# cmake minimum version
cmake_minimum_required(VERSION 2.8)

# Project's name
project(NiuTensor)

# The name of the generated executable file
# The name of the dynamic link library
set(NIUTENSOR_EXE "NiuTensor")
set(NIUTENSOR_DLL "${NIUTENSOR_EXE}")

# Generated file path
set(EXECUTABLE_OUTPUT_PATH ../bin)
set(LIBRARY_OUTPUT_PATH ../lib)

# Use CMAKE_MACOSX_RPATH for macOS 
set(CMAKE_MACOSX_RPATH 1)

# Open floder manage
set_property(GLOBAL PROPERTY USE_FOLDERS ON)
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
option(USE_CUDA "Use CUDA" OFF)
option(USE_HALF_PRECISION "Use Half Precision in CUDA Codes" OFF)
option(USE_MKL "Use MKL" OFF)
option(USE_OPENBLAS "Use OpenBLAS" OFF)
option(GEN_DLL "Generate Dynamic Link Library" OFF)

# If set USE_CUDA ON, please modify CUDA_TOOLKIT_ROOT below.
# If set USE_MKL ON, please modify the INTEL_ROOT below.
# If set USE_OPENBLAS ON, please modify the OPENBLAS_ROOT below.
if (USE_CUDA)
    if(NOT EXISTS ${CUDA_TOOLKIT_ROOT})
        if(WIN32)
            set(CUDA_TOOLKIT_ROOT "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.2")
        else()
            set(CUDA_TOOLKIT_ROOT "/usr/local/cuda-9.2")
        endif()
    endif()
    set(CUDA_TOOLKIT_ROOT_DIR ${CUDA_TOOLKIT_ROOT})
    message(STATUS "CUDA_TOOLKIT_ROOT: ${CUDA_TOOLKIT_ROOT}")
endif()
if(USE_MKL)
    if(NOT DEFINED INTEL_ROOT)
        if(WIN32)
            set(INTEL_ROOT "C:/Program Files (x86)/IntelSWTools/compilers_and_libraries_2020.2.254/windows")
        else()
            set(INTEL_ROOT "/opt/intel/compilers_and_libraries_2020.2.254/linux")
        endif()
    endif()
    message(STATUS "INTEL_ROOT: ${INTEL_ROOT}")
endif()
if(USE_OPENBLAS)
    if(NOT DEFINED OPENBLAS_ROOT)
        if(WIN32)
            set(OPENBLAS_ROOT "C:/Program Files/OpenBLAS")
        else()
            set(OPENBLAS_ROOT "/opt/OpenBLAS")
        endif()
    endif()
    message(STATUS "OPENBLAS_ROOT: ${OPENBLAS_ROOT}")
endif()

# Find all the .cpp .h .cu .chu files in source folder
file(GLOB_RECURSE CPP_FILES source/*.cpp)
file(GLOB_RECURSE H_FILES source/*.h)
file(GLOB_RECURSE CU_FILES source/*.cu)
file(GLOB_RECURSE CUH_FILES source/*.cuh)

function(assign_source_group)
    foreach(_source IN ITEMS ${ARGN})
        if (IS_ABSOLUTE "${_source}")
            file(RELATIVE_PATH _source_rel "${CMAKE_CURRENT_SOURCE_DIR}" "${_source}")
        else()
            set(_source_rel "${_source}")
        endif()
        get_filename_component(_source_path "${_source_rel}" PATH)
        string(REPLACE "/" "\\" _source_path_msvc "${_source_path}")
        source_group("${_source_path_msvc}" FILES "${_source}")
    endforeach()
endfunction(assign_source_group)

function(my_add_executable)
    foreach(_source IN ITEMS ${ARGN})
        assign_source_group(${_source})
    endforeach()
    if(USE_CUDA)
        cuda_add_executable(${ARGV})
    else()
        add_executable(${ARGV})
    endif()
endfunction(my_add_executable)

# Set libs and compiler options for CUDA
if(USE_CUDA)
    add_definitions(-DUSE_CUDA)
    if(USE_HALF_PRECISION)
        add_definitions(-DHALF_PRECISION)
    endif()
    find_package(CUDA REQUIRED)
    if(GPU_ARCH STREQUAL K) # Kepler cards (CUDA 5 until CUDA 10)
        set(ARCH_FLAGS -arch=compute_30 -code=compute_30,sm_30,sm_35,sm_37)
    elseif(GPU_ARCH STREQUAL M) # Maxwell cards (CUDA 6 until CUDA 11)
        set(ARCH_FLAGS -arch=compute_50 -code=compute_50,sm_50,sm_52,sm_53)
    elseif(GPU_ARCH STREQUAL P) # Pascal (CUDA 8 and later)
        set(ARCH_FLAGS -arch=compute_60 -code=compute_60,sm_60,sm_61,sm_62)
    elseif(GPU_ARCH STREQUAL V) # Volta (CUDA 9 and later)
        set(ARCH_FLAGS -arch=compute_70 -code=compute_70,sm_70,sm_72)
    elseif(GPU_ARCH STREQUAL T) # Turing (CUDA 10 and later)
        set(ARCH_FLAGS -arch=compute_75 -code=sm_75)
    elseif(GPU_ARCH STREQUAL A) # Ampere (CUDA 11 and later)
        set(ARCH_FLAGS -arch=compute_80 -code=sm_80)
    endif()

    if(USE_HALF_PRECISION)
        if(NOT DEFINED GPU_ARCH)
            set(ARCH_FLAGS -arch=sm_60
                -gencode=arch=compute_60,code=sm_60
                -gencode=arch=compute_61,code=sm_61
                -gencode=arch=compute_62,code=sm_62
                -gencode=arch=compute_70,code=sm_70
                -gencode=arch=compute_72,code=sm_72
                -gencode=arch=compute_70,code=compute_70
            )
        elseif(${GPU_ARCH} STREQUAL K OR ${GPU_ARCH} STREQUAL M)
            message(FATAL_ERROR "your GPU cannot use the function half precision")
        endif()
    endif()
    
    if(WIN32)
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4819")
        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-maxrregcount=0 -Wno-deprecated-gpu-targets -use_fast_math")
        string(REPLACE -m32 -m64 CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${ARCH_FLAGS})
        set(CMAKE_POLICY_DEFAULT_CMP0028 NEW)
        link_directories("${CUDA_TOOLKIT_ROOT}/lib/x64")
        include_directories("${CUDA_TOOLKIT_ROOT}/include")
        set(CUDA_LIB_DIR "${CUDA_TOOLKIT_ROOT}/lib/x64/")
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}cublas.lib")
        if(CUDA_VERSION_MAJOR EQUAL 11)
            set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}cublasLt.lib")
        endif()
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}npps.lib")
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}nppc.lib")
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}cudadevrt.lib")
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}curand.lib")
    else()
        set(CMAKE_CXX_FLAGS "-fPIC -msse4.2 -w -march=native -Wno-enum-compare -Wno-sign-compare -Wno-format -Wno-dev -O3 -DNDEBUG -rdynamic")
        set(CUDA_NVCC_FLAGS "-Xcompiler -fPIC -maxrregcount=0 --disable-warnings -use_fast_math -DUSE_CUDA -Wno-deprecated-gpu-targets -std=c++11")
        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${ARCH_FLAGS})
        link_directories("${CUDA_TOOLKIT_ROOT}/lib64")
        include_directories("${CUDA_TOOLKIT_ROOT}/include")
        set(CUDA_LIB_DIR "${CUDA_TOOLKIT_ROOT}/lib64/")
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcublas_static.a")
        if(CUDA_VERSION_MAJOR EQUAL 11)
            set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcublasLt_static.a")
        endif()
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libculibos.a")
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libnpps_static.a")
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libnppc_static.a")
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcudadevrt.a")
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcurand_static.a")
        if(EXISTS "/usr/lib64/libdl.so.2")
            set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "/usr/lib64/libdl.so.2")
        elseif(EXISTS "/lib/x86_64-linux-gnu/libdl.so.2")
            set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "/lib/x86_64-linux-gnu/libdl.so.2")
        elseif(EXISTS "/lib64/libdl.so.2")
            set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "/lib64/libdl.so.2")
        endif()
    endif()
endif()

# Set libs and compiler options for MKL
if(USE_MKL)
    add_definitions(-DUSE_BLAS -DMKL)
    set(COMPILER_DIR "${INTEL_ROOT}/compiler")
    set(MKL_DIR "${INTEL_ROOT}/mkl")
    set(CPU_ARCH intel64)
    if(WIN32)
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNDEBUG")
        link_directories(${MKL_DIR}/lib/intel64/)
        link_directories(${COMPILER_DIR}/lib/intel64)
        include_directories(${MKL_DIR}/include)
        set(COMPILER_LIB_DIR "${COMPILER_DIR}/lib/intel64/")
        set(MKL_LIB_DIR "${MKL_DIR}/lib/intel64/")
        set(MKL_LIB_PATH ${MKL_LIB_PATH} "${MKL_LIB_DIR}mkl_intel_lp64.lib")
        set(MKL_LIB_PATH ${MKL_LIB_PATH} "${MKL_LIB_DIR}mkl_core.lib")
        set(MKL_LIB_PATH ${MKL_LIB_PATH} "${MKL_LIB_DIR}mkl_intel_thread.lib")
        set(MKL_LIB_PATH ${MKL_LIB_PATH} "${COMPILER_LIB_DIR}libiomp5md.lib")
    else()
        if(USE_CUDA)
            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-reorder")
        else()
            set(CMAKE_CXX_FLAGS "-std=c++11 -msse4.2 -w -march=native -Wno-enum-compare -Wno-sign-compare -Wno-reorder -Wno-format -O3 -flto -DNDEBUG -rdynamic -fkeep-inline-functions -fPIC")
        endif(USE_CUDA)
        link_directories(${MKL_DIR}/lib/intel64/)
        link_directories(${COMPILER_DIR}/lib/intel64)
        include_directories(${MKL_DIR}/include)
        set(COMPILER_LIB_DIR "${COMPILER_DIR}/lib/intel64/")
        set(MKL_LIB_DIR "${MKL_DIR}/lib/intel64/")
        set(MKL_LIB_PATH ${MKL_LIB_PATH} "${MKL_LIB_DIR}libmkl_intel_lp64.a")
        set(MKL_LIB_PATH ${MKL_LIB_PATH} "${MKL_LIB_DIR}libmkl_core.a")
        set(MKL_LIB_PATH ${MKL_LIB_PATH} "${MKL_LIB_DIR}libmkl_intel_thread.a")
        set(MKL_LIB_PATH ${MKL_LIB_PATH} "${COMPILER_LIB_DIR}libiomp5.a")
    endif()
endif()

# Set libs and compiler options for OpenBLAS
if(USE_OPENBLAS)
    add_definitions(-DUSE_BLAS -DOPENBLAS)
    set(OPENBLAS_INCLUDE_DIR "${OPENBLAS_ROOT}/include")
    set(OPENBLAS_LIB_DIR "${OPENBLAS_ROOT}/lib")
    if(WIN32)
        link_directories(${OPENBLAS_LIB_DIR})
        include_directories(${OPENBLAS_INCLUDE_DIR})
        set(OPENBLAS_LIB_PATH ${OPENBLAS_LIB_PATH} "${OPENBLAS_LIB_DIR}/libopenblas.lib")
    else()
        link_directories(${OPENBLAS_LIB_DIR})
        include_directories(${OPENBLAS_INCLUDE_DIR})
        set(OPENBLAS_LIB_PATH ${OPENBLAS_LIB_PATH} "${OPENBLAS_LIB_DIR}/libopenblas.a")
    endif()
endif()

# Integrate all libs
set(CUDA_LIB ${CUDA_LIB_PATH})
set(MKL_LIB ${MKL_LIB_PATH})
set(OPENBLAS_LIB ${OPENBLAS_LIB_PATH})

# Add executable files to project
# Generate dynamic link library about project
if(USE_CUDA)
    if(GEN_DLL)
        cuda_add_library(${NIUTENSOR_DLL} SHARED ${CPP_FILES} ${H_FILES} ${CU_FILES} ${CUH_FILES})
    else()
        my_add_executable(${NIUTENSOR_EXE} ${CPP_FILES} ${H_FILES} ${CU_FILES} ${CUH_FILES})
    endif()
else()
    if(GEN_DLL)
        add_library(${NIUTENSOR_DLL} SHARED ${CPP_FILES} ${H_FILES})
    else()
        my_add_executable(${NIUTENSOR_EXE} ${CPP_FILES} ${H_FILES})
    endif()
endif()

# Link external libs to executable files
# Link external libs to dynamic link library
if(WIN32)
    add_definitions(-DWIN32)
    set(MESS ${MESS} "On Windows")
    if(USE_CUDA)
        set(MESS ${MESS} " Use CUDA")
        set(ALL_LIB ${ALL_LIB} ${CUDA_LIB})
    endif()
    if(USE_MKL)
        set(MESS ${MESS} " Use MKL")
        set(ALL_LIB ${ALL_LIB} ${MKL_LIB})
    elseif(USE_OPENBLAS)
        set(MESS ${MESS} " Use OpenBLAS")
        set(ALL_LIB ${ALL_LIB} ${OPENBLAS_LIB})
    else()
    endif()

    if(GEN_DLL)
        message(STATUS "Generate Dynamic Link Library")
        message(STATUS "Name of Dynamic Link Library: " ${NIUTENSOR_DLL})
        target_link_libraries(${NIUTENSOR_DLL} ${ALL_LIB})
    else()
        message(STATUS "Generate Makefile For Executable File")
        message(STATUS "Name of Executable File :" ${NIUTENSOR_EXE})
        target_link_libraries(${NIUTENSOR_EXE} ${ALL_LIB})
    endif()
    message(STATUS "${MESS}")
else()
    add_definitions(-std=c++11)
    set(MESS ${MESS} "On Linux or macOS")
    if(USE_CUDA)
        set(MESS ${MESS} " Use CUDA")
        set(ALL_LIB ${ALL_LIB} ${CUDA_LIB})
        set(FLAG ${FLAG} "-lpthread -lcudart -lnvidia-ml")
    else()
        set(FLAG ${FLAG} "-lpthread")
    endif()
    if(USE_MKL)
        set(MESS ${MESS} " Use MKL")
        set(ALL_LIB ${ALL_LIB} ${MKL_LIB})
        set(FLAG ${FLAG} "-liomp5 -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -ldl")
    elseif(USE_OPENBLAS)
        set(MESS ${MESS} " Use OpenBLAS")
        set(ALL_LIB ${ALL_LIB} ${OPENBLAS_LIB})
        set(FLAG ${FLAG} "-lopenblas")
    else()
    endif()

    if(GEN_DLL)
        message(STATUS "Generate Dynamic Link Library")
        message(STATUS "Name of Dynamic Link Library: " ${NIUTENSOR_DLL})
        target_link_libraries(${NIUTENSOR_DLL} ${ALL_LIB} ${FLAG})
    else()
        message(STATUS "Generate Makefile For Executable File")
        message(STATUS "Name of Executable File: " ${NIUTENSOR_EXE})
        target_link_libraries(${NIUTENSOR_EXE} ${ALL_LIB} ${FLAG})
    endif()
    message(STATUS "${MESS}")
endif()
