# the prefix of the generated executable file
NIUTRANS_EXE := NiuTensor

# code path and generated file path
ROOT = .
SRC = $(ROOT)/source
LIB_DIR = $(ROOT)/lib
EXE_DIR = $(ROOT)/bin

# whether to generate dll
dll = 0

# 0 - on Windows or Linux platform
# 1 - on Macintosh platform
OnMac = 0

# 0 - use CPU 
# 1 - use GPU
USE_CUDA = 0
# modify this path if neccessary
CUDA_ROOT = /usr/local/cuda-9.0
CUDA_LIB_DIR = $(CUDA_ROOT)/lib64
CUDA_INCLUDE = $(CUDA_ROOT)/include

# use MKL 
USE_MKL = 0
INTEL_ROOT = /opt/intel
MKL_ROOT = /opt/intel/mkl
MKL_LIB_DIR = $(MKL_ROOT)/lib/intel64/
MKL_INCLUDE = $(MKL_ROOT)/include

# use OpenBLAS
USE_OPENBLAS = 0
OPENBLAS_ROOT = /opt/OpenBLAS
OPENBLAS_LIB_DIR = $(OPENBLAS_ROOT)/lib
OPENBLAS_INCLUDE = $(OPENBLAS_ROOT)/include

SRC_DIR = $(shell find $(SRC) -type d)

# included header files directory
# depended outside library files directory
INC_DIR = $(SRC_DIR)
DEPLIB_DIR = 
ifeq ($(USE_CUDA), 1)
	INC_DIR += $(CUDA_INCLUDE)
	DEPLIB_DIR += $(CUDA_LIB_DIR)
endif
ifeq ($(USE_MKL), 1)
	INC_DIR += $(MKL_INCLUDE)
	DEPLIB_DIR += $(MKL_LIB_DIR)
endif
ifeq ($(USE_OPENBLAS), 1)
	INC_DIR += $(OPENBLAS_INCLUDE)
	DEPLIB_DIR += $(OPENBLAS_LIB_DIR)
endif

# macro
MACRO = 
ifeq ($(USE_CUDA), 1)
	MACRO += -DUSE_CUDA
endif
ifeq ($(USE_MKL), 1)
	MACRO += -DUSE_BLAS -DMKL
endif
ifeq ($(USE_OPENBLAS), 1)
	MACRO += -DUSE_BLAS -DOPENBLAS
endif

# dependency
STATIC_DEPLIB = 
DYNAMIC_DEPLIB = -lpthread
ifeq ($(USE_MKL), 1)
    STATIC_DEPLIB += $(MKL_LIB_DIR)/libmkl_intel_lp64.a \
	                 $(MKL_LIB_DIR)/libmkl_core.a \
					 $(MKL_LIB_DIR)/libmkl_intel_thread.a \
					 $(INTEL_ROOT)/lib/intel64/libiomp5.a                                              
    DYNAMIC_DEPLIB += -liomp5 -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core
endif
ifeq ($(USE_OPENBLAS), 1)
    STATIC_DEPLIB += $(OPENBLAS_LIB_DIR)/libopenblas.a
    DYNAMIC_DEPLIB += -lopenblas
endif
ifeq ($(USE_CUDA), 1)
    STATIC_DEPLIB += $(CUDA_LIB_DIR)/libcublas_static.a \
                     $(CUDA_LIB_DIR)/libculibos.a \
                     $(CUDA_LIB_DIR)/libnpps_static.a \
                     $(CUDA_LIB_DIR)/libnppc_static.a \
                     $(CUDA_LIB_DIR)/libcudadevrt.a \
                     $(CUDA_LIB_DIR)/libcurand_static.a \
					 /lib64/libdl.so.2
    DYNAMIC_DEPLIB += -lcudart -lnvidia-ml
endif 

ifeq ($(OnMac), 1)
    DEPLIBS = $(STATIC_DEPLIB) -lm -ldl $(DYNAMIC_DEPLIB)
else
    DEPLIBS = -Wl,--start-group $(STATIC_DEPLIB) -Wl,--end-group -lm -ldl $(DYNAMIC_DEPLIB)
endif

# specify the compilers here
CC = gcc
CXX = g++
NVCC = $(CUDA_ROOT)/bin/nvcc
ifeq ($(USE_INTEL_COMPILER), 1)
	CC = icc
	CXX = icc
endif

# main file
MAIN_FILE = $(SRC)/Main.cpp

ifeq ($(USE_CUDA), 1)
	NIUTRANS_EXE := $(NIUTRANS_EXE).GPU
else
	NIUTRANS_EXE := $(NIUTRANS_EXE).CPU
endif

NIUTRANS_DLL := $(LIB_DIR)/lib$(NIUTRANS_EXE).so

NIUTRANS_EXE := $(EXE_DIR)/$(NIUTRANS_EXE)

# specify the compiling arguments here
CFLAGS = -std=c++11 -msse4.2 -w -march=native -Wno-enum-compare -Wno-sign-compare -Wno-reorder -Wno-format

# gtx 1080 arch=compute_61,code=sm_61
# k80 arch=compute_37,code=sm_37
# if we set wrong, the result can be `-inf`
CUDA_FLAG = -arch=sm_30 \
			-gencode=arch=compute_30,code=sm_30 \
			-gencode=arch=compute_50,code=sm_50 \
			-gencode=arch=compute_52,code=sm_52 \
			-gencode=arch=compute_60,code=sm_60 \
			-gencode=arch=compute_61,code=sm_61 \
			-gencode=arch=compute_62,code=sm_62 \
			-gencode=arch=compute_70,code=sm_70 \
			-gencode=arch=compute_70,code=compute_70 \
			-maxrregcount=0  --machine 64 -DUSE_CUDA --use_fast_math -std=c++11

CFLAGS += -O3 -flto -DNDEBUG -rdynamic -fkeep-inline-functions

# include dir
CFLAGS += -fPIC $(addprefix -I, $(INC_DIR))
# CUDA_FLAG += $(addprefix -I, $(INC_DIR))
CXXFLAGS = $(CFLAGS)

# lib dir
LDFLAGS = $(addprefix -L, $(DEPLIB_DIR))

# decoder source file
ifeq ($(USE_CUDA), 1)
	SOURCES := $(foreach dir,$(SRC_DIR),$(wildcard $(dir)/*.c) $(wildcard $(dir)/*.cpp) $(wildcard $(dir)/*.cc) $(wildcard $(dir)/*.cu))
else
	SOURCES := $(foreach dir,$(SRC_DIR),$(wildcard $(dir)/*.c) $(wildcard $(dir)/*.cpp) $(wildcard $(dir)/*.cc) )
endif

SOURCES := $(subst $(MAIN_FILE), ,$(SOURCES))

# object file
OBJS := $(patsubst %.c,%.o,$(SOURCES))
OBJS := $(patsubst %.cpp,%.o,$(OBJS))
ifeq ($(USE_CUDA), 1)
	OBJS := $(patsubst %.cu,%.cuo,$(OBJS))
endif

all: start lib exe finish

start:
	@echo ""
	@echo "Start building ..."

lib: start_lib niutrans_dll finish_lib

start_lib:
	@mkdir -p $(LIB_DIR)
	@echo ""
	@echo "Start building library"

niutrans_dll: $(NIUTRANS_DLL)

$(NIUTRANS_DLL): $(OBJS)
ifeq ($(dll), 1)
	@echo "Building dynamic link library: $(NIUTRANS_DLL)"
	@$(CXX) -shared -Wall $(CXXFLAGS) $(MACRO) $(LDFLAGS) $(OBJS) $(DEPLIBS) -o $@
else
	@echo "Skip building dynamic link library"
endif
	
finish_lib:
	@echo "Finish building library"
	@echo ""

exe: start_exe niutrans_exe finish_exe

start_exe:
	@mkdir -p $(EXE_DIR)
	@echo ""
	@echo "Start building executable file"

niutrans_exe: $(NIUTRANS_EXE)

$(NIUTRANS_EXE): $(OBJS) $(MAIN_FILE)
	@echo "Building executable file: $(NIUTRANS_EXE)"
	@$(CXX) $(MAIN_FILE) $(CXXFLAGS) $(MACRO) $(LDFLAGS) $(OBJS) $(DEPLIBS) -o $@

finish_exe:
	@echo "Finish building executable file"
	@echo ""

finish:
	@echo "Finish building ..."
	@echo ""

%.o: %.c
	@$(CC) $(CFLAGS) -c $< -o $@

%.o: %.cpp
	@$(CXX) $(CXXFLAGS) $(MACRO) -c $< -o $@

%.cuo: %.cu
ifeq ($(dll), 1)
	@$(NVCC) --shared --compiler-options '-fPIC' $(CUDA_FLAG) -c $< -o $@
else
	@$(NVCC) $(CUDA_FLAG) -c $< -o $@
endif

.PHONY: clean
clean:
	@echo "Cleaning object files"
	@-rm -f $(OBJS)