Commit 2ab2afc9 by xuchen

add a makefile and correct the problem

parent 07efb5de
# the prefix of the generated executable file
PREFIX := niutrans
TENSOR := $(PREFIX).tensor
NETWORK := $(PREFIX).network
# code path
SRC = ./source
# use gpu ?
USE_CUDA = 1
# modify this path if neccessary
CUDA_ROOT = /usr/local/cuda-9.0
CUDA_LIB_DIR = $(CUDA_ROOT)/lib64
CUDA_INCLUDE = $(CUDA_ROOT)/include
# use MKL
USE_MKL = 0
INTEL_ROOT = /opt/intel
MKL_ROOT = /opt/intel/mkl
MKL_LIB_DIR = $(MKL_ROOT)/lib/intel64/
MKL_INCLUDE = $(MKL_ROOT)/include
# use OpenBLAS
USE_OPENBLAS = 0
OPENBLAS_ROOT = /opt/OpenBLAS
OPENBLAS_LIB_DIR = $(OPENBLAS_ROOT)/lib
OPENBLAS_INCLUDE = $(OPENBLAS_ROOT)/include
SRC_DIR = $(shell find $(SRC) -type d)
# included header files directory
# depended outside library files directory
INC_DIR = $(SRC_DIR)
DEPLIB_DIR =
ifeq ($(USE_CUDA), 1)
INC_DIR += $(CUDA_INCLUDE)
DEPLIB_DIR += $(CUDA_LIB_DIR)
endif
ifeq ($(USE_MKL), 1)
INC_DIR += $(MKL_INCLUDE)
DEPLIB_DIR += $(MKL_LIB_DIR)
endif
ifeq ($(USE_OPENBLAS), 1)
INC_DIR += $(OPENBLAS_INCLUDE)
DEPLIB_DIR += $(OPENBLAS_LIB_DIR)
endif
# macro
MACRO =
ifeq ($(USE_CUDA), 1)
MACRO += -DUSE_CUDA
endif
ifeq ($(USE_MKL), 1)
MACRO += -DUSE_BLAS -DMKL
endif
ifeq ($(USE_OPENBLAS), 1)
MACRO += -DUSE_BLAS -DOPENBLAS
endif
# dependency
STATIC_DEPLIB =
DYNAMIC_DEPLIB = -lpthread
ifeq ($(USE_MKL), 1)
STATIC_DEPLIB += $(MKL_LIB_DIR)/libmkl_intel_lp64.a \
$(MKL_LIB_DIR)/libmkl_core.a \
$(MKL_LIB_DIR)/libmkl_intel_thread.a \
$(INTEL_ROOT)/lib/intel64/libiomp5.a
#DYNAMIC_DEPLIB += -liomp5 -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core
endif
ifeq ($(USE_OPENBLAS), 1)
STATIC_DEPLIB += $(OPENBLAS_LIB_DIR)/libopenblas.a
DYNAMIC_DEPLIB += -lopenblas
endif
ifeq ($(USE_CUDA), 1)
STATIC_DEPLIB += $(CUDA_LIB_DIR)/libcublas_static.a \
$(CUDA_LIB_DIR)/libculibos.a \
$(CUDA_LIB_DIR)/libnpps_static.a \
$(CUDA_LIB_DIR)/libnppc_static.a \
$(CUDA_LIB_DIR)/libcudadevrt.a \
$(CUDA_LIB_DIR)/libcurand_static.a \
/lib64/libdl.so.2
DYNAMIC_DEPLIB += -lcudart -lnvidia-ml
endif
DEPLIBS = -Wl,--start-group $(STATIC_DEPLIB) -Wl,--end-group -lm -ldl $(DYNAMIC_DEPLIB)
# specify the compilers here
CC = gcc
CXX = g++
NVCC = $(CUDA_ROOT)/bin/nvcc
ifeq ($(USE_INTEL_COMPILER), 1)
CC = icc
CXX = icc
endif
# main file
MAIN_FILE = Main.cpp
Tensor_Main := $(SRC)/tensor/$(MAIN_FILE)
Network_Main := $(SRC)/network/$(MAIN_FILE)
TENSOR_CPU := $(TENSOR).cpu
TENSOR_GPU := $(TENSOR).gpu
NETWORK_CPU := $(NETWORK).cpu
NETWORK_GPU := $(NETWORK).gpu
ifeq ($(USE_CUDA), 1)
TENSOR := $(TENSOR_GPU)
NETWORK := $(NETWORK_GPU)
else
TENSOR := $(TENSOR_CPU)
NETWORK := $(NETWORK_CPU)
endif
# specify the compiling arguments here
CFLAGS = -msse4.2 -w -march=native -Wno-enum-compare -Wno-sign-compare -Wno-reorder -Wno-format
# gtx 1080 arch=compute_61,code=sm_61
# k80 arch=compute_37,code=sm_37
# if we set wrong, the result can be `-inf`
CUDA_FLAG = -arch=sm_30 \
-gencode=arch=compute_30,code=sm_30 \
-gencode=arch=compute_50,code=sm_50 \
-gencode=arch=compute_52,code=sm_52 \
-gencode=arch=compute_60,code=sm_60 \
-gencode=arch=compute_61,code=sm_61 \
-gencode=arch=compute_62,code=sm_62 \
-gencode=arch=compute_70,code=sm_70 \
-gencode=arch=compute_70,code=compute_70 \
-maxrregcount=0 --machine 64 -DUSE_CUDA --use_fast_math
CFLAGS += -O3 -flto -DNDEBUG -rdynamic -fkeep-inline-functions
# include dir
CFLAGS += -fPIC $(addprefix -I, $(INC_DIR))
# CUDA_FLAG += $(addprefix -I, $(INC_DIR))
CXXFLAGS = $(CFLAGS)
# lib dir
LDFLAGS = $(addprefix -L, $(DEPLIB_DIR))
# decoder source file
ifeq ($(USE_CUDA), 1)
SOURCES := $(foreach dir,$(SRC_DIR),$(wildcard $(dir)/*.c) $(wildcard $(dir)/*.cpp) $(wildcard $(dir)/*.cc) $(wildcard $(dir)/*.cu))
else
SOURCES := $(foreach dir,$(SRC_DIR),$(wildcard $(dir)/*.c) $(wildcard $(dir)/*.cpp) $(wildcard $(dir)/*.cc) )
endif
SOURCES := $(subst $(Tensor_Main), ,$(SOURCES))
SOURCES := $(subst $(Network_Main), ,$(SOURCES))
# object file
OBJS := $(patsubst %.c,%.o,$(SOURCES))
OBJS := $(patsubst %.cpp,%.o,$(OBJS))
ifeq ($(USE_CUDA), 1)
OBJS := $(patsubst %.cu,%.cuo,$(OBJS))
endif
all: start tensor network finish
tensor: $(TENSOR)
network: $(NETWORK)
$(TENSOR): $(OBJS) $(Tensor_Main)
@echo "Making executable file: $(TENSOR)"
@$(CXX) $(Tensor_Main) $(CXXFLAGS) $(MACRO) $(LDFLAGS) $(OBJS) $(DEPLIBS) -o $@
$(NETWORK): $(OBJS) $(Network_Main)
@echo "Making executable file: $(NETWORK)"
@$(CXX) $(Network_Main) $(CXXFLAGS) $(MACRO) $(LDFLAGS) $(OBJS) $(DEPLIBS) -o $@
start:
@echo ""
@echo "Start Making ..."
finish:
@echo "finish Making ..."
@echo ""
%.o: %.c
@$(CC) $(CFLAGS) -c $< -o $@
%.o: %.cpp
@$(CXX) $(CXXFLAGS) $(MACRO) -c $< -o $@
%.cuo: %.cu
@$(NVCC) $(CUDA_FLAG) -c $< -o $@
.PHONY: clean
clean:
@echo "Making clean object files"
@-rm -f $(OBJS)
cleanexe:
@echo "Making clean executable files"
@-rm -f $(TENSOR_CPU) $(NETWORK_CPU) $(TENSOR_GPU) $(NETWORK_GPU)
\ No newline at end of file
......@@ -86,6 +86,27 @@ dE/da = IndexToOnehot(b)
>> isEfficient - indicates whether the computation is in
an efficient manner
*/
void XDataGrad::GradOnehotToIndex(XTensor * node, bool isEfficent)
{
XLink &income = node->income;
CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for IndexToOnehot!");
XTensor * input = income.tails[0];
XNoder::MakeGrad(input);
}
/*
gradient computation for IndexToOnehot
for
b = IndexToOnehot(a)
we have
dE/da = IndexToOnehot(b)
>> node - the node (c) for backward computation
>> isEfficient - indicates whether the computation is in
an efficient manner
*/
void XDataGrad::GradIndexToOnehot(XTensor * node, bool isEfficent)
{
XLink &income = node->income;
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论