Commit ba8bc234 by Tianzhi

use OpenBLAS API to accelerate the speed of CPU

parent b9871b8d
# the prefix of the generated executable file
PREFIX := niutrans
TENSOR := $(PREFIX).tensor
NETWORK := $(PREFIX).network
# code path
SRC = ./source
# use gpu ?
USE_CUDA = 0
# modify this path if neccessary
CUDA_ROOT = /usr/local/cuda-9.0
CUDA_LIB_DIR = $(CUDA_ROOT)/lib64
CUDA_INCLUDE = $(CUDA_ROOT)/include
# use MKL
USE_MKL = 0
INTEL_ROOT = /opt/intel
MKL_ROOT = /opt/intel/mkl
MKL_LIB_DIR = $(MKL_ROOT)/lib/intel64/
MKL_INCLUDE = $(MKL_ROOT)/include
# use OpenBLAS
USE_OPENBLAS = 1
OPENBLAS_ROOT = /opt/OpenBLAS
OPENBLAS_LIB_DIR = $(OPENBLAS_ROOT)/lib
OPENBLAS_INCLUDE = $(OPENBLAS_ROOT)/include
SRC_DIR = $(shell find $(SRC) -type d)
# included header files directory
# depended outside library files directory
INC_DIR = $(SRC_DIR)
DEPLIB_DIR =
ifeq ($(USE_CUDA), 1)
INC_DIR += $(CUDA_INCLUDE)
DEPLIB_DIR += $(CUDA_LIB_DIR)
endif
ifeq ($(USE_MKL), 1)
INC_DIR += $(MKL_INCLUDE)
DEPLIB_DIR += $(MKL_LIB_DIR)
endif
ifeq ($(USE_OPENBLAS), 1)
INC_DIR += $(OPENBLAS_INCLUDE)
DEPLIB_DIR += $(OPENBLAS_LIB_DIR)
endif
# macro
MACRO =
ifeq ($(USE_CUDA), 1)
MACRO += -DUSE_CUDA
endif
ifeq ($(USE_MKL), 1)
MACRO += -DUSE_BLAS -DMKL
endif
ifeq ($(USE_OPENBLAS), 1)
MACRO += -DUSE_BLAS -DOPENBLAS
endif
# dependency
STATIC_DEPLIB =
DYNAMIC_DEPLIB = -lpthread
ifeq ($(USE_MKL), 1)
STATIC_DEPLIB += $(MKL_LIB_DIR)/libmkl_intel_lp64.a \
$(MKL_LIB_DIR)/libmkl_core.a \
$(MKL_LIB_DIR)/libmkl_intel_thread.a \
$(INTEL_ROOT)/lib/intel64/libiomp5.a
DYNAMIC_DEPLIB += -liomp5 -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core
endif
ifeq ($(USE_OPENBLAS), 1)
STATIC_DEPLIB += $(OPENBLAS_LIB_DIR)/libopenblas.a
DYNAMIC_DEPLIB += -lopenblas
endif
ifeq ($(USE_CUDA), 1)
STATIC_DEPLIB += $(CUDA_LIB_DIR)/libcublas_static.a \
$(CUDA_LIB_DIR)/libculibos.a \
$(CUDA_LIB_DIR)/libnpps_static.a \
$(CUDA_LIB_DIR)/libnppc_static.a \
$(CUDA_LIB_DIR)/libcudadevrt.a \
$(CUDA_LIB_DIR)/libcurand_static.a \
/lib64/libdl.so.2
DYNAMIC_DEPLIB += -lcudart -lnvidia-ml
endif
DEPLIBS = -Wl,--start-group $(STATIC_DEPLIB) -Wl,--end-group -lm -ldl $(DYNAMIC_DEPLIB)
# specify the compilers here
CC = gcc
CXX = g++
NVCC = $(CUDA_ROOT)/bin/nvcc
ifeq ($(USE_INTEL_COMPILER), 1)
CC = icc
CXX = icc
endif
# main file
MAIN_FILE = Main.cpp
Tensor_Main := $(SRC)/tensor/$(MAIN_FILE)
Network_Main := $(SRC)/network/$(MAIN_FILE)
TENSOR_CPU := $(TENSOR).cpu
TENSOR_GPU := $(TENSOR).gpu
NETWORK_CPU := $(NETWORK).cpu
NETWORK_GPU := $(NETWORK).gpu
ifeq ($(USE_CUDA), 1)
TENSOR := $(TENSOR_GPU)
NETWORK := $(NETWORK_GPU)
else
TENSOR := $(TENSOR_CPU)
NETWORK := $(NETWORK_CPU)
endif
# specify the compiling arguments here
CFLAGS = -msse4.2 -w -march=native -Wno-enum-compare -Wno-sign-compare -Wno-reorder -Wno-format
# gtx 1080 arch=compute_61,code=sm_61
# k80 arch=compute_37,code=sm_37
# if we set wrong, the result can be `-inf`
CUDA_FLAG = -arch=sm_30 \
-gencode=arch=compute_30,code=sm_30 \
-gencode=arch=compute_50,code=sm_50 \
-gencode=arch=compute_52,code=sm_52 \
-gencode=arch=compute_60,code=sm_60 \
-gencode=arch=compute_61,code=sm_61 \
-gencode=arch=compute_62,code=sm_62 \
-gencode=arch=compute_70,code=sm_70 \
-gencode=arch=compute_70,code=compute_70 \
-maxrregcount=0 --machine 64 -DUSE_CUDA --use_fast_math
CFLAGS += -O3 -flto -DNDEBUG -rdynamic -fkeep-inline-functions
# include dir
CFLAGS += -fPIC $(addprefix -I, $(INC_DIR))
# CUDA_FLAG += $(addprefix -I, $(INC_DIR))
CXXFLAGS = $(CFLAGS)
# lib dir
LDFLAGS = $(addprefix -L, $(DEPLIB_DIR))
# decoder source file
ifeq ($(USE_CUDA), 1)
SOURCES := $(foreach dir,$(SRC_DIR),$(wildcard $(dir)/*.c) $(wildcard $(dir)/*.cpp) $(wildcard $(dir)/*.cc) $(wildcard $(dir)/*.cu))
else
SOURCES := $(foreach dir,$(SRC_DIR),$(wildcard $(dir)/*.c) $(wildcard $(dir)/*.cpp) $(wildcard $(dir)/*.cc) )
endif
SOURCES := $(subst $(Tensor_Main), ,$(SOURCES))
SOURCES := $(subst $(Network_Main), ,$(SOURCES))
# object file
OBJS := $(patsubst %.c,%.o,$(SOURCES))
OBJS := $(patsubst %.cpp,%.o,$(OBJS))
ifeq ($(USE_CUDA), 1)
OBJS := $(patsubst %.cu,%.cuo,$(OBJS))
endif
all: start tensor network finish
tensor: $(TENSOR)
network: $(NETWORK)
$(TENSOR): $(OBJS) $(Tensor_Main)
@echo "Making executable file: $(TENSOR)"
@$(CXX) $(Tensor_Main) $(CXXFLAGS) $(MACRO) $(LDFLAGS) $(OBJS) $(DEPLIBS) -o $@
$(NETWORK): $(OBJS) $(Network_Main)
@echo "Making executable file: $(NETWORK)"
@$(CXX) $(Network_Main) $(CXXFLAGS) $(MACRO) $(LDFLAGS) $(OBJS) $(DEPLIBS) -o $@
start:
@echo ""
@echo "Start Making ..."
finish:
@echo "finish Making ..."
@echo ""
%.o: %.c
@$(CC) $(CFLAGS) -c $< -o $@
%.o: %.cpp
@$(CXX) $(CXXFLAGS) $(MACRO) -c $< -o $@
%.cuo: %.cu
@$(NVCC) $(CUDA_FLAG) -c $< -o $@
.PHONY: clean
clean:
@echo "Making clean object files"
@-rm -f $(OBJS)
cleanexe:
@echo "Making clean executable files"
@-rm -f $(TENSOR_CPU) $(NETWORK_CPU) $(TENSOR_GPU) $(NETWORK_GPU)
......@@ -82,7 +82,12 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
DTYPE * ap = (DTYPE*)a->data;
DTYPE * bp = (DTYPE*)b->data;
DTYPE * cp = (DTYPE*)c->data;
// when c != a, OpenBLAS needs to copy a to c first. This operation
// slow down the speed, so just use OpenBLAS when c == a
if(useBLAS && c == a){
cblas_saxpy(a->unitNum,1,bp,1,cp,1);
}
else{
/* unrolling */
int num = a->unitNum;
if (num % 4 == 0) {
......@@ -105,6 +110,7 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
}
}
}
}
else {
// TODO!!
ShowNTErrors("TODO!");
......
......@@ -68,12 +68,16 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift)
else{
DTYPE * va = (DTYPE*)a->data;
DTYPE * vb = (DTYPE*)b->data;
if(shift == 0 && useBLAS && a==b){
cblas_sscal(b->unitNum, scale, vb, 1);
} else{
for(int i = 0; i < b->unitNum; i++){
*vb = *va * scale + shift;
va++;
vb++;
}
}
}
}
/*
......
......@@ -77,6 +77,9 @@ void _ReduceMax(const XTensor * input, XTensor * output, int dim)
blockSize = stride * strideNum;
for(int k = 0; k < blockNum; k++){
if(useBLAS){
*(op + i) = *(ip + i + cblas_isamax(strideNum, ip + i, stride));
} else{
DTYPE * ip = (DTYPE*)input->data + blockSize * k;
DTYPE * op = (DTYPE*)output->data + stride * k;
for(int i = 0; i < stride; i++){
......@@ -91,6 +94,7 @@ void _ReduceMax(const XTensor * input, XTensor * output, int dim)
}
}
}
}
}
/*
......
......@@ -143,15 +143,23 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor
else{
if(bias == 0){
if(power == (DTYPE)1.0){
if(useBLAS)
sum = cblas_sasum(strideNum, ip + i, stride);
else
for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride)
sum += *ipb;
}
else if(power == (DTYPE)2.0){
if(useBLAS){
sum = cblas_snrm2(strideNum, ip + i, stride);
sum = sum * sum;
} else{
for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
DTYPE value = (*ipb);
sum += value * value;
}
}
}
else if(power == (DTYPE)0.5){
for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
DTYPE value = (*ipb);
......@@ -167,6 +175,9 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor
}
else{
if(power == (DTYPE)1.0){
if(useBLAS)
sum = cblas_sasum(strideNum, ip + i, stride);
else
for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride)
sum += *ipb;
sum -= strideNum * bias;
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论