use OpenBLAS API to accelerate the speed of CPU

ba8bc234 · Tianzhi · b9871b8d · ba8bc234 · ba8bc234 · ba8bc234
Commit ba8bc234 authored Mar 18, 2019 by Tianzhi
--- a/Makefile
+++ b/Makefile
+# the prefix of the generated executable file
+PREFIX := niutrans
+TENSOR := $(PREFIX).tensor
+NETWORK := $(PREFIX).network
+
+# code path
+SRC = ./source
+
+# use gpu ?
+USE_CUDA = 0
+# modify this path if neccessary
+CUDA_ROOT = /usr/local/cuda-9.0
+CUDA_LIB_DIR = $(CUDA_ROOT)/lib64
+CUDA_INCLUDE = $(CUDA_ROOT)/include
+
+# use MKL 
+USE_MKL = 0
+INTEL_ROOT = /opt/intel
+MKL_ROOT = /opt/intel/mkl
+MKL_LIB_DIR = $(MKL_ROOT)/lib/intel64/
+MKL_INCLUDE = $(MKL_ROOT)/include
+
+# use OpenBLAS
+USE_OPENBLAS = 1
+OPENBLAS_ROOT = /opt/OpenBLAS
+OPENBLAS_LIB_DIR = $(OPENBLAS_ROOT)/lib
+OPENBLAS_INCLUDE = $(OPENBLAS_ROOT)/include
+
+SRC_DIR = $(shell find $(SRC) -type d)
+
+# included header files directory
+# depended outside library files directory
+INC_DIR = $(SRC_DIR)
+DEPLIB_DIR = 
+ifeq ($(USE_CUDA), 1)
+	INC_DIR += $(CUDA_INCLUDE)
+	DEPLIB_DIR += $(CUDA_LIB_DIR)
+endif
+ifeq ($(USE_MKL), 1)
+	INC_DIR += $(MKL_INCLUDE)
+	DEPLIB_DIR += $(MKL_LIB_DIR)
+endif
+ifeq ($(USE_OPENBLAS), 1)
+	INC_DIR += $(OPENBLAS_INCLUDE)
+	DEPLIB_DIR += $(OPENBLAS_LIB_DIR)
+endif
+
+# macro
+MACRO = 
+ifeq ($(USE_CUDA), 1)
+	MACRO += -DUSE_CUDA
+endif
+ifeq ($(USE_MKL), 1)
+	MACRO += -DUSE_BLAS -DMKL
+endif
+ifeq ($(USE_OPENBLAS), 1)
+	MACRO += -DUSE_BLAS -DOPENBLAS
+endif
+
+# dependency
+STATIC_DEPLIB = 
+DYNAMIC_DEPLIB = -lpthread
+ifeq ($(USE_MKL), 1)
+    STATIC_DEPLIB += $(MKL_LIB_DIR)/libmkl_intel_lp64.a \
+	                 $(MKL_LIB_DIR)/libmkl_core.a \
+					 $(MKL_LIB_DIR)/libmkl_intel_thread.a \
+					 $(INTEL_ROOT)/lib/intel64/libiomp5.a                                              
+    DYNAMIC_DEPLIB += -liomp5 -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core
+endif   
+ifeq ($(USE_OPENBLAS), 1)
+    STATIC_DEPLIB += $(OPENBLAS_LIB_DIR)/libopenblas.a
+    DYNAMIC_DEPLIB += -lopenblas
+endif
+ifeq ($(USE_CUDA), 1)
+    STATIC_DEPLIB += $(CUDA_LIB_DIR)/libcublas_static.a \
+                     $(CUDA_LIB_DIR)/libculibos.a \
+                     $(CUDA_LIB_DIR)/libnpps_static.a \
+                     $(CUDA_LIB_DIR)/libnppc_static.a \
+                     $(CUDA_LIB_DIR)/libcudadevrt.a \
+                     $(CUDA_LIB_DIR)/libcurand_static.a \
+					 /lib64/libdl.so.2
+    DYNAMIC_DEPLIB += -lcudart -lnvidia-ml
+endif 
+DEPLIBS = -Wl,--start-group $(STATIC_DEPLIB) -Wl,--end-group -lm -ldl $(DYNAMIC_DEPLIB)
+
+# specify the compilers here
+CC = gcc
+CXX = g++
+NVCC = $(CUDA_ROOT)/bin/nvcc
+ifeq ($(USE_INTEL_COMPILER), 1)
+	CC = icc
+	CXX = icc
+endif
+
+# main file
+MAIN_FILE = Main.cpp 
+Tensor_Main := $(SRC)/tensor/$(MAIN_FILE)
+Network_Main := $(SRC)/network/$(MAIN_FILE)
+
+TENSOR_CPU := $(TENSOR).cpu
+TENSOR_GPU := $(TENSOR).gpu
+NETWORK_CPU := $(NETWORK).cpu
+NETWORK_GPU := $(NETWORK).gpu
+
+ifeq ($(USE_CUDA), 1)
+	TENSOR := $(TENSOR_GPU)
+	NETWORK := $(NETWORK_GPU)
+else
+	TENSOR := $(TENSOR_CPU)
+	NETWORK := $(NETWORK_CPU)
+endif
+
+# specify the compiling arguments here
+CFLAGS = -msse4.2 -w -march=native -Wno-enum-compare -Wno-sign-compare -Wno-reorder -Wno-format
+
+# gtx 1080 arch=compute_61,code=sm_61
+# k80 arch=compute_37,code=sm_37
+# if we set wrong, the result can be `-inf`
+CUDA_FLAG = -arch=sm_30 \
+			-gencode=arch=compute_30,code=sm_30 \
+			-gencode=arch=compute_50,code=sm_50 \
+			-gencode=arch=compute_52,code=sm_52 \
+			-gencode=arch=compute_60,code=sm_60 \
+			-gencode=arch=compute_61,code=sm_61 \
+			-gencode=arch=compute_62,code=sm_62 \
+			-gencode=arch=compute_70,code=sm_70 \
+			-gencode=arch=compute_70,code=compute_70 \
+			-maxrregcount=0  --machine 64 -DUSE_CUDA --use_fast_math 
+
+CFLAGS += -O3 -flto -DNDEBUG -rdynamic -fkeep-inline-functions
+
+# include dir
+CFLAGS += -fPIC $(addprefix -I, $(INC_DIR))
+# CUDA_FLAG += $(addprefix -I, $(INC_DIR))
+CXXFLAGS = $(CFLAGS)
+
+# lib dir
+LDFLAGS = $(addprefix -L, $(DEPLIB_DIR))
+
+# decoder source file
+ifeq ($(USE_CUDA), 1)
+	SOURCES := $(foreach dir,$(SRC_DIR),$(wildcard $(dir)/*.c) $(wildcard $(dir)/*.cpp) $(wildcard $(dir)/*.cc) $(wildcard $(dir)/*.cu))
+else
+	SOURCES := $(foreach dir,$(SRC_DIR),$(wildcard $(dir)/*.c) $(wildcard $(dir)/*.cpp) $(wildcard $(dir)/*.cc) )
+endif
+
+SOURCES := $(subst $(Tensor_Main), ,$(SOURCES))
+SOURCES := $(subst $(Network_Main), ,$(SOURCES))
+
+# object file
+OBJS := $(patsubst %.c,%.o,$(SOURCES))
+OBJS := $(patsubst %.cpp,%.o,$(OBJS))
+ifeq ($(USE_CUDA), 1)
+	OBJS := $(patsubst %.cu,%.cuo,$(OBJS))
+endif
+
+all: start tensor network finish
+
+tensor: $(TENSOR)
+
+network: $(NETWORK)
+
+$(TENSOR): $(OBJS) $(Tensor_Main)
+	@echo "Making executable file: $(TENSOR)"
+	@$(CXX) $(Tensor_Main) $(CXXFLAGS) $(MACRO) $(LDFLAGS) $(OBJS) $(DEPLIBS) -o $@
+
+$(NETWORK): $(OBJS) $(Network_Main)
+	@echo "Making executable file: $(NETWORK)"
+	@$(CXX) $(Network_Main) $(CXXFLAGS) $(MACRO) $(LDFLAGS) $(OBJS) $(DEPLIBS) -o $@
+
+start:
+	@echo ""
+	@echo "Start Making ..."
+
+finish:
+	@echo "finish Making ..."
+	@echo ""
+
+%.o: %.c
+	@$(CC) $(CFLAGS) -c $< -o $@
+
+%.o: %.cpp
+	@$(CXX) $(CXXFLAGS) $(MACRO) -c $< -o $@
+
+%.cuo: %.cu
+	@$(NVCC) $(CUDA_FLAG) -c $< -o $@
+
+.PHONY: clean
+clean:
+	@echo "Making clean object files"
+	@-rm -f $(OBJS)
+
+cleanexe:
+	@echo "Making clean executable files"
+	@-rm -f $(TENSOR_CPU) $(NETWORK_CPU) $(TENSOR_GPU) $(NETWORK_GPU)
--- a/source/tensor/core/arithmetic/Sum.cpp
+++ b/source/tensor/core/arithmetic/Sum.cpp
@@ -82,7 +82,12 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
                DTYPE * ap = (DTYPE*)a->data;
                DTYPE * bp = (DTYPE*)b->data;
                DTYPE * cp = (DTYPE*)c->data;
-    
+                // when c != a, OpenBLAS needs to copy a to c first. This operation 
+                // slow down the speed, so just use OpenBLAS when c == a
+                if(useBLAS && c == a){
+                    cblas_saxpy(a->unitNum,1,bp,1,cp,1);
+                }
+                else{
                    /* unrolling */
                    int num = a->unitNum;
                    if (num % 4 == 0) {
@@ -105,6 +110,7 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
                        }
                    }
                }
+            }
            else {
                // TODO!!
                ShowNTErrors("TODO!");

--- a/source/tensor/core/math/ScaleAndShift.cpp
+++ b/source/tensor/core/math/ScaleAndShift.cpp
@@ -68,12 +68,16 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift)
    else{
        DTYPE * va = (DTYPE*)a->data;
        DTYPE * vb = (DTYPE*)b->data;
+        if(shift == 0 && useBLAS && a==b){
+            cblas_sscal(b->unitNum, scale, vb, 1);
+        } else{
            for(int i = 0; i < b->unitNum; i++){
                *vb = *va * scale + shift;
                va++;
                vb++;
            }
        }
+    }
 }

 /* 

--- a/source/tensor/core/reduce/ReduceMax.cpp
+++ b/source/tensor/core/reduce/ReduceMax.cpp
@@ -77,6 +77,9 @@ void _ReduceMax(const XTensor * input, XTensor * output, int dim)
        blockSize = stride * strideNum;

        for(int k = 0; k < blockNum; k++){
+            if(useBLAS){
+                *(op + i) = *(ip + i + cblas_isamax(strideNum, ip + i, stride));
+            } else{
                DTYPE * ip = (DTYPE*)input->data + blockSize * k;
                DTYPE * op = (DTYPE*)output->data + stride * k;
                for(int i = 0; i < stride; i++){
@@ -91,6 +94,7 @@ void _ReduceMax(const XTensor * input, XTensor * output, int dim)
                }
            }
        }
+    }
 }

 /* 

--- a/source/tensor/core/reduce/ReduceSum.cpp
+++ b/source/tensor/core/reduce/ReduceSum.cpp
@@ -143,15 +143,23 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor 
                else{
                    if(bias == 0){
                        if(power == (DTYPE)1.0){
+                            if(useBLAS)
+                                sum = cblas_sasum(strideNum, ip + i, stride);
+                            else
                                for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride)
                                    sum += *ipb;
                        }
                        else if(power == (DTYPE)2.0){
+                            if(useBLAS){
+                                sum = cblas_snrm2(strideNum, ip + i, stride);
+                                sum = sum * sum;
+                            } else{
                                for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
                                    DTYPE value = (*ipb);
                                    sum += value * value;
                                }
                            }
+                        }
                        else if(power == (DTYPE)0.5){
                            for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
                                DTYPE value = (*ipb);
@@ -167,6 +175,9 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor 
                    }
                    else{
                        if(power == (DTYPE)1.0){
+                            if(useBLAS)
+                                sum = cblas_sasum(strideNum, ip + i, stride);
+                            else
                                for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride)
                                    sum += *ipb;
                            sum -= strideNum * bias;