Commit 55dd6a78 by liyinqiao

Merge with the branch of xuchen and huchi.

1. Fix the bugs in backward process.
2. Support the float16 class.
3. Fix bugs.
4. Clean the codes.
5. Remove the makefile.
parent 2f7adb8c
# the prefix of the generated executable file
NIUTRANS_EXE := NiuTensor
# code path and generated file path
ROOT = .
SRC = $(ROOT)/source
LIB_DIR = $(ROOT)/lib
EXE_DIR = $(ROOT)/bin
# whether to generate dll
dll = 0
# 0 - on Windows or Linux platform
# 1 - on Macintosh platform
OnMac = 0
# 0 - use CPU
# 1 - use GPU
USE_CUDA = 0
# modify this path if neccessary
CUDA_ROOT = /usr/local/cuda
CUDA_LIB_DIR = $(CUDA_ROOT)/lib64
CUDA_INCLUDE = $(CUDA_ROOT)/include
# use MKL
USE_MKL = 0
INTEL_ROOT = /opt/intel
MKL_ROOT = /opt/intel/mkl
MKL_LIB_DIR = $(MKL_ROOT)/lib/intel64/
MKL_INCLUDE = $(MKL_ROOT)/include
# use OpenBLAS
USE_OPENBLAS = 0
OPENBLAS_ROOT = /opt/OpenBLAS
OPENBLAS_LIB_DIR = $(OPENBLAS_ROOT)/lib
OPENBLAS_INCLUDE = $(OPENBLAS_ROOT)/include
SRC_DIR = $(shell find $(SRC) -type d)
# included header files directory
# depended outside library files directory
INC_DIR = $(SRC_DIR)
DEPLIB_DIR =
ifeq ($(USE_CUDA), 1)
INC_DIR += $(CUDA_INCLUDE)
DEPLIB_DIR += $(CUDA_LIB_DIR)
endif
ifeq ($(USE_MKL), 1)
INC_DIR += $(MKL_INCLUDE)
DEPLIB_DIR += $(MKL_LIB_DIR)
endif
ifeq ($(USE_OPENBLAS), 1)
INC_DIR += $(OPENBLAS_INCLUDE)
DEPLIB_DIR += $(OPENBLAS_LIB_DIR)
endif
# macro
MACRO =
ifeq ($(USE_CUDA), 1)
MACRO += -DUSE_CUDA
endif
ifeq ($(USE_MKL), 1)
MACRO += -DUSE_BLAS -DMKL
endif
ifeq ($(USE_OPENBLAS), 1)
MACRO += -DUSE_BLAS -DOPENBLAS
endif
# dependency
STATIC_DEPLIB =
DYNAMIC_DEPLIB = -lpthread
ifeq ($(USE_MKL), 1)
STATIC_DEPLIB += $(MKL_LIB_DIR)/libmkl_intel_lp64.a \
$(MKL_LIB_DIR)/libmkl_core.a \
$(MKL_LIB_DIR)/libmkl_intel_thread.a \
$(INTEL_ROOT)/lib/intel64/libiomp5.a
DYNAMIC_DEPLIB += -liomp5 -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core
endif
ifeq ($(USE_OPENBLAS), 1)
STATIC_DEPLIB += $(OPENBLAS_LIB_DIR)/libopenblas.a
DYNAMIC_DEPLIB += -lopenblas
endif
ifeq ($(USE_CUDA), 1)
STATIC_DEPLIB += $(CUDA_LIB_DIR)/libcublas_static.a \
$(CUDA_LIB_DIR)/libculibos.a \
$(CUDA_LIB_DIR)/libnpps_static.a \
$(CUDA_LIB_DIR)/libnppc_static.a \
$(CUDA_LIB_DIR)/libcudadevrt.a \
$(CUDA_LIB_DIR)/libcurand_static.a \
/lib64/libdl.so.2
DYNAMIC_DEPLIB += -lcudart -lnvidia-ml
endif
ifeq ($(OnMac), 1)
DEPLIBS = $(STATIC_DEPLIB) -lm -ldl $(DYNAMIC_DEPLIB)
else
DEPLIBS = -Wl,--start-group $(STATIC_DEPLIB) -Wl,--end-group -lm -ldl $(DYNAMIC_DEPLIB)
endif
# specify the compilers here
CC = gcc
CXX = g++
NVCC = $(CUDA_ROOT)/bin/nvcc
ifeq ($(USE_INTEL_COMPILER), 1)
CC = icc
CXX = icc
endif
# main file
MAIN_FILE = $(SRC)/Main.cpp
ifeq ($(USE_CUDA), 1)
NIUTRANS_EXE := $(NIUTRANS_EXE).GPU
else
NIUTRANS_EXE := $(NIUTRANS_EXE).CPU
endif
NIUTRANS_DLL := $(LIB_DIR)/lib$(NIUTRANS_EXE).so
NIUTRANS_EXE := $(EXE_DIR)/$(NIUTRANS_EXE)
# specify the compiling arguments here
CFLAGS = -std=c++11 -msse4.2 -w -march=native -Wno-enum-compare -Wno-sign-compare -Wno-reorder -Wno-format
# gtx 1080 arch=compute_61,code=sm_61
# k80 arch=compute_37,code=sm_37
# if we set wrong, the result can be `-inf`
CUDA_FLAG = -arch=sm_30 \
-gencode=arch=compute_30,code=sm_30 \
-gencode=arch=compute_50,code=sm_50 \
-gencode=arch=compute_52,code=sm_52 \
-gencode=arch=compute_60,code=sm_60 \
-gencode=arch=compute_61,code=sm_61 \
-gencode=arch=compute_62,code=sm_62 \
-gencode=arch=compute_70,code=sm_70 \
-gencode=arch=compute_70,code=compute_70 \
-maxrregcount=0 --machine 64 -DUSE_CUDA --use_fast_math -std=c++11
CFLAGS += -O3 -flto -DNDEBUG -rdynamic -fkeep-inline-functions
# include dir
CFLAGS += -fPIC $(addprefix -I, $(INC_DIR))
# CUDA_FLAG += $(addprefix -I, $(INC_DIR))
CXXFLAGS = $(CFLAGS)
# lib dir
LDFLAGS = $(addprefix -L, $(DEPLIB_DIR))
# decoder source file
ifeq ($(USE_CUDA), 1)
SOURCES := $(foreach dir,$(SRC_DIR),$(wildcard $(dir)/*.c) $(wildcard $(dir)/*.cpp) $(wildcard $(dir)/*.cc) $(wildcard $(dir)/*.cu))
else
SOURCES := $(foreach dir,$(SRC_DIR),$(wildcard $(dir)/*.c) $(wildcard $(dir)/*.cpp) $(wildcard $(dir)/*.cc) )
endif
SOURCES := $(subst $(MAIN_FILE), ,$(SOURCES))
# object file
OBJS := $(patsubst %.c,%.o,$(SOURCES))
OBJS := $(patsubst %.cpp,%.o,$(OBJS))
ifeq ($(USE_CUDA), 1)
OBJS := $(patsubst %.cu,%.cuo,$(OBJS))
endif
all: start lib exe finish
start:
@echo ""
@echo "Start building ..."
lib: start_lib niutrans_dll finish_lib
start_lib:
@mkdir -p $(LIB_DIR)
@echo ""
@echo "Start building library"
niutrans_dll: $(NIUTRANS_DLL)
$(NIUTRANS_DLL): $(OBJS)
ifeq ($(dll), 1)
@echo "Building dynamic link library: $(NIUTRANS_DLL)"
@$(CXX) -shared -Wall $(CXXFLAGS) $(MACRO) $(LDFLAGS) $(OBJS) $(DEPLIBS) -o $@
else
@echo "Skip building dynamic link library"
endif
finish_lib:
@echo "Finish building library"
@echo ""
exe: start_exe niutrans_exe finish_exe
start_exe:
@mkdir -p $(EXE_DIR)
@echo ""
@echo "Start building executable file"
niutrans_exe: $(NIUTRANS_EXE)
$(NIUTRANS_EXE): $(OBJS) $(MAIN_FILE)
@echo "Building executable file: $(NIUTRANS_EXE)"
@$(CXX) $(MAIN_FILE) $(CXXFLAGS) $(MACRO) $(LDFLAGS) $(OBJS) $(DEPLIBS) -o $@
finish_exe:
@echo "Finish building executable file"
@echo ""
finish:
@echo "Finish building ..."
@echo ""
%.o: %.c
@$(CC) $(CFLAGS) -c $< -o $@
%.o: %.cpp
@$(CXX) $(CXXFLAGS) $(MACRO) -c $< -o $@
%.cuo: %.cu
ifeq ($(dll), 1)
@$(NVCC) --shared --compiler-options '-fPIC' $(CUDA_FLAG) -c $< -o $@
else
@$(NVCC) $(CUDA_FLAG) -c $< -o $@
endif
.PHONY: clean
clean:
@echo "Cleaning object files"
@-rm -f $(OBJS)
...@@ -53,6 +53,7 @@ void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient) ...@@ -53,6 +53,7 @@ void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient)
XTensor * dedy = output->grad; XTensor * dedy = output->grad;
//XTensor * tmp = NewTensorBufV2(output, output->devID, output->mem); //XTensor * tmp = NewTensorBufV2(output, output->devID, output->mem);
XTensor * tmp = NewTensor(output); XTensor * tmp = NewTensor(output);
tmp->SetZeroAll();
if (operID == FUNC_HARDTANH) if (operID == FUNC_HARDTANH)
_HardTanHBackward(output, input, dedy, tmp); _HardTanHBackward(output, input, dedy, tmp);
......
...@@ -60,6 +60,7 @@ void XLossGrad::MakeGrad(XTensor * node, bool isEfficient) ...@@ -60,6 +60,7 @@ void XLossGrad::MakeGrad(XTensor * node, bool isEfficient)
//XTensor * tmp = NewTensorBufV2(output, output->devID, output->mem); //XTensor * tmp = NewTensorBufV2(output, output->devID, output->mem);
XTensor* tmp = NewTensor(output); XTensor* tmp = NewTensor(output);
tmp->SetZeroAll();
if (operID == LOSS_CROSSENTROPY) { if (operID == LOSS_CROSSENTROPY) {
if (income.tailNum == 3) if (income.tailNum == 3)
......
...@@ -30,6 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -30,6 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/************************************************* /*************************************************
* we define the "new and delete" functions below * we define the "new and delete" functions below
*/ */
bool X_ENABLE_GRAD = true;
/* /*
initialize a tensor V2 initialize a tensor V2
......
...@@ -31,9 +31,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -31,9 +31,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
*/ */
/* global flag for enabling gradient flows or not */ /* global flag for enabling gradient flows or not */
static bool X_ENABLE_GRAD = true; extern bool X_ENABLE_GRAD;
#define DISABLE_GRAD X_ENABLE_GRAD=false
#define ENABLE_GRAD X_ENABLE_GRAD=true #define ENABLE_GRAD X_ENABLE_GRAD=true
#define DISABLE_GRAD X_ENABLE_GRAD=false
/* initialize a XTensor V2 */ /* initialize a XTensor V2 */
void InitTensorV2(XTensor * tensor, void InitTensorV2(XTensor * tensor,
......
...@@ -60,25 +60,4 @@ TENSOR_DATA_TYPE GetDataType(const char * typeName) ...@@ -60,25 +60,4 @@ TENSOR_DATA_TYPE GetDataType(const char * typeName)
} }
} }
/*
Below is for calling CPU BLAS for fast matrix operations
I'm not sure how fast it is. But it seems that other
guys are crazy about this. So I decided to have a try.
*/
/* float -> float16 */
_XINLINE_ unsigned short FloatToFloat16(float f)
{
unsigned int x = *((unsigned int*)&f);
unsigned short h = ((x>>16)&0x8000)|((((x&0x7f800000)-0x38000000)>>13)&0x7c00)|((x>>13)&0x03ff);
return h;
}
/* float16 -> float */
_XINLINE_ float Float16ToFloat(unsigned short h)
{
float f = float(((h&0x8000)<<16) | (((h&0x7c00)+0x1C000)<<13) | ((h&0x03FF)<<13));
return f;
}
} /* end of the nts (NiuTrans.Tensor) namespace */ } /* end of the nts (NiuTrans.Tensor) namespace */
...@@ -46,10 +46,6 @@ enum MATRIX_TRANS_TYPE{X_TRANS, X_NOTRANS}; ...@@ -46,10 +46,6 @@ enum MATRIX_TRANS_TYPE{X_TRANS, X_NOTRANS};
extern const char * GetDataTypeName(TENSOR_DATA_TYPE type); extern const char * GetDataTypeName(TENSOR_DATA_TYPE type);
extern TENSOR_DATA_TYPE GetDataType(const char * typeName); extern TENSOR_DATA_TYPE GetDataType(const char * typeName);
/* data conversion (for lower precision computation) */
unsigned short FloatToFloat16(float f);
float Float16ToFloat(unsigned short h);
#define CheckDataType(a, b) \ #define CheckDataType(a, b) \
{ \ { \
if(GetDataTypeName(a) != GetDataTypeName(a)){ \ if(GetDataTypeName(a) != GetDataTypeName(a)){ \
......
...@@ -143,6 +143,11 @@ extern int verboseLevel; ...@@ -143,6 +143,11 @@ extern int verboseLevel;
fflush(FILEH); \ fflush(FILEH); \
} \ } \
#define LOG(...) do {\
fprintf(stderr, "[INFO] ");\
fprintf(stderr, __VA_ARGS__);\
fprintf(stderr, "\n");\
} while(0)
#define XPRINT(VERBOSE,FILEH,STR) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR);FFLUSH(FILEH);}} #define XPRINT(VERBOSE,FILEH,STR) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR);FFLUSH(FILEH);}}
#define XPRINT1(VERBOSE,FILEH,STR,ARG) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG);FFLUSH(FILEH);}} #define XPRINT1(VERBOSE,FILEH,STR,ARG) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG);FFLUSH(FILEH);}}
#define XPRINT2(VERBOSE,FILEH,STR,ARG,ARG2) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2);FFLUSH(FILEH);}} #define XPRINT2(VERBOSE,FILEH,STR,ARG,ARG2) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2);FFLUSH(FILEH);}}
......
...@@ -300,7 +300,7 @@ void XLink::MakeLink(const XTensor * t1, const XTensor * t2, XTensor * h, int id ...@@ -300,7 +300,7 @@ void XLink::MakeLink(const XTensor * t1, const XTensor * t2, XTensor * h, int id
if(h == NULL) if(h == NULL)
return; return;
if (!t1->enableGrad) if (!(t1->enableGrad && X_ENABLE_GRAD))
return; return;
TensorList list(2); TensorList list(2);
...@@ -323,7 +323,7 @@ void XLink::MakeLink(const XTensor * t1, const XTensor * t2, const XTensor * t3, ...@@ -323,7 +323,7 @@ void XLink::MakeLink(const XTensor * t1, const XTensor * t2, const XTensor * t3,
if (h == NULL) if (h == NULL)
return; return;
if (!t1->enableGrad || !t2->enableGrad) if (!t1->enableGrad || !t2->enableGrad || !X_ENABLE_GRAD)
return; return;
TensorList list(3); TensorList list(3);
...@@ -342,6 +342,9 @@ create a hyper edge with a list of tensors and a output tensor ...@@ -342,6 +342,9 @@ create a hyper edge with a list of tensors and a output tensor
*/ */
void XLink::MakeLink(const TensorList * list, XTensor * h, int id) void XLink::MakeLink(const TensorList * list, XTensor * h, int id)
{ {
if (!X_ENABLE_GRAD || !h->enableGrad)
return;
/* forward */ /* forward */
XLink &income = h->income; XLink &income = h->income;
income.Reset(); income.Reset();
...@@ -376,7 +379,7 @@ create a hyper edge with a input tensors and a list of output tensors ...@@ -376,7 +379,7 @@ create a hyper edge with a input tensors and a list of output tensors
*/ */
void XLink::MakeLink(XTensor * t, TensorList * list, int id) void XLink::MakeLink(XTensor * t, TensorList * list, int id)
{ {
if (!t->enableGrad) if (!t->enableGrad || !X_ENABLE_GRAD)
return; return;
/* forward */ /* forward */
...@@ -633,7 +636,9 @@ void XLink::CopyIncoming(const XTensor * reference, XTensor * target) ...@@ -633,7 +636,9 @@ void XLink::CopyIncoming(const XTensor * reference, XTensor * target)
ClearIncoming(target); ClearIncoming(target);
int tailNum = reference->income.tailNum; int tailNum = reference->income.tailNum;
TensorList tails(tailNum); if (tailNum <= 0)
return;
TensorList tails;
for(int i = 0; i < tailNum; i++){ for(int i = 0; i < tailNum; i++){
XTensor * tail = (XTensor*)reference->income.tails[i]; XTensor * tail = (XTensor*)reference->income.tails[i];
tails.Add(tail); tails.Add(tail);
......
...@@ -395,6 +395,27 @@ void TensorListBase<T>::Shuffle(int nround, int beg, int len) ...@@ -395,6 +395,27 @@ void TensorListBase<T>::Shuffle(int nround, int beg, int len)
} }
} }
/*
read data from a file
>> fp - pointer to a file
>> num - number of items to be read
*/
template<typename T>
void TensorListBase<T>::ReadFromFile(FILE* fp, int num)
{
if (maxNum < num) {
if(!items)
Reserve(num - maxNum);
else {
free(items);
items = (T*)malloc(sizeof(T) * num);
}
}
fread(items, sizeof(T), num, fp);
maxNum = num;
count += num;
}
/* specializations and typedef of list */ /* specializations and typedef of list */
template struct TensorListBase<int>; template struct TensorListBase<int>;
template struct TensorListBase<char>; template struct TensorListBase<char>;
...@@ -406,6 +427,7 @@ template struct TensorListBase<XTensor*>; ...@@ -406,6 +427,7 @@ template struct TensorListBase<XTensor*>;
template struct TensorListBase<uint64_t>; template struct TensorListBase<uint64_t>;
template struct TensorListBase<void*>; template struct TensorListBase<void*>;
template struct TensorListBase<Example*>; template struct TensorListBase<Example*>;
template struct TensorListBase<TrainExample*>;
template struct TensorListBase<Result*>; template struct TensorListBase<Result*>;
} /* end of the nts (NiuTrans.Tensor) namespace */ } /* end of the nts (NiuTrans.Tensor) namespace */
\ No newline at end of file
...@@ -123,6 +123,9 @@ public: ...@@ -123,6 +123,9 @@ public:
/* shuffle the list */ /* shuffle the list */
void Shuffle(int nround = 10, int beg = -1, int len = 0); void Shuffle(int nround = 10, int beg = -1, int len = 0);
/* read data from a file */
void ReadFromFile(FILE* fp, int num);
/* short */ /* short */
T& operator[] (int i) { T& operator[] (int i) {
CheckNTErrors(i >= -count && i < count, "Index of a list item is out of scope!"); CheckNTErrors(i >= -count && i < count, "Index of a list item is out of scope!");
...@@ -138,6 +141,7 @@ public: ...@@ -138,6 +141,7 @@ public:
struct XTensor; struct XTensor;
struct Example; struct Example;
struct TrainExample;
struct Result; struct Result;
typedef TensorListBase<void*> XList; typedef TensorListBase<void*> XList;
...@@ -150,6 +154,7 @@ typedef TensorListBase<short> ShortList; ...@@ -150,6 +154,7 @@ typedef TensorListBase<short> ShortList;
typedef TensorListBase<uint64_t> UInt64List; typedef TensorListBase<uint64_t> UInt64List;
typedef TensorListBase<XTensor*> TensorList; typedef TensorListBase<XTensor*> TensorList;
typedef TensorListBase<Example*> InputBufferType; typedef TensorListBase<Example*> InputBufferType;
typedef TensorListBase<TrainExample*> TrainBufferType;
typedef TensorListBase<Result*> OutputBufferType; typedef TensorListBase<Result*> OutputBufferType;
} /* end of the nts (NiuTrans.Tensor) namespace */ } /* end of the nts (NiuTrans.Tensor) namespace */
......
...@@ -55,6 +55,10 @@ const char * GetOPName(int type) ...@@ -55,6 +55,10 @@ const char * GetOPName(int type)
return "M_ROUND"; return "M_ROUND";
else if (type == MATH_RECIPROCAL) else if (type == MATH_RECIPROCAL)
return "M_RECIPROCAL"; return "M_RECIPROCAL";
else if (type == MATH_EQUAL)
return "M_EQUAL";
else if (type == MATH_NOTEQUAL)
return "M_NOTEQUAL";
else if (type == MATH_CLIP) else if (type == MATH_CLIP)
return "M_CLIP"; return "M_CLIP";
else if (type == MATH_DIV) else if (type == MATH_DIV)
...@@ -67,6 +71,10 @@ const char * GetOPName(int type) ...@@ -67,6 +71,10 @@ const char * GetOPName(int type)
return "M_MATRIXMUL"; return "M_MATRIXMUL";
else if (type == MATH_MATRIXMULBATCHED) else if (type == MATH_MATRIXMULBATCHED)
return "M_MATRIXMULBATCHED"; return "M_MATRIXMULBATCHED";
else if (type == MATH_MAX)
return "M_MAX";
else if (type == MATH_MIN)
return "M_MIN";
else if (type == MATH_MULTIPLY) else if (type == MATH_MULTIPLY)
return "M_MULTIPLY"; return "M_MULTIPLY";
else if (type == MATH_MULTIPLYDIM) else if (type == MATH_MULTIPLYDIM)
......
...@@ -46,7 +46,10 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -46,7 +46,10 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#define MATH_ROUND MATH_TAN + 1 #define MATH_ROUND MATH_TAN + 1
#define MATH_RECIPROCAL MATH_ROUND + 1 #define MATH_RECIPROCAL MATH_ROUND + 1
#define MATH_CLIP MATH_RECIPROCAL + 1 #define MATH_EQUAL MATH_RECIPROCAL + 1
#define MATH_NOTEQUAL MATH_EQUAL + 1
#define MATH_CLIP MATH_NOTEQUAL + 1
#define MATH_DIV MATH_CLIP + 1 #define MATH_DIV MATH_CLIP + 1
#define MATH_DIVDIM MATH_DIV + 1 #define MATH_DIVDIM MATH_DIV + 1
#define MATH_MASK MATH_DIVDIM + 1 #define MATH_MASK MATH_DIVDIM + 1
......
...@@ -134,8 +134,8 @@ constructor ...@@ -134,8 +134,8 @@ constructor
>> myDevID - device id >> myDevID - device id
>> myMem - memory pool used to allocating the data array >> myMem - memory pool used to allocating the data array
*/ */
XTensor::XTensor(const int myOrder, const int* myDimSize, const TENSOR_DATA_TYPE myDataType, XTensor::XTensor(const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType,
const float myDenseRatio, int myDevID, XMem* myMem) const float myDenseRatio, int myDevID, XMem * myMem)
{ {
Init(); Init();
SetDataPointer(); SetDataPointer();
...@@ -1739,12 +1739,13 @@ void XTensor::Dump(FILE* file, const char* label, const int n, const int beg, co ...@@ -1739,12 +1739,13 @@ void XTensor::Dump(FILE* file, const char* label, const int n, const int beg, co
} }
} }
else if (dataType == X_FLOAT16) { else if (dataType == X_FLOAT16) {
for(int i = beg; i < end; i++){ float16* f = (float16*)d;
DTYPE f = ((unsigned short*)d)[i]; for (int i = beg; i < end; i++) {
if(i == beg) float v = f[i].Float();
fprintf(file, "%e", f); if (i == beg)
fprintf(file, "%e", v);
else else
fprintf(file, " %e", f); fprintf(file, " %e", v);
} }
} }
else else
...@@ -1804,11 +1805,12 @@ void XTensor::BinaryDump(FILE* file) ...@@ -1804,11 +1805,12 @@ void XTensor::BinaryDump(FILE* file)
break; break;
} }
case X_FLOAT16: { case X_FLOAT16: {
fwrite(tmp.data, sizeof(unsigned short), unitNum, file); fwrite(tmp.data, sizeof(float16), unitNum, file);
break; break;
} }
default: { default: {
fwrite(tmp.data, sizeof(float), unitNum, file); fwrite(tmp.data, sizeof(float), unitNum, file);
break;
} }
} }
} }
...@@ -1941,8 +1943,8 @@ void XTensor::BinaryRead(FILE* file, size_t offset) ...@@ -1941,8 +1943,8 @@ void XTensor::BinaryRead(FILE* file, size_t offset)
break; break;
} }
case X_FLOAT16: { case X_FLOAT16: {
unsigned short* d = new unsigned short[unitNum]; float16* d = new float16[unitNum];
fread(d, sizeof(unsigned short), unitNum, file); fread(d, sizeof(float16), unitNum, file);
SetData(d, unitNum); SetData(d, unitNum);
delete[] d; delete[] d;
break; break;
...@@ -1952,6 +1954,7 @@ void XTensor::BinaryRead(FILE* file, size_t offset) ...@@ -1952,6 +1954,7 @@ void XTensor::BinaryRead(FILE* file, size_t offset)
fread(d, sizeof(float), unitNum, file); fread(d, sizeof(float), unitNum, file);
SetData(d, unitNum); SetData(d, unitNum);
delete[] d; delete[] d;
break;
} }
} }
} }
......
...@@ -91,6 +91,7 @@ ...@@ -91,6 +91,7 @@
#include "sort/TopK.h" #include "sort/TopK.h"
#include "utilities/CheckData.h" #include "utilities/CheckData.h"
#include "utilities/Float16.h"
#include "utilities/FlushToMem.h" #include "utilities/FlushToMem.h"
#include "utilities/SetAscendingOrder.h" #include "utilities/SetAscendingOrder.h"
#include "utilities/XMatrixSegment.h" #include "utilities/XMatrixSegment.h"
......
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#include "ConvertDataType.h" #include "ConvertDataType.h"
#include "ConvertDataType.cuh" #include "ConvertDataType.cuh"
#include "../movement/CopyValues.h" #include "../movement/CopyValues.h"
#include "../utilities/Float16.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -48,12 +49,12 @@ void ConvertDataType(int devID, ...@@ -48,12 +49,12 @@ void ConvertDataType(int devID,
if(typeS == X_FLOAT && typeT == X_FLOAT16){ if(typeS == X_FLOAT && typeT == X_FLOAT16){
for(int i = 0; i < size; i++){ for(int i = 0; i < size; i++){
((unsigned short*)t)[i] = FloatToFloat16(((float*)s)[i]); ((float16*)t)[i] = float16(((float*)s)[i]);
} }
} }
else if(typeS == X_FLOAT16 && typeT == X_FLOAT){ else if(typeS == X_FLOAT16 && typeT == X_FLOAT){
for(int i = 0; i < size; i++){ for(int i = 0; i < size; i++){
((float*)t)[i] = Float16ToFloat(((unsigned short*)s)[i]); ((float*)t)[i] = ((float16*)s)[i].Float();
} }
} }
else{ else{
...@@ -94,15 +95,15 @@ void _ConvertDataType(const XTensor * input, XTensor * output) ...@@ -94,15 +95,15 @@ void _ConvertDataType(const XTensor * input, XTensor * output)
} }
else if (input->dataType == X_FLOAT && output->dataType == X_FLOAT16) { else if (input->dataType == X_FLOAT && output->dataType == X_FLOAT16) {
float* inputData = (float*)input->data; float* inputData = (float*)input->data;
unsigned short* outputData = (unsigned short*)output->data; float16* outputData = (float16*)output->data;
for (int i = 0; i < input->unitNum; i++) for (int i = 0; i < input->unitNum; i++)
outputData[i] = (unsigned short)inputData[i]; outputData[i] = (float16)inputData[i];
} }
else if (input->dataType == X_FLOAT16 && output->dataType == X_FLOAT) { else if (input->dataType == X_FLOAT16 && output->dataType == X_FLOAT) {
unsigned short* inputData = (unsigned short*)input->data; float16* inputData = (float16*)input->data;
float* outputData = (float*)output->data; float* outputData = (float*)output->data;
for (int i = 0; i < input->unitNum; i++) for (int i = 0; i < input->unitNum; i++)
outputData[i] = (float)inputData[i]; outputData[i] = inputData[i].Float();
} }
else else
ShowNTErrors("Unsupported data types for conversion!"); ShowNTErrors("Unsupported data types for conversion!");
......
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
#include "SetData.cuh" #include "SetData.cuh"
#include "../../XUtility.h" #include "../../XUtility.h"
#include "../movement/CopyValues.h" #include "../movement/CopyValues.h"
#include "../utilities/Float16.h"
#if !defined( WIN32 ) && !defined( _WIN32 ) #if !defined( WIN32 ) && !defined( _WIN32 )
#include "sys/time.h" #include "sys/time.h"
...@@ -434,19 +435,19 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper) ...@@ -434,19 +435,19 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
if(tensor->dataType == X_FLOAT){ if(tensor->dataType == X_FLOAT){
float * d = (float*)tensor->data; float * d = (float*)tensor->data;
for(int i = 0; i < tensor->unitNum; i++){ for(int i = 0; i < tensor->unitNum; i++){
d[i] = variance * ((float)rand()/RAND_MAX) + lower; d[i] = ((float)rand()/RAND_MAX) * variance + lower;
} }
} }
else if (tensor->dataType == X_FLOAT16) { else if (tensor->dataType == X_FLOAT16) {
unsigned short* d = (unsigned short*)tensor->data; float16* d = (float16*)tensor->data;
for (int i = 0; i < tensor->unitNum; i++) { for (int i = 0; i < tensor->unitNum; i++) {
d[i] = variance * ((unsigned short)rand() / RAND_MAX) + lower; d[i] = ((float16)rand() / RAND_MAX) * variance + lower;
} }
} }
else if(tensor->dataType == X_DOUBLE){ else if(tensor->dataType == X_DOUBLE){
double * d = (double*)tensor->data; double * d = (double*)tensor->data;
for(int i = 0; i < tensor->unitNum; i++){ for(int i = 0; i < tensor->unitNum; i++){
d[i] = variance * ((double)rand()/RAND_MAX) + lower; d[i] = ((double)rand()/RAND_MAX) * variance+ lower;
} }
} }
else{ else{
......
...@@ -51,6 +51,7 @@ void KernelSetDataFixed(T * d, T v, int size) ...@@ -51,6 +51,7 @@ void KernelSetDataFixed(T * d, T v, int size)
template __global__ void KernelSetDataFixed<int>(int *, int, int); template __global__ void KernelSetDataFixed<int>(int *, int, int);
template __global__ void KernelSetDataFixed<float>(float *, float, int); template __global__ void KernelSetDataFixed<float>(float *, float, int);
template __global__ void KernelSetDataFixed<double>(double *, double, int); template __global__ void KernelSetDataFixed<double>(double *, double, int);
template __global__ void KernelSetDataFixed<__half>(__half*, __half, int);
/* /*
generate data items with a fixed value generate data items with a fixed value
...@@ -79,6 +80,8 @@ void _CudaSetDataFixed(XTensor * tensor, T value) ...@@ -79,6 +80,8 @@ void _CudaSetDataFixed(XTensor * tensor, T value)
KernelSetDataFixed << <blocks, threads >> > ((float*)tensor->data, (float)value, tensor->unitNum); KernelSetDataFixed << <blocks, threads >> > ((float*)tensor->data, (float)value, tensor->unitNum);
else if (tensor->dataType == X_DOUBLE) else if (tensor->dataType == X_DOUBLE)
KernelSetDataFixed << <blocks, threads >> > ((double*)tensor->data, (double)value, tensor->unitNum); KernelSetDataFixed << <blocks, threads >> > ((double*)tensor->data, (double)value, tensor->unitNum);
else if (tensor->dataType == X_FLOAT16)
KernelSetDataFixed << <blocks, threads >> > ((__half*)tensor->data, (__half)value, tensor->unitNum);
else else
ShowNTErrors("TODO! Unsupported datatype!") ShowNTErrors("TODO! Unsupported datatype!")
...@@ -108,6 +111,8 @@ void KernelSetDataFixedCond(T * d, T * c, T value, int size) ...@@ -108,6 +111,8 @@ void KernelSetDataFixedCond(T * d, T * c, T value, int size)
template __global__ void KernelSetDataFixedCond<int>(int*, int*, int, int); template __global__ void KernelSetDataFixedCond<int>(int*, int*, int, int);
template __global__ void KernelSetDataFixedCond<float>(float*, float*, float, int); template __global__ void KernelSetDataFixedCond<float>(float*, float*, float, int);
template __global__ void KernelSetDataFixedCond<double>(double*, double*, double, int); template __global__ void KernelSetDataFixedCond<double>(double*, double*, double, int);
template __global__ void KernelSetDataFixedCond<__half>(__half*, __half*, __half, int);
/* /*
generate data items with a fixed value p generate data items with a fixed value p
only if the condition entry is non-zero only if the condition entry is non-zero
...@@ -141,6 +146,9 @@ void _CudaSetDataFixedCond(XTensor* tensor, XTensor* condition, T value) ...@@ -141,6 +146,9 @@ void _CudaSetDataFixedCond(XTensor* tensor, XTensor* condition, T value)
else if (tensor->dataType == X_DOUBLE) else if (tensor->dataType == X_DOUBLE)
KernelSetDataFixedCond <<< blocks, threads >>> ((double*)tensor->data, (double*)condition->data, KernelSetDataFixedCond <<< blocks, threads >>> ((double*)tensor->data, (double*)condition->data,
(double)value, tensor->unitNum); (double)value, tensor->unitNum);
else if (tensor->dataType == X_FLOAT16)
KernelSetDataFixedCond <<< blocks, threads >>> ((__half*)tensor->data, (__half*)condition->data,
(__half)value, tensor->unitNum);
else else
ShowNTErrors("TODO! Unsupported datatype!") ShowNTErrors("TODO! Unsupported datatype!")
......
...@@ -92,6 +92,10 @@ XTensor funcName(const XTensor &a, DTYPE number) ...@@ -92,6 +92,10 @@ XTensor funcName(const XTensor &a, DTYPE number)
XTensor b(&a); \ XTensor b(&a); \
b.SetTMPFlag(); \ b.SetTMPFlag(); \
_funcName(&a, &b, number); \ _funcName(&a, &b, number); \
if (a.enableGrad) { \
XLink::MakeLink(&a, NULL, &b, operationId); \
XLink::AddParamToHead(&b, (DTYPE)number); \
} \
return b; \ return b; \
} }
...@@ -102,6 +106,10 @@ void funcName(const XTensor &a, XTensor &b, DTYPE number) ...@@ -102,6 +106,10 @@ void funcName(const XTensor &a, XTensor &b, DTYPE number)
InitTensorV2(&b, &a); \ InitTensorV2(&b, &a); \
} \ } \
_funcName(&a, &b, number); \ _funcName(&a, &b, number); \
if (a.enableGrad) { \
XLink::MakeLink(&a, NULL, &b, operationId); \
XLink::AddParamToHead(&b, (DTYPE)number); \
} \
} }
// I think we needn't to make link. // I think we needn't to make link.
...@@ -186,6 +194,9 @@ XTensor funcName(const XTensor & a, const XTensor & b) ...@@ -186,6 +194,9 @@ XTensor funcName(const XTensor & a, const XTensor & b)
XTensor c(&a); \ XTensor c(&a); \
c.SetTMPFlag(); \ c.SetTMPFlag(); \
_funcName(&a, &b, &c); \ _funcName(&a, &b, &c); \
if (a.enableGrad && b.enableGrad) { \
XLink::MakeLink(&a, &b, &c, operationId); \
} \
return c; \ return c; \
} }
...@@ -196,16 +207,33 @@ void funcName(const XTensor &a, const XTensor &b, XTensor c) ...@@ -196,16 +207,33 @@ void funcName(const XTensor &a, const XTensor &b, XTensor c)
InitTensor(&c, &a); \ InitTensor(&c, &a); \
} \ } \
_funcName(&a, &b, &c); \ _funcName(&a, &b, &c); \
if (a.enableGrad && b.enableGrad) { \
XLink::MakeLink(&a, &b, &c, operationId); \
} \
} }
#ifdef USE_CUDA #ifdef USE_CUDA
_SIMPLE_MAX_MIN_FUNCTION(_Equal, _CudaEqual, myIsEqual)
_SIMPLE_MAX_MIN_FUNCTION(_NotEqual, _CudaNotEqual, myIsNotEqual)
_SIMPLE_MAX_MIN_FUNCTION(_Max, _CudaMax, MAX) _SIMPLE_MAX_MIN_FUNCTION(_Max, _CudaMax, MAX)
_SIMPLE_MAX_MIN_FUNCTION(_Min, _CudaMin, MIN) _SIMPLE_MAX_MIN_FUNCTION(_Min, _CudaMin, MIN)
#else #else
_SIMPLE_MAX_MIN_FUNCTION(_Equal, myIsEqual)
_SIMPLE_MAX_MIN_FUNCTION(_NotEqual, myIsNotEqual)
_SIMPLE_MAX_MIN_FUNCTION(_Max, MAX) _SIMPLE_MAX_MIN_FUNCTION(_Max, MAX)
_SIMPLE_MAX_MIN_FUNCTION(_Min, MIN) _SIMPLE_MAX_MIN_FUNCTION(_Min, MIN)
#endif #endif
_SIMPLE_MAX_MIN_FUNCTION_ME(_EqualMe, _Equal)
SIMPLE_MAX_MIN_FUNCTION_ME(EqualMe, _Equal)
SIMPLE_MAX_MIN_FUNCTION(Equal, _Equal, MATH_EQUAL)
SIMPLE_MAX_MIN_FUNCTION_VOID(Equal, _Equal, MATH_EQUAL)
_SIMPLE_MAX_MIN_FUNCTION_ME(_NotEqualMe, _NotEqual)
SIMPLE_MAX_MIN_FUNCTION_ME(NotEqualMe, _NotEqual)
SIMPLE_MAX_MIN_FUNCTION(NotEqual, _NotEqual, MATH_NOTEQUAL)
SIMPLE_MAX_MIN_FUNCTION_VOID(NotEqual, _NotEqual, MATH_NOTEQUAL)
_SIMPLE_MAX_MIN_FUNCTION_ME(_MaxMe, _Max) _SIMPLE_MAX_MIN_FUNCTION_ME(_MaxMe, _Max)
SIMPLE_MAX_MIN_FUNCTION_ME(MaxMe, _Max) SIMPLE_MAX_MIN_FUNCTION_ME(MaxMe, _Max)
SIMPLE_MAX_MIN_FUNCTION(Max, _Max, MATH_MAX) SIMPLE_MAX_MIN_FUNCTION(Max, _Max, MATH_MAX)
......
...@@ -134,6 +134,9 @@ void _Cuda##funcName(const XTensor * a, const XTensor * b, XTensor * c) \ ...@@ -134,6 +134,9 @@ void _Cuda##funcName(const XTensor * a, const XTensor * b, XTensor * c) \
BacktoCudaDev(a->devID, devIDBackup); \ BacktoCudaDev(a->devID, devIDBackup); \
} }
SIMPLE_MAX_MIN_FUNCTION_GPU(Equal, cudaIsEqual)
SIMPLE_MAX_MIN_FUNCTION_GPU(NotEqual, cudaIsNotEqual)
SIMPLE_MAX_MIN_FUNCTION_GPU(Max, max) SIMPLE_MAX_MIN_FUNCTION_GPU(Max, max)
SIMPLE_MAX_MIN_FUNCTION_GPU(Min, min) SIMPLE_MAX_MIN_FUNCTION_GPU(Min, min)
......
...@@ -31,9 +31,15 @@ namespace nts{ // namespace nts(NiuTrans.Tensor) ...@@ -31,9 +31,15 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
/* check whether every entry is equal to the given value (cuda version) */ /* check whether every entry is equal to the given value (cuda version) */
void _CudaEqual(const XTensor * a, XTensor * b, DTYPE value); void _CudaEqual(const XTensor * a, XTensor * b, DTYPE value);
/* check whether every entry is equal to the given value (cuda version) */
void _CudaEqual(const XTensor * a, const XTensor * b, XTensor * c);
/* check whether every entry is not equal to the given value (cuda version) */ /* check whether every entry is not equal to the given value (cuda version) */
void _CudaNotEqual(const XTensor * a, XTensor * b, DTYPE value); void _CudaNotEqual(const XTensor * a, XTensor * b, DTYPE value);
/* check whether every entry is not equal to the given value (cuda version) */
void _CudaNotEqual(const XTensor * a, const XTensor * b, XTensor * c);
/* return maximum of two tensor for each items (cuda version) */ /* return maximum of two tensor for each items (cuda version) */
void _CudaMax(const XTensor * a, const XTensor * b, XTensor *c); void _CudaMax(const XTensor * a, const XTensor * b, XTensor *c);
......
...@@ -40,6 +40,20 @@ XTensor Equal(const XTensor & a, DTYPE value); ...@@ -40,6 +40,20 @@ XTensor Equal(const XTensor & a, DTYPE value);
/* check whether every entry is equal to the given value */ /* check whether every entry is equal to the given value */
void Equal(const XTensor & a, XTensor & b, DTYPE value); void Equal(const XTensor & a, XTensor & b, DTYPE value);
/* check whether every entry is equal to the given value */
void _Equal(const XTensor * a, const XTensor * b, XTensor * c);
/* check whether every entry is equal to the given value (do it on site) */
void _EqualMe(XTensor * a, XTensor * b);
/* check whether every entry is equal to the given value (do it on site) */
void EqualMe(XTensor & a, XTensor & b);
/* check whether every entry is equal to the given value (return an XTensor structure) */
XTensor Equal(const XTensor & a, const XTensor & b);
/* check whether every entry is equal to the given value */
void Equal(const XTensor & a, const XTensor & b, XTensor & c);
/* check whether every entry is not equal to the given value */ /* check whether every entry is not equal to the given value */
void _NotEqual(const XTensor * a, XTensor * b, DTYPE value); void _NotEqual(const XTensor * a, XTensor * b, DTYPE value);
...@@ -56,6 +70,22 @@ XTensor NotEqual(const XTensor & a, DTYPE value); ...@@ -56,6 +70,22 @@ XTensor NotEqual(const XTensor & a, DTYPE value);
/* check whether every entry is not equal to the given value */ /* check whether every entry is not equal to the given value */
void NotEqual(const XTensor & a, XTensor & b, DTYPE value); void NotEqual(const XTensor & a, XTensor & b, DTYPE value);
/* check whether every entry is not equal to the given value */
void _NotEqual(const XTensor * a, const XTensor * b, XTensor * c);
/* check whether every entry is not equal to the given value (do it on site) */
void _NotEqualMe(XTensor * a, XTensor * b);
/* check whether every entry is not equal to the given value (do it on site) */
void NotEqualMe(XTensor & a, XTensor * b);
/* check whether every entry is not equal to the given value (return an XTensor structure) */
XTensor NotEqual(const XTensor & a, const XTensor & b);
/* check whether every entry is not equal to the given value */
void NotEqual(const XTensor & a, const XTensor & b, XTensor & c);
/* return maximum of two tensor for each items */ /* return maximum of two tensor for each items */
void _Max(const XTensor * a, const XTensor * b, XTensor * c); void _Max(const XTensor * a, const XTensor * b, XTensor * c);
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Creted by: Guan Huhao 2020-02-05
* $Updated by: Xu Chen (email: hello_master1954@163.com) 2020-05-01
*/
#ifndef FLOAT16_H
#define FLOAT16_H
namespace nts { // namespace nts(NiuTrans.Tensor)
struct float16
{
private:
/*
sign is the sign bit 1 means negative, 0 means positive
exp is the exponent with 16 offset
data is the data, similar to ieee-754, the highest is default 1 and ignored
*/
unsigned short data : 10;
unsigned short exp : 5;
unsigned short sign : 1;
// mask for calculate the highest 1
static unsigned int mask[32];
static unsigned int pow2[32];
//int FindHighOne(const int &num, int &l, int &r);
int AbsCompare(const float16 & a,const float16 & b);
public:
float16 SetOverFlow();
// judge whether overflow
int IsOverlFlow() const;
/* constructor by (sign, exp, data)
similar to ieee 32 floating point
sign: 1bit
exp: 5bit
data: 10bit */
float16(const int& s, const int& e, const int& d);
/* default constructor
This initializes the 16bit floating point to 0. */
float16();
// constructor by a 32-bit float num
float16(const float& data);
// constructor by other datatype
template<class T> float16(const T& data);
void Dump();
// convert float16 to float and return
float Float();
/* assignment function and tempalte function
Float assignment function is the basic function.
Template assignment function is force change other datetype to float,
then call the float assignment function.
Template assignment function now support int and double. */
float16 operator = (const float& data);
float16 operator = (const float16& data);
template<class T> float16 operator = (const T& data);
// overload operator (less than) a < b
int operator < (const float16& data);
template<class T> int operator < (const T& data);
// overload opertator <= (less or equal than) a <= b
int operator <= (const float16& data);
template<class T> int operator <= (const T& data);
// overload operator (greater than) a > b
int operator > (const float16& data);
template<class T> int operator > (const T& data);
// overload opertator >= (greater or equal than) a >= b
int operator >= (const float16& data);
template<class T> int operator >= (const T& data);
// overload operator + (add) a + b
float16 operator + (const float16& data);
template<class T> float16 operator + (const T& data);
// overload operator += (add) a += b
float16 operator += (const float16& data);
template<class T> float16 operator += (const T& data);
// overload operator -(negetive) -a
float16 operator - ();
// overload operator - (substraction) a - b
float16 operator - (const float16& data);
template<class T> float16 operator - (const T& data);
// overload operator -= (substraction) a -= b
float16 operator -= (const float16& data);
template<class T> float16 operator -= (const T& data);
// overload operator * (multiple) a * b
float16 operator * (const float16& data);
template<class T> float16 operator * (const T& data);
// overload operator *= (multiple) a *= b
float16 operator *= (const float16& data);
template<class T> float16 operator *= (const T& data);
// overload operator / (division) a / b
float16 GetInverse() const;
float16 operator / (const float16& data);
template<class T> float16 operator / (const T& data);
// overload operator /= (division) a /= b
float16 operator /= (const float16& data);
template<class T> float16 operator /= (const T& data);
};
} // namespace nts(NiuTrans.Tensor)
#endif /* FLOAT16_H */
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论