Commit 135aadf4 by liyinqiao

Merge with Xuchen branch

parent c7559b7d
# the prefix of the generated executable file
PREFIX = NiuTrans
NIUTRANS_EXE := $(PREFIX).Tensor
# code path and generated file path
ROOT = .
SRC = $(ROOT)/source
LIB_DIR = $(ROOT)/lib
EXE_DIR = $(ROOT)/bin
# whether to generate dll
dll = 0
# 0 - use CPU
# 1 - use GPU
USE_CUDA = 1
# modify this path if neccessary
CUDA_ROOT = /usr/local/cuda-9.0
CUDA_LIB_DIR = $(CUDA_ROOT)/lib64
CUDA_INCLUDE = $(CUDA_ROOT)/include
# use MKL
USE_MKL = 0
INTEL_ROOT = /opt/intel
MKL_ROOT = /opt/intel/mkl
MKL_LIB_DIR = $(MKL_ROOT)/lib/intel64/
MKL_INCLUDE = $(MKL_ROOT)/include
# use OpenBLAS
USE_OPENBLAS = 0
OPENBLAS_ROOT = /opt/OpenBLAS
OPENBLAS_LIB_DIR = $(OPENBLAS_ROOT)/lib
OPENBLAS_INCLUDE = $(OPENBLAS_ROOT)/include
SRC_DIR = $(shell find $(SRC) -type d)
# included header files directory
# depended outside library files directory
INC_DIR = $(SRC_DIR)
DEPLIB_DIR =
ifeq ($(USE_CUDA), 1)
INC_DIR += $(CUDA_INCLUDE)
DEPLIB_DIR += $(CUDA_LIB_DIR)
endif
ifeq ($(USE_MKL), 1)
INC_DIR += $(MKL_INCLUDE)
DEPLIB_DIR += $(MKL_LIB_DIR)
endif
ifeq ($(USE_OPENBLAS), 1)
INC_DIR += $(OPENBLAS_INCLUDE)
DEPLIB_DIR += $(OPENBLAS_LIB_DIR)
endif
# macro
MACRO =
ifeq ($(USE_CUDA), 1)
MACRO += -DUSE_CUDA
endif
ifeq ($(USE_MKL), 1)
MACRO += -DUSE_BLAS -DMKL
endif
ifeq ($(USE_OPENBLAS), 1)
MACRO += -DUSE_BLAS -DOPENBLAS
endif
# dependency
STATIC_DEPLIB =
DYNAMIC_DEPLIB = -lpthread
ifeq ($(USE_MKL), 1)
STATIC_DEPLIB += $(MKL_LIB_DIR)/libmkl_intel_lp64.a \
$(MKL_LIB_DIR)/libmkl_core.a \
$(MKL_LIB_DIR)/libmkl_intel_thread.a \
$(INTEL_ROOT)/lib/intel64/libiomp5.a
DYNAMIC_DEPLIB += -liomp5 -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core
endif
ifeq ($(USE_OPENBLAS), 1)
STATIC_DEPLIB += $(OPENBLAS_LIB_DIR)/libopenblas.a
DYNAMIC_DEPLIB += -lopenblas
endif
ifeq ($(USE_CUDA), 1)
STATIC_DEPLIB += $(CUDA_LIB_DIR)/libcublas_static.a \
$(CUDA_LIB_DIR)/libculibos.a \
$(CUDA_LIB_DIR)/libnpps_static.a \
$(CUDA_LIB_DIR)/libnppc_static.a \
$(CUDA_LIB_DIR)/libcudadevrt.a \
$(CUDA_LIB_DIR)/libcurand_static.a \
/lib64/libdl.so.2
DYNAMIC_DEPLIB += -lcudart -lnvidia-ml
endif
DEPLIBS = -Wl,--start-group $(STATIC_DEPLIB) -Wl,--end-group -lm -ldl $(DYNAMIC_DEPLIB)
# specify the compilers here
CC = gcc
CXX = g++
NVCC = $(CUDA_ROOT)/bin/nvcc
ifeq ($(USE_INTEL_COMPILER), 1)
CC = icc
CXX = icc
endif
# main file
MAIN_FILE = $(SRC)/network/Main.cpp
Tensor_Main := $(SRC)/tensor/Main.cpp
Network_Main := $(SRC)/network/Main.cpp
ifeq ($(USE_CUDA), 1)
NIUTRANS_EXE := $(NIUTRANS_EXE).GPU
else
NIUTRANS_EXE := $(NIUTRANS_EXE).CPU
endif
NIUTRANS_DLL := $(LIB_DIR)/lib$(NIUTRANS_EXE).so
NIUTRANS_EXE := $(EXE_DIR)/$(NIUTRANS_EXE)
# specify the compiling arguments here
CFLAGS = -std=c++11 -msse4.2 -w -march=native -Wno-enum-compare -Wno-sign-compare -Wno-reorder -Wno-format
# gtx 1080 arch=compute_61,code=sm_61
# k80 arch=compute_37,code=sm_37
# if we set wrong, the result can be `-inf`
CUDA_FLAG = -arch=sm_30 \
-gencode=arch=compute_30,code=sm_30 \
-gencode=arch=compute_50,code=sm_50 \
-gencode=arch=compute_52,code=sm_52 \
-gencode=arch=compute_60,code=sm_60 \
-gencode=arch=compute_61,code=sm_61 \
-gencode=arch=compute_62,code=sm_62 \
-gencode=arch=compute_70,code=sm_70 \
-gencode=arch=compute_70,code=compute_70 \
-maxrregcount=0 --machine 64 -DUSE_CUDA --use_fast_math -std=c++11
CFLAGS += -O3 -flto -DNDEBUG -rdynamic -fkeep-inline-functions
# include dir
CFLAGS += -fPIC $(addprefix -I, $(INC_DIR))
# CUDA_FLAG += $(addprefix -I, $(INC_DIR))
CXXFLAGS = $(CFLAGS)
# lib dir
LDFLAGS = $(addprefix -L, $(DEPLIB_DIR))
# decoder source file
ifeq ($(USE_CUDA), 1)
SOURCES := $(foreach dir,$(SRC_DIR),$(wildcard $(dir)/*.c) $(wildcard $(dir)/*.cpp) $(wildcard $(dir)/*.cc) $(wildcard $(dir)/*.cu))
else
SOURCES := $(foreach dir,$(SRC_DIR),$(wildcard $(dir)/*.c) $(wildcard $(dir)/*.cpp) $(wildcard $(dir)/*.cc) )
endif
SOURCES := $(subst $(Tensor_Main), ,$(SOURCES))
SOURCES := $(subst $(Network_Main), ,$(SOURCES))
# object file
OBJS := $(patsubst %.c,%.o,$(SOURCES))
OBJS := $(patsubst %.cpp,%.o,$(OBJS))
ifeq ($(USE_CUDA), 1)
OBJS := $(patsubst %.cu,%.cuo,$(OBJS))
endif
all: start lib exe finish
start:
@echo ""
@echo "Start building ..."
lib: start_lib niutrans_dll finish_lib
start_lib:
@mkdir -p $(LIB_DIR)
@echo ""
@echo "Start building library"
niutrans_dll: $(NIUTRANS_DLL)
$(NIUTRANS_DLL): $(OBJS)
ifeq ($(dll), 1)
@echo "Building dynamic link library: $(NIUTRANS_DLL)"
@$(CXX) -shared -Wall $(CXXFLAGS) $(MACRO) $(LDFLAGS) $(OBJS) $(DEPLIBS) -o $@
else
@echo "Skip building dynamic link library"
endif
finish_lib:
@echo "Finish building library"
@echo ""
exe: start_exe niutrans_exe finish_exe
start_exe:
@mkdir -p $(EXE_DIR)
@echo ""
@echo "Start building executable file"
niutrans_exe: $(NIUTRANS_EXE)
$(NIUTRANS_EXE): $(OBJS) $(MAIN_FILE)
@echo "Building executable file: $(NIUTRANS_EXE)"
@$(CXX) $(MAIN_FILE) $(CXXFLAGS) $(MACRO) $(LDFLAGS) $(OBJS) $(DEPLIBS) -o $@
finish_exe:
@echo "Finish building executable file"
@echo ""
finish:
@echo "Finish building ..."
@echo ""
%.o: %.c
@$(CC) $(CFLAGS) -c $< -o $@
%.o: %.cpp
@$(CXX) $(CXXFLAGS) $(MACRO) -c $< -o $@
%.cuo: %.cu
ifeq ($(dll), 1)
@$(NVCC) --shared --compiler-options '-fPIC' $(CUDA_FLAG) -c $< -o $@
else
@$(NVCC) $(CUDA_FLAG) -c $< -o $@
endif
.PHONY: clean
clean:
@echo "Cleaning object files"
@-rm -f $(OBJS)
\ No newline at end of file
......@@ -45,7 +45,9 @@ int main( int argc, const char ** argv )
//_CrtSetDbgFlag(_CrtSetDbgFlag(_CRTDBG_REPORT_FLAG) | _CRTDBG_LEAK_CHECK_DF);
//_CrtSetBreakAlloc(2708);
if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
if(argc > 1 && !strcmp(argv[1], "-test"))
Test();
else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
FNNLMMain(argc - 1, argv + 1);
else if(argc > 1 && !strcmp(argv[1], "-t2t"))
TransformerMain(argc - 1, argv + 1);
......@@ -54,6 +56,7 @@ int main( int argc, const char ** argv )
fprintf(stderr, "neural networks in an easy way. \n\n");
fprintf(stderr, "Run this program with \"-test\" for unit test!\n");
fprintf(stderr, "Or run this program with \"-fnnlm\" for sample FNNLM!\n");
fprintf(stderr, "Or run this program with \"-t2t\" for sample Transformer!\n");
}
//_CrtDumpMemoryLeaks();
......
......@@ -43,18 +43,18 @@ void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient)
XNoder::MakeGrad(input);
if(operID == FUNC_HARDTANH)
_HardTanHBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
_HardTanHBackward(output, input, output->grad, input->grad);
else if(operID == FUNC_IDENTITY)
_IdentityBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
_IdentityBackward(output, input, output->grad, input->grad);
else if(operID == FUNC_LOGSOFTMAX){
int leadDim = income.GetParamInt(0);
CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in logsoftmax!");
_LogSoftmaxBackward(NULL, output, input, output->grad, input->grad, NULL, leadDim, NOLOSS);
}
else if(operID == FUNC_RECTIFY)
_RectifyBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
_RectifyBackward(output, input, output->grad, input->grad);
else if(operID == FUNC_SIGMOID)
_SigmoidBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
_SigmoidBackward(output, input, output->grad, input->grad);
else if(operID == FUNC_SOFTMAX){
int leadDim = income.GetParamInt(0);
CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in softmax!");
......
......@@ -69,7 +69,7 @@ void XLossGrad::MakeGrad(XTensor * node, bool isEfficient)
if(operID == LOSS_CROSSENTROPY) {
if (income.tailNum == 3)
padding = income.tails[2];
leadingDim = income.GetParamInt(0);
leadingDim = income.GetParamInt(0);
CheckNTErrors(leadingDim >= 0 && leadingDim < output->order, "wrong leading dimension in logsoftmax!");
_CrossEntropyBackward(dedy, output, gold, weight, padding, leadingDim);
}
......@@ -98,39 +98,39 @@ compute dE/dx for a given function y = f(x)
>> params - parameters of the function
>> lossName - name of the loss, e.g., cross entropy
*/
void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx, XTensor * padding,
int funcID, void * params,
LOSS_FUNCTION_NAME lossName)
{
CheckNTErrors(gold && y && x, "Empty input tensors!");
CheckNTErrors(dedx, "Empty gradient tensors!");
CheckNTErrors((funcID & FUNCTION_BASE) != 0, "Illegal function id");
if(funcID == FUNC_HARDTANH){
_HardTanHBackward(gold, y, x, dedy, dedx, lossName);
}
else if(funcID == FUNC_IDENTITY){
_IdentityBackward(gold, y, x, dedy, dedx, lossName);
}
else if(funcID == FUNC_LOGSOFTMAX){
int leadDim = *(int*)params;
_LogSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
}
else if(funcID == FUNC_RECTIFY){
_RectifyBackward(gold, y, x, dedy, dedx, lossName);
}
else if(funcID == FUNC_SIGMOID){
_SigmoidBackward(gold, y, x, dedy, dedx, lossName);
}else if(funcID == FUNC_SOFTMAX){
int leadDim = *(int*)params;
_SoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
}
else{
ShowNTErrors("wrong function found when call the backward process!");
}
}
//void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x,
// XTensor * dedy, XTensor * dedx, XTensor * padding,
// int funcID, void * params,
// LOSS_FUNCTION_NAME lossName)
//{
// CheckNTErrors(gold && y && x, "Empty input tensors!");
// CheckNTErrors(dedx, "Empty gradient tensors!");
// CheckNTErrors((funcID & FUNCTION_BASE) != 0, "Illegal function id");
//
// if(funcID == FUNC_HARDTANH){
// _HardTanHBackward(gold, y, x, dedy, dedx, lossName);
// }
// else if(funcID == FUNC_IDENTITY){
// _IdentityBackward(gold, y, x, dedy, dedx, lossName);
// }
// else if(funcID == FUNC_LOGSOFTMAX){
// int leadDim = *(int*)params;
// _LogSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
// }
// else if(funcID == FUNC_RECTIFY){
// _RectifyBackward(gold, y, x, dedy, dedx, lossName);
// }
// else if(funcID == FUNC_SIGMOID){
// _SigmoidBackward(gold, y, x, dedy, dedx, lossName);
// }else if(funcID == FUNC_SOFTMAX){
// int leadDim = *(int*)params;
// _SoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
// }
// else{
// ShowNTErrors("wrong function found when call the backward process!");
// }
//
//}
/*
compute dE/dy for variable y and error(loss) function E
......@@ -139,27 +139,27 @@ compute dE/dy for variable y and error(loss) function E
>> dedy - dE/dy
>> lossName - name of the loss, e.g., cross entropy
*/
void XLossGrad::Compute(XTensor * gold, XTensor * y,
XTensor * dedy, XTensor * padding,
LOSS_FUNCTION_NAME lossName)
{
if(gold == NULL){
if(dedy->dataType == X_FLOAT)
_SetDataFixedFloat(dedy, 1.0F);
else if(dedy->dataType == X_DOUBLE)
_SetDataFixedDouble(dedy, 1.0);
else if(dedy->dataType == X_INT)
_SetDataFixedInt(dedy, 1);
else{
ShowNTErrors("TODO");
}
return;
}
//_LossBackward(dedy, gold, y, lossName);
if(lossName == CROSSENTROPY)
_CrossEntropyBackward(dedy, y, gold, NULL, padding);
}
//void XLossGrad::Compute(XTensor * gold, XTensor * y,
// XTensor * dedy, XTensor * padding,
// LOSS_FUNCTION_NAME lossName)
//{
// if(gold == NULL){
// if(dedy->dataType == X_FLOAT)
// _SetDataFixedFloat(dedy, 1.0F);
// else if(dedy->dataType == X_DOUBLE)
// _SetDataFixedDouble(dedy, 1.0);
// else if(dedy->dataType == X_INT)
// _SetDataFixedInt(dedy, 1);
// else{
// ShowNTErrors("TODO");
// }
// return;
// }
//
// //_LossBackward(dedy, gold, y, lossName);
// if(lossName == CROSSENTROPY)
// _CrossEntropyBackward(dedy, y, gold, NULL, padding);
//
//}
}
\ No newline at end of file
......@@ -43,11 +43,11 @@ public:
static
bool IsLossOP(XTensor * node);
/* compute dE/dx for a given function y = f(x) */
void Compute(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx, XTensor * padding,
int funcID, void * params,
LOSS_FUNCTION_NAME lossName);
///* compute dE/dx for a given function y = f(x) */
//void Compute(XTensor * gold, XTensor * y, XTensor * x,
// XTensor * dedy, XTensor * dedx, XTensor * padding,
// int funcID, void * params,
// LOSS_FUNCTION_NAME lossName);
/* compute dE/dy for variable y and error(loss) function E */
void Compute(XTensor * gold, XTensor * y,
......
......@@ -530,7 +530,7 @@ void XMathGrad::GradMatrixMul(XTensor * node, bool isEfficient)
XTensor * dedc = node->grad;
XTensor * deda = a->grad;
XTensor * dedb = b->grad;
if(a->order == 2 && b->order == 2)
GradMatrixMul(a, deda, transA, b, dedb, transB, dedc, alpha, isEfficient);
else if(transA == X_NOTRANS && a->order > 2 && b->order == 2){
......
......@@ -20,7 +20,7 @@
* This is a simple impelementation of the feed-forward network-baesd language
* model (FNNLM). See more details about FNNLM in
* "A Neural Probabilistic Language Model" by Bengio et al.
* Journal of Machine Learning Research 3 (2003) 1137C1155
* Journal of Machine Learning Research 3 (2003) 1137?155
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-06-22
*/
......@@ -469,6 +469,10 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
/* update model parameters */
Update(model, grad, learningRate, false);
/* get probabilities */
float prob = GetProb(output, gold);
loss -= prob;
}
else{
/* gradient = 0 */
......@@ -480,23 +484,19 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
ForwardAutoDiff(ngrams, ngramNum, output, model);
/* this is implemented by multiply function */
//ForwardAutoDiff(inputs, output, model);
lossTensor = CrossEntropy(output, gold);
/* automatic differentiation */
autoDiffer.Backward(lossTensor);
//autoDiffer.Backward(output, gold, CROSSENTROPY);
/* update model parameters */
Update(model, grad, learningRate, true);
/* get probabilities */
float prob = ReduceSumAll(lossTensor);
loss += prob;
}
/* get probabilities */
float prob = GetProb(output, gold);
prob = ReduceSumAll(lossTensor);
loss += prob;
wordCount += ngramNum;
wordCountTotal += ngramNum;
......@@ -579,9 +579,6 @@ void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad)
XTensor * para = (XTensor*)paraList.GetItem(i);
XTensor * paraGrad = (XTensor*)gradList.GetItem(i);
//fprintf(stderr, "%d\n", i);
//paraGrad->Dump(stderr, "grad:", 10);
/* the delta rule */
_Sum(para, paraGrad, para, -epsilon);
}
......@@ -600,14 +597,14 @@ float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs)
InitTensorV2(&probs, &output);
/* probs[i,j] = output[i,j] * gold[i,j] */
_Multiply(&output, &gold, &probs);
Multiply(output, gold, probs);
/* probability of each word */
XTensor wprobs;
InitTensor1DV2(&wprobs, output.GetDim(0), output.dataType, output.devID);
_ReduceSum(&probs, &wprobs, 1);
ReduceSum(probs, wprobs, 1);
if(wordProbs != NULL)
_CopyValues(&wprobs, wordProbs);
CopyValues(wprobs, *wordProbs);
/* reshape the tensor to fit it into the reduce procedure
TODO: XTensor supports scalars */
......@@ -619,7 +616,7 @@ float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs)
/* probability for the batch */
XTensor result;
InitTensor1DV2(&result, 1, X_FLOAT, output.devID);
_ReduceSum(&probs, &result, 1);
ReduceSum(probs, result, 1);
return result.Get1D(0);
}
......@@ -784,7 +781,7 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
/* generate word embedding of position i:
embedding = input * w */
_MatrixMul(&input, X_NOTRANS, &w, X_NOTRANS, &embedding);
MatrixMul(input, X_NOTRANS, w, X_NOTRANS, embedding);
eList.Add(&net.embeddings[i]);
}
......@@ -792,7 +789,7 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
/* concatenate word embeddings
embeddingcat = cat(embedding_0...embedding_{n-1}) */
InitModelTensor2D(net.embeddingCat, batchSize, (n - 1) * model.eSize, model);
_Concatenate(&eList, &net.embeddingCat, 1);
Concatenate(eList, net.embeddingCat, 1);
/* go over each hidden layer */
for(int i = 0; i < depth; i++){
......@@ -807,22 +804,22 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
/* generate hidden states of layer i:
s = h_pre * w */
_MatrixMul(&h_pre, X_NOTRANS, &w, X_NOTRANS, &s);
MatrixMul(h_pre, X_NOTRANS, w, X_NOTRANS, s);
/* make a 2d tensor for the bias term */
XTensor b2D;
InitTensorV2(&b2D, &s);
_Unsqueeze(&b, &b2D, 0, batchSize);
Unsqueeze(b, b2D, 0, batchSize);
/* introduce bias term:
s = s + b
NOTE: the trick here is to extend b to a 2d tensor
to fit into the 2d representation in tensor summation */
_Sum(&s, &b2D, &s);
Sum(s, b2D, s);
/* pass the state through the hard tanh function:
h = tanh(s) */
_HardTanH(&s, &h);
HardTanH(s, h);
}
/* generate the output Pr(w_{n-1}|w_0...w_{n-2}):
......@@ -840,16 +837,16 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
InitModelTensor2D(y, batchSize, model.vSize, model);
/* s = h_last * w */
_MatrixMul(&h_last, X_NOTRANS, &w, X_NOTRANS, &s);
MatrixMul(h_last, X_NOTRANS, w, X_NOTRANS, s);
XTensor b2D;
InitTensorV2(&b2D, &s);
_Unsqueeze(&b, &b2D, 0, batchSize);
Unsqueeze(b, b2D, 0, batchSize);
_Sum(&s, &b2D, &s);
Sum(s, b2D, s);
/* y = softmax(s) */
_LogSoftmax(&s, &y, 1);
LogSoftmax(s, y, 1);
}
}
......@@ -891,18 +888,18 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
x is the top most hidden layer)
so we know
dE/dw = x^T * dE/ds */
_MatrixMul(&x, X_TRANS, &deds, X_NOTRANS, &dedw);
MatrixMul(x, X_TRANS, deds, X_NOTRANS, dedw);
/* gradient of the bias: dE/db = dE/ds * 1 = dE/ds
specifically dE/db_{j} = \sum_{i} dE/ds_{i,j} */
_ReduceSum(&deds, &dedb, 0);
ReduceSum(deds, dedb, 0);
/* then, we compute
dE/dx_{j} = \sum_j' (dE/ds_{j'} * ds_{j'}/dx_j)
= \sum_j' (dE/ds_{j'} * w_{j, j'})
i.e.,
dE/dx = dE/ds * w^T */
_MatrixMul(&deds, X_NOTRANS, &w, X_TRANS, &dedx);
MatrixMul(deds, X_NOTRANS, w, X_TRANS, dedx);
XTensor &gradPassed = dedx;
XTensor dedsHidden;
......@@ -927,20 +924,20 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
/* backpropagation through the activation fucntion:
dE/ds = dE/dh * dh/ds */
_HardTanHBackward(NULL, &h, &s, &dedh, &deds, NOLOSS);
_HardTanHBackward(&h, &s, &dedh, &deds);
/* gradient of the weight: dE/dw = x^T * dE/ds */
_MatrixMul(&x, X_TRANS, &deds, X_NOTRANS, &dedw);
MatrixMul(x, X_TRANS, deds, X_NOTRANS, dedw);
/* gradient of the bias: dE/db = dE/ds * 1 = dE/ds
specifically dE/db_{j} = \sum_{i} dE/ds_{i,j} */
_ReduceSum(&deds, &dedb, 0);
ReduceSum(deds, dedb, 0);
/* gradient of the input: dE/dx = dE/ds * w^T */
_MatrixMul(&deds, X_NOTRANS, &w, X_TRANS, &dedx);
MatrixMul(deds, X_NOTRANS, w, X_TRANS, dedx);
if (i > 0)
_CopyValues(&dedx, &gradPassed);
CopyValues(dedx, gradPassed);
}
TensorList eList(n - 1);
......@@ -955,7 +952,7 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
XTensor &dedyCat = depth > 0 ? dedxBottom : dedx;
/* split the concatenation of gradients of the embeddings */
_Split(&dedyCat, &eList, 1, n - 1);
Split(dedyCat, eList, 1, n - 1);
/* go over for each word */
for (int i = 0; i < n - 1; i++) {
......@@ -966,7 +963,7 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
/* gradient of the embedding weight: dE/dw += x^T * dE/dy
NOTE that we accumulate dE/dw here because the matrix w
is shared by several layers (or words) */
_MatrixMul(&x, X_TRANS, dedy, X_NOTRANS, &dedw, 1.0F, 1.0F);
MatrixMul(x, X_TRANS, *dedy, X_NOTRANS, dedw, 1.0F, 1.0F);
delete dedy;
}
......@@ -1171,9 +1168,10 @@ void Test(const char * test, const char * result, FNNModel &model)
else {
/* this is implemented by gather function */
ForwardAutoDiff(ngrams, ngramNum, output, model);
/* this is implemented by multiply function */
//ForwardAutoDiff(inputs, output, model);
output = Log(output);
/* this is implemented by multiply function */
//ForwardAutoDiff(inputs, output, model);
}
/* prediction probabilities */
......@@ -1201,6 +1199,7 @@ void Test(const char * test, const char * result, FNNModel &model)
}
fclose(file);
fclose(ofile);
double elapsed = GetClockSec() - startT;
......
......@@ -297,7 +297,7 @@ void T2TSearch::Generate(T2TStateBundle * beam)
row means a previous state. The column number is size-of-beam \times vocab-size. We,
therefore, divide entries of the top-k index by vocab-size to compute the id of the
previous state for each hypothesis in the top-k list. */
Descale(preID, sizeVocab);
DescaleMe(preID, sizeVocab);
/* Then, we do something similar to "preID". For the top-k predictions, we need
to know their indices in the vocabulary. We compute the offset of each prediction
......@@ -311,13 +311,13 @@ void T2TSearch::Generate(T2TStateBundle * beam)
CopyValues(scoreTopK, score);
/* CPU data (TODO: remove GPU->CPU data copy!!!) */
XTensor indexCPU;
InitTensorV2(&indexCPU, index.order, index.dimSize, index.dataType, -1);
CopyValues(index, indexCPU);
XTensor indexGPU;
indexGPU = CopyValues(index);
//InitTensor(&indexCPU, index.order, index.dimSize, index.dataType, index.denseRatio, -1);
//CopyValues(index, indexCPU);
for (int i = 0; i < indexCPU.unitNum; i++)
indexCPU.SetInt(i * stride + indexCPU.GetInt(i), i);
for (int i = 0; i < indexGPU.unitNum; i++)
indexGPU.SetInt(i * stride + indexGPU.GetInt(i), i);
CheckNTErrors(XTensor::IsSameShaped(&prob, &probPath), "Wrong tensor shape!");
......@@ -338,8 +338,8 @@ void T2TSearch::Generate(T2TStateBundle * beam)
prob.Reshape(1, prob.unitNum);
probTopK.Reshape(1, probTopK.unitNum);
_Gather(&probPath, &probPathTopK, probPathTopK.order - 1, (int*)indexCPU.data, indexCPU.unitNum);
_Gather(&prob, &probTopK, probTopK.order - 1, (int*)indexCPU.data, indexCPU.unitNum);
_CopyIndexed(&probPath, &probPathTopK, probPathTopK.order - 1, &indexGPU);
_CopyIndexed(&prob, &probTopK, probTopK.order - 1, &indexGPU);
probPath.Reshape(order, dims);
probPathTopK.Reshape(order, dimsTopK);
......
......@@ -60,7 +60,7 @@ TENSOR_DATA_TYPE GetDataType(const char * typeName)
}
}
/****************************************************
/*
Below is for calling CPU BLAS for fast matrix operations
I'm not sure how fast it is. But it seems that other
guys are crazy about this. So I decided to have a try.
......@@ -81,35 +81,4 @@ _XINLINE_ float Float16ToFloat(unsigned short h)
return f;
}
/*
data type conversion
>> devID - device id
>> s - source data array
>> typeS - source data type
>> t - target data array
>> typeT - target data type
>> size - number of the items in s (and t)
*/
void ConvertDataType(int devID, void * s, TENSOR_DATA_TYPE typeS, void * t, TENSOR_DATA_TYPE typeT, int size)
{
CheckNTErrors((devID < 0), "This code must be run on CPUs!");
if(typeS == typeT)
return;
if(typeS == X_FLOAT && typeT == X_FLOAT16){
for(int i = 0; i < size; i++){
((unsigned short*)t)[i] = FloatToFloat16(((float*)s)[i]);
}
}
else if(typeS == X_FLOAT16 && typeT == X_FLOAT){
for(int i = 0; i < size; i++){
((float*)t)[i] = Float16ToFloat(((unsigned short*)s)[i]);
}
}
else{
ShowNTErrors("Unsupported data types for conversion!");
}
}
} /* end of the nts (NiuTrans.Tensor) namespace */
......@@ -49,15 +49,6 @@ extern TENSOR_DATA_TYPE GetDataType(const char * typeName);
/* data conversion (for lower precision computation) */
unsigned short FloatToFloat16(float f);
float Float16ToFloat(unsigned short h);
void ConvertDataType(int devID,
void * s, TENSOR_DATA_TYPE typeS,
void * t, TENSOR_DATA_TYPE typeT, int size);
#ifdef USE_CUDA
void CudaConvertDataType(int devID,
void * s, TENSOR_DATA_TYPE typeS,
void * t, TENSOR_DATA_TYPE typeT, int size);
#endif
} /* end of the nts (NiuTrans.Tensor) namespace */
......
......@@ -51,7 +51,13 @@ bool CONST_TRUE = true;
int verboseLevel = 0;
bool useBLAS = false;
bool useCUDA = false;
#ifdef USE_CUDA
bool useCUDA = true;
#else
bool useCUDA = false;
#endif
FILE * tmpLog = NULL;
double myTime = 0;
......
......@@ -59,6 +59,8 @@ const char * GetOPName(int type)
return "M_DIV";
else if (type == MATH_DIVDIM)
return "M_DIVDIM";
else if (type == MATH_MASK)
return "M_MASK";
else if (type == MATH_MATRIXMUL)
return "M_MATRIXMUL";
else if (type == MATH_MATRIXMULBATCHED)
......
......@@ -48,7 +48,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#define MATH_CLIP MATH_ROUND + 1
#define MATH_DIV MATH_CLIP + 1
#define MATH_DIVDIM MATH_DIV + 1
#define MATH_MATRIXMUL MATH_DIVDIM + 1
#define MATH_MASK MATH_DIVDIM + 1
#define MATH_MATRIXMUL MATH_MASK + 1
#define MATH_MATRIXMULBATCHED MATH_MATRIXMUL + 1
#define MATH_MULTIPLY MATH_MATRIXMULBATCHED + 1
#define MATH_MULTIPLYDIM MATH_MULTIPLY + 1
......@@ -79,7 +80,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* data and shape related operations */
#define DATA_BASE MATH_BASE * 2
#define GETANDSET DATA_BASE + 1
#define GETANDSET_SELECT GETANDSET + 1
#define GETANDSET_CONVERTDATATYPE GETANDSET + 1
#define GETANDSET_SELECT GETANDSET_CONVERTDATATYPE + 1
#define MOVEMENT GETANDSET_SELECT + 1
#define MOVEMENT_COPYINDEXED MOVEMENT + 1
......
......@@ -48,6 +48,7 @@
#include "core/math/ScaleAndShift.h"
#include "core/getandset/SetData.h"
#include "function/Identity.h"
#include "core/CHeader.h"
#ifdef USE_CUDA
......@@ -485,6 +486,12 @@ XTensor XTensor::operator- (const DTYPE shift) const
return ScaleAndShift(*this, 1, -shift);
}
/* overloading of the minus-sign */
XTensor XTensor::operator- () const
{
return Negate(*this);
}
/* overloading of the division-sign */
XTensor XTensor::operator/ (const XTensor& tensor) const
{
......@@ -837,6 +844,12 @@ void XTensor::SetData(const void * d, int num, int beg)
XMemCopy((char*)data + beg * unitSize, devID, d, -1, num * unitSize);
}
/* generate data items with a uniform distribution in [0, 1] */
void XTensor::Rand(int rNum, int cNum)
{
_SetDataRand(this, rNum, cNum);
}
/*
set the tensor items by a uniform distribution in range [lower, upper]
>> lower - lower value of the range
......@@ -2425,7 +2438,7 @@ initialize a dense 5d tensor V2
*/
void InitTensor5DV2(XTensor * tensor, const int d0, const int d1, const int d2, const int d3, const int d4,
const TENSOR_DATA_TYPE myDataType, const int myDevID)
const TENSOR_DATA_TYPE myDataType, const int myDevID)
{
int dims[5];
dims[0] = d0;
......
......@@ -238,6 +238,9 @@ public:
/* overloading of the minus-sign */
XTensor operator- (const DTYPE shift) const;
/* overloading of the minus-sign */
XTensor operator- () const;
/* overloading of the division-sign */
XTensor operator/ (const XTensor &tensor) const;
......@@ -301,6 +304,9 @@ public:
/* set the tensor with an data array */
void SetData(const void * d, int num, int beg = 0);
/* generate data items with a uniform distribution in [0, 1] */
void Rand(int rNum, int cNum);
/* set tensor items by a uniform distribution */
void SetDataRand(DTYPE lower = 0.0F, DTYPE upper = 1.0F);
......@@ -497,7 +503,7 @@ void InitTensor5D(XTensor * tensor, const int d0, const int d1, const int d2, co
/* initialize a dense 5d tensor V2 */
void InitTensor5DV2(XTensor * tensor, const int d0, const int d1, const int d2, const int d3, const int d4,
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1);
const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1);
/* initialize a tensor with a reference tensor */
void InitTensor(XTensor * tensor, const XTensor * reference);
......
......@@ -36,13 +36,9 @@
#include "arithmetic/MatrixMulBatched.h"
#include "arithmetic/Multiply.h"
#include "arithmetic/MultiplyDim.h"
#include "arithmetic/Negate.h"
#include "arithmetic/Sign.h"
#include "arithmetic/Sub.h"
#include "arithmetic/SubDim.h"
#include "arithmetic/Sum.h"
#include "arithmetic/SumByColumnTV.h"
#include "arithmetic/SumByColumnVT.h"
#include "arithmetic/SumDim.h"
#include "arithmetic/XTensorBLAS.h"
#include "arithmetic/MulAndShift.h"
......@@ -56,7 +52,6 @@
#include "math/Clip.h"
#include "math/Compare.h"
#include "math/Normalize.h"
#include "math/Power.h"
#include "math/ScaleAndShift.h"
#include "math/Unary.h"
......@@ -97,5 +92,4 @@
#include "utilities/XMatrixSegment.h"
#include "utilities/FlushToMem.h"
#include "../function/DropoutWithIndex.h"
#endif // __CHEADER_H__
......@@ -151,16 +151,35 @@ XTensor Mask(const XTensor &a, const XTensor &mask, DTYPE alpha)
XTensor c(&a);
c.SetTMPFlag();
/* call _Sum function */
/* call _Mask function */
_Mask(&a, &mask, &c, alpha);
/* tensor connections */
//XLink::MakeLink(&a, &mask, &c, MATH_SUM);
//XLink::AddParamToHead(&c, alpha);
// TODO!!
ShowNTErrors("TODO!");
XLink::MakeLink(&a, &mask, &c, MATH_MASK);
XLink::AddParamToHead(&c, alpha);
return c;
}
/*
mask entries of a given tensor (return an XTensor structure):
a(i) = a(i) if mask(i) is non-zero
a(i) = alpha if mask(i) = 0
where i is the index of the element
*/
void Mask(const XTensor &a, const XTensor &mask, XTensor &c, DTYPE alpha)
{
if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
InitTensor(&c, &a);
}
/* call _Mask function */
_Mask(&a, &mask, &c, alpha);
if (c.enableGrad) {
XLink::MakeLink(&a, &mask, &c, MATH_MASK);
XLink::AddParamToHead(&c, alpha);
}
}
}
\ No newline at end of file
......@@ -34,7 +34,7 @@ c(i) = a(i) if mask(i) is non-zero
c(i) = alpha if mask(i) = 0
where i is the index of the element
*/
void _Mask(const XTensor * a, const XTensor * mask, XTensor * c, DTYPE alpha);
void _Mask(const XTensor * a, const XTensor * mask, XTensor * c, DTYPE alpha = 0.0);
/*
mask entries of a given tensor (on site):
......@@ -42,10 +42,10 @@ a(i) = a(i) if mask(i) is non-zero
a(i) = alpha if mask(i) = 0
where i is the index of the element
*/
void _MaskMe(XTensor * a, const XTensor * mask, DTYPE alpha);
void MaskMe(XTensor & a, const XTensor & mask, DTYPE alpha);
void _MaskMe(XTensor * a, const XTensor * mask, DTYPE alpha = 0.0);
void MaskMe(XTensor & a, const XTensor & mask, DTYPE alpha = 0.0);
/*
/*
mask entries of a given tensor (return an XTensor structure):
a(i) = a(i) if mask(i) is non-zero
a(i) = alpha if mask(i) = 0
......@@ -53,6 +53,14 @@ where i is the index of the element
*/
XTensor Mask(const XTensor &a, const XTensor &mask, DTYPE alpha = 0.0);
/*
mask entries of a given tensor (return an XTensor structure):
a(i) = a(i) if mask(i) is non-zero
a(i) = alpha if mask(i) = 0
where i is the index of the element
*/
void Mask(const XTensor &a, const XTensor &mask, XTensor &c, DTYPE alpha = 0.0);
} // namespace nts(NiuTrans.Tensor)
#endif // __MASK_H__
......@@ -202,7 +202,9 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
delete cList;
}
bool CheckMMulShape(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c)
bool CheckMMulShape(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
const XTensor * b, MATRIX_TRANS_TYPE transposedB,
XTensor * c)
{
if (!(a && b && c))
return false;
......@@ -231,10 +233,13 @@ bool CheckMMulShape(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTen
dimSize[sub++] = bm;
for (int i = 0; i < order; i++) {
if (dimSize[i] != c->dimSize[i])
if (dimSize[i] != c->dimSize[i]) {
delete[] dimSize;
return false;
}
}
delete[] dimSize;
return true;
}
......@@ -303,8 +308,8 @@ XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
}
void MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
const XTensor &b, MATRIX_TRANS_TYPE transposedB, XTensor &c,
DTYPE alpha, XPRunner * parallelRunner)
const XTensor &b, MATRIX_TRANS_TYPE transposedB, XTensor &c,
DTYPE alpha, DTYPE beta, XPRunner * parallelRunner)
{
CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
......@@ -337,7 +342,7 @@ void MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
}
/* call _MatrixMul function */
_MatrixMul(&a, transposedA, &b, transposedB, &c, alpha, 0, parallelRunner);
_MatrixMul(&a, transposedA, &b, transposedB, &c, alpha, beta, parallelRunner);
if (c.enableGrad) {
/* tensor connections */
......@@ -400,7 +405,7 @@ XTensor MatrixMul(const XTensor &a, const XTensor &b,
}
void MatrixMul(const XTensor &a, const XTensor &b, XTensor &c,
DTYPE alpha, XPRunner * parallelRunner)
DTYPE alpha, XPRunner * parallelRunner)
{
CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
......
......@@ -40,8 +40,11 @@ bj is the j-th element tensor of B, and c_{i,j} is the (i,j) elementtensor of th
C should be a tensor of z * x * n * m.
Obviously C = A * B performs normal matrix multiplication if A = y * z and B = x * y.
*/
void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
const XTensor * b, MATRIX_TRANS_TYPE transposedB,
XTensor * c,
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0,
XPRunner * parallelRunner = NULL);
/*
matrix multiplication (return an XTensor structure) c = trans(a) * trans(b) * alpha
......@@ -56,11 +59,16 @@ bj is the j-th element tensor of B, and c_{i,j} is the (i,j) elementtensor of th
C should be a tensor of z * x * n * m.
Obviously C = A * B performs normal matrix multiplication if A = y * z and B = x * y.
*/
XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
const XTensor &b, MATRIX_TRANS_TYPE transposedB,
DTYPE alpha = (DTYPE)1.0,
XPRunner * parallelRunner = NULL);
void MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
XTensor &c, DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
void MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
const XTensor &b, MATRIX_TRANS_TYPE transposedB,
XTensor &c,
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0,
XPRunner * parallelRunner = NULL);
/* matrix multiplication with no transposition c = a * b * alpha*/
XTensor MatrixMul(const XTensor &a, const XTensor &b,
......@@ -69,7 +77,6 @@ XTensor MatrixMul(const XTensor &a, const XTensor &b,
void MatrixMul(const XTensor &a, const XTensor &b, XTensor &c,
DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
} // namespace nts(NiuTrans.Tensor)
#endif // __MATRIXMUL_H__
\ No newline at end of file
......@@ -154,7 +154,7 @@ void _MatrixMulBatchedCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
const XTensor * b, MATRIX_TRANS_TYPE transposedB,
XTensor * c, DTYPE alpha, DTYPE beta)
{
CheckNTErrors((a && b && c), "Empty input tensors!");
CheckNTErrors(a && b && c, "Empty input tensors!");
CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
"Input tensors should have the same data type!");
CheckNTErrors(a->order >= 2 && b->order >= 2 && c->order >= 2,
......
......@@ -66,7 +66,7 @@ operation c = x * w + b MulAndShift
<< return - the result of matrix multiplication
*/
XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
DTYPE alpha, XPRunner * parallelRunner)
DTYPE alpha, XPRunner * parallelRunner)
{
CheckNTErrors(x.dataType == w.dataType, "Input tensors should have the same data type!");
CheckNTErrors(x.order >= 2 && w.order >= 2, "Input tensors must have a order >= 2!");
......@@ -129,9 +129,6 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
DelTensorBuf(tmp);
return c;
}
}
\ No newline at end of file
......@@ -29,7 +29,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -123,9 +123,9 @@ where i is the item index
void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
{
int leadingDimRDI = a->order - leadingDim - 1;
CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
CheckNTErrors(a->unitNum <= c->unitNum && b->unitNum <= c->unitNum,
"Unmatched tensors in multiplication!");
CheckNTErrors((a->order == b->order && a->order == c->order), "Unmatched tensors!");
CheckNTErrors(a->order == b->order && a->order == c->order, "Unmatched tensors!");
int stride = 1;
int blockSizeA = 1;
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "../../XTensor.h"
#include "../../XName.h"
#include "Negate.h"
#include "Negate.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
set every entry to its minus value
>> a - input tensor we are processing
>> b - output tensor we are processing
*/
void _Negate(const XTensor * a, XTensor * b)
{
#ifdef USE_CUDA
/* run it on GPUs */
if (a->devID >= 0) {
_CudaNegate(a, b);
return;
}
#endif
CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
DTYPE * d = (DTYPE*)a->data;
DTYPE * db = (DTYPE*)b->data;
for (int i = 0; i < a->unitNum; i++)
db[i] = -d[i];
}
/*
set every entry to its minus value (do it on site)
keep the result in the input tensor a and return nothing
>> a - the tensor we are processing
*/
void _NegateMe(XTensor * a)
{
_Negate(a, a);
}
/*
set every entry to its minus value (do it on site)
keep the result in the input tensor a and return nothing
>> a - the tensor we are processing
*/
void NegateMe(XTensor& a)
{
_Negate(&a, &a);
}
/*
set every entry to its minus value (return an XTensor structure)
make a new tensor to keep the result and return it
>> a - input tensor we are processing
<< return - the minus value of input tensor
*/
XTensor Negate(const XTensor & a)
{
XTensor b(&a);
b.SetTMPFlag();
/* call _Negate function */
_Negate(&a, &b);
/* tensor connections */
XLink::MakeLink(&a, NULL, &b, MATH_NEGATE);
return b;
}
/*
set every entry to its minus value
>> a - input tensor we are processing
>> b - output tensor we are processing
*/
void Negate(const XTensor & a, XTensor & b)
{
if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {
InitTensor(&b, &a);
}
/* call _Negate function */
_Negate(&a, &b);
if (b.enableGrad) {
/* tensor connections */
XLink::MakeLink(&a, NULL, &b, MATH_NEGATE);
}
}
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "../../XDevice.h"
#include "../../XTensor.h"
#include "Negate.h"
#include "Negate.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/*
set each entry to its negtive value (CUDA Kernel)
>> a - pointer to the input data array
>> b - pointer to the output data array
>> size - size of the data array
*/
__global__
void KernelNegate(DTYPE * a, DTYPE * b, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size)
b[i] = -a[i];
}
/*
set each entry to its negtive value (CUDA Kernel)
This is for float16 computation
>> a - pointer to the input data array
>> b - pointer to the output data array
>> size - size of the data array
*/
__global__
void KernelNegate(__half * a, __half * b, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
if (i < size)
b[i] = __hsub(__float2half(0), a[i]);
#else
if (i < size)
b[i] = __float2half(-__half2float(a[i]));
#endif
}
/*
set each entry to its negtive value
>> a - input tensor
>> b - output tensor
*/
void _CudaNegate(const XTensor * a, XTensor * b)
{
CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
CheckNTErrors((a->isSparse == false), "TODO!");
int gridSize[3];
int blockSize[3];
GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
dim3 blocks(gridSize[0]);
dim3 threads(blockSize[0]);
int devIDBackup;
ProtectCudaDev(a->devID, devIDBackup);
if (a->dataType == DEFAULT_DTYPE) {
KernelNegate << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);
}
else if (a->dataType == X_FLOAT16) {
KernelNegate << <blocks, threads >> >((__half*)a->data, (__half*)b->data, a->unitNum);
}
else {
ShowNTErrors("TODO!");
}
BacktoCudaDev(a->devID, devIDBackup);
}
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#ifndef __NEGATE_CUH__
#define __NEGATE_CUH__
#include "Negate.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* set each entry to its negtive value (CUDA Kernel) */
__global__
void KernelNegate(DTYPE * a, DTYPE * b, int size);
/* set each entry to its negtive value (CUDA Kernel) with float16 data type*/
__global__
void KernelNegate(__half * a, __half * b, int size);
/* set each entry to its negtive value */
void _CudaNegate(const XTensor * a, XTensor * b);
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
#endif // __NEGATE_CUH__
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#ifndef __NEGATE_H__
#define __NEGATE_H__
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* set every entry to its minus value */
void _Negate(const XTensor * a, XTensor * b);
/*
set every entry to its minus value (do it on site)
keep the result in the input tensor a and return nothing
*/
void _NegateMe(XTensor * a);
void NegateMe(XTensor & a);
/*
set every entry to its minus value (return an XTensor structure)
make a new tensor to keep the result and return it
*/
XTensor Negate(const XTensor & a);
/* set every entry to its minus value */
void Negate(const XTensor & a, XTensor & b);
} // namespace nts(NiuTrans.Tensor)
#endif // __NEGATE_H__
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
*/
#include "../../XTensor.h"
#include "../../XName.h"
#include "Sign.h"
#include "Sign.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
set every entry to its sign value
>> a - input tensor we are processing
>> b - output tensor we are processing
*/
void _Sign(const XTensor * a, XTensor * b)
{
#ifdef USE_CUDA
/* run it on GPUs */
if (a->devID >= 0) {
_CudaSign(a, b);
return;
}
#endif
CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
DTYPE * d = (DTYPE*)a->data;
DTYPE * db = (DTYPE*)b->data;
for (int i = 0; i < a->unitNum; i++) {
if (d[i] > 0)
db[i] = 1.0F;
else if (d[i] == 0)
db[i] = 0.0F;
else
db[i] = -1.0F;
}
}
/*
set every entry to its sign value (do it on site)
keep the result in the input tensor a and return nothing
>> a - the tensor we are processing
*/
void _SignMe(XTensor * a)
{
_Sign(a, a);
}
/*
set every entry to its sign value (do it on site)
keep the result in the input tensor a and return nothing
>> a - the tensor we are processing
*/
void SignMe(XTensor& a)
{
_Sign(&a, &a);
}
/*
set every entry to its sign value (return an XTensor structure)
make a new tensor to keep the result and return it
>> a - input tensor we are processing
<< return - the sign value of the input tensor
*/
XTensor Sign(const XTensor & a)
{
XTensor b(&a);
b.SetTMPFlag();
/* call _Sign function */
_Sign(&a, &b);
/* tensor connections */
XLink::MakeLink(&a, NULL, &b, MATH_SIGN);
return b;
}
/*
set every entry to its sign value
>> a - input tensor we are processing
>> b - output tensor we are processing
*/
void Sign(const XTensor & a, XTensor & b)
{
if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {
InitTensor(&b, &a);
}
/* call _Sign function */
_Sign(&a, &b);
if (b.enableGrad) {
/* tensor connections */
XLink::MakeLink(&a, NULL, &b, MATH_SIGN);
}
}
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
*/
#include "../../XDevice.h"
#include "../../XTensor.h"
#include "Sign.h"
#include "Sign.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/*
set each entry to its sign value (CUDA Kernel)
>> a - pointer to input data array
>> b - pointer to output data array
>> size - size of the data array
*/
__global__
void KernelSign(DTYPE * a, DTYPE * b, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) {
if (a[i] > 0)
b[i] = 1.0F;
else if (a[i] == 0)
b[i] = 0.0F;
else
b[i] = -1.0F;
}
}
/*
set each entry to its sign value with float16 data type value (CUDA Kernel)
This is for float16 computation
>> a - pointer to input data array
>> b - pointer to output data array
>> size - size of the data array
*/
__global__
void KernelSign(__half * a, __half * b, int size)
{
return;
}
/*
set each entry to its sign value
>> a - input tensor we are processing
>> b - output tensor we are processing
*/
void _CudaSign(const XTensor * a, XTensor * b)
{
CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
CheckNTErrors((a->isSparse == false), "TODO!");
int gridSize[3];
int blockSize[3];
GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
dim3 blocks(gridSize[0]);
dim3 threads(blockSize[0]);
int devIDBackup;
ProtectCudaDev(a->devID, devIDBackup);
if (a->dataType == DEFAULT_DTYPE) {
KernelSign << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);
}
else if (a->dataType == X_FLOAT16) {
KernelSign << <blocks, threads >> >((__half*)a->data, (__half*)b->data, a->unitNum);
}
else {
ShowNTErrors("TODO!");
}
BacktoCudaDev(a->devID, devIDBackup);
}
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
*/
#ifndef __SIGN_CUH__
#define __SIGN_CUH__
#include "Sign.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* set each entry to its sign value (CUDA Kernel) */
__global__
void KernelSign(DTYPE * a, DTYPE * b, int size);
/* set each entry to its sign value (CUDA Kernel) with float16 data type*/
__global__
void KernelSign(__half * a, __half * b, int size);
/* set each entry to its sign value */
void _CudaSign(const XTensor * a, XTensor * b);
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
#endif // __SIGN_H__
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
*/
#ifndef __SIGN_H__
#define __SIGN_H__
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* set every entry to its sign value */
void _Sign(const XTensor * a, XTensor * b);
/*
set every entry to its sign value (do it on site)
keep the result in the input tensor a and return nothing
*/
void _SignMe(XTensor * a);
/*
set every entry to its sign value (do it on site)
keep the result in the input tensor a and return nothing
*/
void SignMe(XTensor & a);
/*
set every entry to its sign value (return an XTensor structure)
make a new tensor to keep the result and return it
*/
XTensor Sign(const XTensor & a);
/* set every entry to its sign value */
void Sign(const XTensor & a, XTensor & b);
} // namespace nts(NiuTrans.Tensor)
#endif // __SIGN_H__
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "../../XTensor.h"
#include "SumByColumnTV.h"
#include "SumByColumnTV.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
sum of a tensor and a vector (column vector) in a column by column manner
for each column a_col (in a block), we have
c_col = a_col + b * \beta
where b is a vector.
>> a - a tensor
>> b - a vector with the same column size with a
>> c - where we put a+b. we save it in a if c is NULL
>> beta - the scaling factor
*/
void _SumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
{
CheckNTErrors((a && b && c), "Empty input tensors!");
CheckNTErrors((XTensor::IsSameShaped(a, c)), "Unmatched tensors in addition!");
CheckNTErrors((b->order == 2 && b->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]),
"Illegal input vector size!");
int rowNum = a->dimSize[0];
int colNum = a->dimSize[1];
int blockNum = 1;
for (int i = 2; i < a->order; i++)
blockNum *= a->dimSizeRDI[i];
int blockSize = colNum * rowNum;
if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
#ifdef USE_CUDA
_CudaSumByColumnTV(a, b, c, beta);
#endif
}
else {
if (!a->isSparse && !b->isSparse) {
CheckNTErrors(!c->isSparse, "TODO!");
if (a->dataType == DEFAULT_DTYPE &&
b->dataType == DEFAULT_DTYPE &&
c->dataType == DEFAULT_DTYPE)
{
for (int k = 0; k < blockNum; k++) {
for (int i = 0; i < rowNum; i++) {
DTYPE * ap = (DTYPE*)a->data + k * blockSize + i * colNum;
DTYPE * bp = (DTYPE*)b->data;
DTYPE * cp = (DTYPE*)c->data + k * blockSize + i * colNum;
DTYPE v = bp[i];
for (int j = 0; j < colNum; j++)
cp[j] = ap[j] + v * beta;
}
}
}
else {
// TODO!!
ShowNTErrors("TODO!");
}
}
else {
// TODO!!
ShowNTErrors("TODO!");
}
}
}
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "../../XDevice.h"
#include "../../XTensor.h"
#include "SumByColumnTV.h"
#include "SumByColumnTV.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/*
summation of a tensor and a vector (column vector)
c_col = a_col + b * \beta
>> a - a tensor
>> b - a vector with the same column size with a
>> c - where we put a+b. we save it in a
>> colNum - column number (of a block)
>> blockSize - size of a block
>> size - size of the entire data array
>> beta - the scaling factor
*/
__global__
void KernelADDByColumnTV(DTYPE * a, DTYPE * b, DTYPE * c, int colNum, int blockSize, int size, DTYPE beta)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i >= size)
return;
int offset = i % blockSize;
int row = offset / colNum;
c[i] = a[i] + b[row] * beta;
}
/*
summation of a tensor and a vector (column vector)
for each column a_col (in a block), we have
c_col = a_col + b * \beta
where b is a vector.
>> a - a tensor
>> b - a vector with the same column size with a
>> c - where we put a+b. we save it in a if c is NULL
>> beta - the scaling factor
*/
void _CudaSumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
{
CheckNTErrors((a && b && c), "Empty input tensors!");
CheckNTErrors((XTensor::IsSameShaped(a, c)), "Unmatched tensors in addition!");
CheckNTErrors((b->order == 2 && b->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]),
"Illegal input vector size!");
CheckNTErrors((a->dataType == DEFAULT_DTYPE && b->dataType == DEFAULT_DTYPE &&
c->dataType == DEFAULT_DTYPE), "TODO");
int rowNum = a->dimSize[0];
int colNum = a->dimSize[1];
int blockNum = 1;
for (int i = 2; i < a->order; i++)
blockNum *= a->dimSizeRDI[i];
int cudaGridSize[3];
int cudaBlockSize[3];
GDevs.GetCudaThread(c->devID, a->unitNum, cudaGridSize, cudaBlockSize);
int devIDBackup;
ProtectCudaDev(a->devID, devIDBackup);
KernelADDByColumnTV << <dim3(cudaGridSize[0]), dim3(cudaBlockSize[0]) >> >
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, colNum, rowNum * colNum, a->unitNum, beta);
BacktoCudaDev(a->devID, devIDBackup);
}
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#ifndef __REDUCEMAX_CUH__
#define __REDUCEMAX_CUH__
#include "../reduce/ReduceMax.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* summation of a tensor and a vector (column vector) */
void _CudaSumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
#endif // __REDUCEMAX_CUH__
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#ifndef __SUMBYCOLUMNTV_H__
#define __SUMBYCOLUMNTV_H__
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* sum of a tensor and a (column) vector */
void _SumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
} // namespace nts(NiuTrans.Tensor)
#endif // __SUMBYCOLUMNTV_H__
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "../../XTensor.h"
#include "SumByColumnVT.h"
#include "SumByColumnVT.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
sum of a vector (column vector) and a tensor in a column by column manner
for each column b_col, we have
c = a + \sum{col} b_col * \beta
where c and a are vectors, and b_col is a column in b.
>> a - a tensor
>> b - a vector with the same column size with a
>> c - where we put a+b. we save it in a if c is NULL
>> beta - the scaling factor
*/
void _SumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
{
CheckNTErrors((a && b && c), "Empty input tensors!");
CheckNTErrors((XTensor::IsSameShaped(a, c)), "Unmatched tensors in addition!");
CheckNTErrors((a->order == 2 && a->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]),
"Illegal input vector size!");
if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
#ifdef USE_CUDA
_CudaSumByColumnVT(a, b, c, beta);
#endif
}
else {
int rowNum = b->dimSize[0];
int colNum = b->dimSize[1];
int blockNum = 1;
for (int i = 2; i < b->order; i++)
blockNum *= b->dimSizeRDI[i];
int blockSize = colNum * rowNum;
if (!a->isSparse && !b->isSparse) {
CheckNTErrors(!c->isSparse, "TODO!");
if (a->dataType == DEFAULT_DTYPE &&
b->dataType == DEFAULT_DTYPE &&
c->dataType == DEFAULT_DTYPE)
{
for (int k = 0; k < blockNum; k++) {
for (int i = 0; i < rowNum; i++) {
DTYPE * ap = (DTYPE*)a->data;
DTYPE * bp = (DTYPE*)b->data + k * blockSize + i * colNum;
DTYPE * cp = (DTYPE*)c->data;
DTYPE sum = 0;
for (int j = 0; j < colNum; j++)
sum += bp[j];
cp[i] = ap[i] + sum * beta;
}
}
}
else {
// TODO!!
ShowNTErrors("TODO!");
}
}
else {
// TODO!!
ShowNTErrors("TODO!");
}
}
}
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "../../XDevice.h"
#include "../../XTensor.h"
#include "SumByColumnVT.h"
#include "SumByColumnVT.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/*
summation of a vector (column vector) and a tensor
c = a + \sum{col} b_col * \beta
>> a - a vector with the same column size with b
>> b - a tensor
>> c - where we put a+b. we save it in a
>> colNum - column number (of a block)
>> blockSize - size of a block
>> size - size of the entire data array
>> beta - the scaling factor
*/
__global__
void KernelADDByColumnVT(DTYPE * a, DTYPE * b, DTYPE * c, int colNum, int rowNum, int blockNum, DTYPE beta)
{
int row = blockDim.x * blockIdx.x + threadIdx.x;
if (row >= rowNum)
return;
DTYPE sum = 0;
for (int k = 0; k < blockNum; k++) {
DTYPE * bp = b + (rowNum * k + row) * colNum;
if (colNum % 4 == 0) {
for (int i = 0; i < colNum; i += 4)
sum += bp[i] + bp[i + 1] + bp[i + 2] + bp[i + 3];
}
else if (colNum % 2 == 0) {
for (int i = 0; i < colNum; i += 2)
sum += bp[i] + bp[i + 1];
}
else {
for (int i = 0; i < colNum; i++)
sum += bp[i];
}
__syncthreads();
}
c[row] = a[row] + beta * sum;
}
/*
summation of a vector (column vector) and a tensor
for each column b_col, we have
c = a + \sum{col} b_col * \beta
where c and a are vectors, and b_col is a column in b.
>> a - a vector with the same column size with b
>> b - a tensor
>> c - where we put a+b. we save it in a if c is NULL
>> beta - the scaling factor
*/
void _CudaSumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
{
CheckNTErrors((a && b && c), "Empty input tensors!");
CheckNTErrors((XTensor::IsSameShaped(a, c)), "Unmatched tensors in addition!");
CheckNTErrors((a->order == 2 && a->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]),
"Illegal input vector size!");
CheckNTErrors((a->dataType == DEFAULT_DTYPE && b->dataType == DEFAULT_DTYPE &&
c->dataType == DEFAULT_DTYPE), "TODO");
int rowNum = b->dimSize[0];
int colNum = b->dimSize[1];
int blockNum = 1;
for (int i = 2; i < b->order; i++)
blockNum *= b->dimSizeRDI[i];
int cudaGridSize[3];
int cudaBlockSize[3];
GDevs.GetCudaThread(c->devID, a->dimSizeRDI[1], cudaGridSize, cudaBlockSize);
int devIDBackup = 0;
ProtectCudaDev(a->devID, devIDBackup);
KernelADDByColumnVT << <dim3(cudaGridSize[0]), dim3(cudaBlockSize[0]) >> >
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, colNum, rowNum, blockNum, beta);
BacktoCudaDev(a->devID, devIDBackup);
}
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#ifndef __SUMBYCOLUMNVT_CUH__
#define __SUMBYCOLUMNVT_CUH__
#include "SumByColumnVT.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* summation of a vector (column vector) and a tensor */
void _CudaSumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
#endif // __SUMBYCOLUMNVT_CUH__
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#ifndef __SUMBYCOLUMNVT_H__
#define __SUMBYCOLUMNVT_H__
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* sum of a (column) vector and a tensor */
void _SumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
} // namespace nts(NiuTrans.Tensor)
#endif // __SUMBYCOLUMNVT_H__
......@@ -20,20 +20,55 @@
*/
#include "../../XTensor.h"
#include "../../XName.h"
#include "ConvertDataType.h"
#include "ConvertDataType.cuh"
#include "../movement/CopyValues.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
data type conversion
>> devID - device id
>> s - source data array
>> typeS - source data type
>> t - target data array
>> typeT - target data type
>> size - number of the items in s (and t)
*/
void ConvertDataType(int devID,
void * s, TENSOR_DATA_TYPE typeS,
void * t, TENSOR_DATA_TYPE typeT,
int size)
{
CheckNTErrors((devID < 0), "This code must be run on CPUs!");
if(typeS == typeT)
return;
if(typeS == X_FLOAT && typeT == X_FLOAT16){
for(int i = 0; i < size; i++){
((unsigned short*)t)[i] = FloatToFloat16(((float*)s)[i]);
}
}
else if(typeS == X_FLOAT16 && typeT == X_FLOAT){
for(int i = 0; i < size; i++){
((float*)t)[i] = Float16ToFloat(((unsigned short*)s)[i]);
}
}
else{
ShowNTErrors("Unsupported data types for conversion!");
}
}
/*
convert data type
>> input - input tensor
>> output - output tensor
>> input - the input tensor
>> output - the output tensor
*/
void _ConvertDataType(const XTensor * input, XTensor * output)
{
//CheckNTErrors((input->unitSize == output->unitSize), "Input and Output must be same in size!");
if (input->dataType == output->dataType)
return;
......@@ -59,6 +94,50 @@ void _ConvertDataType(const XTensor * input, XTensor * output)
}
else
ShowNTErrors("Unsupported data types for conversion!");
}
/*
convert data type (return an XTensor structure)
make a new tensor to keep the result and return it
>> input - the input tensor
<< return - the output tensor with the specified data type
*/
XTensor ConvertDataType(const XTensor & input, TENSOR_DATA_TYPE dataType)
{
if (input.dataType == dataType) {
XTensor output;
output = CopyValues(input);
return output;
}
int order = input.order;
float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
XTensor output(order, input.dimSize, dataType, dr, input.devID, input.mem);
output.SetTMPFlag();
_ConvertDataType(&input, &output);
/* tensor connection */
XLink::MakeLink(&input, NULL, &output, GETANDSET_CONVERTDATATYPE);
return output;
}
void ConvertDataType(const XTensor & input, XTensor & output, TENSOR_DATA_TYPE dataType)
{
if (!output.isInit || input.dataType != output.dataType) {
float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
InitTensor(&output, input.order, input.dimSize, dataType, dr, input.devID, input.mem);
}
_ConvertDataType(&input, &output);
/* tensor connection */
if (output.enableGrad)
XLink::MakeLink(&input, NULL, &output, GETANDSET_CONVERTDATATYPE);
}
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-06-14
*/
* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
*/
#include "../../XTensor.h"
#include "../../XDevice.h"
......@@ -67,44 +67,7 @@ void KernelIntToFloat(int * inputData, float * outputData, int size)
if (i < size){
outputData[i] = (float)(inputData[i]);
}}
/*
data conversion (cuda code)
>> devID - device id
>> s - source data array
>> typeS - source data type
>> t - target data array
>> typeT - target data type
>> size - number of the items in s (and t)
*/
void _CudaConvertDataType(int devID, void * s, TENSOR_DATA_TYPE typeS, void * t, TENSOR_DATA_TYPE typeT, int size)
{
CheckNTErrors((devID >= 0), "This code must be run on GPUs!");
if(typeS == typeT)
return;
int gridSize[3];
int blockSize[3];
GDevs.GetCudaThread(devID, size, gridSize, blockSize);
dim3 blocks(gridSize[0]);
dim3 threads(blockSize[0]);
int devIDBackup;
ProtectCudaDev(devID, devIDBackup);
if(typeS == X_FLOAT && typeT == X_FLOAT16)
KernelFloatToFloat16<<<blocks, threads>>>((float*)s, (__half*)t, size);
else if(typeS == X_FLOAT16 && typeT == X_FLOAT)
KernelFloat16ToFloat<<<blocks, threads>>>((__half*)s, (float*)t, size);
else{
ShowNTErrors("Unsupported data types for conversion!");
}
ProtectCudaDev(devID, devIDBackup);
}
/*
......@@ -114,8 +77,6 @@ convert data type (cuda code)
*/
void _CudaConvertDataType(const XTensor * input, XTensor * output)
{
//CheckNTErrors((input->unitSize == output->unitSize), "Input and Output must be same in size!");
if (input->dataType == output->dataType)
return;
......@@ -131,13 +92,17 @@ void _CudaConvertDataType(const XTensor * input, XTensor * output)
ProtectCudaDev(input->devID, devIDBackup);
if(input->dataType == X_FLOAT && output->dataType == X_INT)
KernelFloatToInt<<<blocks, threads>>>((float*)input->data, (int*)output->data, input->unitNum);
KernelFloatToInt<<<blocks, threads>>>
((float*)input->data, (int*)output->data, input->unitNum);
else if(input->dataType == X_INT && output->dataType == X_FLOAT)
KernelIntToFloat<<<blocks, threads>>>((int*)input->data, (float*)output->data, input->unitNum);
KernelIntToFloat<<<blocks, threads>>>
((int*)input->data, (float*)output->data, input->unitNum);
else if(input->dataType == X_FLOAT && output->dataType == X_FLOAT16)
KernelFloatToFloat16<<<blocks, threads>>>((float*)input->data, (__half*)output->data, input->unitNum);
KernelFloatToFloat16<<<blocks, threads>>>
((float*)input->data, (__half*)output->data, input->unitNum);
else if(input->dataType == X_FLOAT16 && output->dataType == X_FLOAT)
KernelFloat16ToFloat<<<blocks, threads>>>((__half*)input->data, (float*)output->data, input->unitNum);
KernelFloat16ToFloat<<<blocks, threads>>>
((__half*)input->data, (float*)output->data, input->unitNum);
else{
ShowNTErrors("Unsupported data types for conversion!");
}
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
*/
* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
*/
#ifndef __CONVERTDATATYPE_CUH__
#define __CONVERTDATATYPE_CUH__
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
*/
* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
*/
#ifndef __CONVERTDATATYPE_H__
#define __CONVERTDATATYPE_H__
#include "../../XTensor.h"
#include "../../XDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* data conversion (for lower precision computation) */
void ConvertDataType(int devID,
void * s, TENSOR_DATA_TYPE typeS,
void * t, TENSOR_DATA_TYPE typeT, int size);
/* convert data type */
void _ConvertDataType(const XTensor * input, XTensor * output);
/* convert data type (return an XTensor structure) */
XTensor ConvertDataType(const XTensor & input, TENSOR_DATA_TYPE dataType);
/* convert data type */
void ConvertDataType(const XTensor & input, XTensor & output, TENSOR_DATA_TYPE dataType);
} // namespace nts(NiuTrans.Tensor)
#endif // __CONVERTDATATYPE_H__
......@@ -466,13 +466,23 @@ void _SetDataLowTri(XTensor * tensor, DTYPE p, int shift)
}
}
/* generate data items with a uniform distribution in [0, 1] */
void _SetDataRand(XTensor * tensor, int rNum, int cNum)
{
if (tensor == NULL || tensor->isInit == false || tensor->order !=2 ) {
InitTensor2D(tensor, rNum, cNum);
}
_SetDataRand(tensor, 0.0F, 1.0F);
}
/*
generate data items with a uniform distribution in [lower, upper]
>> tensor - the tensor whose data array would be initialized
>> lower - lower value of the range
>> upper - upper value of the range
*/
void _SetDataRand(const XTensor * tensor, DTYPE lower, DTYPE upper)
void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
{
CheckNTErrors(upper > lower, "the high value must be greater than low value!");
......@@ -525,7 +535,7 @@ the item to a pre-defined value if the item >= p, set the item to 0 otherwise
>> p - the threshold
>> value - the value we intend to assign to the item
*/
void _SetDataRandP(const XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value)
void _SetDataRandP(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value)
{
CheckNTErrors(tensor->dataType == DEFAULT_DTYPE, "TODO");
......
......@@ -569,15 +569,17 @@ void _CudaSetDataRand(const XTensor * tensor, DTYPE lower, DTYPE upper)
ProtectCudaDev(tensor->devID, devIDBackup);
curandGenerator_t & gen = GDevs.GPUs[tensor->devID].gen;
curandGenerateUniform(gen , (float*)tensor->data , tensor->unitNum);
curandGenerateUniform(gen, (float*)tensor->data, tensor->unitNum);
DTYPE variance = upper - lower;
if(variance != 1.0F || lower != 0){
if (tensor->dataType == X_FLOAT)
KernelSetDataRandFloat <<<blocks, threads >>>((float*) tensor->data, tensor->unitNum, lower, variance);
KernelSetDataRandFloat <<<blocks, threads >>>
((float*) tensor->data, tensor->unitNum, lower, variance);
else if (tensor->dataType == X_DOUBLE)
KernelSetDataRandDouble <<<blocks, threads >>>((double*)tensor->data, tensor->unitNum, lower, variance);
KernelSetDataRandDouble <<<blocks, threads >>>
((double*)tensor->data, tensor->unitNum, lower, variance);
}
BacktoCudaDev(tensor->devID, devIDBackup);
......
......@@ -63,12 +63,15 @@ void _SetDataIndexed(XTensor * source, XTensor * modify, int dim, int index);
/* generate data as lower triangular matrics for last two dimensions */
void _SetDataLowTri(XTensor * tensor, DTYPE p, int shift);
/* generate data items with a uniform distribution in [0, 1] */
void _SetDataRand(XTensor * tensor, int rNum, int cNum);
/* generate data items with a uniform distribution in [lower, upper] */
void _SetDataRand(const XTensor * tensor, DTYPE lower, DTYPE upper);
void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper);
/* generate data items with a uniform distribution in [lower, upper] and set
the item to a pre-defined value if the item >= p, set the item to 0 otherwise */
void _SetDataRandP(const XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value);
void _SetDataRandP(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value);
/* generate data items with a normal distribution with specified mean and standard deviation */
void _SetDataRandN(XTensor * tensor, DTYPE mean = 0.0F, DTYPE standardDeviation = 1.0F);
......
......@@ -26,230 +26,167 @@
namespace nts {
int scale(int x, int scale)
template<class T1, class T2>
T1 BinaryDescale(T1 x, T2 num)
{
return x * scale;
return (T1)(x / num);
}
float scale(float x, float scale)
template<class T1, class T2>
T1 BinaryPower(T1 x, T2 num)
{
return x * scale;
if (num == 0)
return (T1)1.0;
else if (num == 0.5)
return (T1)sqrt(x);
else if (num == 2)
return x * x;
else {
if (x == 0 && num < 0)
return (T1)1e20F;
else
return (T1)pow(x, num);
}
}
int descale(int x, int descale)
template<class T1, class T2>
T1 BinaryScale(T1 x, T2 num)
{
return x / descale;
return (T1)(x * num);
}
float descale(float x, float descale)
template<class T1, class T2>
T1 BinaryShift(T1 x, T2 num)
{
return x / descale;
return (T1)(x + num);
}
int shift(int x, int shift)
int BinaryMod(int x, int num)
{
return x + shift;
return x % num;
}
float shift(float x, float shift)
{
return x + shift;
}
int mod(int x, int mod)
{
return x % mod;
}
#ifdef USE_CUDA
/* define three marco separately, specify the respective function names (GPU mode) */
#define _SIMPLE_BINARY_FUNCTION_INT(_funcName, _cudaFuncName, origFunc) \
void _funcName(const XTensor * a, XTensor * b, int num) \
{ \
/* run it on GPUs */ \
if (a->devID >= 0) { \
_cudaFuncName(a, b, num); \
return; \
} \
CheckNTErrors((XTensor::IsSameShaped(a, b)), \
"Input tensors should have the same data type!"); \
CheckNTErrors((a->dataType == X_INT&&b->dataType == X_INT), "TODO!"); \
int * d = (int*)a->data; \
int * db = (int*)b->data; \
for (int i = 0; i < a->unitNum; i++) \
db[i] = (int)origFunc(d[i], num); \
} \
#define _SIMPLE_BINARY_FUNCTION(_funcName, _cudaFuncName, origFunc) \
void _funcName(const XTensor * a, XTensor * b, float num) \
{ \
/* run it on GPUs */ \
if (a->devID >= 0) { \
_cudaFuncName(a, b, num); \
return; \
} \
CheckNTErrors((XTensor::IsSameShaped(a, b)), \
"Input tensors should have the same data type!"); \
CheckNTErrors((a->dataType == X_FLOAT&&b->dataType == X_FLOAT), "TODO!");\
float * d = (float*)a->data; \
float * db = (float*)b->data; \
for (int i = 0; i < a->unitNum; i++) \
db[i] = (float)origFunc(d[i], num); \
}
#define SIMPLE_BINARY_FUNCTION_ME_INT(funcName, _funcName) \
void funcName(XTensor &a, int num) \
{ \
_funcName(&a, &a, num); \
} \
#define SIMPLE_BINARY_FUNCTION_ME(funcName, _funcName) \
void funcName(XTensor &a, float num) \
{ \
_funcName(&a, &a, num); \
} \
#define SIMPLE_BINARY_FUNCTION_INT(funcName, _funcName) \
void funcName(const XTensor &a, XTensor &b, int num) \
{ \
_funcName(&a, &b, num); \
} \
#define SIMPLE_BINARY_FUNCTION(funcName, _funcName, operationId) \
XTensor funcName(const XTensor &a, float num) \
{ \
XTensor b(&a); \
b.SetTMPFlag(); \
_funcName(&a, &b, num); \
XLink::MakeLink(&a, NULL, &b, operationId); \
return b; \
} \
#define SIMPLE_BINARY_FUNCTION_VOID(funcName, _funcName, operationId) \
void funcName(const XTensor &a, XTensor &b, float num) \
{ \
if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) { \
InitTensor(&b, &a); \
} \
_funcName(&a, &b, num); \
if (b.enableGrad) { \
XLink::MakeLink(&a, NULL, &b, operationId); \
} \
} \
_SIMPLE_BINARY_FUNCTION_INT(_Scale, _CudaScale, scale)
SIMPLE_BINARY_FUNCTION_ME_INT(_ScaleMe, _Scale)
SIMPLE_BINARY_FUNCTION_INT(Scale, _Scale)
_SIMPLE_BINARY_FUNCTION(_Scale, _CudaScaleFloat, scale)
SIMPLE_BINARY_FUNCTION_ME(_ScaleMe, _Scale)
SIMPLE_BINARY_FUNCTION(Scale, _Scale, MATH_SCALE)
SIMPLE_BINARY_FUNCTION_VOID(Scale, _Scale, MATH_SCALE)
_SIMPLE_BINARY_FUNCTION_INT(_Descale, _CudaDescale, descale)
SIMPLE_BINARY_FUNCTION_ME_INT(_DescaleMe, _Descale)
SIMPLE_BINARY_FUNCTION_INT(Descale, _Descale)
_SIMPLE_BINARY_FUNCTION(_Descale, _CudaDescaleFloat, descale)
SIMPLE_BINARY_FUNCTION_ME(_DescaleMe, _Descale)
/* define three marco separately, specify the respective function names */
#define _SIMPLE_BINARY_FUNCTION(_funcName, _cudaFuncName, origFunc) \
template<class T> \
void _funcName(const XTensor * a, XTensor * b, T num) \
{ \
/* run it on GPUs */ \
if (a->devID >= 0) { \
if (useCUDA) { \
_cudaFuncName(a, b, num); \
return; \
} \
else \
ShowNTErrors("No GPU devices support!") \
} \
CheckNTErrors((XTensor::IsSameShaped(a, b)), \
"Input tensors should have the same data type!"); \
if (a->dataType == X_INT) { \
int * d = (int*)a->data; \
int * db = (int*)b->data; \
for (int i = 0; i < a->unitNum; i++) \
db[i] = (int)origFunc((int)d[i], (T)num); \
} \
else if (a->dataType == X_FLOAT) { \
float * d = (float*)a->data; \
float * db = (float*)b->data; \
for (int i = 0; i < a->unitNum; i++) \
db[i] = (float)origFunc((float)d[i], (T)num); \
} \
else if (a->dataType == X_DOUBLE) { \
double * d = (double*)a->data; \
double * db = (double*)b->data; \
for (int i = 0; i < a->unitNum; i++) \
db[i] = (double)origFunc((double)d[i], (T)num); \
} \
else \
ShowNTErrors("TO DO!"); \
} \
template void _funcName<int>(const XTensor*, XTensor*, int); \
template void _funcName<float>(const XTensor*, XTensor*, float); \
template void _funcName<double>(const XTensor*, XTensor*, double);
#define _SIMPLE_BINARY_FUNCTION_ME(_funcNameMe, _funcName) \
template<class T> \
void _funcNameMe(XTensor * a, T num) \
{ \
_funcName(a, a, num); \
} \
template void _funcNameMe<int>(XTensor*, int); \
template void _funcNameMe<float>(XTensor*, float); \
template void _funcNameMe<double>(XTensor*, double);
#define SIMPLE_BINARY_FUNCTION_ME(funcNameMe, _funcName) \
template<class T> \
void funcNameMe(XTensor &a, T num) \
{ \
_funcName(&a, &a, num); \
} \
template void funcNameMe<int>(XTensor&, int); \
template void funcNameMe<float>(XTensor&, float); \
template void funcNameMe<double>(XTensor&, double);
#define SIMPLE_BINARY_FUNCTION(funcName, _funcName, operationId) \
template<class T> \
XTensor funcName(const XTensor &a, T num) \
{ \
XTensor b(&a); \
b.SetTMPFlag(); \
_funcName(&a, &b, num); \
XLink::MakeLink(&a, NULL, &b, operationId); \
XLink::AddParamToHead(&b, num); \
return b; \
} \
template XTensor funcName<int>(const XTensor&, int); \
template XTensor funcName<float>(const XTensor&, float); \
template XTensor funcName<double>(const XTensor&, double);
#define SIMPLE_BINARY_FUNCTION_VOID(funcName, _funcName, operationId) \
template<class T> \
void funcName(const XTensor &a, XTensor &b, T num) \
{ \
if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) { \
InitTensor(&b, &a); \
} \
_funcName(&a, &b, num); \
if (b.enableGrad) { \
XLink::MakeLink(&a, NULL, &b, operationId); \
XLink::AddParamToHead(&b, num); \
} \
} \
template void funcName<int>(const XTensor&, XTensor&, int); \
template void funcName<float>(const XTensor&, XTensor&, float); \
template void funcName<double>(const XTensor&, XTensor&, double);
_SIMPLE_BINARY_FUNCTION(_Descale, _CudaDescale, BinaryDescale)
_SIMPLE_BINARY_FUNCTION_ME(_DescaleMe, _Descale)
SIMPLE_BINARY_FUNCTION_ME(DescaleMe, _Descale)
SIMPLE_BINARY_FUNCTION(Descale, _Descale, MATH_DESCALE)
SIMPLE_BINARY_FUNCTION_VOID(Descale, _Descale, MATH_DESCALE)
_SIMPLE_BINARY_FUNCTION_INT(_Shift, _CudaShift, shift)
SIMPLE_BINARY_FUNCTION_ME_INT(_ShiftMe, _Shift)
SIMPLE_BINARY_FUNCTION_INT(Shift, _Shift)
_SIMPLE_BINARY_FUNCTION(_Mod, _CudaMod, BinaryMod)
_SIMPLE_BINARY_FUNCTION_ME(_ModMe, _Mod)
SIMPLE_BINARY_FUNCTION_ME(ModMe, _Mod)
SIMPLE_BINARY_FUNCTION(Mod, _Mod, MATH_MOD)
SIMPLE_BINARY_FUNCTION_VOID(Mod, _Mod, MATH_MOD)
_SIMPLE_BINARY_FUNCTION(_Power, _CudaPower, BinaryPower)
_SIMPLE_BINARY_FUNCTION_ME(_PowerMe, _Power)
SIMPLE_BINARY_FUNCTION_ME(PowerMe, _Power)
SIMPLE_BINARY_FUNCTION(Power, _Power, MATH_POWER)
SIMPLE_BINARY_FUNCTION_VOID(Power, _Power, MATH_POWER)
_SIMPLE_BINARY_FUNCTION(_Scale, _CudaScale, BinaryScale)
_SIMPLE_BINARY_FUNCTION_ME(_ScaleMe, _Scale)
SIMPLE_BINARY_FUNCTION_ME(ScaleMe, _Scale)
SIMPLE_BINARY_FUNCTION(Scale, _Scale, MATH_SCALE)
SIMPLE_BINARY_FUNCTION_VOID(Scale, _Scale, MATH_SCALE)
_SIMPLE_BINARY_FUNCTION(_Shift, _CudaShiftFloat, shift)
SIMPLE_BINARY_FUNCTION_ME(_ShiftMe, _Shift)
_SIMPLE_BINARY_FUNCTION(_Shift, _CudaShift, BinaryShift)
_SIMPLE_BINARY_FUNCTION_ME(_ShiftMe, _Shift)
SIMPLE_BINARY_FUNCTION_ME(ShiftMe, _Shift)
SIMPLE_BINARY_FUNCTION(Shift, _Shift, MATH_SHIFT)
SIMPLE_BINARY_FUNCTION_VOID(Shift, _Shift, MATH_SHIFT)
_SIMPLE_BINARY_FUNCTION_INT(_Mod, _CudaMod, mod)
SIMPLE_BINARY_FUNCTION_ME_INT(ModMe, _Mod)
SIMPLE_BINARY_FUNCTION_INT(Mod, _Mod)
#else
/* define three marco separately, specify the respective function names (CPU mode) */
#define _SIMPLE_BINARY_FUNCTION_INT(_funcName, origFunc) \
void _funcName(const XTensor * a, XTensor * b, int num) \
{ \
CheckNTErrors(a->devID < 0, "No GPU code is supported"); \
CheckNTErrors((XTensor::IsSameShaped(a, b)), \
"Input tensors should have the same data type!"); \
CheckNTErrors((a->dataType == X_INT&&b->dataType == X_INT), "TODO!"); \
int * d = (int*)a->data; \
int * db = (int*)b->data; \
for (int i = 0; i < a->unitNum; i++) \
db[i] = (int)origFunc(d[i], num); \
} \
#define _SIMPLE_BINARY_FUNCTION(_funcName, origFunc) \
void _funcName(const XTensor * a, XTensor * b, float num) \
{ \
CheckNTErrors(a->devID < 0, "No GPU code is supported"); \
CheckNTErrors((XTensor::IsSameShaped(a, b)), \
"Input tensors should have the same data type!"); \
CheckNTErrors((a->dataType == X_FLOAT&&b->dataType == X_FLOAT), "TODO!");\
float * d = (float*)a->data; \
float * db = (float*)b->data; \
for (int i = 0; i < a->unitNum; i++) \
db[i] = (float)origFunc(d[i], num); \
}
#define SIMPLE_BINARY_FUNCTION_ME_INT(funcName, _funcName) \
void funcName(XTensor &a, int num) \
{ \
_funcName(&a, &a, num); \
} \
#define SIMPLE_BINARY_FUNCTION_ME(funcName, _funcName) \
void funcName(XTensor &a, float num) \
{ \
_funcName(&a, &a, num); \
} \
#define SIMPLE_BINARY_FUNCTION_INT(funcName, _funcName) \
void funcName(const XTensor &a, XTensor &b, int num) \
{ \
_funcName(&a, &b, num); \
} \
#define SIMPLE_BINARY_FUNCTION(funcName, _funcName) \
void funcName(const XTensor &a, XTensor &b, float num) \
{ \
_funcName(&a, &b, num); \
} \
_SIMPLE_BINARY_FUNCTION_INT(_Scale, scale)
SIMPLE_BINARY_FUNCTION_ME_INT(_ScaleMe, _Scale)
SIMPLE_BINARY_FUNCTION_INT(Scale, _Scale)
_SIMPLE_BINARY_FUNCTION(_Scale, scale)
SIMPLE_BINARY_FUNCTION_ME(_ScaleMe, _Scale)
SIMPLE_BINARY_FUNCTION(Scale, _Scale)
_SIMPLE_BINARY_FUNCTION_INT(_Descale, descale)
SIMPLE_BINARY_FUNCTION_ME_INT(_DescaleMe, _Descale)
SIMPLE_BINARY_FUNCTION_INT(Descale, _Descale)
_SIMPLE_BINARY_FUNCTION(_Descale, descale)
SIMPLE_BINARY_FUNCTION_ME(_DescaleMe, _Descale)
SIMPLE_BINARY_FUNCTION(Descale, _Descale)
_SIMPLE_BINARY_FUNCTION_INT(_Shift, shift)
SIMPLE_BINARY_FUNCTION_ME_INT(_Shift, _Shift)
SIMPLE_BINARY_FUNCTION_INT(Shift, _Shift)
_SIMPLE_BINARY_FUNCTION(_Shift, shift)
SIMPLE_BINARY_FUNCTION_ME(_ShiftMe, _Shift)
SIMPLE_BINARY_FUNCTION(Shift, _Shift)
_SIMPLE_BINARY_FUNCTION_INT(_Mod, mod)
SIMPLE_BINARY_FUNCTION_ME_INT(_ModMe, _Mod)
SIMPLE_BINARY_FUNCTION_INT(Mod, _Mod)
#endif
} // namespace nts(NiuTrans.Tensor)
......@@ -21,6 +21,7 @@
#include <math.h>
#include "../../XDevice.h"
#include "../../XUtility.h"
#include "../../XName.h"
#include "Binary.h"
#include "Binary.cuh"
......@@ -28,134 +29,108 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
__device__
int cudascale(int x, int scale)
int BinaryCudaMod(int x, int base)
{
return x * scale;
return x % base;
}
template<class T1, class T2>
__device__
float cudascale(float x, float scale)
T1 BinaryCudaDescale(T1 x, T2 num)
{
return x * scale;
return x / num;
}
template<class T1, class T2>
__device__
int cudadescale(int x, int descale)
T1 BinaryCudaPower(T1 x, T2 num)
{
return x / descale;
if (num == 0)
return (T1)1.0;
else if (num == 0.5)
return (T1)sqrt((float)x);
else if (num == 2)
return (T1)(x * x);
else {
if (x == 0 && num < 0)
return (T1)1e20F;
else
return (T1)pow((float)x, (float)num);
}
}
template<class T1, class T2>
__device__
float cudadescale(float x, float descale)
T1 BinaryCudaScale(T1 x, T2 num)
{
return x / descale;
return x * num;
}
template<class T1, class T2>
__device__
int cudashift(int x, int shift)
T1 BinaryCudaShift(T1 x, T2 num)
{
return x + shift;
}
__device__
float cudashift(float x, float descale)
{
return x + descale;
}
__device__
int cudamod(int x, int mod)
{
return x % mod;
}
#define SIMPLE_BINARY_FUNCTION_GPU(funcName, origFunc) \
__global__ \
void Kernel##funcName(int * a, int * b, int size, int num) \
{ \
int i = blockDim.x * blockIdx.x + threadIdx.x; \
\
if (i < size) \
b[i] = (int)origFunc(a[i], num); \
} \
\
void _Cuda##funcName(const XTensor * a, XTensor * b, int num) \
{ \
CheckNTErrors((XTensor::IsSameShaped(a, b)), \
"Input tensors should have the same type!"); \
CheckNTErrors((a->isSparse == false), "TODO!"); \
\
int gridSize[3]; \
int blockSize[3]; \
\
GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize); \
\
dim3 blocks(gridSize[0]); \
dim3 threads(blockSize[0]); \
\
int devIDBackup; \
ProtectCudaDev(a->devID, devIDBackup); \
\
if (a->dataType == X_INT) { \
Kernel##funcName<<<blocks, threads>>> \
((int*)a->data, (int*)b->data, a->unitNum, num); \
} \
else { \
ShowNTErrors("TODO!"); \
} \
\
BacktoCudaDev(a->devID, devIDBackup); \
} \
#define SIMPLE_BINARY_FUNCTION_FLOAT_GPU(funcName, origFunc) \
__global__ \
void Kernel##funcName(float * a, float * b, int size, float num) \
{ \
int i = blockDim.x * blockIdx.x + threadIdx.x; \
\
if (i < size) \
b[i] = (float)origFunc(a[i], num); \
} \
\
\
void _Cuda##funcName(const XTensor * a, XTensor * b, float num) \
{ \
CheckNTErrors((XTensor::IsSameShaped(a, b)), \
"Input tensors should have the same type!"); \
CheckNTErrors((a->isSparse == false), "TODO!"); \
\
int gridSize[3]; \
int blockSize[3]; \
\
GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize); \
\
dim3 blocks(gridSize[0]); \
dim3 threads(blockSize[0]); \
\
int devIDBackup; \
ProtectCudaDev(a->devID, devIDBackup); \
\
if (a->dataType == X_FLOAT) { \
Kernel##funcName<<<blocks, threads>>> \
((float*)a->data, (float*)b->data, a->unitNum, num);\
} \
else { \
ShowNTErrors("TODO!"); \
} \
\
BacktoCudaDev(a->devID, devIDBackup); \
return x + num;
}
SIMPLE_BINARY_FUNCTION_GPU(Scale, cudascale)
SIMPLE_BINARY_FUNCTION_FLOAT_GPU(ScaleFloat, cudascale)
SIMPLE_BINARY_FUNCTION_GPU(Descale, cudadescale)
SIMPLE_BINARY_FUNCTION_FLOAT_GPU(DescaleFloat, cudadescale)
SIMPLE_BINARY_FUNCTION_GPU(Shift, cudashift)
SIMPLE_BINARY_FUNCTION_FLOAT_GPU(ShiftFloat, cudashift)
SIMPLE_BINARY_FUNCTION_GPU(Mod, cudamod)
#define SIMPLE_BINARY_FUNCTION_GPU(funcName, origFunc) \
template<class T1, class T2> \
__global__ \
void Kernel##funcName(T1 * a, T1 * b, int size, T2 num) \
{ \
int i = blockDim.x * blockIdx.x + threadIdx.x; \
\
if (i < size) \
b[i] = (T1)origFunc((T1)a[i], (T2)num); \
} \
\
template<class T> \
void _Cuda##funcName(const XTensor * a, XTensor * b, T num) \
{ \
CheckNTErrors((XTensor::IsSameShaped(a, b)), \
"Input tensors should have the same type!"); \
CheckNTErrors((a->isSparse == false), "TODO!"); \
\
int gridSize[3]; \
int blockSize[3]; \
\
GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize); \
\
dim3 blocks(gridSize[0]); \
dim3 threads(blockSize[0]); \
\
int devIDBackup; \
ProtectCudaDev(a->devID, devIDBackup); \
\
if (a->dataType == X_FLOAT) { \
Kernel##funcName<<<blocks, threads>>> \
((float*)a->data, (float*)b->data, a->unitNum, (T)num); \
} \
else if (a->dataType == X_DOUBLE) { \
Kernel##funcName<<<blocks, threads>>> \
((double*)a->data, (double*)b->data, a->unitNum, (T)num); \
} \
else if (a->dataType == X_INT) { \
Kernel##funcName<<<blocks, threads>>> \
((int*)a->data, (int*)b->data, a->unitNum, (T)num); \
} \
else { \
ShowNTErrors("TODO!"); \
} \
\
BacktoCudaDev(a->devID, devIDBackup); \
} \
template void _Cuda##funcName<int>(const XTensor*, XTensor*, int); \
template void _Cuda##funcName<float>(const XTensor*, XTensor*, float); \
template void _Cuda##funcName<double>(const XTensor*, XTensor*, double);
SIMPLE_BINARY_FUNCTION_GPU(Descale, BinaryCudaDescale)
SIMPLE_BINARY_FUNCTION_GPU(Mod, BinaryCudaMod)
SIMPLE_BINARY_FUNCTION_GPU(Power, BinaryCudaPower)
SIMPLE_BINARY_FUNCTION_GPU(Scale, BinaryCudaScale)
SIMPLE_BINARY_FUNCTION_GPU(Shift, BinaryCudaShift)
#endif // USE_CUDA
......
......@@ -29,38 +29,25 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* scale each entry (CUDA Kernel) */
__global__
void KernelScale(int * a, int * b, int size, int scale);
__global__
void KernelScale(int * a, int * b, int size, float scale);
/* scale each entry */
void _CudaScale(const XTensor * a, XTensor * b, int scale);
void _CudaScaleFloat(const XTensor * a, XTensor * b, float scale);
/* descale each entry (CUDA Kernel) */
__global__
void KernelDescale(int * a, int * b, int size, int scale);
__global__
void KernelDescale(int * a, int * b, int size, float scale);
/* descale each entry */
void _CudaDescale(const XTensor * a, XTensor * b, int scale);
void _CudaDescaleFloat(const XTensor * a, XTensor * b, float scale);
template<class T>
void _CudaDescale(const XTensor * a, XTensor * b, T num);
/* shift each entry (CUDA Kernel) */
__global__
void KernelShift(int * a, int * b, int size, int shift);
__global__
void KernelShift(int * a, int * b, int size, float shift);
/* shift each entry */
void _CudaShift(const XTensor * a, XTensor * b, int shift);
void _CudaShiftFloat(const XTensor * a, XTensor * b, float shift);
/* power each entry */
template<class T>
void _CudaPower(const XTensor * a, XTensor * b, T num);
/* mod each entry (CUDA Kernel) */
__global__
void KernelMod(int * a, int * b, int size, int base);
/* mod each entry */
void _CudaMod(const XTensor * a, XTensor * b, int base);
template<class T>
void _CudaMod(const XTensor * a, XTensor * b, T base);
/* scale each entry */
template<class T>
void _CudaScale(const XTensor * a, XTensor * b, T num);
/* shift each entry */
template<class T>
void _CudaShift(const XTensor * a, XTensor * b, T num);
#endif // USE_CUDA
......
......@@ -16,8 +16,8 @@
*/
/*
* $Created by: JIANG Yufan (email: jiangyufan2018@outlook.com) 2019-04-05
*/
* $Created by: JIANG Yufan (email: jiangyufan2018@outlook.com) 2019-04-05
*/
#ifndef __BINARY_H__
#define __BINARY_H__
......@@ -26,132 +26,110 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
scale up tensor entires
b = a * scale
*/
void _Scale(const XTensor * a, XTensor * b, int scale);
void _Scale(const XTensor * a, XTensor * b, float scale);
/*
scale up tensor entires (on site)
b = a * scale
*/
void _ScaleMe(XTensor * a, int scale);
void _ScaleMe(XTensor * a, float scale);
/*
scale up tensor entires (on site)
b = a * scale
*/
void ScaleMe(XTensor & a, int scale);
void ScaleMe(XTensor & a, float scale);
/*
scale up tensor entires
b = a * scale
*/
void Scale(const XTensor & a, XTensor &b, int scale);
void Scale(const XTensor & a, XTensor &b, float scale);
/*
scale up tensor entires (return an XTensor structure)
b = a * scale
*/
XTensor Scale(const XTensor & a, float scale);
/*
descale tensor entires
b = a / scale
*/
void _Descale(const XTensor * a, XTensor * b, int scale);
void _Descale(const XTensor * a, XTensor * b, float scale);
/*
descale tensor entires (on site)
b = a / scale
*/
void _DescaleMe(XTensor * a, int scale);
void _DescaleMe(XTensor * a, float scale);
/*
descale tensor entires (on site)
b = a / scale
*/
void DescaleMe(XTensor & a, int scale);
void DescaleMe(XTensor & a, float scale);
/*
descale tensor entires
b = a / scale
*/
void Descale(const XTensor & a, XTensor & b, int scale);
void Descale(const XTensor & a, XTensor & b, float scale);
/*
descale tensor entires (return an XTensor structure)
b = a / scale
*/
XTensor Descale(const XTensor & a, float scale);
/*
shift tensor entires
b = a + shift
*/
void _Shift(const XTensor * a, XTensor * b, int shift);
void _Shift(const XTensor * a, XTensor * b, float shift);
/*
shift tensor entires (on site)
b = a + shift
*/
void _ShiftMe(XTensor * a, int shift);
void _ShiftMe(XTensor * a, float shift);
/*
shift tensor entires (on site)
b = a + shift
*/
void ShiftMe(XTensor & a, int shift);
void ShiftMe(XTensor & a, float shift);
/*
shift tensor entires
b = a + shift
*/
void Shift(const XTensor & a, XTensor & b, int shift);
void Shift(const XTensor & a, XTensor & b, float shift);
/*
shift tensor entires (return an XTensor structure)
b = a + shift
*/
XTensor Shift(const XTensor & a, float shift);
/*
mod tensor entires
b = a % mod
*/
void _Mod(const XTensor * a, XTensor * b, int base);
/*
mod tensor entires (on site)
b = a % mod
*/
void _ModMe(XTensor * a, int base);
/*
mod tensor entires (on site)
b = a % mod
*/
void ModMe(XTensor & a, int base);
/*
mod tensor entires
b = a % mod
*/
void Mod(const XTensor & a, XTensor & b, int base);
/* descale tensor entires
b = a / num */
template<class T>
void _Descale(const XTensor * a, XTensor * b, T num);
/* descale tensor entires (on site)
b = a / num */
template<class T>
void _DescaleMe(XTensor * a, T num);
/* descale tensor entires (on site)
b = a / num */
template<class T>
void DescaleMe(XTensor & a, T num);
/* descale tensor entires
b = a / num */
template<class T>
void Descale(const XTensor & a, XTensor & b, T num);
/* descale tensor entires (return an XTensor structure)
b = a / num */
template<class T>
XTensor Descale(const XTensor & a, T num);
/* mod tensor entires
b = a % base */
template<class T>
void _Mod(const XTensor * a, XTensor * b, T base);
/* mod base entires (on site)
b = a % num */
template<class T>
void _ModMe(XTensor * a, T base);
/* mod tensor entires (on site)
b = a % base */
template<class T>
void ModMe(XTensor & a, T base);
/* mod tensor entires
b = a % base */
template<class T>
void Mod(const XTensor & a, XTensor & b, T base);
/* mod tensor entires (return an XTensor structure)
b = a % base */
template<class T>
XTensor Mod(const XTensor & a, T base);
/* get the power(x, y)
b = power(a, num) */
template<class T>
void _Power(const XTensor * a, XTensor * b, T scale);
/* get the power(x, y) (on site)
b = power(a, num) */
template<class T>
void _PowerMe(XTensor * a, T scale);
/* get the power(x, y) (on site)
b = power(a, num) */
template<class T>
void PowerMe(XTensor & a, T scale);
/* get the power(x, y)
b = power(a, num) */
template<class T>
void Power(const XTensor & a, XTensor & b, T scale);
/* get the power(x, y) (return an XTensor structure)
b = power(a, num) */
template<class T>
XTensor Power(const XTensor & a, T scale);
/* scale up tensor entires
b = a * num */
template<class T>
void _Scale(const XTensor * a, XTensor * b, T num);
/* scale up tensor entires (on site)
b = a * num */
template<class T>
void _ScaleMe(XTensor * a, T num);
/* scale up tensor entires (on site)
b = a * num */
template<class T>
void ScaleMe(XTensor & a, T num);
/* scale up tensor entires
b = a * num */
template<class T>
void Scale(const XTensor & a, XTensor & b, T num);
/* scale up tensor entires (return an XTensor structure)
b = a * num */
template<class T>
XTensor Scale(const XTensor & a, T num);
/* shift tensor entires
b = a + num */
template<class T>
void _Shift(const XTensor * a, XTensor * b, T num);
/* shift tensor entires (on site)
b = a + num */
template<class T>
void _ShiftMe(XTensor * a, T num);
/* shift tensor entires (on site)
b = a + num */
template<class T>
void ShiftMe(XTensor & a, T num);
/* shift tensor entires
b = a + num */
template<class T>
void Shift(const XTensor & a, XTensor & b, T num);
/* shift tensor entires (return an XTensor structure)
b = a + num */
template<class T>
XTensor Shift(const XTensor & a, T num);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -37,88 +37,72 @@ DTYPE myIsNotEqual(DTYPE a, DTYPE b)
}
#ifdef USE_CUDA
/* define three marco separately, specify the respective function names (GPU mode) */
#define _SIMPLE_COMPARE_FUNCTION(_funcName, _cudaFuncName, origFunc) \
void _funcName(const XTensor * a, XTensor * b, DTYPE number) \
{ \
CheckNTErrors((XTensor::IsSameShaped(a, b)), \
"Input tensors should have the same type!"); \
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!"); \
/* run it on GPUs */ \
if (a->devID >= 0) { \
_cudaFuncName(a, b, number); \
return; \
} \
DTYPE * d = (DTYPE*)a->data; \
DTYPE * db = (DTYPE*)b->data; \
for (int i = 0; i < a->unitNum; i++) \
db[i] = (DTYPE)origFunc(d[i], number); \
/* define three marco separately, specify the respective function names */
#define _SIMPLE_COMPARE_FUNCTION(_funcName, _cudaFuncName, origFunc) \
void _funcName(const XTensor * a, XTensor * b, DTYPE number) \
{ \
CheckNTErrors((XTensor::IsSameShaped(a, b)), \
"Input tensors should have the same type!"); \
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!"); \
/* run it on GPUs */ \
if (a->devID >= 0) { \
if (useCUDA) { \
_cudaFuncName(a, b, number); \
return; \
} \
else \
ShowNTErrors("No GPU devices support!") \
} \
DTYPE * d = (DTYPE*)a->data; \
DTYPE * db = (DTYPE*)b->data; \
for (int i = 0; i < a->unitNum; i++) \
db[i] = (DTYPE)origFunc(d[i], number); \
}
#define _SIMPLE_COMPARE_FUNCTION_ME(_funcNameMe, _funcName) \
void _funcNameMe(XTensor * a, DTYPE number) \
{ \
_funcName(a, a, number); \
}
#define SIMPLE_COMPARE_FUNCTION_ME(funcNameMe, _funcName) \
void funcNameMe(XTensor & a, DTYPE number) \
{ \
_funcName(&a, &a, number); \
}
#define SIMPLE_COMPARE_FUNCTION(funcName, _funcName, operationId) \
XTensor funcName(const XTensor &a, DTYPE number) \
{ \
XTensor b(&a); \
b.SetTMPFlag(); \
_funcName(&a, &b, number); \
return b; \
}
#define _SIMPLE_COMPARE_FUNCTION_ME(_funcNameMe, _funcName) \
void _funcNameMe(XTensor * a, DTYPE number) \
{ \
_funcName(a, a, number); \
}
#define SIMPLE_COMPARE_FUNCTION(funcName, _funcName, operationId) \
XTensor funcName(const XTensor &a, DTYPE number) \
{ \
XTensor b(&a); \
b.SetTMPFlag(); \
_funcName(&a, &b, number); \
return b; \
#define SIMPLE_COMPARE_FUNCTION_VOID(funcName, _funcName, operationId) \
void funcName(const XTensor &a, XTensor &b, DTYPE number) \
{ \
if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) { \
InitTensor(&b, &a); \
} \
_funcName(&a, &b, number); \
}
// I think we needn't to make link.
// XLink::MakeLink(&a, NULL, &b, operationId);
_SIMPLE_COMPARE_FUNCTION(_Equal, _CudaEqual, myIsEqual)
_SIMPLE_COMPARE_FUNCTION_ME(_EqualMe, _Equal)
SIMPLE_COMPARE_FUNCTION_ME(EqualMe, _Equal)
SIMPLE_COMPARE_FUNCTION(Equal, _Equal, MATH_EQUAL)
SIMPLE_COMPARE_FUNCTION_VOID(Equal, _Equal, MATH_EQUAL)
_SIMPLE_COMPARE_FUNCTION(_NotEqual, _CudaNotEqual, myIsNotEqual)
_SIMPLE_COMPARE_FUNCTION_ME(_NotEqualMe, _NotEqual)
SIMPLE_COMPARE_FUNCTION_ME(NotEqualMe, _NotEqual)
SIMPLE_COMPARE_FUNCTION(NotEqual, _NotEqual, MATH_NOTEQUAL)
#else
/* define three marco separately, specify the respective function names (CPU mode) */
#define _SIMPLE_COMPARE_FUNCTION(_funcName, origFunc) \
void _funcName(const XTensor * a, XTensor * b, DTYPE number) \
{ \
CheckNTErrors((XTensor::IsSameShaped(a, b)), \
"Input tensors should have the same type!"); \
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!"); \
DTYPE * d = (DTYPE*)a->data; \
DTYPE * db = (DTYPE*)b->data; \
for (int i = 0; i < a->unitNum; i++) \
db[i] = (DTYPE)origFunc(d[i], number); \
}
#define _SIMPLE_COMPARE_FUNCTION_ME(_funcNameMe, _funcName) \
void _funcNameMe(XTensor * a, DTYPE number) \
{ \
_funcName(a, a, number); \
}
#define SIMPLE_COMPARE_FUNCTION(funcName, _funcName, operationId) \
XTensor funcName(const XTensor &a, DTYPE number) \
{ \
XTensor b(&a); \
b.SetTMPFlag(); \
_funcName(&a, &b, number); \
return b; \
}
// I think we needn't to make link.
// XLink::MakeLink(&a, NULL, &b, operationId);
_SIMPLE_COMPARE_FUNCTION(_Equal, myIsEqual)
_SIMPLE_COMPARE_FUNCTION_ME(_EqualMe, _Equal)
SIMPLE_COMPARE_FUNCTION(Equal, _Equal, MATH_EQUAL)
_SIMPLE_COMPARE_FUNCTION(_NotEqual, myIsNotEqual)
_SIMPLE_COMPARE_FUNCTION_ME(_NotEqualMe, _NotEqual)
SIMPLE_COMPARE_FUNCTION(NotEqual, _NotEqual, MATH_NOTEQUAL)
SIMPLE_COMPARE_FUNCTION_VOID(NotEqual, _NotEqual, MATH_NOTEQUAL)
#endif
......
......@@ -38,6 +38,9 @@ void EqualMe(XTensor & a, DTYPE value);
/* check whether every entry is equal to the given value (return an XTensor structure) */
XTensor Equal(const XTensor & a, DTYPE value);
/* check whether every entry is equal to the given value */
void Equal(const XTensor & a, XTensor & b, DTYPE value);
/* check whether every entry is not equal to the given value */
void _NotEqual(const XTensor * a, XTensor * b, DTYPE value);
......@@ -50,6 +53,9 @@ void NotEqualMe(XTensor & a, DTYPE value);
/* check whether every entry is not equal to the given value (return an XTensor structure) */
XTensor NotEqual(const XTensor & a, DTYPE value);
/* check whether every entry is not equal to the given value */
void NotEqual(const XTensor & a, XTensor & b, DTYPE value);
} // namespace nts(NiuTrans.Tensor)
#endif // end __COMPARE_H__
\ No newline at end of file
......@@ -42,7 +42,9 @@ where a and b are the scalar and bias respectively, and \epsilon is the adjustme
>> b - the bias
>> epsilon - a parameter
*/
void _Normalize(const XTensor * input, XTensor * output, int dim, const XTensor * mean, const XTensor * var, const XTensor * a, const XTensor * b, DTYPE epsilon)
void _Normalize(const XTensor * input, XTensor * output, int dim,
const XTensor * mean, const XTensor * var,
const XTensor * a, const XTensor * b, DTYPE epsilon)
{
int dimRDI = input->order - dim - 1;
CheckNTErrors((XTensor::IsSameShaped(input, output)), "Unmatched input tensors!");
......@@ -109,7 +111,9 @@ where a and b are the scalar and bias respectively, and \epsilon is the adjustme
>> b - the bias
>> epsilon - a parameter
*/
void _NormalizeMe(XTensor * input, int dim, const XTensor * mean, const XTensor * var, const XTensor * a, const XTensor * b, DTYPE epsilon)
void _NormalizeMe(XTensor * input, int dim,
const XTensor * mean, const XTensor * var,
const XTensor * a, const XTensor * b, DTYPE epsilon)
{
_Normalize(input, input, dim, mean, var, a, b, epsilon);
}
......@@ -129,7 +133,9 @@ where a and b are the scalar and bias respectively, and \epsilon is the adjustme
>> b - the bias
>> epsilon - a parameter
*/
void NormalizeMe(XTensor& input, int dim, const XTensor& mean, const XTensor& var, const XTensor& a, const XTensor& b, DTYPE epsilon)
void NormalizeMe(XTensor& input, int dim,
const XTensor& mean, const XTensor& var,
const XTensor& a, const XTensor& b, DTYPE epsilon)
{
_Normalize(&input, &input, dim, &mean, &var, &a, &b, epsilon);
}
......@@ -150,7 +156,9 @@ where a and b are the scalar and bias respectively, and \epsilon is the adjustme
>> epsilon - a parameter
<< return - the result of normalized the data with normal distribution
*/
XTensor Normalize(const XTensor &input, int dim, const XTensor &mean, const XTensor &var, const XTensor &a, const XTensor &b, DTYPE epsilon)
XTensor Normalize(const XTensor &input, int dim,
const XTensor &mean, const XTensor &var,
const XTensor &a, const XTensor &b, DTYPE epsilon)
{
XTensor output(&input);
output.SetTMPFlag();
......@@ -171,4 +179,48 @@ XTensor Normalize(const XTensor &input, int dim, const XTensor &mean, const XTen
return output;
}
/*
normalized the data with normal distribution (return an XTensor structure)
make a new tensor to keep the result and return it
For an input x, y = a * (x-mean)/sqrt(variance+\epsilon) + b
where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
>> input - the input tensor
>> output - the output tensor
>> dim - dimension alone which we generate the mean and variance
>> mean - the mean of the input
>> var - the variance of the input
>> a - the scalar
>> b - the bias
>> epsilon - a parameter
<< return - the result of normalized the data with normal distribution
*/
void Normalize(const XTensor &input, XTensor &output, int dim,
const XTensor &mean, const XTensor &var,
const XTensor &a, const XTensor &b, DTYPE epsilon)
{
if (!output.isInit || !XTensor::IsSameShaped(&input, &output)) {
InitTensor(&output, &input);
}
/* call _Normalize function */
_Normalize(&input, &output, dim, &mean, &var, &a, &b, epsilon);
if (output.enableGrad == true) {
/* tensor connections */
TensorList list(5);
list.Add((XTensor*)&input);
list.Add((XTensor*)&mean);
list.Add((XTensor*)&var);
list.Add((XTensor*)&a);
list.Add((XTensor*)&b);
XLink::MakeLink(&list, &output, MATH_NORMALIZE);
XLink::AddParamToHeadInt(&output, dim);
XLink::AddParamToHead(&output, epsilon);
}
}
} // namespace nts(NiuTrans.Tensor)
......@@ -31,7 +31,9 @@ normalized the data with normal distribution.
For an input x, y = a * (x-mean)/sqrt(variance+\epsilon) + b
where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
*/
void _Normalize(const XTensor * input, XTensor * output, int dim, const XTensor * mean, const XTensor * var, const XTensor * a, const XTensor * b, DTYPE epsilon);
void _Normalize(const XTensor * input, XTensor * output, int dim,
const XTensor * mean, const XTensor * var,
const XTensor * a, const XTensor * b, DTYPE epsilon);
/*
normalized the data with normal distribution (do it on site)
......@@ -39,7 +41,9 @@ keep the result in the input tenosr and return nothing
For an input x, x = a * (x-mean)/sqrt(variance+\epsilon) + b
where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
*/
void _NormalizeMe(XTensor * input, int dim, const XTensor * mean, const XTensor * var, const XTensor * a, const XTensor * b, DTYPE epsilon);
void _NormalizeMe(XTensor * input, int dim,
const XTensor * mean, const XTensor * var,
const XTensor * a, const XTensor * b, DTYPE epsilon);
/*
normalized the data with normal distribution (do it on site)
......@@ -47,7 +51,9 @@ keep the result in the input tenosr and return nothing
For an input x, x = a * (x-mean)/sqrt(variance+\epsilon) + b
where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
*/
void NormalizeMe(XTensor & input, int dim, const XTensor & mean, const XTensor & var, const XTensor & a, const XTensor & b, DTYPE epsilon);
void NormalizeMe(XTensor & input, int dim,
const XTensor & mean, const XTensor & var,
const XTensor & a, const XTensor & b, DTYPE epsilon);
/*
normalized the data with normal distribution (return an XTensor structure)
......@@ -55,7 +61,19 @@ make a new tensor to keep the result and return it
For an input x, y = a * (x-mean)/sqrt(variance+\epsilon) + b
where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
*/
XTensor Normalize(const XTensor &input, int dim, const XTensor &mean, const XTensor &var, const XTensor &a, const XTensor &b, DTYPE epsilon);
XTensor Normalize(const XTensor &input, int dim,
const XTensor &mean, const XTensor &var,
const XTensor &a, const XTensor &b, DTYPE epsilon);
/*
normalized the data with normal distribution (return an XTensor structure)
make a new tensor to keep the result and return it
For an input x, y = a * (x-mean)/sqrt(variance+\epsilon) + b
where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
*/
void Normalize(const XTensor &input, XTensor &output, int dim,
const XTensor &mean, const XTensor &var,
const XTensor &a, const XTensor &b, DTYPE epsilon);
} // namespace nts(NiuTrans.Tensor)
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include <math.h>
#include "../../XTensor.h"
#include "../../XName.h"
#include "Power.h"
#include "Power.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
get the power(a, p)
>> a - input tensor
>> b - output tensor
>> p - parameter
*/
void _Power(const XTensor * a, XTensor * b, DTYPE p)
{
#ifdef USE_CUDA
/* run it on GPUs */
if (a->devID >= 0) {
_CudaPower(a, b, p);
return;
}
#endif
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
DTYPE * aData = (DTYPE*)a->data;
DTYPE * bData = (DTYPE*)b->data;
if (p == 0) {
for (int i = 0; i < a->unitNum; i++)
bData[i] = (DTYPE)1.0;
}
else if (p == (DTYPE)0.5) {
for (int i = 0; i < a->unitNum; i++)
bData[i] = (DTYPE)sqrt(aData[i]);
}
else if (p == (DTYPE)2.0) {
for (int i = 0; i < a->unitNum; i++)
bData[i] = aData[i] * aData[i];
}
else {
for (int i = 0; i < a->unitNum; i++) {
if (p < 0 && aData[i] == 0)
bData[i] = 1e20F;
else
bData[i] = (DTYPE)pow(aData[i], p);
}
}
}
/*
get the power(a, p) (do it on site)
keep the result in the input tensor a and return nothing
>> a - the tensor
>> p - parameter
*/
void _PowerMe(XTensor * a, DTYPE p)
{
_Power(a, a, p);
}
/*
get the power(a, p) (do it on site)
keep the result in the input tensor a and return nothing
>> a - the tensor
>> p - parameter
*/
void PowerMe(XTensor& a, DTYPE p)
{
_Power(&a, &a, p);
}
/*
get the power(a, p) (return an XTensor structure)
make a new tensor to keep the result and return it
>> a - input tensor
>> p - parameter
<< return - the power value of the input tensor
*/
XTensor Power(const XTensor & a, DTYPE p)
{
XTensor b(&a);
b.SetTMPFlag();
/* call _Power function */
_Power(&a, &b, p);
/* tensor connections */
XLink::MakeLink(&a, NULL, &b, MATH_POWER);
XLink::AddParamToHead(&b, p);
return b;
}
/*
get the power(a, p)
>> a - input tensor
>> b - output tensor
>> p - parameter
*/
void Power(const XTensor & a, XTensor & b, DTYPE p)
{
if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {
InitTensor(&b, &a);
}
/* call _Power function */
_Power(&a, &b, p);
if (b.enableGrad) {
/* tensor connections */
XLink::MakeLink(&a, NULL, &b, MATH_POWER);
XLink::AddParamToHead(&b, p);
}
}
} // namespace nts(NiuTrans.Tensor)
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "../../XDevice.h"
#include "../../XTensor.h"
#include "../movement/CopyValues.cuh"
#include "Power.h"
#include "Power.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/*
set all entries to its root (CUDA Kernel)
>> a - input data array
>> b - output data array
>> size - size of the data array
*/
__global__
void KernelSqrtV2(DTYPE * a, DTYPE * b, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size)
b[i] = sqrt(a[i]);
}
/*
set all entries to its root (CUDA Kernel)
>> a - input data array
>> b - output data array
>> size - size of the data array
*/
__global__
void KernelSqrtV2(__half * a, __half * b, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
if (i < size)
b[i] = hsqrt(a[i]);
#else
if (i < size)
b[i] = __float2half(sqrt(__half2float(a[i])));
#endif
}
/*
get power(d[i], p)
>> a - input data array
>> b - output data array
>> p - power
>> size - size of the data array
*/
__global__
void KernelPower(DTYPE * a, DTYPE * b, DTYPE p, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) {
DTYPE v = a[i];
if (p < 0 && v == 0)
b[i] = 1e20;
else
b[i] = pow(a[i], p);
}
}
/*
get power(d[i], p)
>> a - input data array
>> b - output data array
>> p - power
>> size - size of the data array
*/
__global__
void KernelPower(__half * a, __half * b, __half p, int size)
{
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
#else
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) {
float v = __half2float(a[i]);
if (__half2float(p) < 0 && v == 0)
b[i] = __float2half(1e20);
else
b[i] = __float2half(pow(__half2float(a[i]), __half2float(p)));
}
#endif
}
/* get the power of the entries */
void _CudaPower(const XTensor * a, XTensor * b, DTYPE p)
{
CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
int gridSize[3];
int blockSize[3];
GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
dim3 blocks(gridSize[0]);
dim3 threads(blockSize[0]);
int devIDBackup;
ProtectCudaDev(a->devID, devIDBackup);
if (a->dataType == DEFAULT_DTYPE) {
if (p == (DTYPE)0.5) {
KernelSqrtV2 << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);
}
else if (p == (DTYPE)1.0) {
_CudaCopyValues(a, b);
}
else if (p != (DTYPE)1.0) {
KernelPower << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, p, a->unitNum);
}
}
else if (a->dataType == X_FLOAT16) {
if (p == (DTYPE)0.5) {
KernelSqrtV2 << <blocks, threads >> >((__half*)a->data, (__half*)b->data, a->unitNum);
}
else if (p != (DTYPE)1.0) {
ShowNTErrors("TODO!");
}
}
else {
ShowNTErrors("TODO!");
}
BacktoCudaDev(a->devID, devIDBackup);
}
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#ifndef __POWER_CUH__
#define __POWER_CUH__
#include "Power.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* set all entries to its root (CUDA Kernel) */
__global__
void KernelSqrtV2(DTYPE * a, DTYPE * b, int size);
/* set all entries to its root (CUDA Kernel) */
__global__
void KernelSqrtV2(__half * a, __half * b, int size);
/* get the power of the entries */
void _CudaPower(const XTensor * a, XTensor * b, DTYPE p);
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
#endif // __POWER_CUH__
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#ifndef __POWER_H__
#define __POWER_H__
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* get the power(x, y) */
void _Power(const XTensor * a, XTensor * b, DTYPE p);
/*
get the power(x, y) (do it on site)
keep the result in the input tensor a and return nothing
*/
void _PowerMe(XTensor * a, DTYPE p);
/*
get the power(x, y) (do it on site)
keep the result in the input tensor a and return nothing
*/
void PowerMe(XTensor & a, DTYPE p);
/*
get the power(x, y) (return an XTensor structure)
make a new tensor to keep the result and return it
*/
XTensor Power(const XTensor & a, DTYPE p);
/* get the power(x, y) */
void Power(const XTensor & a, XTensor & b, DTYPE p);
} // namespace nts(NiuTrans.Tensor)
#endif // __POWER_H__
......@@ -26,248 +26,207 @@
#include "Unary.cuh"
namespace nts{
DTYPE square(DTYPE x)
template<class T>
T UnaryNegate(T x) {
return (T)-x;
}
template<class T>
T UnarySquare(T x)
{
return x * x;
return (T)(x * x);
}
DTYPE round(DTYPE r)
template<class T>
T UnaryRound(T r)
{
return (r > 0.0) ? (DTYPE)floor(r + 0.5) : (DTYPE)ceil(r - 0.5);
return (r > 0.0) ? (T)floor(r + 0.5) : (T)ceil(r - 0.5);
}
DTYPE isnonzero(DTYPE r)
template<class T>
T UnarySign(T r)
{
return (r != 0.0) ? (DTYPE)1.0 : (DTYPE)0.0;
if (r > 0.0)
return (T)1.0;
else if (r == 0.0)
return (T)0.0;
else
return (T)-1.0;
}
DTYPE iszero(DTYPE r)
template<class T>
T UnaryIsNonZero(T r)
{
return (r == 0.0) ? (DTYPE)1.0 : (DTYPE)0.0;
return (r != 0.0) ? (T)1.0 : (T)0.0;
}
#ifdef USE_CUDA
/* define three marco separately, specify the respective function names (GPU mode) */
#define _SIMPLE_UNARY_FUNCTION(_funcName, _cudaFuncName, origFunc) \
void _funcName(const XTensor * a, XTensor * b) \
{ \
/* run it on GPUs */ \
if (a->devID >= 0) { \
_cudaFuncName(a, b); \
return; \
} \
CheckNTErrors((XTensor::IsSameShaped(a, b)), \
"Input tensors should have the same type!"); \
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!"); \
DTYPE * d = (DTYPE*)a->data; \
DTYPE * db = (DTYPE*)b->data; \
for (int i = 0; i < a->unitNum; i++) \
db[i] = (DTYPE)origFunc(d[i]); \
template<class T>
T UnaryIsZero(T r)
{
return (r == 0.0) ? (T)1.0 : (T)0.0;
}
#define _SIMPLE_UNARY_FUNCTION_ME(_funcNameMe, _funcName) \
void _funcNameMe(XTensor * a) \
{ \
_funcName(a, a); \
/* define three marco separately, specify the respective function names */
#define _SIMPLE_UNARY_FUNCTION(_funcName, _cudaFuncName, origFunc) \
void _funcName(const XTensor * a, XTensor * b) \
{ \
/* run it on GPUs */ \
if (a->devID >= 0) { \
if (useCUDA) { \
_cudaFuncName(a, b); \
return; \
} \
else \
ShowNTErrors("No GPU devices support!") \
} \
CheckNTErrors((XTensor::IsSameShaped(a, b)), \
"Input tensors should have the same type!"); \
if (a->dataType == X_INT) { \
int * d = (int*)a->data; \
int * db = (int*)b->data; \
for (int i = 0; i < a->unitNum; i++) \
db[i] = (int)origFunc(d[i]); \
} \
else if (a->dataType == X_FLOAT) { \
float * d = (float*)a->data; \
float * db = (float*)b->data; \
for (int i = 0; i < a->unitNum; i++) \
db[i] = (float)origFunc(d[i]); \
} \
else if (a->dataType == X_DOUBLE) { \
double * d = (double*)a->data; \
double * db = (double*)b->data; \
for (int i = 0; i < a->unitNum; i++) \
db[i] = (double)origFunc(d[i]); \
} \
else \
ShowNTErrors("TO DO!"); \
}
#define _SIMPLE_UNARY_FUNCTION_ME(_funcNameMe, _funcName) \
void _funcNameMe(XTensor * a) \
{ \
_funcName(a, a); \
}
#define SIMPLE_UNARY_FUNCTION(funcName, _funcName, operationId) \
XTensor funcName(const XTensor &a) \
{ \
XTensor b(&a); \
b.SetTMPFlag(); \
_funcName(&a, &b); \
XLink::MakeLink(&a, NULL, &b, operationId); \
return b; \
}
#define SIMPLE_UNARY_FUNCTION_VOID(funcName, _funcName, operationId) \
void funcName(const XTensor &a, XTensor &b) \
{ \
if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) { \
InitTensor(&b, &a); \
} \
_funcName(&a, &b); \
if (b.enableGrad) { \
XLink::MakeLink(&a, NULL, &b, operationId); \
} \
#define SIMPLE_UNARY_FUNCTION_ME(funcNameMe, _funcName) \
void funcNameMe(XTensor & a) \
{ \
_funcName(&a, &a); \
}
#define SIMPLE_UNARY_FUNCTION(funcName, _funcName, operationId) \
XTensor funcName(const XTensor & a) \
{ \
XTensor b(&a); \
b.SetTMPFlag(); \
_funcName(&a, &b); \
XLink::MakeLink(&a, NULL, &b, operationId); \
return b; \
}
#define SIMPLE_UNARY_FUNCTION_VOID(funcName, _funcName, operationId) \
void funcName(const XTensor & a, XTensor & b) \
{ \
if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) { \
InitTensor(&b, &a); \
} \
_funcName(&a, &b); \
if (b.enableGrad) { \
XLink::MakeLink(&a, NULL, &b, operationId); \
} \
}
_SIMPLE_UNARY_FUNCTION(_Absolute, _CudaAbsolute, fabs)
_SIMPLE_UNARY_FUNCTION_ME(_AbsoluteMe, _Absolute)
SIMPLE_UNARY_FUNCTION(Absolute, _Absolute, MATH_ABSOLUTE)
SIMPLE_UNARY_FUNCTION_VOID(Absolute, _Absolute, MATH_ABSOLUTE)
_SIMPLE_UNARY_FUNCTION(_Ceil, _CudaCeil, ceil)
_SIMPLE_UNARY_FUNCTION_ME(_CeilMe, _Ceil)
SIMPLE_UNARY_FUNCTION(Ceil, _Ceil, MATH_CEIL)
SIMPLE_UNARY_FUNCTION_VOID(Ceil, _Ceil, MATH_CEIL)
_SIMPLE_UNARY_FUNCTION(_Exp, _CudaExp, exp)
_SIMPLE_UNARY_FUNCTION_ME(_ExpMe, _Exp)
SIMPLE_UNARY_FUNCTION(Exp, _Exp, MATH_EXP)
SIMPLE_UNARY_FUNCTION_VOID(Exp, _Exp, MATH_EXP)
_SIMPLE_UNARY_FUNCTION(_Floor, _CudaFloor, floor)
_SIMPLE_UNARY_FUNCTION_ME(_FloorMe, _Floor)
SIMPLE_UNARY_FUNCTION(Floor, _Floor, MATH_FLOOR)
SIMPLE_UNARY_FUNCTION_VOID(Floor, _Floor, MATH_FLOOR)
_SIMPLE_UNARY_FUNCTION(_IsNonZero, _CudaIsNonZero, isnonzero)
_SIMPLE_UNARY_FUNCTION_ME(_IsNonZeroMe, _IsNonZero)
SIMPLE_UNARY_FUNCTION(IsNonZero, _IsNonZero, MATH_ISNONZERO)
SIMPLE_UNARY_FUNCTION_VOID(IsNonZero, _IsNonZero, MATH_ISNONZERO)
_SIMPLE_UNARY_FUNCTION(_IsZero, _CudaIsZero, iszero)
_SIMPLE_UNARY_FUNCTION_ME(_IsZeroMe, _IsZero)
SIMPLE_UNARY_FUNCTION(IsZero, _IsZero, MATH_ISZERO)
SIMPLE_UNARY_FUNCTION_VOID(IsZero, _IsZero, MATH_ISZERO)
_SIMPLE_UNARY_FUNCTION(_IsNonZero, _CudaIsNonZero, UnaryIsNonZero)
_SIMPLE_UNARY_FUNCTION(_IsZero, _CudaIsZero, UnaryIsZero)
_SIMPLE_UNARY_FUNCTION(_Log, _CudaLog, log)
_SIMPLE_UNARY_FUNCTION_ME(_LogMe, _Log)
SIMPLE_UNARY_FUNCTION(Log, _Log, MATH_LOG)
SIMPLE_UNARY_FUNCTION_VOID(Log, _Log, MATH_LOG)
_SIMPLE_UNARY_FUNCTION(_Negate, _CudaNegate, UnaryNegate)
_SIMPLE_UNARY_FUNCTION(_Round, _CudaRound, round)
_SIMPLE_UNARY_FUNCTION_ME(_RoundMe, _Round)
SIMPLE_UNARY_FUNCTION(Round, _Round, MATH_ROUND)
SIMPLE_UNARY_FUNCTION_VOID(Round, _Round, MATH_ROUND)
_SIMPLE_UNARY_FUNCTION(_Sign, _CudaSign, UnarySign)
_SIMPLE_UNARY_FUNCTION(_Sqrt, _CudaSqrt, sqrt)
_SIMPLE_UNARY_FUNCTION_ME(_SqrtMe, _Sqrt)
SIMPLE_UNARY_FUNCTION(Sqrt, _Sqrt, MATH_SQRT)
SIMPLE_UNARY_FUNCTION_VOID(Sqrt, _Sqrt, MATH_SQRT)
_SIMPLE_UNARY_FUNCTION(_Square, _CudaSquare, square)
_SIMPLE_UNARY_FUNCTION_ME(_SquareMe, _Square)
SIMPLE_UNARY_FUNCTION(Square, _Square, MATH_SQUARE)
SIMPLE_UNARY_FUNCTION_VOID(Square, _Square, MATH_SQUARE)
_SIMPLE_UNARY_FUNCTION(_Square, _CudaSquare, UnarySquare)
_SIMPLE_UNARY_FUNCTION(_Sin, _CudaSin, sin)
_SIMPLE_UNARY_FUNCTION_ME(_SinMe, _Sin)
SIMPLE_UNARY_FUNCTION(Sin, _Sin, MATH_SIN)
SIMPLE_UNARY_FUNCTION_VOID(Sin, _Sin, MATH_SIN)
_SIMPLE_UNARY_FUNCTION(_Cos, _CudaCos, cos)
_SIMPLE_UNARY_FUNCTION_ME(_CosMe, _Cos)
SIMPLE_UNARY_FUNCTION(Cos, _Cos, MATH_COS)
SIMPLE_UNARY_FUNCTION_VOID(Cos, _Cos, MATH_COS)
_SIMPLE_UNARY_FUNCTION(_Tan, _CudaTan, tan)
_SIMPLE_UNARY_FUNCTION_ME(_TanMe, _Tan)
SIMPLE_UNARY_FUNCTION(Tan, _Tan, MATH_TAN)
SIMPLE_UNARY_FUNCTION_VOID(Tan, _Tan, MATH_TAN)
#else
/* define three marco separately, specify the respective function names (CPU mode) */
#define _SIMPLE_UNARY_FUNCTION(_funcName, origFunc) \
void _funcName(const XTensor * a, XTensor * b) \
{ \
CheckNTErrors((XTensor::IsSameShaped(a, b)), \
"Input tensors should have the same type!"); \
CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!"); \
DTYPE * d = (DTYPE*)a->data; \
DTYPE * db = (DTYPE*)b->data; \
for (int i = 0; i < a->unitNum; i++) \
db[i] = (DTYPE)origFunc(d[i]); \
}
#define _SIMPLE_UNARY_FUNCTION_ME(_funcNameMe, _funcName) \
void _funcNameMe(XTensor * a) \
{ \
_funcName(a, a); \
}
#define SIMPLE_UNARY_FUNCTION(funcName, _funcName, operationId) \
XTensor funcName(const XTensor &a) \
{ \
XTensor b(&a); \
b.SetTMPFlag(); \
_funcName(&a, &b); \
XLink::MakeLink(&a, NULL, &b, operationId); \
return b; \
}
#define SIMPLE_UNARY_FUNCTION_VOID(funcName, _funcName, operationId) \
void funcName(const XTensor &a, XTensor &b) \
{ \
if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) { \
InitTensor(&b, &a); \
} \
_funcName(&a, &b); \
if (b.enableGrad) { \
XLink::MakeLink(&a, NULL, &b, operationId); \
} \
}
_SIMPLE_UNARY_FUNCTION(_Absolute, fabs)
_SIMPLE_UNARY_FUNCTION_ME(_AbsoluteMe, _Absolute)
SIMPLE_UNARY_FUNCTION_ME(AbsoluteMe, _Absolute)
SIMPLE_UNARY_FUNCTION(Absolute, _Absolute, MATH_ABSOLUTE)
SIMPLE_UNARY_FUNCTION_VOID(Absolute, _Absolute, MATH_ABSOLUTE)
_SIMPLE_UNARY_FUNCTION(_Ceil, ceil)
_SIMPLE_UNARY_FUNCTION_ME(_CeilMe, _Ceil)
SIMPLE_UNARY_FUNCTION_ME(CeilMe, _Ceil)
SIMPLE_UNARY_FUNCTION(Ceil, _Ceil, MATH_CEIL)
SIMPLE_UNARY_FUNCTION_VOID(Ceil, _Ceil, MATH_CEIL)
_SIMPLE_UNARY_FUNCTION(_Exp, exp)
_SIMPLE_UNARY_FUNCTION_ME(_ExpMe, _Exp)
SIMPLE_UNARY_FUNCTION_ME(ExpMe, _Exp)
SIMPLE_UNARY_FUNCTION(Exp, _Exp, MATH_EXP)
SIMPLE_UNARY_FUNCTION_VOID(Exp, _Exp, MATH_EXP)
_SIMPLE_UNARY_FUNCTION(_Floor, floor)
_SIMPLE_UNARY_FUNCTION_ME(_FloorMe, _Floor)
SIMPLE_UNARY_FUNCTION_ME(FloorMe, _Floor)
SIMPLE_UNARY_FUNCTION(Floor, _Floor, MATH_FLOOR)
SIMPLE_UNARY_FUNCTION_VOID(Floor, _Floor, MATH_FLOOR)
_SIMPLE_UNARY_FUNCTION(_IsNonZero, isnonzero)
_SIMPLE_UNARY_FUNCTION_ME(_IsNonZeroMe, _IsNonZero)
SIMPLE_UNARY_FUNCTION_ME(IsNonZeroMe, _IsNonZero)
SIMPLE_UNARY_FUNCTION(IsNonZero, _IsNonZero, MATH_ISNONZERO)
SIMPLE_UNARY_FUNCTION_VOID(IsNonZero, _IsNonZero, MATH_ISNONZERO)
_SIMPLE_UNARY_FUNCTION(_IsZero, iszero)
_SIMPLE_UNARY_FUNCTION_ME(_IsZeroMe, _IsZero)
SIMPLE_UNARY_FUNCTION_ME(IsZeroMe, _IsZero)
SIMPLE_UNARY_FUNCTION(IsZero, _IsZero, MATH_ISZERO)
SIMPLE_UNARY_FUNCTION_VOID(IsZero, _IsZero, MATH_ISZERO)
_SIMPLE_UNARY_FUNCTION(_Log, log)
_SIMPLE_UNARY_FUNCTION_ME(_LogMe, _Log)
SIMPLE_UNARY_FUNCTION_ME(LogMe, _Log)
SIMPLE_UNARY_FUNCTION(Log, _Log, MATH_LOG)
SIMPLE_UNARY_FUNCTION_VOID(Log, _Log, MATH_LOG)
_SIMPLE_UNARY_FUNCTION(_Round, round)
_SIMPLE_UNARY_FUNCTION_ME(_NegateMe, _Negate)
SIMPLE_UNARY_FUNCTION_ME(NegateMe, _Negate)
SIMPLE_UNARY_FUNCTION(Negate, _Negate, MATH_NEGATE)
SIMPLE_UNARY_FUNCTION_VOID(Negate, _Negate, MATH_NEGATE)
_SIMPLE_UNARY_FUNCTION_ME(_RoundMe, _Round)
SIMPLE_UNARY_FUNCTION_ME(RoundMe, _Round)
SIMPLE_UNARY_FUNCTION(Round, _Round, MATH_ROUND)
SIMPLE_UNARY_FUNCTION_VOID(Round, _Round, MATH_ROUND)
_SIMPLE_UNARY_FUNCTION(_Sqrt, sqrt)
_SIMPLE_UNARY_FUNCTION_ME(_SignMe, _Sign)
SIMPLE_UNARY_FUNCTION_ME(SignMe, _Sign)
SIMPLE_UNARY_FUNCTION(Sign, _Sign, MATH_SIGN)
SIMPLE_UNARY_FUNCTION_VOID(Sign, _Sign, MATH_SIGN)
_SIMPLE_UNARY_FUNCTION_ME(_SqrtMe, _Sqrt)
SIMPLE_UNARY_FUNCTION_ME(SqrtMe, _Sqrt)
SIMPLE_UNARY_FUNCTION(Sqrt, _Sqrt, MATH_SQRT)
SIMPLE_UNARY_FUNCTION_VOID(Sqrt, _Sqrt, MATH_SQRT)
_SIMPLE_UNARY_FUNCTION(_Square, square)
_SIMPLE_UNARY_FUNCTION_ME(_SquareMe, _Square)
SIMPLE_UNARY_FUNCTION_ME(SquareMe, _Square)
SIMPLE_UNARY_FUNCTION(Square, _Square, MATH_SQUARE)
SIMPLE_UNARY_FUNCTION_VOID(Square, _Square, MATH_SQUARE)
_SIMPLE_UNARY_FUNCTION(_Sin, sin)
_SIMPLE_UNARY_FUNCTION_ME(_SinMe, _Sin)
SIMPLE_UNARY_FUNCTION_ME(SinMe, _Sin)
SIMPLE_UNARY_FUNCTION(Sin, _Sin, MATH_SIN)
SIMPLE_UNARY_FUNCTION_VOID(Sin, _Sin, MATH_SIN)
_SIMPLE_UNARY_FUNCTION(_Cos, cos)
_SIMPLE_UNARY_FUNCTION_ME(_CosMe, _Cos)
SIMPLE_UNARY_FUNCTION_ME(CosMe, _Cos)
SIMPLE_UNARY_FUNCTION(Cos, _Cos, MATH_COS)
SIMPLE_UNARY_FUNCTION_VOID(Cos, _Cos, MATH_COS)
_SIMPLE_UNARY_FUNCTION(_Tan, tan)
_SIMPLE_UNARY_FUNCTION_ME(_TanMe, _Tan)
SIMPLE_UNARY_FUNCTION_ME(TanMe, _Tan)
SIMPLE_UNARY_FUNCTION(Tan, _Tan, MATH_TAN)
SIMPLE_UNARY_FUNCTION_VOID(Tan, _Tan, MATH_TAN)
/*_SIMPLE_UNARY_FUNCTION(_Round, round)
_SIMPLE_UNARY_FUNCTION_ME(_RoundMe, _Round)
SIMPLE_UNARY_FUNCTION(Round, _Round, MATH_ROUND)*/
#endif
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
......@@ -24,55 +24,139 @@
#include "../../XName.h"
#include "Unary.h"
#include "Unary.cuh"
#include<cuda_runtime.h>
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
template<class T>
__device__
DTYPE cudasquare(DTYPE x)
T UnaryCudaCeil(T x)
{
return (T)ceil((float)x);
}
template<class T>
__device__
T UnaryCudaExp(T x)
{
return (T)exp((float)x);
}
template<class T>
__device__
T UnaryCudaFabs(T x)
{
return (T)fabs((float)x);
}
template<class T>
__device__
T UnaryCudaFloor(T x)
{
return (T)floor((float)x);
}
template<class T>
__device__
T UnaryCudaIsNonZero(T r)
{
return (r != (T)0.0) ? (T)1.0 : (T)0.0;
}
template<class T>
__device__
T UnaryCudaIsZero(T r)
{
return (r == (T)0.0) ? (T)1.0 : (T)0.0;
}
template<class T>
__device__
T UnaryCudaLog(T x)
{
return (T)log((float)x);
}
template<class T>
__device__
T UnaryCudaNegate(T x)
{
return -x;
}
template<class T>
__device__
T UnaryCudaSign(T r)
{
if (r > (T)0)
return 1.0;
else if (r == (T)0)
return 0.0;
else
return -1.0;
}
template<class T>
__device__
T UnaryCudaSqrt(T x)
{
return (T)sqrt((float)x);
}
template<class T>
__device__
T UnaryCudaSquare(T x)
{
return x * x;
}
template<class T>
__device__
DTYPE cudaround(DTYPE r)
T UnaryCudaRound(T r)
{
return (r > 0.0) ? (DTYPE)floor(r + 0.5) : (DTYPE)ceil(r - 0.5);
return (r > (T)0.0) ? (T)UnaryCudaFloor(r + (T)0.5) : (T)UnaryCudaCeil(r - (T)0.5);
}
template<class T>
__device__
DTYPE cudaisnonzero(DTYPE r)
T UnaryCudaSin(T x)
{
return (r != 0.0) ? (DTYPE)1.0 : (DTYPE)0.0;
return (T)sin((float)x);
}
template<class T>
__device__
DTYPE cudaiszero(DTYPE r)
T UnaryCudaCos(T x)
{
return (r == 0.0) ? (DTYPE)1.0 : (DTYPE)0.0;
return (T)cos((float)x);
}
template<class T>
__device__
T UnaryCudaTan(T x)
{
return (T)tan((float)x);
}
#define SIMPLE_UNARY_FUNCTION_GPU(funcName, origFunc) \
template<class T> \
__global__ \
void Kernel##funcName(DTYPE * a, DTYPE * b, int size) \
void Kernel##funcName(T * a, T * b, int size) \
{ \
int i = blockDim.x * blockIdx.x + threadIdx.x; \
\
if (i < size) \
b[i] = (DTYPE)origFunc(a[i]); \
} \
__global__ \
void Kernel##funcName(__half * a, __half * b, int size) \
{ \
return; \
b[i] = (T)origFunc(a[i]); \
} \
void _Cuda##funcName(const XTensor * a, XTensor * b) \
{ \
CheckNTErrors((XTensor::IsSameShaped(a, b)), \
"Input tensors should have the same type!"); \
CheckNTErrors((a->isSparse == false), "TODO!"); \
CheckNTErrors(a->isSparse == false, "TODO!"); \
\
int gridSize[3]; \
int blockSize[3]; \
......@@ -85,35 +169,43 @@ void _Cuda##funcName(const XTensor * a, XTensor * b) \
int devIDBackup; \
ProtectCudaDev(a->devID, devIDBackup); \
\
if (a->dataType == DEFAULT_DTYPE) { \
if (a->dataType == X_FLOAT) { \
Kernel##funcName<<<blocks, threads>>> \
((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum); \
((float*)a->data, (float*)b->data, a->unitNum); \
} \
else if (a->dataType == X_FLOAT16) { \
else if (a->dataType == X_DOUBLE) { \
Kernel##funcName<<<blocks, threads>>> \
((__half*)a->data, (__half*)b->data, a->unitNum); \
((double*)a->data, (double*)b->data, a->unitNum); \
} \
else if (a->dataType == X_INT) { \
Kernel##funcName<<<blocks, threads>>> \
((int*)a->data, (int*)b->data, a->unitNum); \
} \
else { \
ShowNTErrors("TODO!"); \
} \
\
BacktoCudaDev(a->devID, devIDBackup); \
} \
}
SIMPLE_UNARY_FUNCTION_GPU(Absolute, UnaryCudaFabs)
SIMPLE_UNARY_FUNCTION_GPU(Ceil, UnaryCudaCeil)
SIMPLE_UNARY_FUNCTION_GPU(Exp, UnaryCudaExp)
SIMPLE_UNARY_FUNCTION_GPU(Floor, UnaryCudaFloor)
SIMPLE_UNARY_FUNCTION_GPU(IsNonZero, UnaryCudaIsNonZero)
SIMPLE_UNARY_FUNCTION_GPU(IsZero, UnaryCudaIsZero)
SIMPLE_UNARY_FUNCTION_GPU(Log, UnaryCudaLog)
SIMPLE_UNARY_FUNCTION_GPU(Negate, UnaryCudaNegate)
SIMPLE_UNARY_FUNCTION_GPU(Round, UnaryCudaRound)
SIMPLE_UNARY_FUNCTION_GPU(Sign, UnaryCudaSign)
SIMPLE_UNARY_FUNCTION_GPU(Sqrt, UnaryCudaSqrt)
SIMPLE_UNARY_FUNCTION_GPU(Square, UnaryCudaSquare)
SIMPLE_UNARY_FUNCTION_GPU(Absolute, fabs)
SIMPLE_UNARY_FUNCTION_GPU(Ceil, ceil)
SIMPLE_UNARY_FUNCTION_GPU(Exp, exp)
SIMPLE_UNARY_FUNCTION_GPU(Floor, floor)
SIMPLE_UNARY_FUNCTION_GPU(IsNonZero, cudaisnonzero)
SIMPLE_UNARY_FUNCTION_GPU(IsZero, cudaiszero)
SIMPLE_UNARY_FUNCTION_GPU(Log, log)
SIMPLE_UNARY_FUNCTION_GPU(Round, cudaround)
SIMPLE_UNARY_FUNCTION_GPU(Sqrt, sqrt)
SIMPLE_UNARY_FUNCTION_GPU(Square, cudasquare)
SIMPLE_UNARY_FUNCTION_GPU(Sin, sin)
SIMPLE_UNARY_FUNCTION_GPU(Cos, cos)
SIMPLE_UNARY_FUNCTION_GPU(Tan, tan)
SIMPLE_UNARY_FUNCTION_GPU(Sin, UnaryCudaSin)
SIMPLE_UNARY_FUNCTION_GPU(Cos, UnaryCudaCos)
SIMPLE_UNARY_FUNCTION_GPU(Tan, UnaryCudaTan)
#endif // USE_CUDA
......
......@@ -29,121 +29,49 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* set each entry to its absolute value (CUDA Kernel) */
__global__
void KernelAbsolute(DTYPE * a, DTYPE * b, int size);
/* set each entry to its absolute value (CUDA Kernel) with float16 data type*/
__global__
void KernelAbsolute(__half * a, __half * b, int size);
/* set each entry to its absolute value */
void _CudaAbsolute(const XTensor * a, XTensor * b);
/* set each entry to its ceil value (CUDA Kernel) */
__global__
void KernelCeil(DTYPE * a, DTYPE * b, int size);
/* set each entry to its ceil value (CUDA Kernel) with float16 data type*/
__global__
void KernelCeil(__half * a, __half * b, int size);
/* set each entry to its ceil value */
void _CudaCeil(const XTensor * a, XTensor * b);
/* set each entry to its exponent value (CUDA Kernel) */
__global__
void KernelExp(DTYPE * a, DTYPE * b, int size);
/* set each entry to its exponent value (CUDA Kernel) with float16 data type*/
__global__
void KernelExp(__half * a, __half * b, int size);
/* set each entry to its exponent value */
void _CudaExp(const XTensor * a, XTensor * b);
/* set each entry to its floor value (CUDA Kernel) */
__global__
void KernelFloor(DTYPE * a, DTYPE * b, int size);
/* set each entry to its floor value (CUDA Kernel) with float16 data type*/
__global__
void KernelFloor(__half * a, __half * b, int size);
/* set each entry to its floor value */
void _CudaFloor(const XTensor * a, XTensor * b);
/* if source entry is non-zero, set target entry to be one, otherwise zero (CUDA Kernel) */
__global__
void KernelIsNonZero(DTYPE * a, DTYPE * b, int size);
/* if source entry is non-zero, set target entry to be one, otherwise zero (CUDA Kernel) with float16 data type*/
__global__
void KernelIsNonZero(__half * a, __half * b, int size);
/* if source entry is non-zero, set target entry to be one, otherwise zero */
void _CudaIsNonZero(const XTensor * a, XTensor * b);
/* if source entry is zero, set target entry to be one, otherwise zero (CUDA Kernel) */
__global__
void KernelIsZero(DTYPE * a, DTYPE * b, int size);
/* if source entry is zero, set target entry to be one, otherwise zero (CUDA Kernel) with float16 data type*/
__global__
void KernelIsZero(__half * a, __half * b, int size);
/* if source entry is zero, set target entry to be one, otherwise zero */
void _CudaIsZero(const XTensor * a, XTensor * b);
/* set each entry to its logarithm value (CUDA Kernel) */
__global__
void KernelLog(DTYPE * a, DTYPE * b, int size);
/* set each entry to its logarithm value (CUDA Kernel) with float16 data type*/
__global__
void KernelLog(__half * a, __half * b, int size);
/* set each entry to its logarithm value */
void _CudaLog(const XTensor * a, XTensor * b);
/* set each entry to its round value (CUDA Kernel) */
__global__
void KernelRound(DTYPE * a, DTYPE * b, int size);
/* set each entry to its round value (CUDA Kernel) with float16 data type*/
__global__
void KernelRound(__half * a, __half * b, int size);
/* set each entry to its negative value */
void _CudaNegate(const XTensor * a, XTensor * b);
/* set each entry to its round value */
void _CudaRound(const XTensor * a, XTensor * b);
/* set each entry to its sqrt value (CUDA Kernel) */
__global__
void KernelSqrt(DTYPE * a, DTYPE * b, int size);
/* set each entry to its sqrt value (CUDA Kernel) with float16 data type*/
__global__
void KernelSqrt(__half * a, __half * b, int size);
/* set each entry to its sign value */
void _CudaSign(const XTensor * a, XTensor * b);
/* set each entry to its sqrt value */
void _CudaSqrt(const XTensor * a, XTensor * b);
/* set each entry to its square value (CUDA Kernel) */
__global__
void KernelSquare(DTYPE * a, DTYPE * b, int size);
/* set each entry to its square value (CUDA Kernel) with float16 data type*/
__global__
void KernelSquare(__half * a, __half * b, int size);
/* set each entry to its square value */
void _CudaSquare(const XTensor * a, XTensor * b);
/* set each entry to its sine value (CUDA Kernel) */
__global__
void KernelSin(DTYPE * a, DTYPE * b, int size);
/* set each entry to its sine value (CUDA Kernel) with float16 data type*/
__global__
void KernelSin(__half * a, __half * b, int size);
/* set each entry to its sine value */
void _CudaSin(const XTensor * a, XTensor * b);
/* set each entry to its cosine value (CUDA Kernel) */
__global__
void KernelCos(DTYPE * a, DTYPE * b, int size);
/* set each entry to its cosine value (CUDA Kernel) with float16 data type*/
__global__
void KernelCos(__half * a, __half * b, int size);
/* set each entry to its cosine value */
void _CudaCos(const XTensor * a, XTensor * b);
/* set each entry to its tangent value (CUDA Kernel) */
__global__
void KernelTan(DTYPE * a, DTYPE * b, int size);
/* set each entry to its tangent value (CUDA Kernel) with float16 data type*/
__global__
void KernelTan(__half * a, __half * b, int size);
/* set each entry to its tangent value */
void _CudaTan(const XTensor * a, XTensor * b);
......
......@@ -124,6 +124,20 @@ XTensor Log(const XTensor & a);
/* set every entry to its logarithm value */
void Log(const XTensor & a, XTensor & b);
/* set every entry to its negative value */
void _Negate(const XTensor * a, XTensor * b);
/* set every entry to its negative value (do it on site)
keep the result in the input tensor a and return nothing */
void _NegateMe(XTensor * a);
/* set every entry to its negative value (do it on site)
keep the result in the input tensor a and return nothing */
void NegateMe(XTensor & a);
/* set every entry to its negative value (return an XTensor structure)
make a new tensor to keep the result and return it */
XTensor Negate(const XTensor & a);
/* set every entry to its negative value */
void Negate(const XTensor & a, XTensor & b);
/* set every entry to its round value */
void _Round(const XTensor * a, XTensor * b);
/* set every entry to its round value (do it on site)
......@@ -138,6 +152,20 @@ XTensor Round(const XTensor & a);
/* set every entry to its round value */
void Round(const XTensor & a, XTensor & b);
/* set every entry to its sign value */
void _Sign(const XTensor * a, XTensor * b);
/* set every entry to its sign value (do it on site)
keep the result in the input tensor a and return nothing */
void _SignMe(XTensor * a);
/* set every entry to its sign value (do it on site)
keep the result in the input tensor a and return nothing */
void SignMe(XTensor & a);
/* set every entry to its sign value (return an XTensor structure)
make a new tensor to keep the result and return it */
XTensor Sign(const XTensor & a);
/* set every entry to its sign value */
void Sign(const XTensor & a, XTensor & b);
/* set every entry to its sqrt value */
void _Sqrt(const XTensor * a, XTensor * b);
/* set every entry to its sqrt value (do it on site)
......@@ -166,7 +194,6 @@ XTensor Square(const XTensor & a);
/* set every entry to its square value */
void Square(const XTensor & a, XTensor & b);
/* set every entry to its sine value */
void _Sin(const XTensor * a, XTensor * b);
/* set every entry to its sine value (do it on site)
......
......@@ -189,6 +189,29 @@ void _CopyIndexed(const XTensor * s, XTensor * t, int dim,
}
}
/*
copy selected sub-tensors
>> s - the source tensor
>> t - the target tensor
>> dim - the leading dimension to define "sub-tensors"
e.g., for a tensor of size (3, 2, 4) and dim = 2,
we have 4 sub-tensors of size (3, 2)
>> srcIndex - the tensor to save the index of the source sub-tensors
>> copyNum - number of the sub-tensors we copy for each source index,
e.g., for srcIndex = [1,4] and copyNum = 2,
we actually copy the source sub-tensors 1, 2, 4, 5
*/
void _CopyIndexed(const XTensor * s, XTensor * t, int dim,
const XTensor * srcIndex, int copyNum)
{
XTensor * tgtIndex = NewTensor(srcIndex);
tgtIndex->SetAscendingOrder(0);
_CopyIndexed(s, t, dim, srcIndex, tgtIndex, copyNum);
delete tgtIndex;
}
/*
copy selected sub-tensors where indeces are kept in tensors (return an XTensor structure)
make a new tensor to keep the result and return it
......
......@@ -31,16 +31,14 @@ void _CopyIndexed(const XTensor * s, XTensor * t, int dim,
int * srcIndex, int indexSize, int * tgtIndex,
int copyNum = 1);
/* copy selected sub-tensors where indeces are kept in tensors */
/* copy selected sub-tensors */
void _CopyIndexed(const XTensor * s, XTensor * t, int dim,
const XTensor * srcIndex, const XTensor * tgtIndex,
int copyNum = 1);
/*
copy selected sub-tensors (return a XTensor structure)
make a new tensor to keep the result and return it (remove this???)
*/
//XTensor CopyIndexed(const XTensor &s, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum);
/* copy selected sub-tensors */
void _CopyIndexed(const XTensor * s, XTensor * t, int dim,
const XTensor * srcIndex, int copyNum = 1);
/*
copy selected sub-tensors where indeces are kept in tensors (return an XTensor structure)
......
......@@ -23,6 +23,7 @@
#include "../../XUtility.h"
#include "CopyValues.h"
#include "CopyValues.cuh"
#include "../getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......
......@@ -52,15 +52,15 @@ void _CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream)
}
/* dense -> sparse */
else if (!s->isSparse && t->isSparse &&
s->dataType == DEFAULT_DTYPE &&
t->dataType == DEFAULT_DTYPE)
s->dataType == DEFAULT_DTYPE &&
t->dataType == DEFAULT_DTYPE)
{
ShowNTErrors("TODO!");
}
/* sparse -> dense */
else if (s->isSparse && !t->isSparse &&
s->dataType == DEFAULT_DTYPE &&
t->dataType == DEFAULT_DTYPE)
s->dataType == DEFAULT_DTYPE &&
t->dataType == DEFAULT_DTYPE)
{
ShowNTErrors("TODO!");
}
......
......@@ -33,28 +33,6 @@ gather indexed sub-tensors
>> s - the source tensor
>> t - the target tensor
>> dim - the leading dimension to define "sub-tensors"
e.g., for a tensor of size (3, 2, 4) and dim = 0,
we have 3 sub-tensors of size (2, 4)
>> srcIndex - index of the source sub-tensors
>> indexSize - length of srcIndex (and tgtIndex)
*/
void _Gather(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize)
{
int * tgtIndex = new int[indexSize];
for(int i = 0; i < indexSize; i++)
tgtIndex[i] = i;
_CopyIndexed(s, t, dim, srcIndex, indexSize, tgtIndex, 1);
delete[] tgtIndex;
}
/*
gather indexed sub-tensors
>> s - the source tensor
>> t - the target tensor
>> srcIndex - the tensor to save the index of the source tensor
*/
void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex)
......@@ -101,15 +79,10 @@ XTensor Gather(XTensor &s, XTensor &index)
CheckNTErrors(s.order == 2, "The order of the input tensor must be 2!");
int order = s.order;
int order = index.order + 1;
int * dimSize = new int[order];
for (int i = 0; i < s.order; i++) {
if (i == dim)
dimSize[i] = index.unitNum;
else
dimSize[i] = s.dimSize[i];
}
memcpy(dimSize, index.dimSize, index.order * sizeof(int));
dimSize[index.order] = s.GetDim(-1);
float dr = (!s.isSparse) ? 1.0F : s.denseRatio;
XTensor t(order, dimSize, s.dataType, dr, s.devID, s.mem);
......@@ -122,20 +95,7 @@ XTensor Gather(XTensor &s, XTensor &index)
/* tensor connection */
XLink::MakeLink(&s, &index, &t, MOVEMENT_GATHER);
if(index.order > 1) {
int * dims = new int[index.order + 1];
memcpy(dims, index.dimSize, index.order * sizeof(int));
dims[index.order] = t.GetDim(-1);
XTensor tt;
tt = Reshape(t, index.order + 1, dims);
delete[] dims;
return tt;
}
else {
return t;
}
return t;
}
} // namespace nts(NiuTrans.Tensor)
......@@ -27,9 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* gather selected sub-tensors */
void _Gather(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize);
/* gather selected sub-tensors */
void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex);
/* gather selected sub-tensors (return an XTensor structure)
......
......@@ -219,7 +219,6 @@ void _SpreadForCopyIndexed(XTensor * s, XTensor * c, int dim,
}
}
}
/*
......@@ -236,15 +235,18 @@ void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index)
int order = source->order;
CheckNTErrors(source->dataType == DEFAULT_DTYPE, "TODO!");
CheckNTErrors(collection->GetDim(-1) == source->GetDim(-1), "Illegal dimension!");
CheckNTErrors(collection->unitNum/collection->GetDim(-1) == index->unitNum,
"Illegal dimension!");
for(int i = 0; i < order; i++){
if(i == dim){
CheckNTErrors(collection->GetDim(i) == index->unitNum, "Illegal dimension!");
}
else {
CheckNTErrors(collection->GetDim(i) == source->GetDim(i), "Illegal dimension!");
}
}
//for(int i = 0; i < order; i++){
// if(i == dim){
// CheckNTErrors(collection->GetDim(i) == index->unitNum, "Illegal dimension!");
// }
// else {
// CheckNTErrors(collection->GetDim(i) == source->GetDim(i), "Illegal dimension!");
// }
//}
#ifdef USE_CUDA
if(source->devID >= 0 && collection->devID >= 0) {
......
......@@ -137,6 +137,115 @@ XTensor Concatenate(const TensorList &smalls, int dim)
}
}
bool CheckConcatenateShape(const TensorList &smalls, int dim, XTensor &big, bool uniform)
{
XTensor * tensor = (XTensor*)smalls.GetItem(0);
int order = tensor->order;
int * dimSize = new int[order];
if (uniform) {
for (int i = 0; i < tensor->order; i++) {
if (i != dim)
dimSize[i] = tensor->dimSize[i];
else
dimSize[i] = tensor->dimSize[dim] * smalls.count;
}
}
else {
for (int i = 0; i < tensor->order; i++)
if (i != dim)
dimSize[i] = tensor->dimSize[i];
int catDimSize = 0;
for (int i = 0; i < smalls.count; i++) {
XTensor * tensor = (XTensor*)smalls.GetItem(i);
catDimSize += tensor->dimSize[dim];
}
dimSize[dim] = catDimSize;
}
for (int i = 0; i < order; i++) {
if (dimSize[i] != big.dimSize[i]) {
delete[] dimSize;
return false;
}
}
delete[] dimSize;
return false;
}
void Concatenate(const TensorList & smalls, XTensor & big, int dim)
{
CheckNTErrors(smalls.count > 0, "Empty list!");
CheckNTErrors(dim >= 0, "Illegal dimension to concatenate!");
bool uniform = true;
for (int i = 1; i < smalls.count; i++) {
XTensor * a = (XTensor*)smalls.GetItem(i - 1);
XTensor * b = (XTensor*)smalls.GetItem(i);
CheckNTErrors((a && b), "Empty input tensors!");
if (!XTensor::IsSameShaped(a, b))
uniform = false;
}
if (!big.isInit || !CheckConcatenateShape(smalls, dim, big, uniform)) {
XTensor * tensor = (XTensor*)smalls.GetItem(0);
int order = tensor->order;
int * dimSize = new int[order];
if (uniform) {
for (int i = 0; i < tensor->order; i++) {
if (i != dim)
dimSize[i] = tensor->dimSize[i];
else
dimSize[i] = tensor->dimSize[dim] * smalls.count;
}
float dr = (!tensor->isSparse) ? 1.0F : tensor->denseRatio;
InitTensor(&big, order, dimSize, tensor->dataType, dr, tensor->devID, tensor->mem);
}
else {
for (int i = 0; i < tensor->order; i++)
if (i != dim)
dimSize[i] = tensor->dimSize[i];
int catDimSize = 0;
for (int i = 0; i < smalls.count; i++) {
XTensor * tensor = (XTensor*)smalls.GetItem(i);
catDimSize += tensor->dimSize[dim];
}
dimSize[dim] = catDimSize;
float dr = (!tensor->isSparse) ? 1.0F : tensor->denseRatio;
InitTensor(&big, order, dimSize, tensor->dataType, dr, tensor->devID, tensor->mem);
}
/* destroy variables */
delete[] dimSize;
}
if (uniform) {
/* call _Merge function */
_Merge(&smalls, &big, dim);
/* tensor connection */
if (big.enableGrad) {
XLink::MakeLink(&smalls, &big, SHAPE_MERGE);
XLink::AddParamToHeadInt(&big, dim);
}
}
else {
/* call _ConcatenateSolely function */
_ConcatenateSolely(&smalls, &big, dim);
/* tensor connection */
if (big.enableGrad) {
XLink::MakeLink(&smalls, &big, SHAPE_CONCATENATE);
XLink::AddParamToHeadInt(&big, dim);
}
}
}
/*
concatenate two tensors along a given dimension
......
......@@ -41,6 +41,8 @@ Note that this is actually a wrapper that selects
*/
XTensor Concatenate(const TensorList &smalls, int dim);
void Concatenate(const TensorList & smalls, XTensor & big, int dim);
/* concatenate two tensors along a given dimension */
void _Concatenate(const XTensor * smallA, const XTensor * smallB, XTensor * big, int dim);
......
......@@ -273,16 +273,16 @@ void Merge(const XTensor &s, XTensor &t, int whereToMerge, int leadingDim)
merge small tensors into a big tensor
>> smalls - the list of the small tensors
>> big - the merged tensor (for return)
>> t - the merged tensor (for return)
>> whereToMerge - the merging operation is along with which dimension
*/
void _Merge(const TensorList * smalls, XTensor * big, int whereToMerge)
void _Merge(const TensorList * smalls, XTensor * t, int whereToMerge)
{
whereToMerge = (whereToMerge < 0 ? big->order - 1 : whereToMerge);
whereToMerge = (whereToMerge < 0 ? t->order - 1 : whereToMerge);
CheckNTErrors((smalls != NULL), "Invalid list!");
CheckNTErrors((smalls->count > 0), "Empty list!");
CheckNTErrors((whereToMerge >= 0 && whereToMerge < big->order), "Wrong range of whereToMerge");
CheckNTErrors((whereToMerge >= 0 && whereToMerge < t->order), "Wrong range of whereToMerge");
bool uniform = true;
......@@ -292,7 +292,7 @@ void _Merge(const TensorList * smalls, XTensor * big, int whereToMerge)
for (int i = 0; i < smalls->count; i++) {
XTensor* smallsItem = smalls->GetItem(i);
CheckNTErrors((big->unitNum == smallsItem->unitNum * mergeNum), "Unmatched tensors!");
CheckNTErrors((t->unitNum == smallsItem->unitNum * mergeNum), "Unmatched tensors!");
if (i > 0) {
XTensor * preItem = smalls->GetItem(i - 1);
......@@ -325,17 +325,17 @@ void _Merge(const TensorList * smalls, XTensor * big, int whereToMerge)
/* merging with fewer data copy operations */
if (mergedNum * gridNum <= MIN_TENSOR_MERGE_LIST_NUM) {
int sPitch = blockSize * s0->unitSize;
int tPtich = blockSize * mergedNum * big->unitSize;
int mSize = blockSize * big->unitSize;
int tPtich = blockSize * mergedNum * t->unitSize;
int mSize = blockSize * t->unitSize;
int n = blockNum;
int sStep = 0;
int tStep = blockSize * big->unitSize;
int tStep = blockSize * t->unitSize;
for (int g = 0; g < gridNum; g++) {
char * tData = (char*)big->data + g * blockSize * blockNum * big->unitSize;
char * tData = (char*)t->data + g * blockSize * blockNum * t->unitSize;
for (int k = 0; k < mergedNum; k++) {
XTensor * s = smalls->GetItem(k);
char * sData = (char*)s->data + g * blockSize * blockNum * s->unitSize;
XMemCopy2D(tData + k * tStep, tPtich, big->devID,
XMemCopy2D(tData + k * tStep, tPtich, t->devID,
sData + k * sStep, sPitch, s->devID,
mSize, n);
}
......@@ -358,7 +358,7 @@ void _Merge(const TensorList * smalls, XTensor * big, int whereToMerge)
if (uniform)
dataTMP = smallsItem0->data;
else
dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(big->devID, size);
dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(t->devID, size);
tensorTMP->data = dataTMP;
......@@ -370,7 +370,7 @@ void _Merge(const TensorList * smalls, XTensor * big, int whereToMerge)
}
}
_Merge(tensorTMP, big, whereToMerge + 1);
_Merge(tensorTMP, t, whereToMerge + 1);
delete[] dimSizeTMP;
......@@ -380,7 +380,7 @@ void _Merge(const TensorList * smalls, XTensor * big, int whereToMerge)
if ((!uniform) && (mem != NULL))
mem->ReleaseBuf(mem->devID, size);
else
XMemFree(big->devID, dataTMP);
XMemFree(t->devID, dataTMP);
}
}
......
......@@ -36,7 +36,7 @@ XTensor Merge(const XTensor &s, int whereToMerge, int leadingDim = -1);
void Merge(const XTensor &s, XTensor &t, int whereToMerge, int leadingDim = -1);
/* merge small tensors into a big tensor */
void _Merge(const TensorList * smalls, XTensor * big, int whereToMerge);
void _Merge(const TensorList * smalls, XTensor * t, int whereToMerge);
/* merge small tensors into a big tensor (return an XTensor structure) */
XTensor Merge(const TensorList &smalls, int whereToMerge);
......
......@@ -31,7 +31,7 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
transform a tensor by splitting it, e.g., (N, M) -> (N/3, M, 3)
transform a tensor by splitting it, e.g., (N, M) -> (3, N/3, M)
>> s - the source tensor
>> t - the target tensor (for return)
......@@ -61,7 +61,7 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
}
/* for the case that we split the last dimension. Actually
(N, M) and (N, M/3, 3) have the same memory layout */
(N, M) and (3, N/3, M) have the same memory layout */
if (s->order - 1 == whereToSplitRDI) {
XMemCopy(t->data, t->devID, s->data, s->devID, s->unitNum * s->unitSize);
return;
......@@ -184,7 +184,7 @@ bool CheckSplitSize(const XTensor * s, const XTensor * t, int whereToSplit, int
}
/*
transform a tensor by splitting it, e.g., (N, M) -> (N/3, M, 3) (return an XTensor structure)
transform a tensor by splitting it, e.g., (N, M) -> (3, N/3, M) (return an XTensor structure)
make a new tensor to keep the result and return it
>> s - the source tensor
......
......@@ -27,6 +27,7 @@
#include "../XTensor.h"
#include "Dropout.h"
#include "DropoutWithIndex.h"
#include "HardTanH.h"
#include "Identity.h"
#include "LogSoftmax.h"
......
......@@ -16,14 +16,13 @@
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-25
*/
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-25
*/
#include <stdlib.h>
#include "../XName.h"
#include "HardTanH.h"
#include "HardTanH.cuh"
#include "../loss/LHeader.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
......@@ -37,27 +36,27 @@ y = 1 if x > 1
*/
void _HardTanH(const XTensor * x, XTensor * y)
{
CheckNTErrors(XTensor::IsSameShaped(x, y),
"The input tensor and output tensor must have the same shape!")
#ifdef USE_CUDA
if(x->devID >= 0 || y->devID >= 0){
_CudaHardTanH(x, y);
return;
}
#endif
if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
int n = x->GetSize();
DTYPE * ip = (DTYPE*)x->data;
DTYPE * op = (DTYPE*)y->data;
for(int i = 0; i < n; i++){
DTYPE p = ip[i];
if(p > 1.0)
p = 1.0;
else if(p < -1.0)
p = -1.0;
op[i] = p;
}
int n = x->GetSize();
DTYPE * ip = (DTYPE*)x->data;
DTYPE * op = (DTYPE*)y->data;
for(int i = 0; i < n; i++){
DTYPE p = ip[i];
if(p > 1.0)
p = 1.0;
else if(p < -1.0)
p = -1.0;
op[i] = p;
}
else
ShowNTErrors("TODO!");
}
/*
......@@ -111,50 +110,36 @@ hard tanh: y = 1 if x > 1
and dy/dx = 1 if -1 <= x <= 1
0 otherwise
>> gold - gold standard to measure error (or loss)
>> y - output of the function
>> x - input of the function
>> y - output of the hardtanh function
>> x - input of the hardtanh function
>> dedy - dE/dy
>> dedx - dE/dx
>> lossName - type of loss function, e.g., cross entropy
*/
void _HardTanHBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
LOSS_FUNCTION_NAME lossName)
void _HardTanHBackward(XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx)
{
CheckNTErrors((gold == NULL || XTensor::IsSameShaped(gold, y)),
"The tensors must be of the same size!");
CheckNTErrors(x != NULL, "The input tensor x must be not NULL!")
#ifdef USE_CUDA
if(x->devID >= 0 || y->devID >= 0){
_CudaHardTanHBackward(gold, y, x, dedy, dedx, lossName);
if(x->devID >= 0){
_CudaHardTanHBackward(y, x, dedy, dedx);
return;
}
#endif
if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
/* calculate dE/dy */
if(lossName == CROSSENTROPY)
_CrossEntropyBackward(dedy, y, gold);
else if(lossName != NOLOSS)
_LossBackward(dedy, gold, y, lossName);
DTYPE * dedyp = (DTYPE*)dedy->data;
DTYPE * dedxp = (DTYPE*)dedx->data;
DTYPE * ip = (DTYPE*)x->data;
int size = y->unitNum;
/* dE/dx = dE/dy * dy/dx */
for(int i = 0; i < size; i++){
DTYPE s =ip[i];
if(s > 1.0 || s < -1.0)
dedxp[i] = 0;
else
dedxp[i] = dedyp[i];
}
DTYPE * dedyp = (DTYPE*)dedy->data;
DTYPE * dedxp = (DTYPE*)dedx->data;
DTYPE * ip = (DTYPE*)x->data;
int size = x->unitNum;
/* dE/dx = dE/dy * dy/dx */
for(int i = 0; i < size; i++){
DTYPE s =ip[i];
if(s > 1.0 || s < -1.0)
dedxp[i] = 0;
else
dedxp[i] = dedyp[i];
}
else
ShowNTErrors("TODO!");
}
} // namespace nts(NiuTrans.Tensor)
......@@ -21,8 +21,6 @@
#include "HardTanH.h"
#include "HardTanH.cuh"
#include "Loss.cuh"
#include "../loss/CrossEntropy.cuh"
#include "../XDevice.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
......@@ -63,25 +61,19 @@ y = 1 if x > 1
*/
void _CudaHardTanH(const XTensor * x, XTensor * y)
{
if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
CheckNTErrors(!x->isSparse && !y->isSparse,
"The hard tanh activation function does not support sparse tensors.");
CheckNTErrors(!x->isSparse && !y->isSparse, "The hard tanh activation function does not support sparse tensors.");
CheckNTErrors(x->unitNum && y->unitNum, "The x vectors must be of the same length.");
int gridSize[3], blockSize[3];
int gridSize[3], blockSize[3];
GDevs.GetCudaThread(x->devID, x->unitNum, gridSize, blockSize);
GDevs.GetCudaThread(x->devID, x->unitNum, gridSize, blockSize);
int devIDBackup;
ProtectCudaDev(x->devID, devIDBackup);
int devIDBackup;
ProtectCudaDev(x->devID, devIDBackup);
KernelHardtanhCompute<<<dim3(gridSize[0]), dim3(blockSize[0])>>>((DTYPE*)x->data, (DTYPE*)y->data, x->unitNum);
KernelHardtanhCompute<<<dim3(gridSize[0]), dim3(blockSize[0])>>>((DTYPE*)x->data, (DTYPE*)y->data, x->unitNum);
BacktoCudaDev(x->devID, devIDBackup);
}
else{
ShowNTErrors("TODO!");
}
BacktoCudaDev(x->devID, devIDBackup);
}
/*
......@@ -92,13 +84,12 @@ dy/dx = 1 if -1 <= x <= 1
>> dedy - dE/dy
>> dedx - dE/dx
>> gold - gold standard
>> y - y of the function
>> x - x of the function
>> size - size of y/x
*/
__global__
void KernelHardtanhBackward(DTYPE * dedy, DTYPE * dedx, DTYPE * gold, DTYPE * y, DTYPE * x, int size)
void KernelHardtanhBackward(DTYPE * dedy, DTYPE * dedx, DTYPE * x, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
......@@ -123,44 +114,29 @@ hard tanh: y = 1 if x > 1
and dy/dx = 1 if -1 <= x <= 1
0 otherwise
>> gold - gold standard to measure error (or loss)
>> y - output of the function
>> x - input of the function
>> y - output of the hardtanh function
>> x - input of the hardtanh function
>> dedy - dE/dy
>> dedx - dE/dx
>> lossName - type of loss function, e.g., cross entropy
*/
void _CudaHardTanHBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
LOSS_FUNCTION_NAME lossName)
void _CudaHardTanHBackward(XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx)
{
if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
int gridSize[3], blockSize[3];
/* calculate dE/dy */
if(lossName == CROSSENTROPY)
_CudaCrossEntropyBackward(dedy, y, gold);
else if(lossName != NOLOSS)
_CudaLossBackward(dedy, gold, y, lossName);
GDevs.GetCudaThread(x->devID, x->unitNum, gridSize, blockSize);
int gridSize[3], blockSize[3];
int devIDBackup;
ProtectCudaDev(x->devID, devIDBackup);
GDevs.GetCudaThread(x->devID, x->unitNum, gridSize, blockSize);
/* dE/dx = dE/dy * dy/dx */
KernelHardtanhBackward<<<dim3(gridSize[0]),dim3(blockSize[0])>>>
((DTYPE*)dedy->data,
(DTYPE*)dedx->data,
(DTYPE*)x->data,
x->unitNum);
int devIDBackup;
ProtectCudaDev(x->devID, devIDBackup);
/* dE/dx = dE/dy * dy/dx */
KernelHardtanhBackward<<<dim3(gridSize[0]),dim3(blockSize[0])>>>
((DTYPE*)dedy->data,
(DTYPE*)dedx->data,
gold == NULL ? NULL : (DTYPE*)gold->data,
(DTYPE*)y->data, (DTYPE*)x->data,
x->unitNum);
BacktoCudaDev(x->devID, devIDBackup);
}
else
ShowNTErrors("TODO!");
BacktoCudaDev(x->devID, devIDBackup);
}
#endif
......
......@@ -23,7 +23,6 @@
#define __HARDTANH_CUH__
#include "../XTensor.h"
#include "Loss.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
......@@ -38,9 +37,8 @@ y = 1 if x > 1
void _CudaHardTanH(const XTensor * input, XTensor * output);
/* de/dx (Cuda version) */
void _CudaHardTanHBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
LOSS_FUNCTION_NAME lossName);
void _CudaHardTanHBackward(XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx);
#endif // USE_CUDA
......
......@@ -23,7 +23,6 @@
#define __HARDHANH_H__
#include "../XTensor.h"
#include "Loss.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
......@@ -43,9 +42,8 @@ XTensor HardTanH(const XTensor &x);
void HardTanH(const XTensor &x, XTensor &y);
/* de/dx */
void _HardTanHBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
LOSS_FUNCTION_NAME lossName);
void _HardTanHBackward(XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -19,9 +19,8 @@
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-27
*/
#include "../XName.h"
#include "Identity.h"
#include "../loss/LHeader.h"
#include "../XName.h"
#include "../XUtility.h"
#include "../core/movement/CopyValues.h"
......@@ -30,10 +29,12 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
/*
identity function y = x
>> x - input tensor
>> y - result
>> y - output tensor
*/
void _Identity(const XTensor * x, XTensor * y)
{
CheckNTErrors(XTensor::IsSameShaped(x, y),
"The input tensor and output tensor must have the same shape!")
_CopyValues(x, y);
}
......@@ -42,7 +43,7 @@ identity function y = x (return an XTensor structure)
make a new tensor to keep the result and return it
>> x - input tensor
<< return - y
<< return - output tensor
*/
XTensor Identity(const XTensor &x)
{
......@@ -78,33 +79,16 @@ backward computation for identity function y = x
dE/dx = dE/dy * dy/dx = dE/dy
>> gold - gold standard to measure error (or loss)
>> y - output of the function
>> x - input of the function
>> y - output of the identity function
>> x - input of the identity function
>> dedy - dE/dy
>> dedx - dE/dx
>> lossName - type of loss function, e.g., cross entropy
*/
void _IdentityBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
LOSS_FUNCTION_NAME lossName)
void _IdentityBackward(const XTensor * y, const XTensor * x,
const XTensor * dedy, XTensor * dedx)
{
CheckNTErrors((gold == NULL || XTensor::IsSameShaped(gold, y)),
"The tensors must be of the same size!");
if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE)
{
/* calculate dE/dy */
if(lossName == CROSSENTROPY)
_CrossEntropyBackward(dedy, y, gold);
else if(lossName != NOLOSS)
_LossBackward(dedy, gold, y, lossName);
if(dedy->data != dedx->data)
_CopyValues(dedy, dedx);
}
else
ShowNTErrors("TODO!");
if(dedy->data != dedx->data)
_CopyValues(dedy, dedx);
}
} // namespace nts(NiuTrans.Tensor)
......@@ -23,7 +23,6 @@
#define __IDENTITY_H__
#include "../XTensor.h"
#include "Loss.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
......@@ -36,9 +35,8 @@ XTensor Identity(const XTensor &x);
void Identity(const XTensor &x, XTensor &y);
/* de/dx */
void _IdentityBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
LOSS_FUNCTION_NAME lossName);
void _IdentityBackward(const XTensor * y, const XTensor * x,
const XTensor * dedy, XTensor * dedx);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -222,7 +222,6 @@ void LogSoftmax(const XTensor &x, XTensor &y, int leadDim)
}
}
/*
backward computation for dense matrices with default data type
......
......@@ -16,8 +16,8 @@
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-26
*/
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-26
*/
#ifndef __LOGSOFTMAX_CUH__
#define __LOGSOFTMAX_CUH__
......
......@@ -16,8 +16,8 @@
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-25
*/
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-25
*/
#ifndef __LOGSOFTMAX_H__
#define __LOGSOFTMAX_H__
......@@ -36,6 +36,9 @@ XTensor LogSoftmax(const XTensor &x, int leadDim);
/* log scale softmax y = log(e^x / \sum_{i} e^{x_i}) (with both argument of x and y) */
void LogSoftmax(const XTensor &x, XTensor &y, int leadDim);
/* log scale softmax y = log(e^x / \sum_{i} e^{x_i}) (with both argument of x and y) */
void LogSoftmax(const XTensor &x, XTensor &y, int leadDim);
/* de/dx */
void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
......
......@@ -22,10 +22,9 @@
#include "Loss.h"
#include "Loss.cuh"
#include "../XDevice.h"
#include "../core/math/Power.h"
#include "../core/math/ScaleAndShift.h"
#include "../core/math/Unary.h"
#include "../core/arithmetic/Negate.h"
#include "../core/math/Binary.h"
#include "../core/arithmetic/Sum.h"
#include "../core/arithmetic/Multiply.h"
#include "../core/reduce/ReduceSum.h"
......
......@@ -16,23 +16,25 @@
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "../XName.h"
#include "Rectify.h"
#include "Rectify.cuh"
#include "../loss/LHeader.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
/*
rectify function y = max(0, x)
>> input - input tensor
>> output - result
>> x - input tensor
>> y - output tensor
*/
void _Rectify(const XTensor * x, XTensor * y)
{
CheckNTErrors(XTensor::IsSameShaped(x, y),
"The input tensor and output tensor must have the same shape!")
#ifdef USE_CUDA
if(x->devID >= 0 || y->devID >= 0){
_CudaRectify(x, y);
......@@ -40,28 +42,24 @@ void _Rectify(const XTensor * x, XTensor * y)
}
#endif
if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
DTYPE * ip = (DTYPE*)x->data;
DTYPE * op = (DTYPE*)y->data;
int n = x->GetSize();
for(int i = 0; i < n; i++){
DTYPE p = ip[i];
if(p < 0)
p = 0;
op[i] = p;
}
DTYPE * ip = (DTYPE*)x->data;
DTYPE * op = (DTYPE*)y->data;
int n = x->GetSize();
for(int i = 0; i < n; i++){
DTYPE p = ip[i];
if(p < 0)
p = 0;
op[i] = p;
}
else
ShowNTErrors("TODO!");
}
/*
rectify function y = max(0, x) (return an XTensor structure)
make a new tensor to keep the result and return it
>> input - input tensor
<< return - y
>> x - input tensor
<< return - output tensor
*/
XTensor Rectify(const XTensor &x)
{
......@@ -107,50 +105,36 @@ rectified: y = 0 if x < 0
and dy/ds = 0 if x < 0
1 otherwise
>> gold - gold standard to measure error (or loss)
>> y - output of the function
>> x - input of the function
>> y - output of the rectify function
>> x - input of the rectify function
>> dedy - dE/dy
>> dedx - dE/dx
>> lossName - type of loss function, e.g., cross entropy
*/
void _RectifyBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
LOSS_FUNCTION_NAME lossName)
void _RectifyBackward(XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx)
{
CheckNTErrors((gold == NULL || XTensor::IsSameShaped(gold, y)),
"The tensors must be of the same size!");
CheckNTErrors(x != NULL, "The input tensor x must be not NULL!")
#ifdef USE_CUDA
if(x->devID >= 0 || y->devID >= 0){
_CudaRectifyBackward(gold, y, x, dedy, dedx, lossName);
if(x->devID >= 0){
_CudaRectifyBackward(y, x, dedy, dedx);
return;
}
#endif
if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE)
{
/* calculate dE/dy */
if(lossName == CROSSENTROPY)
_CrossEntropyBackward(dedy, y, gold);
else if(lossName != NOLOSS)
_LossBackward(dedy, gold, y, lossName);
DTYPE * dedyp = (DTYPE*)dedy->data;
DTYPE * dedxp = (DTYPE*)dedx->data;
DTYPE * ip = (DTYPE*)x->data;
int size = y->unitNum;
for(int i = 0; i < size; i++){
/* dE/ds = dE/dy * dy/ds = dE/dy */
DTYPE s = ip[i];
if(s < 0)
dedxp[i] = 0;
else
dedxp[i] = dedyp[i];
}
DTYPE * dedyp = (DTYPE*)dedy->data;
DTYPE * dedxp = (DTYPE*)dedx->data;
DTYPE * ip = (DTYPE*)x->data;
int size = x->unitNum;
for(int i = 0; i < size; i++){
/* dE/ds = dE/dy * dy/ds = dE/dy */
DTYPE s = ip[i];
if(s < 0)
dedxp[i] = 0;
else
dedxp[i] = dedyp[i];
}
else
ShowNTErrors("TODO!");
}
} // namespace nts(NiuTrans.Tensor)
......@@ -16,13 +16,10 @@
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
*/
#include "Rectify.h"
#include "Rectify.cuh"
#include "Loss.cuh"
#include "../loss/CrossEntropy.cuh"
#include "../XDevice.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
......@@ -57,24 +54,17 @@ rectify function y = max(0, x)
*/
void _CudaRectify(const XTensor * x, XTensor * y)
{
if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
CheckNTErrors(!x->isSparse && !y->isSparse, "The Rectify function does not support sparse matrices.");
CheckNTErrors(x->unitNum && y->unitNum, "The input vectors must be of the same length.");
int gridSize[3], blockSize[3];
int gridSize[3], blockSize[3];
GDevs.GetCudaThread(x->devID, x->unitNum, gridSize, blockSize);
GDevs.GetCudaThread(x->devID, x->unitNum, gridSize, blockSize);
int devIDBackup;
ProtectCudaDev(x->devID, devIDBackup);
int devIDBackup;
ProtectCudaDev(x->devID, devIDBackup);
KernelRectify<<<dim3(gridSize[0]), dim3(blockSize[0])>>>((DTYPE*)x->data, (DTYPE*)y->data, x->unitNum);
KernelRectify<<<dim3(gridSize[0]), dim3(blockSize[0])>>>
((DTYPE*)x->data, (DTYPE*)y->data, x->unitNum);
BacktoCudaDev(x->devID, devIDBackup);
}
else
ShowNTErrors("TODO!");
BacktoCudaDev(x->devID, devIDBackup);
}
/*
......@@ -85,13 +75,11 @@ dy/dx = 1 if x >= 0
>> dedy - dE/dy
>> dedx - dE/dx
>> gold - gold standard
>> y - output of the function
>> x - input of the function
>> size - size of output/input
*/
__global__
void KernelRectifyBackward(DTYPE * dedy, DTYPE * dedx, DTYPE * gold, DTYPE * y, DTYPE * x, int size)
void KernelRectifyBackward(DTYPE * dedy, DTYPE * dedx, DTYPE * x, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
......@@ -104,11 +92,10 @@ void KernelRectifyBackward(DTYPE * dedy, DTYPE * dedx, DTYPE * gold, DTYPE * y,
}
}
/*
backward computation (Cuda version)
dE/ds = dE/dy * dy/ds
dE/dx = dE/dy * dy/dx
rectify : y = s if s >= 0
0 if s < 0
......@@ -116,48 +103,29 @@ rectify : y = s if s >= 0
and dy/ds = 1 if s >= 0
0 otherwise
>> gold - gold standard to measure error (or loss)
>> output - output of the activation function, i.e., y
>> input - input of the activation function , i.e., s
>> dEdY - dE/dy
>> dEdS - dE/ds
>> lossName - type of loss function, e.g., cross entropy
>> gBeg - where to start in the gold standard (along the leading dimension)
>> gLen - segment length from gBeg (along the leading dimension)
>> oBeg - where to start in the model output (along the leading dimension)
>> parallelRunner - parallel processing module
>> y - output of the rectify function
>> x - input of the rectify function
>> dedy - dE/dy
>> dedx - dE/dx
*/
void _CudaRectifyBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
LOSS_FUNCTION_NAME lossName)
void _CudaRectifyBackward(XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx)
{
if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
/* calculate dE/dy */
if(lossName == CROSSENTROPY)
_CudaCrossEntropyBackward(dedy, y, gold);
else if(lossName != NOLOSS)
_CudaLossBackward(dedy, gold, y, lossName);
int gridSize[3], blockSize[3];
GDevs.GetCudaThread(x->devID, x->unitNum, gridSize, blockSize);
int devIDBackup;
ProtectCudaDev(x->devID, devIDBackup);
/* dE/ds = dE/dy * dy/ds */
KernelRectifyBackward<<<dim3(gridSize[0]),dim3(blockSize[0])>>>
((DTYPE*)dedy->data,
(DTYPE*)dedx->data,
gold == NULL ? NULL : (DTYPE*)gold->data,
(DTYPE*)y->data, (DTYPE*)x->data,
x->unitNum);
BacktoCudaDev(x->devID, devIDBackup);
}
else
ShowNTErrors("TODO!");
int gridSize[3], blockSize[3];
GDevs.GetCudaThread(x->devID, x->unitNum, gridSize, blockSize);
int devIDBackup;
ProtectCudaDev(x->devID, devIDBackup);
/* dE/ds = dE/dy * dy/ds */
KernelRectifyBackward<<<dim3(gridSize[0]),dim3(blockSize[0])>>>
((DTYPE*)dedy->data,
(DTYPE*)dedx->data,
(DTYPE*)x->data,
x->unitNum);
BacktoCudaDev(x->devID, devIDBackup);
}
#endif
......
......@@ -23,7 +23,6 @@
#define __RECTIFY_CUH__
#include "../XTensor.h"
#include "Loss.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
......@@ -33,9 +32,8 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
void _CudaRectify(const XTensor * input, XTensor * output);
/* de/dx (Cuda version) */
void _CudaRectifyBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
LOSS_FUNCTION_NAME lossName);
void _CudaRectifyBackward(XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx);
#endif // USE_CUDA
......
......@@ -23,7 +23,6 @@
#define __RECTIFY_H__
#include "../XTensor.h"
#include "Loss.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
......@@ -36,9 +35,8 @@ XTensor Rectify(const XTensor &x);
void Rectify(const XTensor &x, XTensor &y);
/* de/dx */
void _RectifyBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
LOSS_FUNCTION_NAME lossName);
void _RectifyBackward(XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -34,6 +34,9 @@ sigmoid function y = 1/(1+exp(-x))
*/
void _Sigmoid(const XTensor * x, XTensor * y)
{
CheckNTErrors(XTensor::IsSameShaped(x, y),
"The input tensor and output tensor must have the same shape!")
#ifdef USE_CUDA
if(x->devID >= 0 || y->devID >= 0){
_CudaSigmoid(x, y);
......@@ -59,7 +62,7 @@ sigmoid function y = 1/(1+exp(-x)) (return an XTensor structure)
make a new tensor to keep the result and return it
>> x - input tensor
<< return - y
<< return - output tensor
*/
XTensor Sigmoid(const XTensor &x)
{
......@@ -97,50 +100,32 @@ dE/ds = dE/dy * dy/dx
sigmoid: y = 1/(1+exp(-x))
and dy/dx = y * (1 -y)
and dy/dx = y * (1 - y)
>> gold - gold standard to measure the error (or loss)
>> y - output of the function
>> x - input of the function
>> dedy - dE/dy
>> dedx - dE/dx
>> lossName - type of loss function, e.g., cross entropy
*/
void _SigmoidBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
LOSS_FUNCTION_NAME lossName)
void _SigmoidBackward(XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx)
{
CheckNTErrors((gold == NULL || XTensor::IsSameShaped(gold, y)),
"The tensors must be of the same size!");
#ifdef USE_CUDA
if(x->devID >= 0 || y->devID >= 0){
_CudaSigmoidBackward(gold, y, x, dedy, dedx, lossName);
if(x->devID >= 0){
_CudaSigmoidBackward(y, x, dedy, dedx);
return;
}
#endif
if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE)
{
/* calculate dE/dy */
if(lossName == CROSSENTROPY)
_CrossEntropyBackward(dedy, y, gold);
else if(lossName != NOLOSS)
_LossBackward(dedy, gold, y, lossName);
DTYPE * dedyp = (DTYPE*)dedy->data;
DTYPE * dedxp = (DTYPE*)dedx->data;
DTYPE * op = (DTYPE*)y->data;
int size = y->unitNum;
/* dE/dx = dE/dy * dy/dx */
for(int i = 0; i < size; i++){
DTYPE y = op[i];
dedxp[i] = dedyp[i] * (DTYPE)y * ((DTYPE)1.0 - y);
}
DTYPE * dedyp = (DTYPE*)dedy->data;
DTYPE * dedxp = (DTYPE*)dedx->data;
DTYPE * op = (DTYPE*)y->data;
int size = y->unitNum;
/* dE/dx = dE/dy * dy/dx */
for(int i = 0; i < size; i++){
DTYPE y = op[i];
dedxp[i] = dedyp[i] * (DTYPE)y * ((DTYPE)1.0 - y);
}
else
ShowNTErrors("TODO!");
}
} // namespace nts(NiuTrans.Tensor)
......@@ -61,24 +61,19 @@ sigmoid function y = 1/(1+exp(-x)) (Cuda version)
*/
void _CudaSigmoid(const XTensor * x, XTensor * y)
{
if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
CheckNTErrors(!x->isSparse && !y->isSparse, "the activation function (rectify) does not support sparse matrices.");
CheckNTErrors(x->unitNum && y->unitNum, "we require two vectors with the same length.");
CheckNTErrors(!x->isSparse && !y->isSparse, "the activation function (rectify) does not support sparse matrices.");
CheckNTErrors(x->unitNum && y->unitNum, "we require two vectors with the same length.");
int gridSize[3], blockSize[3];
int gridSize[3], blockSize[3];
GDevs.GetCudaThread(x->devID, x->unitNum, gridSize, blockSize);
GDevs.GetCudaThread(x->devID, x->unitNum, gridSize, blockSize);
int devIDBackup;
ProtectCudaDev(x->devID, devIDBackup);
int devIDBackup;
ProtectCudaDev(x->devID, devIDBackup);
KernelSigmoidCompute<<<dim3(gridSize[0]), dim3(blockSize[0])>>>((DTYPE*)x->data, (DTYPE*)y->data, x->unitNum);
KernelSigmoidCompute<<<dim3(gridSize[0]), dim3(blockSize[0])>>>((DTYPE*)x->data, (DTYPE*)y->data, x->unitNum);
BacktoCudaDev(x->devID, devIDBackup);
}
else
ShowNTErrors("TODO!");
BacktoCudaDev(x->devID, devIDBackup);
}
/*
......@@ -92,13 +87,12 @@ sigmoid: y = 1/(1+exp(-x))
>> dedy - dE/dy
>> dedx - dE/ds
>> gold - gold standard
>> y - output of the function
>> x - input of the function
>> size - size of output/input
*/
__global__
void KernelSigmoidBackward(DTYPE * dedy, DTYPE * dedx, DTYPE * gold, DTYPE * y, DTYPE * x, int size)
void KernelSigmoidBackward(DTYPE * dedy, DTYPE * dedx, DTYPE * y, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
......@@ -116,46 +110,31 @@ sigmoid: y = 1/(1+exp(-x))
and dy/dx = y * (1 -y)
>> gold - gold standard to measure error (or loss)
>> y - output of the function
>> x - input of the function
>> dedy - dE/dy
>> dedx - dE/dx
>> lossName - type of loss function, e.g., cross entropy
*/
void _CudaSigmoidBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
LOSS_FUNCTION_NAME lossName)
void _CudaSigmoidBackward(XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx)
{
if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
/* calculate dE/dy */
if(lossName == CROSSENTROPY)
_CudaCrossEntropyBackward(dedy, y, gold);
else if(lossName != NOLOSS)
_LossBackward(dedy, gold, y, lossName);
int gridSize[3], blockSize[3];
GDevs.GetCudaThread(x->devID, x->unitNum, gridSize, blockSize);
int devIDBackup;
ProtectCudaDev(x->devID, devIDBackup);
/* dE/ds = dE/dy * dy/ds */
KernelSigmoidBackward<<<dim3(gridSize[0]),dim3(blockSize[0])>>>
((DTYPE*)dedy->data,
(DTYPE*)dedx->data,
gold == NULL ? NULL : (DTYPE*)gold->data,
(DTYPE*)y->data, (DTYPE*)x->data,
x->unitNum);
BacktoCudaDev(x->devID, devIDBackup);
}
else
ShowNTErrors("TODO!");
int gridSize[3], blockSize[3];
GDevs.GetCudaThread(y->devID, y->unitNum, gridSize, blockSize);
int devIDBackup;
ProtectCudaDev(y->devID, devIDBackup);
/* dE/dx = dE/dy * dy/dx */
KernelSigmoidBackward<<<dim3(gridSize[0]),dim3(blockSize[0])>>>
((DTYPE*)dedy->data,
(DTYPE*)dedx->data,
(DTYPE*)y->data,
y->unitNum);
BacktoCudaDev(x->devID, devIDBackup);
}
#endif
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
......@@ -23,7 +23,6 @@
#define __SIGMOID_CUH__
#include "../XTensor.h"
#include "Loss.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
......@@ -33,9 +32,8 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
void _CudaSigmoid(const XTensor * input, XTensor * output);
/* de/dx (Cuda version) */
void _CudaSigmoidBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
LOSS_FUNCTION_NAME lossName);
void _CudaSigmoidBackward(XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx);
#endif // USE_CUDA
......
......@@ -16,14 +16,13 @@
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-25
*/
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-25
*/
#ifndef __SIGMOID_H__
#define __SIGMOID_H__
#include "../XTensor.h"
#include "Loss.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
......@@ -36,9 +35,8 @@ XTensor Sigmoid(const XTensor &x);
void Sigmoid(const XTensor &x, XTensor &y);
/* de/dx */
void _SigmoidBackward(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx,
LOSS_FUNCTION_NAME lossName);
void _SigmoidBackward(XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -16,8 +16,8 @@
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-27
*/
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-27
*/
#include <math.h>
#include "Softmax.h"
......
......@@ -16,8 +16,8 @@
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-27
*/
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-27
*/
#ifndef __SOFTMAX_H__
#define __SOFTMAX_H__
......
......@@ -28,7 +28,6 @@
#include "../core/arithmetic/Multiply.h"
#include "../core/math/Unary.h"
#include "../core/math/ScaleAndShift.h"
#include "../core/arithmetic/Negate.h"
#include "../core/reduce/ReduceSum.h"
#include "../core/reduce/ReduceSumAll.h"
......@@ -63,23 +62,6 @@ void _CrossEntropy(const XTensor * output, const XTensor * gold,
CheckNTErrors(loss->order == output->order - 1, "Wrong loss dimension!");
CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, "TODO!");
/*XTensor * interBuf1 = NewTensorBuf(output, output->devID, output->mem);
XTensor * interBuf2 = NewTensorBuf(output, output->devID, output->mem);
_Log(output, interBuf1);
_Multiply(gold, interBuf1, interBuf2);
if(weight != NULL)
_MultiplyDimMe(interBuf2, weight, n);
_NegateMe(interBuf2);
_ReduceSum(interBuf2, loss, n);
if(padding != NULL)
_MultiplyMe(loss, padding);
DelTensorBuf(interBuf2);
DelTensorBuf(interBuf1);*/
XTensor * inter = NewTensor(output);
_Log(output, inter);
......@@ -94,7 +76,6 @@ void _CrossEntropy(const XTensor * output, const XTensor * gold,
_MultiplyMe(loss, padding);
DelTensor(inter);
}
/*
......
......@@ -29,7 +29,6 @@
#include "../core/arithmetic/Div.h"
#include "../core/arithmetic/Multiply.h"
#include "../core/arithmetic/MultiplyDim.h"
#include "../core/arithmetic/Negate.h"
#include "../core/math/Unary.h"
#include "../core/math/ScaleAndShift.h"
#include "../core/reduce/ReduceSum.h"
......
......@@ -21,6 +21,7 @@
#include <math.h>
#include "TCrossEntropy.h"
#include "../loss/CrossEntropy.h"
#include "../core/math/ScaleAndShift.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......
......@@ -22,8 +22,6 @@
#ifndef __TEST_CROSSENTROPY_H__
#define __TEST_CROSSENTROPY_H__
#include "../loss/CrossEntropy.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for CrossEntropy Function */
......
......@@ -25,211 +25,10 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/*
case 1: gather indexed sub-tensors
In this case, (3, 2, 3) -> (3, 2, 2), dim = 2,
srcIndex = [0, 2], indexSize = 2
*/
bool TestGather1()
{
/* a input tensor of size (3, 2, 3) */
int sOrder = 3;
int * sDimSize = new int[sOrder];
sDimSize[0] = 3;
sDimSize[1] = 2;
sDimSize[2] = 3;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a output tensor of size (3, 2, 2) */
int tOrder = 3;
int * tDimSize = new int[tOrder];
tDimSize[0] = 3;
tDimSize[1] = 2;
tDimSize[2] = 2;
int tUnitNum = 1;
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
DTYPE sData[3][2][3] = { { {0.0F, -1.0F, 2.0F},
{2.0F, 1.0F, 3.0F} },
{ {1.0F, 2.0F, 4.0F},
{3.0F, 1.0F, 2.0F}},
{ {-1.0F, 3.0F, 2.0F},
{1.0F, -1.0F, 0.0F} } };
DTYPE answer[3][2][2] = { { {0.0F, 2.0F},
{2.0F, 3.0F} },
{ {1.0F, 4.0F},
{3.0F, 2.0F}},
{ {-1.0F, 2.0F},
{1.0F, 0.0F} } };
int dim = 2;
int indexSize = 2;
int srcIndex[2] = {0, 2};
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * s = NewTensor(sOrder, sDimSize);
XTensor * t = NewTensor(tOrder, tDimSize);
/* initialize variables */
s->SetData(sData, sUnitNum);
t->SetZeroAll();
/* call Gather function */
_Gather(s, t, dim, srcIndex, indexSize);
/* check results */
cpuTest = t->CheckData(answer, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* initialize variables */
sGPU->SetData(sData, sUnitNum);
tGPU->SetZeroAll();
/* call Gather function */
_Gather(sGPU, tGPU, dim, srcIndex, indexSize);
/* check results */
gpuTest = tGPU->CheckData(answer, tUnitNum);
/* destroy variables */
delete s;
delete t;
delete sGPU;
delete tGPU;
delete[] sDimSize;
delete[] tDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete s;
delete t;
delete[] sDimSize;
delete[] tDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 2: gather indexed sub-tensors
In this case, (3, 2, 3) -> (3, 1, 3), dim = 1,
srcIndex = [0], indexSize = 1
*/
bool TestGather2()
{
/* a input tensor of size (3, 2, 3) */
int sOrder = 3;
int * sDimSize = new int[sOrder];
sDimSize[0] = 3;
sDimSize[1] = 2;
sDimSize[2] = 3;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a output tensor of size (3, 1, 3) */
int tOrder = 3;
int * tDimSize = new int[tOrder];
tDimSize[0] = 3;
tDimSize[1] = 1;
tDimSize[2] = 3;
int tUnitNum = 1;
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
DTYPE sData[3][2][3] = { { {0.0F, -1.0F, 2.0F},
{2.0F, 1.0F, 3.0F} },
{ {1.0F, 2.0F, 4.0F},
{3.0F, 1.0F, 2.0F}},
{ {-1.0F, 3.0F, 2.0F},
{1.0F, -1.0F, 0.0F} } };
DTYPE answer[3][1][3] = { { {0.0F, -1.0F, 2.0F} },
{ {1.0F, 2.0F, 4.0F} } ,
{ {-1.0F, 3.0F, 2.0F} } };
int dim = 1;
int indexSize = 1;
int srcIndex[2] = {0};
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * s = NewTensor(sOrder, sDimSize);
XTensor * t = NewTensor(tOrder, tDimSize);
/* initialize variables */
s->SetData(sData, sUnitNum);
t->SetZeroAll();
/* call Gather function */
_Gather(s, t, dim, srcIndex, indexSize);
/* check results */
cpuTest = t->CheckData(answer, tUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor tUserGPU;
/* initialize variables */
sGPU->SetData(sData, sUnitNum);
tGPU->SetZeroAll();
/* call Gather function */
_Gather(sGPU, tGPU, dim, srcIndex, indexSize);
/* check results */
gpuTest = tGPU->CheckData(answer, tUnitNum);
/* destroy variables */
delete s;
delete t;
delete sGPU;
delete tGPU;
delete[] sDimSize;
delete[] tDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete s;
delete t;
delete[] sDimSize;
delete[] tDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 3: gather indexed sub-tensors
In this case, (3, 3) -> (2, 3), dim = 0,
srcIndex = [0, 2]
*/
bool TestGather3()
bool TestGather1()
{
/* a input tensor of size (3, 3) */
int sOrder = 2;
......@@ -286,7 +85,7 @@ bool TestGather3()
index->SetData(srcIndex, indexSize);
/* call Gather function */
_Gather(s, t, dim, srcIndex, indexSize);
_Gather(s, t, index);
tUser = Gather(*s, *index);
/* check results */
......@@ -309,7 +108,7 @@ bool TestGather3()
indexGPU->SetData(srcIndex, indexSize);
/* call Gather function */
_Gather(sGPU, tGPU, dim, srcIndex, indexSize);
_Gather(sGPU, tGPU, indexGPU);
tUserGPU = Gather(*sGPU, *indexGPU);
/* check results */
......@@ -360,24 +159,6 @@ bool TestGather()
}
else
XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */
caseFlag = TestGather2();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* case 2 test */
caseFlag = TestGather3();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 3 failed!\n");
}
else
XPRINT(0, stdout, ">> case 3 passed!\n");
/* other cases test */
/*
......
......@@ -129,14 +129,12 @@ bool TestHardTanH2()
DTYPE xData[2][3] = { {0.5F, -1.0F, 2.0F},
{3.5F, -4.5F, 1.0F} };
DTYPE goldData[2][3] = { {1.0F, 1.0F, 1.0F},
{1.0F, 1.0F, 1.0F} };
DTYPE yAnswer[2][3] = { {0.5F, -1.0F, 1.0F},
{1.0F, -1.0F, 1.0F} };
DTYPE dedyAnswer[2][3] = { {-0.5F, -2.0F, 0.0F},
{0.0F, -2.0F, 0.0F} };
DTYPE dedxAnswer[2][3] = { {-0.5F, -2.0F, 0.0F},
{0.0F, 0.0F, -0.0F} };
DTYPE dedyData[2][3] = { {-0.5F, -2.0F, 0.0F},
{0.0F, -2.0F, 0.0F} };
/* CPU test */
bool cpuTest = true;
......@@ -144,27 +142,24 @@ bool TestHardTanH2()
/* create tensors */
XTensor * x = NewTensor(order, dimSize);
XTensor * y = NewTensor(order, dimSize);
XTensor * gold = NewTensor(order, dimSize);
XTensor * dedy = NewTensor(order, dimSize);
XTensor * dedx = NewTensor(order, dimSize);
/* initialize variables */
x->SetData(xData, unitNum);
gold->SetData(goldData, unitNum);
y->SetZeroAll();
dedy->SetZeroAll();
dedx->SetZeroAll();
dedy->SetData(dedyData, unitNum);
/* call HardTanH function */
_HardTanH(x, y);
/* call HardTanHBackward function */
_HardTanHBackward(gold, y, x, dedy, dedx, SQUAREDERROR);
_HardTanHBackward(y, x, dedy, dedx);
/* check results */
cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F)
&& dedx->CheckData(dedxAnswer, unitNum, 1e-4F)
&& dedy->CheckData(dedyAnswer, unitNum, 1e-4F);
cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F) &&
dedx->CheckData(dedxAnswer, unitNum, 1e-4F);
#ifdef USE_CUDA
/* GPU test */
......@@ -173,37 +168,32 @@ bool TestHardTanH2()
/* create tensors */
XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * goldGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * dedyGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * dedxGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
/* initialize variables */
xGPU->SetData(xData, unitNum);
goldGPU->SetData(goldData, unitNum);
yGPU->SetZeroAll();
dedyGPU->SetZeroAll();
dedxGPU->SetZeroAll();
dedxGPU->SetZeroAll();
dedyGPU->SetData(dedyData, unitNum);
/* call HardTanH function */
_HardTanH(xGPU, yGPU);
/* call hardtanhbackward function */
_HardTanHBackward(goldGPU, yGPU, xGPU, dedyGPU, dedxGPU, SQUAREDERROR);
_HardTanHBackward(yGPU, xGPU, dedyGPU, dedxGPU);
/* check results */
gpuTest = y->CheckData(yAnswer, unitNum, 1e-4F)
&& dedxGPU->CheckData(dedxAnswer, unitNum, 1e-4F)
&& dedyGPU->CheckData(dedyAnswer, unitNum, 1e-4F);
gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F) &&
dedxGPU->CheckData(dedxAnswer, unitNum, 1e-4F);
/* destroy variables */
delete x;
delete y;
delete gold;
delete dedx;
delete dedy;
delete xGPU;
delete yGPU;
delete goldGPU;
delete dedxGPU;
delete dedyGPU;
delete[] dimSize;
......@@ -213,7 +203,6 @@ bool TestHardTanH2()
/* destroy variables */
delete x;
delete y;
delete gold;
delete dedx;
delete dedy;
delete[] dimSize;
......
......@@ -120,9 +120,8 @@ bool TestIdentity2()
unitNum *= dimSize[i];
DTYPE xData[3] = {1.0F, 1.0F, 2.0F};
DTYPE gData[3] = {0.0F, 0.0F, 1.0F};
DTYPE yAnswer[3] = {1.0F, 1.0F, 2.0F};
DTYPE dedyAnswer[3] = {0.0F, 0.0F, -0.5F};
DTYPE dedyData[3] = {0.0F, 0.0F, -0.5F};
DTYPE dedxAnswer[3] = {0.0F, 0.0F, -0.5F};
/* CPU test */
......@@ -131,27 +130,24 @@ bool TestIdentity2()
/* create tensors */
XTensor * x = NewTensor(order, dimSize);
XTensor * y = NewTensor(order, dimSize);
XTensor * g = NewTensor(order, dimSize);
XTensor * dedy = NewTensor(order, dimSize);
XTensor * dedx = NewTensor(order, dimSize);
/* initialize variables */
x->SetData(xData, unitNum);
g->SetData(gData, unitNum);
y->SetZeroAll();
dedx->SetZeroAll();
dedy->SetZeroAll();
dedy->SetData(dedyData, unitNum);
/* call Identity function */
_Identity(x, y);
/* call IdentityBackward function */
_IdentityBackward(g, y, x, dedy, dedx, CROSSENTROPY);
_IdentityBackward(y, x, dedy, dedx);
/* check result */
cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F)
&& dedx->CheckData(dedxAnswer, unitNum, 1e-4F)
&& dedy->CheckData(dedyAnswer, unitNum, 1e-4F);
cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F) &&
dedx->CheckData(dedxAnswer, unitNum, 1e-4F);
#ifdef USE_CUDA
/* GPU test */
......@@ -160,37 +156,32 @@ bool TestIdentity2()
/* create tensors */
XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * gGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * dedyGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * dedxGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
/* initialize variables */
xGPU->SetData(xData, unitNum);
gGPU->SetData(gData, unitNum);
yGPU->SetZeroAll();
dedxGPU->SetZeroAll();
dedyGPU->SetZeroAll();
dedyGPU->SetData(dedyData, unitNum);
/* call Identity function */
_Identity(xGPU, yGPU);
/* call IdentityBackward function */
_IdentityBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, CROSSENTROPY);
_IdentityBackward(yGPU, xGPU, dedyGPU, dedxGPU);
/* check result */
gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F)
&& dedxGPU->CheckData(dedxAnswer, unitNum, 1e-4F)
&& dedyGPU->CheckData(dedyAnswer, unitNum, 1e-4F);
gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F) &&
dedxGPU->CheckData(dedxAnswer, unitNum, 1e-4F);
/* destroy variables */
delete x;
delete y;
delete g;
delete dedx;
delete dedy;
delete xGPU;
delete yGPU;
delete gGPU;
delete dedxGPU;
delete dedyGPU;
delete[] dimSize;
......@@ -200,7 +191,6 @@ bool TestIdentity2()
/* destroy variables */
delete x;
delete y;
delete g;
delete dedx;
delete dedy;
delete[] dimSize;
......
......@@ -22,7 +22,7 @@
#ifndef __TEST_NEGATE_H__
#define __TEST_NEGATE_H__
#include "../core/arithmetic/Negate.h"
#include "../core/math/Unary.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......
......@@ -19,6 +19,7 @@
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-06-15
*/
#include "../core/math/Binary.h"
#include "../XUtility.h"
#include "TPower.h"
......
......@@ -22,8 +22,6 @@
#ifndef __TEST_POWER_H__
#define __TEST_POWER_H__
#include "../core/math/Power.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for Power Function */
......
......@@ -119,16 +119,14 @@ bool TestRectify2()
for (int i = 0; i < order; i++)
unitNum *= dimSize[i];
DTYPE xData[2][3] = { {1.0F, 1.0F, 2.0F},
{2.0F, 4.0F, 5.0F} };
DTYPE goldData[2][3] = { {1.0F, 1.0F, 1.0F},
{1.0F, 1.0F, 1.0F} };
DTYPE yAnswer[2][3] = { {1.0F, 1.0F, 2.0F},
{2.0F, 4.0F, 5.0F} };
DTYPE dedyAnswer[2][3] = { {-0.5F, -0.5F, -0.25F},
{-0.25F, -0.125F, -0.1F} };
DTYPE dedxAnswer[2][3] = { {-0.5F, -0.5F, -0.25F},
{-0.25F, -0.125F, -0.1F} };
DTYPE xData[2][3] = { {-1.0F, 1.0F, 2.0F},
{-2.0F, 4.0F, 5.0F} };
DTYPE yData[2][3] = { {0.0F, 1.0F, 2.0F},
{0.0F, 4.0F, 5.0F} };
DTYPE dedyData[2][3] = { {-0.5F, -0.5F, -0.25F},
{-0.25F, -0.125F, -0.1F} };
DTYPE dedxAnswer[2][3] = { {0.0F, -0.5F, -0.25F},
{0.0F, -0.125F, -0.1F} };
/* CPU test */
bool cpuTest = true;
......@@ -136,27 +134,22 @@ bool TestRectify2()
/* create tensors */
XTensor * x = NewTensor(order, dimSize);
XTensor * y = NewTensor(order, dimSize);
XTensor * gold = NewTensor(order, dimSize);
XTensor * dedy = NewTensor(order, dimSize);
XTensor * dedx = NewTensor(order, dimSize);
/* initialize variables */
x->SetData(xData, unitNum);
gold->SetData(goldData, unitNum);
y->SetZeroAll();
dedy->SetZeroAll();
y->SetData(yData, unitNum);
dedy->SetData(dedyData, unitNum);
dedx->SetZeroAll();
/* call Rectify function */
_Rectify(x, y);
/* call RectifyBackward function */
_RectifyBackward(gold, y, x, dedy, dedx, CROSSENTROPY);
_RectifyBackward(y, x, dedy, dedx);
/* check results */
cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F)
&& dedx->CheckData(dedxAnswer, unitNum, 1e-4F)
&& dedy->CheckData(dedyAnswer, unitNum, 1e-4F);
cpuTest = dedx->CheckData(dedxAnswer, unitNum, 1e-4F);
#ifdef USE_CUDA
/* GPU test */
......@@ -165,39 +158,32 @@ bool TestRectify2()
/* create tensors */
XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * goldGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * dedyGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * dedxGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
/* initialize variables */
xGPU->SetData(xData, unitNum);
goldGPU->SetData(goldData, unitNum);
yGPU->SetZeroAll();
dedyGPU->SetZeroAll();
yGPU->SetData(yData, unitNum);
dedyGPU->SetData(dedyData, unitNum);
dedxGPU->SetZeroAll();
/* call Rectify function */
_Rectify(xGPU, yGPU);
/* call rectifybackward function */
_RectifyBackward(goldGPU, yGPU, xGPU, dedyGPU, dedxGPU, CROSSENTROPY);
_RectifyBackward(yGPU, xGPU, dedyGPU, dedxGPU);
/* check results */
gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F)
&& dedxGPU->CheckData(dedxAnswer, unitNum, 1e-4F)
&& dedyGPU->CheckData(dedyAnswer, unitNum, 1e-4F);
gpuTest = dedxGPU->CheckData(dedxAnswer, unitNum, 1e-4F);
/* destroy variables */
delete x;
delete y;
delete dedy;
delete dedx;
delete gold;
delete xGPU;
delete yGPU;
delete dedyGPU;
delete dedxGPU;
delete goldGPU;
delete[] dimSize;
return cpuTest && gpuTest;
......@@ -207,8 +193,7 @@ bool TestRectify2()
delete y;
delete dedy;
delete dedx;
delete gold;
delete[] dimSize;
delete[] dimSize;
return cpuTest;
#endif // USE_CUDA
......
......@@ -59,7 +59,8 @@ bool TestSigmoid1()
yUser = Sigmoid(*x);
/* check result */
cpuTest = y->CheckData(answer, unitNum, 1e-4F) && yUser.CheckData(answer, unitNum, 1e-4F);
cpuTest = y->CheckData(answer, unitNum, 1e-4F) &&
yUser.CheckData(answer, unitNum, 1e-4F);
#ifdef USE_CUDA
/* GPU test */
......@@ -79,7 +80,8 @@ bool TestSigmoid1()
yUserGPU = Sigmoid(*xGPU);
/* check result */
gpuTest = yGPU->CheckData(answer, unitNum, 1e-4F) && yUserGPU.CheckData(answer, unitNum, 1e-4F);
gpuTest = yGPU->CheckData(answer, unitNum, 1e-4F) &&
yUserGPU.CheckData(answer, unitNum, 1e-4F);
/* destroy variables */
delete x;
......@@ -104,7 +106,7 @@ case 2: test Sigmoid function and SigmoidBackward function.
sigmoid function: y = 1/(1+exp(-x))
backward computation:
dE/ds = dE/dy * dy/dx
dy/dx = y * (1 -y)
dy/dx = y * (1 - y)
In this case, LossName=CROSSENTROPY.
*/
bool TestSigmoid2()
......@@ -119,10 +121,9 @@ bool TestSigmoid2()
unitNum *= dimSize[i];
DTYPE xData[3] = {0.0F, 1.0F, 2.0F};
DTYPE gData[3] = {0.4F, 0.8F, 1.0F};
DTYPE yAnswer[3] = {0.5F, 0.7311F, 0.8808F};
DTYPE dedyAnswer[3] = {-0.8F, -1.0943F, -1.1353F};
DTYPE dedxAnswer[3] = {-0.2F, -0.2151F, -0.1192F};
DTYPE dedyData[3] = {0.0F, 1.0F, 2.0F};
DTYPE dedxAnswer[3] = {0.0F, 0.1966F, 0.2100F};
/* CPU test */
bool cpuTest = true;
......@@ -130,65 +131,58 @@ bool TestSigmoid2()
/* create tensors */
XTensor * x = NewTensor(order, dimSize);
XTensor * y = NewTensor(order, dimSize);
XTensor * g = NewTensor(order, dimSize);
XTensor * dedy = NewTensor(order, dimSize);
XTensor * dedx = NewTensor(order, dimSize);
/* initialize variables */
x->SetData(xData, unitNum);
g->SetData(gData, unitNum);
y->SetZeroAll();
dedy->SetZeroAll();
dedx->SetZeroAll();
dedy->SetData(dedyData, unitNum);
/* call Sigmoid function */
_Sigmoid(x, y);
/* call SigmoidBackward function */
_SigmoidBackward(g, y, x, dedy, dedx, CROSSENTROPY);
_SigmoidBackward(y, x, dedy, dedx);
/* check result */
cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F)
&& dedx->CheckData(dedxAnswer, unitNum, 1e-4F)
&& dedy->CheckData(dedyAnswer, unitNum, 1e-4F);
cpuTest = y->CheckData(yAnswer, unitNum, 1e-4F) &&
dedx->CheckData(dedxAnswer, unitNum, 1e-4F);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
/* create tensors */
XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * gGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * dedyGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
XTensor * dedxGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
/* initialize variables */
xGPU->SetData(xData, unitNum);
gGPU->SetData(gData, unitNum);
yGPU->SetZeroAll();
dedyGPU->SetZeroAll();
dedxGPU->SetZeroAll();
dedyGPU->SetData(dedyData, unitNum);
/* call Sigmoid function */
_Sigmoid(xGPU, yGPU);
/* call SigmoidBackward function */
_SigmoidBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, CROSSENTROPY);
_SigmoidBackward(yGPU, xGPU, dedyGPU, dedxGPU);
/* check result */
gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F)
&& dedxGPU->CheckData(dedxAnswer, unitNum, 1e-4F)
&& dedyGPU->CheckData(dedyAnswer, unitNum, 1e-4F);
gpuTest = yGPU->CheckData(yAnswer, unitNum, 1e-4F) &&
dedxGPU->CheckData(dedxAnswer, unitNum, 1e-4F);
/* destroy variables */
delete x;
delete y;
delete g;
delete dedx;
delete dedy;
delete xGPU;
delete yGPU;
delete gGPU;
delete dedxGPU;
delete dedyGPU;
delete[] dimSize;
......@@ -198,7 +192,6 @@ bool TestSigmoid2()
/* destroy variables */
delete x;
delete y;
delete g;
delete dedx;
delete dedy;
delete[] dimSize;
......
......@@ -22,7 +22,7 @@
#ifndef __TEST_SIGN_H__
#define __TEST_SIGN_H__
#include "../core/arithmetic/Sign.h"
#include "../core/math/Unary.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......
......@@ -192,22 +192,22 @@ bool TestSpread2()
XTensor * s2 = NewTensor(sOrder, sDimSize);
XTensor * t = NewTensor(tOrder, tDimSize);
XTensor * sIndex = NewTensor(indexOrder, indexDimSize, X_INT);
XTensor * cIndex = NewTensor(indexOrder, indexDimSize, X_INT);
XTensor * tIndex = NewTensor(indexOrder, indexDimSize, X_INT);
/* initialize variables */
s1->SetData(sData, sUnitNum);
s2->SetData(sData, sUnitNum);
t->SetData(tData, tUnitNum);
sIndex->SetData(srcIndex, indexSize);
cIndex->SetData(tgtIndex, indexSize);
tIndex->SetData(tgtIndex, indexSize);
/* call _SpreadForGather function */
_SpreadForCopyIndexed(s1, t, dim, sIndex, cIndex, 1);
_SpreadForCopyIndexed(s1, t, dim, sIndex, tIndex, 1);
_SpreadForGather(s2, t, sIndex);
/* check results */
cpuTest = s1->CheckData(answer, tUnitNum) &&
s2->CheckData(answer, tUnitNum);
cpuTest = s1->CheckData(answer, sUnitNum) &&
s2->CheckData(answer, sUnitNum);
#ifdef USE_CUDA
/* GPU test */
......@@ -218,34 +218,34 @@ bool TestSpread2()
XTensor * sGPU2 = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * sIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
XTensor * cIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
XTensor * tIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
/* initialize variables */
sGPU1->SetData(sData, sUnitNum);
sGPU2->SetData(sData, sUnitNum);
tGPU->SetData(tData, tUnitNum);
sIndexGPU->SetData(srcIndex, indexSize);
cIndexGPU->SetData(tgtIndex, indexSize);
tIndexGPU->SetData(tgtIndex, indexSize);
/* call _SpreadForGather function */
_SpreadForCopyIndexed(sGPU1, tGPU, dim, sIndex, cIndex, 1);
_SpreadForCopyIndexed(sGPU1, tGPU, dim, sIndexGPU, tIndexGPU, 1);
_SpreadForGather(sGPU2, tGPU, sIndexGPU);
/* check results */
gpuTest = sGPU1->CheckData(answer, tUnitNum) &&
sGPU2->CheckData(answer, tUnitNum);
gpuTest = sGPU1->CheckData(answer, sUnitNum) &&
sGPU2->CheckData(answer, sUnitNum);
/* destroy variables */
delete s1;
delete s2;
delete t;
delete sIndex;
delete cIndex;
delete tIndex;
delete sGPU1;
delete sGPU2;
delete tGPU;
delete sIndexGPU;
delete cIndexGPU;
delete tIndexGPU;
delete[] sDimSize;
delete[] tDimSize;
delete[] indexDimSize;
......@@ -257,7 +257,142 @@ bool TestSpread2()
delete s2;
delete t;
delete sIndex;
delete cIndex;
delete tIndex;
delete[] sDimSize;
delete[] tDimSize;
delete[] indexDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 3: test _SpreadForGather and _SpreadForCopyIndexed function
spread a collection tensor to source tensor
*/
bool TestSpread3()
{
/* a input tensor of size (3, 3) */
int sOrder = 2;
int * sDimSize = new int[sOrder];
sDimSize[0] = 3;
sDimSize[1] = 3;
int sUnitNum = 1;
for (int i = 0; i < sOrder; i++)
sUnitNum *= sDimSize[i];
/* a output tensor of size (2, 3) */
int tOrder = 2;
int * tDimSize = new int[tOrder];
tDimSize[0] = 3;
tDimSize[1] = 2;
int tUnitNum = 1;
for (int i = 0; i < tOrder; i++)
tUnitNum *= tDimSize[i];
/* a index tensor of size (2) */
int indexOrder = 1;
int * indexDimSize = new int[indexOrder];
indexDimSize[0] = 2;
int indexUnitNum = 1;
for (int i = 0; i < indexOrder; i++)
indexUnitNum *= indexDimSize[i];
DTYPE sData[3][3] = { {0.0F, 0.0F, 2.0F},
{2.0F, 1.0F, 3.0F},
{2.0F, 2.0F, 4.0F} };
DTYPE tData[3][2] = { {0.0F, -1.0F},
{2.0F, 1.0F},
{2.0F, 0.0F} };
DTYPE answer[3][3] = { {-1.0F, 0.0F, 2.0F},
{3.0F, 1.0F, 5.0F},
{2.0F, 2.0F, 6.0F} };
int dim = 1;
int indexSize = 2;
int srcIndex[2] = {0, 2};
int tgtIndex[2] = {1, 0};
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * s1 = NewTensor(sOrder, sDimSize);
XTensor * s2 = NewTensor(sOrder, sDimSize);
XTensor * t = NewTensor(tOrder, tDimSize);
XTensor * sIndex = NewTensor(indexOrder, indexDimSize, X_INT);
XTensor * tIndex = NewTensor(indexOrder, indexDimSize, X_INT);
/* initialize variables */
s1->SetData(sData, sUnitNum);
s2->SetData(sData, sUnitNum);
t->SetData(tData, tUnitNum);
sIndex->SetData(srcIndex, indexSize);
tIndex->SetData(tgtIndex, indexSize);
/* call _SpreadForGather function */
_SpreadForCopyIndexed(s1, t, dim, sIndex, tIndex, 1);
_SpreadForCopyIndexed(s2, t, dim, sIndex, tIndex, 1);
/* check results */
cpuTest = s1->CheckData(answer, sUnitNum) &&
s2->CheckData(answer, sUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensors */
XTensor * sGPU1 = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * sGPU2 = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
XTensor * tGPU = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
XTensor * sIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
XTensor * tIndexGPU = NewTensor(indexOrder, indexDimSize, X_INT, 1.0F, 0);
/* initialize variables */
sGPU1->SetData(sData, sUnitNum);
sGPU2->SetData(sData, sUnitNum);
tGPU->SetData(tData, tUnitNum);
sIndexGPU->SetData(srcIndex, indexSize);
tIndexGPU->SetData(tgtIndex, indexSize);
/* call _SpreadForGather function */
_SpreadForCopyIndexed(sGPU1, tGPU, dim, sIndexGPU, tIndexGPU, 1);
_SpreadForCopyIndexed(sGPU2, tGPU, dim, sIndexGPU, tIndexGPU, 1);
/* check results */
gpuTest = sGPU1->CheckData(answer, sUnitNum) &&
sGPU2->CheckData(answer, sUnitNum);
/* destroy variables */
delete s1;
delete s2;
delete t;
delete sIndex;
delete tIndex;
delete sGPU1;
delete sGPU2;
delete tGPU;
delete sIndexGPU;
delete tIndexGPU;
delete[] sDimSize;
delete[] tDimSize;
delete[] indexDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete s1;
delete s2;
delete t;
delete sIndex;
delete tIndex;
delete[] sDimSize;
delete[] tDimSize;
delete[] indexDimSize;
......@@ -285,6 +420,24 @@ bool TestSpread()
}
else
XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 1 test */
caseFlag = TestSpread2();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* case 1 test */
caseFlag = TestSpread3();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 3 failed!\n");
}
else
XPRINT(0, stdout, ">> case 3 passed!\n");
/* other cases test */
/*
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-06
*/
#include "TSumByColumnTV.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
case 1: test SumByColumnTV function
sum of a tensor and a vector (column vector) in a column by column manner
*/
bool TestSumByColumnTV1()
{
/* a tensor of size (2, 4) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 2;
aDimSize[1] = 4;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (2, 1) */
int bOrder = 2;
int * bDimSize = new int[bOrder];
bDimSize[0] = 2;
bDimSize[1] = 1;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
/* a tensor of size (2, 4) */
int cOrder = 2;
int * cDimSize = new int[cOrder];
cDimSize[0] = 2;
cDimSize[1] = 4;
int cUnitNum = 1;
for (int i = 0; i < cOrder; i++)
cUnitNum *= cDimSize[i];
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[2][1] = { {1.0F},
{0.0F} };
DTYPE answer[2][4] = { {1.0F, 2.0F, 3.0F, 4.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensor(aOrder, aDimSize);
XTensor * b = NewTensor(bOrder, bDimSize);
XTensor * c = NewTensor(cOrder, cDimSize);
/* initialize variables */
a->SetData(aData, aUnitNum);
b->SetData(bData, bUnitNum);
/* call SumByColumnTV function */
_SumByColumnTV(a, b, c);
/* check results */
cpuTest = c->CheckData(answer, cUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensor(cOrder, cDimSize, X_FLOAT, 1.0F, 0);
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
bGPU->SetData(bData, bUnitNum);
cGPU->SetZeroAll();
/* call SumByColumnTV function */
_SumByColumnTV(aGPU, bGPU, cGPU);
/* check results */
gpuTest = cGPU->CheckData(answer, cUnitNum);
/* destroy variables */
delete a;
delete b;
delete c;
delete aGPU;
delete bGPU;
delete cGPU;
delete[] aDimSize;
delete[] bDimSize;
delete[] cDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete c;
delete[] aDimSize;
delete[] bDimSize;
delete[] cDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
*/
/* test for SumByColumnTV Function */
bool TestSumByColumnTV()
{
XPRINT(0, stdout, "[TEST SumByColumnTV] sum of a tensor and a vector (column vector) in a column by column manner \n");
bool returnFlag = true, caseFlag = true;
/* case 1 test */
caseFlag = TestSumByColumnTV1();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 1 failed!\n");
}
else
XPRINT(0, stdout, ">> case 1 passed!\n");
/* other cases test */
/*
TODO!!
*/
if (returnFlag) {
XPRINT(0, stdout, ">> All Passed!\n");
}
else
XPRINT(0, stdout, ">> Failed!\n");
XPRINT(0, stdout, "\n");
return returnFlag;
}
} // namespace nts(NiuTrans.Tensor)
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-06
*/
#ifndef __TEST_SUMBYCOLUMNTV_H__
#define __TEST_SUMBYCOLUMNTV_H__
#include "../core/arithmetic/SumByColumnTV.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for SumByColumnTV Function */
extern "C"
bool TestSumByColumnTV();
} // namespace nts(NiuTrans.Tensor)
#endif // __TEST_SUMBYCOLUMNTV_H__
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-06
*/
#include "TSumByColumnVT.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
case 1: test SumByColumnVT function
sum of a vector (column vector) and a tensor in a column by column manner
*/
bool TestSumByColumnVT1()
{
/* a tensor of size (2, 1) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 2;
aDimSize[1] = 1;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (2, 4) */
int bOrder = 2;
int * bDimSize = new int[bOrder];
bDimSize[0] = 2;
bDimSize[1] = 4;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
/* a tensor of size (2, 1) */
int cOrder = 2;
int * cDimSize = new int[cOrder];
cDimSize[0] = 2;
cDimSize[1] = 1;
int cUnitNum = 1;
for (int i = 0; i < cOrder; i++)
cUnitNum *= cDimSize[i];
DTYPE aData[2][1] = { {1.0F},
{0.0F} };
DTYPE bData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE answer[2][1] = { {7.0F},
{22.0F} };
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensor(aOrder, aDimSize);
XTensor * b = NewTensor(bOrder, bDimSize);
XTensor * c = NewTensor(cOrder, cDimSize);
/* initialize variables */
a->SetData(aData, aUnitNum);
b->SetData(bData, bUnitNum);
c->SetZeroAll();
/* call SumByColumnVT function */
_SumByColumnVT(a, b, c);
/* check results */
cpuTest = c->CheckData(answer, cUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensor(cOrder, cDimSize, X_FLOAT, 1.0F, 0);
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
bGPU->SetData(bData, bUnitNum);
cGPU->SetZeroAll();
/* call SumByColumnVT function */
_SumByColumnVT(aGPU, bGPU, cGPU);
/* check results */
gpuTest = cGPU->CheckData(answer, cUnitNum);
/* destroy variables */
delete a;
delete b;
delete c;
delete aGPU;
delete bGPU;
delete cGPU;
delete[] aDimSize;
delete[] bDimSize;
delete[] cDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete c;
delete[] aDimSize;
delete[] bDimSize;
delete[] cDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
*/
/* test for SumByColumnVT Function */
bool TestSumByColumnVT()
{
XPRINT(0, stdout, "[TEST SumByColumnVT] sum of a vector (column vector) and a tensor in a column by column manner \n");
bool returnFlag = true, caseFlag = true;
/* case 1 test */
caseFlag = TestSumByColumnVT1();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 1 failed!\n");
}
else
XPRINT(0, stdout, ">> case 1 passed!\n");
/* other cases test */
/*
TODO!!
*/
if (returnFlag) {
XPRINT(0, stdout, ">> All Passed!\n");
}
else
XPRINT(0, stdout, ">> Failed!\n");
XPRINT(0, stdout, "\n");
return returnFlag;
}
} // namespace nts(NiuTrans.Tensor)
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-06
*/
#ifndef __TEST_SUMBYCOLUMNVT_H__
#define __TEST_SUMBYCOLUMNVT_H__
#include "../core/arithmetic/SumByColumnVT.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for SumByColumnVT Function */
extern "C"
bool TestSumByColumnVT();
} // namespace nts(NiuTrans.Tensor)
#endif // __TEST_SUMBYCOLUMNVT_H__
......@@ -68,10 +68,9 @@ bool Test()
wrong = !TestSin() || wrong;
wrong = !TestSort() || wrong;
wrong = !TestSplit() || wrong;
wrong = !TestSpread() || wrong;
wrong = !TestSub() || wrong;
wrong = !TestSum() || wrong;
wrong = !TestSumByColumnTV() || wrong;
wrong = !TestSumByColumnVT() || wrong;
wrong = !TestSumDim() || wrong;
wrong = !TestTan() || wrong;
wrong = !TestTranspose() || wrong;
......
......@@ -61,10 +61,9 @@
#include "TSin.h"
#include "TSort.h"
#include "TSplit.h"
#include "TSpread.h"
#include "TSub.h"
#include "TSum.h"
#include "TSumByColumnTV.h"
#include "TSumByColumnVT.h"
#include "TSumDim.h"
#include "TTan.h"
#include "TTranspose.h"
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论