Commit e223c59c by xuchen

bug fixed and add some function

parent 8e13830b
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
## 注意事项 ## 注意事项
CUDA最新版本9.2尚且不支持VS2017最新版本,因此建议使用CUDA版本为9.0或9.1,建议使用VS版本为VS2015,或使用VS2017时安装v140工具集。 CUDA最新版本9.2尚且不支持VS2017最新版本,因此建议使用CUDA版本为9.0或9.1,建议使用VS版本为VS2015,或使用VS2017时安装v140工具集,解决方案平台设置为×64
## CUDA配置 ## CUDA配置
...@@ -29,7 +29,7 @@ CUDA最新版本9.2尚且不支持VS2017最新版本,因此建议使用CUDA版 ...@@ -29,7 +29,7 @@ CUDA最新版本9.2尚且不支持VS2017最新版本,因此建议使用CUDA版
**C/C++->预处理器->预处理器定义** 中,添加 **C/C++->预处理器->预处理器定义** 中,添加
>USE_CUDA;USE_BLAS;WIN32;MKL;DEBUG;CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_WARNINGS_ >USE_CUDA;USE_BLAS;WIN32;MKL;_DEBUG;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_WARNINGS_
CONSOLE; CONSOLE;
**链接器->系统->子系统**,设置为控制台。 **链接器->系统->子系统**,设置为控制台。
......
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#include "../tensor/XUtility.h" #include "../tensor/XUtility.h"
#include "../tensor/function/FHeader.h" #include "../tensor/function/FHeader.h"
#include "../tensor/core/CHeader.h" #include "../tensor/core/CHeader.h"
#include "../tensor/test/Test.h"
#include "../sample/fnnlm/FNNLM.h" #include "../sample/fnnlm/FNNLM.h"
#include "../sample/transformer/Transformer.h" #include "../sample/transformer/Transformer.h"
...@@ -31,18 +32,24 @@ ...@@ -31,18 +32,24 @@
//#include <stdlib.h> //#include <stdlib.h>
//#include <crtdbg.h> //#include <crtdbg.h>
void BackwardTest();
void TransposeTest(); void TransposeTest();
void SumDimTest(); void SumDimTest();
using namespace nts; using namespace nts;
using namespace fnnlm; using namespace fnnlm;
using namespace transformer; using namespace transformer;
using namespace GAN;
int main( int argc, const char ** argv ) int main( int argc, const char ** argv )
{ {
//_CrtSetBreakAlloc(896); //_CrtSetBreakAlloc(896);
//BackwardTest();
//return 0;
if(argc > 1 && !strcmp(argv[1], "-fnnlm")) if(argc > 1 && !strcmp(argv[1], "-test"))
Test();
else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
FNNLMMain(argc - 1, argv + 1); FNNLMMain(argc - 1, argv + 1);
else if(argc > 1 && !strcmp(argv[1], "-t2t")) else if(argc > 1 && !strcmp(argv[1], "-t2t"))
TransformerMain(argc - 1, argv + 1); TransformerMain(argc - 1, argv + 1);
...@@ -58,6 +65,41 @@ int main( int argc, const char ** argv ) ...@@ -58,6 +65,41 @@ int main( int argc, const char ** argv )
return 0; return 0;
} }
void BackwardTest()
{
XNet net;
XTensor a;
XTensor b;
XTensor c;
XTensor mean;
XTensor origin;
InitTensor2D(&a, 2, 3);
InitTensor1D(&b, 2);
a.SetZeroAll();
b.SetZeroAll();
a.Set2D(1.0F, 0, 0);
a.Set2D(2.0F, 0, 1);
a.Set2D(3.0F, 0, 2);
a.Set2D(4.0F, 1, 0);
a.Set2D(5.0F, 1, 1);
a.Set2D(6.0F, 1, 2);
b.Set1D(2.0F, 0);
b.Set1D(1.0F, 1);
c = DivDim(a, b, 0);
c.Dump(stderr, "c:");
XLink::ShowNetwork(stderr, &c);
net.Backward(c);
net.Dump(stderr);
}
void TransposeTest() void TransposeTest()
{ {
#ifdef USE_CUDA #ifdef USE_CUDA
......
...@@ -50,6 +50,7 @@ void XFuncGrad::MakeGrad(XTensor * node) ...@@ -50,6 +50,7 @@ void XFuncGrad::MakeGrad(XTensor * node)
_IdentityBackward(NULL, output, input, output->grad, input->grad, NOLOSS); _IdentityBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
else if(operID == FUNC_LOGSOFTMAX){ else if(operID == FUNC_LOGSOFTMAX){
int leadDim = income.GetParamInt(0); int leadDim = income.GetParamInt(0);
CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in logsoftmax!");
_LogSoftmaxBackward(NULL, output, input, output->grad, input->grad, leadDim, NOLOSS); _LogSoftmaxBackward(NULL, output, input, output->grad, input->grad, leadDim, NOLOSS);
} }
else if(operID == FUNC_RECTIFY) else if(operID == FUNC_RECTIFY)
...@@ -58,6 +59,7 @@ void XFuncGrad::MakeGrad(XTensor * node) ...@@ -58,6 +59,7 @@ void XFuncGrad::MakeGrad(XTensor * node)
_SigmoidBackward(NULL, output, input, output->grad, input->grad, NOLOSS); _SigmoidBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
else if(operID == FUNC_SOFTMAX){ else if(operID == FUNC_SOFTMAX){
int leadDim = income.GetParamInt(0); int leadDim = income.GetParamInt(0);
CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in softmax!");
_SoftmaxBackward(NULL, output, input, output->grad, input->grad, leadDim, NOLOSS); _SoftmaxBackward(NULL, output, input, output->grad, input->grad, leadDim, NOLOSS);
} }
else{ else{
......
...@@ -22,7 +22,11 @@ ...@@ -22,7 +22,11 @@
#include "XBackwardLoss.h" #include "XBackwardLoss.h"
#include "../tensor/XName.h" #include "../tensor/XName.h"
#include "../tensor/function/HardTanH.h" #include "../tensor/function/HardTanH.h"
#include "../tensor/function/Identity.h"
#include "../tensor/function/LogSoftmax.h" #include "../tensor/function/LogSoftmax.h"
#include "../tensor/function/Rectify.h"
#include "../tensor/function/Sigmoid.h"
#include "../tensor/function/Softmax.h"
namespace nts{ namespace nts{
...@@ -49,10 +53,22 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x, ...@@ -49,10 +53,22 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x,
if(funcID == FUNC_HARDTANH){ if(funcID == FUNC_HARDTANH){
_HardTanHBackward(gold, y, x, dedy, dedx, lossName); _HardTanHBackward(gold, y, x, dedy, dedx, lossName);
} }
else if(funcID == FUNC_IDENTITY){
_IdentityBackward(gold, y, x, dedy, dedx, lossName);
}
else if(funcID == FUNC_LOGSOFTMAX){ else if(funcID == FUNC_LOGSOFTMAX){
int leadDim = *(int*)params; int leadDim = *(int*)params;
_LogSoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName); _LogSoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName);
} }
else if(funcID == FUNC_RECTIFY){
_RectifyBackward(gold, y, x, dedy, dedx, lossName);
}
else if(funcID == FUNC_SIGMOID){
_SigmoidBackward(gold, y, x, dedy, dedx, lossName);
}else if(funcID == FUNC_SOFTMAX){
int leadDim = *(int*)params;
_SoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName);
}
else{ else{
ShowNTErrors("wrong function found when call the backward process!"); ShowNTErrors("wrong function found when call the backward process!");
} }
......
...@@ -41,6 +41,8 @@ void XMathGrad::MakeGrad(XTensor * node) ...@@ -41,6 +41,8 @@ void XMathGrad::MakeGrad(XTensor * node)
GradSumDim(node); GradSumDim(node);
else if(operID == MATH_MULTIPLY) else if(operID == MATH_MULTIPLY)
GradMultiply(node); GradMultiply(node);
else if (operID == MATH_MULTIPLYDIM)
GradMultiplyDim(node);
else if(operID == MATH_MATRIXMUL) else if(operID == MATH_MATRIXMUL)
GradMatrixMul(node); GradMatrixMul(node);
else if(operID == MATH_MATRIXMULBATCHED) else if(operID == MATH_MATRIXMULBATCHED)
...@@ -55,6 +57,8 @@ void XMathGrad::MakeGrad(XTensor * node) ...@@ -55,6 +57,8 @@ void XMathGrad::MakeGrad(XTensor * node)
GradScaleAndShift(node); GradScaleAndShift(node);
else if (operID == MATH_DIV) else if (operID == MATH_DIV)
GradDiv(node); GradDiv(node);
else if (operID == MATH_DIVDIM)
GradDivDim(node);
else if (operID == MATH_SUB) else if (operID == MATH_SUB)
GradSub(node); GradSub(node);
else if (operID == MATH_SIN) else if (operID == MATH_SIN)
...@@ -242,6 +246,94 @@ void XMathGrad::GradMultiply(XTensor * node) ...@@ -242,6 +246,94 @@ void XMathGrad::GradMultiply(XTensor * node)
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
} }
/*
gradient for multiply with one dimension
c = a * b
where the size of b is equal to dimension n of a, i.e., |b| = a.dimSize[n]
dE/da = dE/dc * b
dE/db = (dE/dc * a).reduce(0,...,n-1,n+1,...)
*/
void XMathGrad::GradMultiplyDim(XTensor * node)
{
XLink &income = node->income;
CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for MULTIPLYDIM!");
XTensor * a = income.tails[0];
XTensor * b = income.tails[1];
int n = income.GetParamInt(0);
XNoder::MakeGrad(a);
XNoder::MakeGrad(b);
/* dE/da */
_MultiplyDim(node->grad, b, a->grad, n, 1.0F);
/* dE/db */
int order = a->order;
int dimSize[MAX_TENSOR_DIM_NUM];
memcpy(dimSize, a->dimSize, sizeof(int) * a->order);
XTensor * bGradTMP = NewTensorBuf(node->grad, node->devID, node->mem);
_Multiply(node->grad, a, bGradTMP);
if(n == order - 1){
int reshapedSize[MAX_TENSOR_DIM_NUM];
reshapedSize[0] = a->unitNum/dimSize[order - 1];
reshapedSize[1] = dimSize[order - 1];
/* we reshape dE/dc * a to a matrix whose column number is equal to the
size of b. Then we can reduce the matrix into a row vector. */
bGradTMP->Reshape(2, reshapedSize);
if(b->outgo.tailNum > 1){
XTensor * bGradTMP2 = NewTensorBuf(b->grad, b->devID, b->mem);
_ReduceSum(bGradTMP, bGradTMP2, 0);
_Sum(b->grad, bGradTMP2, b->grad);
DelTensorBuf(bGradTMP2);
}
else{
_ReduceSum(bGradTMP, b->grad, 0);
}
}
else{
int reshapedSize[MAX_TENSOR_DIM_NUM];
reshapedSize[0] = 1;
reshapedSize[1] = dimSize[n];
reshapedSize[2] = 1;
for(int i = 0; i < order; i++){
if(i < n)
reshapedSize[0] *= dimSize[i];
}
reshapedSize[2] = a->unitNum / (reshapedSize[0] * reshapedSize[1]);
/* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|.
Then reduce along with z and x to obtain dE/db. */
bGradTMP->Reshape(3, reshapedSize);
XTensor * interGrad = NewTensorBuf(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
_ReduceSum(bGradTMP, interGrad, 2);
if(b->outgo.tailNum > 1){
XTensor * bGradTMP2 = NewTensorBuf(b->grad, b->devID, b->mem);
_ReduceSum(interGrad, bGradTMP2, 0);
_Sum(b->grad, bGradTMP2, b->grad);
DelTensorBuf(bGradTMP2);
}
else{
_ReduceSum(interGrad, b->grad, 0);
}
DelTensorBuf(interGrad);
}
DelTensor(bGradTMP);
node->visitMark = NODE_FINISHED;
}
/* /*
gradient for matrix multiply gradient for matrix multiply
for c = matmul(a, b) * \alpha for c = matmul(a, b) * \alpha
...@@ -381,7 +473,6 @@ void XMathGrad::GradMatrixMulBatched(XTensor * node) ...@@ -381,7 +473,6 @@ void XMathGrad::GradMatrixMulBatched(XTensor * node)
XNoder::MakeGrad(a); XNoder::MakeGrad(a);
XNoder::MakeGrad(b); XNoder::MakeGrad(b);
XTensor * c = node;
XTensor * dedc = node->grad; XTensor * dedc = node->grad;
XTensor * deda = a->grad; XTensor * deda = a->grad;
XTensor * dedb = b->grad; XTensor * dedb = b->grad;
...@@ -460,7 +551,7 @@ gradient for power ...@@ -460,7 +551,7 @@ gradient for power
for for
c = pow(a,p) c = pow(a,p)
we have we have
dE/da = (dE/dc) * p*a^(p-1) dE/da = (dE/dc) * p * a^(p-1)
>> node - the node (c) for backward computation >> node - the node (c) for backward computation
*/ */
void XMathGrad::GradPower(XTensor * node) void XMathGrad::GradPower(XTensor * node)
...@@ -469,21 +560,19 @@ void XMathGrad::GradPower(XTensor * node) ...@@ -469,21 +560,19 @@ void XMathGrad::GradPower(XTensor * node)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for POWER!"); CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for POWER!");
XTensor * a = income.tails[0]; XTensor * a = income.tails[0];
XTensor * b = NewTensor(a); XTensor * b = NewTensorBuf(a, a->devID, a->mem);
XTensor * c = NewTensor(a);
DTYPE p = income.GetParam(0); DTYPE p = income.GetParam(0);
XNoder::MakeGrad(a); XNoder::MakeGrad(a);
_Power(a, b, (p-1)/p); _Power(a, b, p - 1.0F);
_ScaleAndShift(b, c, p); _ScaleAndShiftMe(b, p);
_Multiply(node->grad, c, a->grad, 1.0F); _Multiply(node->grad, b, a->grad, 1.0F);
DelTensor(b);
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
delete b;
delete c;
} }
/* /*
...@@ -500,16 +589,16 @@ void XMathGrad::GradNegate(XTensor * node) ...@@ -500,16 +589,16 @@ void XMathGrad::GradNegate(XTensor * node)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for NEGATE!"); CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for NEGATE!");
XTensor * a = income.tails[0]; XTensor * a = income.tails[0];
XTensor * b = NewTensor(a); XTensor * b = NewTensorBuf(a, a->devID, a->mem);
XNoder::MakeGrad(a); XNoder::MakeGrad(a);
_ScaleAndShift(node->grad, b, -1.0F); _ScaleAndShift(node->grad, b, -1.0F);
_Sum(a->grad, b, a->grad); _Sum(a->grad, b, a->grad);
node->visitMark = NODE_FINISHED; DelTensorBuf(b);
delete b; node->visitMark = NODE_FINISHED;
} }
/* /*
...@@ -526,18 +615,14 @@ void XMathGrad::GradScaleAndShift(XTensor * node) ...@@ -526,18 +615,14 @@ void XMathGrad::GradScaleAndShift(XTensor * node)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SCALEANDSHIFT!"); CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SCALEANDSHIFT!");
XTensor * a = income.tails[0]; XTensor * a = income.tails[0];
XTensor * b = NewTensor(a);
DTYPE scale = income.GetParam(0); DTYPE scale = income.GetParam(0);
XNoder::MakeGrad(a); XNoder::MakeGrad(a);
_ScaleAndShift(node->grad, b, scale); _Sum(a->grad, node->grad, a->grad, scale);
_Sum(a->grad, b, a->grad);
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
delete b;
} }
/* /*
...@@ -583,9 +668,7 @@ void XMathGrad::GradDiv(XTensor * node) ...@@ -583,9 +668,7 @@ void XMathGrad::GradDiv(XTensor * node)
XTensor * a = income.tails[0]; XTensor * a = income.tails[0];
XTensor * b = income.tails[1]; XTensor * b = income.tails[1];
XTensor * c = NewTensor(b); XTensor * ab2 = NewTensorBuf(a, a->devID, a->mem);
XTensor * d = NewTensor(b);
XTensor * e = NewTensor(b);
XNoder::MakeGrad(a); XNoder::MakeGrad(a);
XNoder::MakeGrad(b); XNoder::MakeGrad(b);
...@@ -593,16 +676,115 @@ void XMathGrad::GradDiv(XTensor * node) ...@@ -593,16 +676,115 @@ void XMathGrad::GradDiv(XTensor * node)
CheckNTErrors(XTensor::IsSameShaped(a, b), "Wrong sized input tensors!"); CheckNTErrors(XTensor::IsSameShaped(a, b), "Wrong sized input tensors!");
_Div(node->grad, b, a->grad, 1.0F); _Div(node->grad, b, a->grad, 1.0F);
_Power(b, c, -2.0F);
_Multiply(a, c, d); _Power(b, ab2, -2.0F);
_ScaleAndShift(d, e, -1.0F); _Multiply(a, ab2, ab2);
_Multiply(node->grad, e, b->grad, 1.0F); _ScaleAndShiftMe(ab2, -1.0F);
_Multiply(node->grad, ab2, b->grad, 1.0F);
DelTensorBuf(ab2);
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
}
/*
gradient for division with one dimension
c = a / b
where the size of b is equal to dimension n of a, i.e., |b| = a.dimSize[n]
dE/da = dE/dc * (1/b)
dE/db = dE/dc * b.reduce(0,...,n-1,n+1,...)
dE/db = (dE/dc * a).reduce(0,...,n-1,n+1,...)
*/
void XMathGrad::GradDivDim(XTensor * node)
{
XLink &income = node->income;
CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for DIVDIM!");
delete c; XTensor * a = income.tails[0];
delete d; XTensor * b = income.tails[1];
delete e; int n = income.GetParamInt(0);
XNoder::MakeGrad(a);
XNoder::MakeGrad(b);
/* dE/da = dE/dc * (1/b) */
_DivDim(node->grad, b, a->grad, n, 1.0);
/* dE/db = dE/dc * dc/db */
int order = a->order;
int dimSize[MAX_TENSOR_DIM_NUM];
memcpy(dimSize, a->dimSize, sizeof(int) * a->order);
XTensor * aTMP1 = NewTensorBuf(a, a->devID, a->mem);
XTensor * aTMP2 = NewTensorBuf(a, a->devID, a->mem);
XTensor * bTMP = NewTensorBuf(b, b->devID, b->mem);
XTensor * interGradTMP = NewTensorBuf(node->grad, node->devID, node->mem);
_Negate(a, aTMP1);
_Power(b, bTMP, -2);
_MultiplyDim(aTMP1, bTMP, aTMP2, n);
_Multiply(node->grad, aTMP2, interGradTMP);
if(n == order - 1){
int reshapedSize[MAX_TENSOR_DIM_NUM];
reshapedSize[0] = a->unitNum/dimSize[order - 1];
reshapedSize[1] = dimSize[order - 1];
/* we reshape dE/dc * a to a matrix whose column number is equal to the
size of b. Then we can reduce the matrix into a row vector. */
interGradTMP->Reshape(2, reshapedSize);
if(b->outgo.tailNum > 1){
XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
_ReduceSum(interGradTMP, bGradTMP, 0);
_Sum(b->grad, bGradTMP, b->grad);
DelTensorBuf(bGradTMP);
}
else{
_ReduceSum(interGradTMP, b->grad, 0);
}
}
else{
int reshapedSize[MAX_TENSOR_DIM_NUM];
reshapedSize[0] = 1;
reshapedSize[1] = dimSize[n];
reshapedSize[2] = 1;
for(int i = 0; i < order; i++){
if(i < n)
reshapedSize[0] *= dimSize[i];
}
reshapedSize[2] = a->unitNum / (reshapedSize[0] * reshapedSize[1]);
/* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|.
Then reduce along with z and x to obtain dE/db. */
interGradTMP->Reshape(3, reshapedSize);
XTensor * interGrad = NewTensorBuf(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
_ReduceSum(interGradTMP, interGrad, 2);
if(b->outgo.tailNum > 1){
XTensor * bGradTMP2 = NewTensorBuf(b->grad, b->devID, b->mem);
_ReduceSum(interGrad, bGradTMP2, 0);
_Sum(b->grad, bGradTMP2, b->grad);
DelTensorBuf(bGradTMP2);
}
else{
_ReduceSum(interGrad, b->grad, 0);
}
DelTensorBuf(interGrad);
}
DelTensorBuf(aTMP1);
DelTensorBuf(aTMP2);
DelTensorBuf(bTMP);
DelTensorBuf(interGradTMP);
node->visitMark = NODE_FINISHED;
} }
/* /*
...@@ -619,16 +801,16 @@ void XMathGrad::GradExp(XTensor * node) ...@@ -619,16 +801,16 @@ void XMathGrad::GradExp(XTensor * node)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for EXP!"); CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for EXP!");
XTensor * a = income.tails[0]; XTensor * a = income.tails[0];
XTensor * b = NewTensor(a); XTensor * b = NewTensorBuf(a, a->devID, a->mem);
XNoder::MakeGrad(a); XNoder::MakeGrad(a);
_Exp(a, b); _Exp(a, b);
_Multiply(node->grad, b, a->grad, 1.0F); _Multiply(node->grad, b, a->grad, 1.0F);
node->visitMark = NODE_FINISHED; DelTensorBuf(b);
delete b; node->visitMark = NODE_FINISHED;
} }
/* /*
...@@ -645,16 +827,16 @@ void XMathGrad::GradSin(XTensor * node) ...@@ -645,16 +827,16 @@ void XMathGrad::GradSin(XTensor * node)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SIN!"); CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SIN!");
XTensor * a = income.tails[0]; XTensor * a = income.tails[0];
XTensor * b = NewTensor(a); XTensor * b = NewTensorBuf(a, a->devID, a->mem);
XNoder::MakeGrad(a); XNoder::MakeGrad(a);
_Cos(a, b); _Cos(a, b);
_Multiply(node->grad, b, a->grad, 1.0F); _Multiply(node->grad, b, a->grad, 1.0F);
node->visitMark = NODE_FINISHED; DelTensorBuf(b);
delete b; node->visitMark = NODE_FINISHED;
} }
/* /*
...@@ -671,19 +853,17 @@ void XMathGrad::GradCos(XTensor * node) ...@@ -671,19 +853,17 @@ void XMathGrad::GradCos(XTensor * node)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for COS!"); CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for COS!");
XTensor * a = income.tails[0]; XTensor * a = income.tails[0];
XTensor * b = NewTensor(a); XTensor * b = NewTensorBuf(a, a->devID, a->mem);
XTensor * c = NewTensor(a);
XNoder::MakeGrad(a); XNoder::MakeGrad(a);
_Sin(a, b); _Sin(a, b);
_ScaleAndShift(b, c, -1.0F); _ScaleAndShiftMe(b, -1.0F);
_Multiply(node->grad, c, a->grad, 1.0F); _Multiply(node->grad, b, a->grad, 1.0F);
node->visitMark = NODE_FINISHED; DelTensorBuf(b);
delete b; node->visitMark = NODE_FINISHED;
delete c;
} }
/* /*
...@@ -700,19 +880,17 @@ void XMathGrad::GradTan(XTensor * node) ...@@ -700,19 +880,17 @@ void XMathGrad::GradTan(XTensor * node)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for TAN!"); CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for TAN!");
XTensor * a = income.tails[0]; XTensor * a = income.tails[0];
XTensor * b = NewTensor(a); XTensor * b = NewTensorBuf(a, a->devID, a->mem);
XTensor * c = NewTensor(a);
XNoder::MakeGrad(a); XNoder::MakeGrad(a);
_Cos(a, b); _Cos(a, b);
_Power(b, c, -2.0F); _PowerMe(b, -2.0F);
_Multiply(node->grad, c, a->grad, 1.0F); _Multiply(node->grad, b, a->grad, 1.0F);
node->visitMark = NODE_FINISHED; DelTensorBuf(b);
delete b; node->visitMark = NODE_FINISHED;
delete c;
} }
/* /*
...@@ -818,16 +996,16 @@ void XMathGrad::GradAbsolute(XTensor * node) ...@@ -818,16 +996,16 @@ void XMathGrad::GradAbsolute(XTensor * node)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for ABSOLUTE!"); CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for ABSOLUTE!");
XTensor * a = income.tails[0]; XTensor * a = income.tails[0];
XTensor * b = NewTensor(a); XTensor * b = NewTensorBuf(a, a->devID, a->mem);
XNoder::MakeGrad(a); XNoder::MakeGrad(a);
_Sign(a, b); _Sign(a, b);
_Multiply(node->grad, b, a->grad, 1.0F); _Multiply(node->grad, b, a->grad, 1.0F);
node->visitMark = NODE_FINISHED; DelTensorBuf(b);
delete b; node->visitMark = NODE_FINISHED;
} }
/* /*
...@@ -843,17 +1021,10 @@ void XMathGrad::GradSign(XTensor * node) ...@@ -843,17 +1021,10 @@ void XMathGrad::GradSign(XTensor * node)
XLink &income = node->income; XLink &income = node->income;
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SIGN!"); CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SIGN!");
XTensor * a = income.tails[0]; // we do nothing here
XTensor * b = NewTensor(a); // TODO: set grad = 0 if the node is the only child
XNoder::MakeGrad(a);
b->SetZeroAll();
_Sum(a->grad, b, a->grad);
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
delete b;
} }
/* /*
...@@ -869,17 +1040,10 @@ void XMathGrad::GradRound(XTensor * node) ...@@ -869,17 +1040,10 @@ void XMathGrad::GradRound(XTensor * node)
XLink &income = node->income; XLink &income = node->income;
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for ROUND!"); CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for ROUND!");
XTensor * a = income.tails[0]; // we do nothing here
XTensor * b = NewTensor(a); // TODO: set grad = 0 if the node is the only child
XNoder::MakeGrad(a);
b->SetZeroAll();
_Sum(a->grad, b, a->grad);
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
delete b;
} }
/* /*
...@@ -895,7 +1059,7 @@ void XMathGrad::GradClip(XTensor * node) ...@@ -895,7 +1059,7 @@ void XMathGrad::GradClip(XTensor * node)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for CLIP!"); CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for CLIP!");
XTensor * a = income.tails[0]; XTensor * a = income.tails[0];
XTensor * b = NewTensor(a); XTensor * b = NewTensorBuf(a, a->devID, a->mem);
DTYPE lower = income.GetParam(0); DTYPE lower = income.GetParam(0);
DTYPE upper = income.GetParam(1); DTYPE upper = income.GetParam(1);
...@@ -905,9 +1069,9 @@ void XMathGrad::GradClip(XTensor * node) ...@@ -905,9 +1069,9 @@ void XMathGrad::GradClip(XTensor * node)
_ClipBackward(node, a, node->grad, a->grad, lower, upper); _ClipBackward(node, a, node->grad, a->grad, lower, upper);
_Sum(a->grad, b, a->grad); _Sum(a->grad, b, a->grad);
node->visitMark = NODE_FINISHED; DelTensorBuf(b);
delete b; node->visitMark = NODE_FINISHED;
} }
/* /*
...@@ -924,21 +1088,20 @@ void XMathGrad::GradReduceMean(XTensor * node) ...@@ -924,21 +1088,20 @@ void XMathGrad::GradReduceMean(XTensor * node)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for Reduce!"); CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for Reduce!");
XTensor * a = income.tails[0]; XTensor * a = income.tails[0];
XTensor * b = NewTensor(a); XTensor * b = NewTensorBuf(a, a->devID, a->mem);
XTensor * c = NewTensor(a);
int dim = income.GetParamInt(0); int dim = income.GetParamInt(0);
int n = a->GetDim(dim); int n = a->GetDim(dim);
XNoder::MakeGrad(a); XNoder::MakeGrad(a);
_Unsqueeze(node->grad, b, dim, n); _Unsqueeze(node->grad, b, dim, n);
_ScaleAndShift(b, c, 1.0F/n); _ScaleAndShiftMe(b, 1.0F/n);
_Sum(a->grad, c, a->grad); _Sum(a->grad, b, a->grad);
node->visitMark = NODE_FINISHED; DelTensorBuf(b);
delete b; node->visitMark = NODE_FINISHED;
delete c;
} }
/* /*
...@@ -955,27 +1118,28 @@ void XMathGrad::GradReduceSum(XTensor * node) ...@@ -955,27 +1118,28 @@ void XMathGrad::GradReduceSum(XTensor * node)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for Reduce!"); CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for Reduce!");
XTensor * a = income.tails[0]; XTensor * a = income.tails[0];
XTensor * b = NewTensor(a); XTensor * b = NewTensorBuf(a, a->devID, a->mem);
int dim = income.GetParamInt(0); int dim = income.GetParamInt(0);
int n = a->GetDim(dim); int n = a->GetDim(dim);
XNoder::MakeGrad(a); XNoder::MakeGrad(a);
_Unsqueeze(node->grad, b, dim, n); _Unsqueeze(node->grad, b, dim, n);
_Sum(a->grad, b, a->grad); _Sum(a->grad, b, a->grad);
node->visitMark = NODE_FINISHED; DelTensor(b);
delete b; node->visitMark = NODE_FINISHED;
} }
/* /*
gradient for reduceSumSquared gradient for reduceSumSquared
for for
c = reduceSumSquared(a, dim, b) c = \sum_i (a_i - b)^2
we have we have
dE/da = Unsqueeze(dE/dc) * 2a dE/da = Unsqueeze(dE/dc) * 2a
dE/db = Unsqueeze(dE/dc) * (-2b) dE/db = dE/dc * -2 * n * b
>> node - the node (c) for backward computation >> node - the node (c) for backward computation
*/ */
void XMathGrad::GradReduceSumSquared(XTensor * node) void XMathGrad::GradReduceSumSquared(XTensor * node)
...@@ -985,35 +1149,46 @@ void XMathGrad::GradReduceSumSquared(XTensor * node) ...@@ -985,35 +1149,46 @@ void XMathGrad::GradReduceSumSquared(XTensor * node)
XTensor * a = income.tails[0]; XTensor * a = income.tails[0];
XTensor * b = income.tails[1]; XTensor * b = income.tails[1];
XTensor * c = NewTensor(a); XTensor * c = NewTensorBuf(a, a->devID, a->mem);
XTensor * d = NewTensor(b); XTensor * d = NewTensorBuf(a, a->devID, a->mem);
XTensor * e = NewTensor(c); XTensor * e = NewTensorBuf(a, a->devID, a->mem);
XTensor * f = NewTensorBuf(b, b->devID, b->mem);
int dim = income.GetParamInt(0); int dim = income.GetParamInt(0);
int n = a->GetDim(dim); int n = a->GetDim(dim);
XNoder::MakeGrad(a); XNoder::MakeGrad(a);
XNoder::MakeGrad(b); XNoder::MakeGrad(b);
_ScaleAndShift(a, c, 2.0F); /* compute a-b */
_ScaleAndShift(b, d, -2.0F); _Unsqueeze(b, c, dim, n);
_Sub(a, c, d);
_ReduceSum(d, f, dim);
/* dE/da = Unsqueeze(dE/dc) * 2(a-b) */
_ScaleAndShiftMe(d, 2.0F);
_Unsqueeze(node->grad, e, dim, n); _Unsqueeze(node->grad, e, dim, n);
_Multiply(e, c, a->grad, 1.0F); _Multiply(d, e, a->grad, 1.0F);
_Multiply(node->grad, d, b->grad, 1.0F);
node->visitMark = NODE_FINISHED; /* dE/db = dE/dc * -2 * (a-b*n) */
_ScaleAndShiftMe(f, -2.0F);
_Multiply(node->grad, f, b->grad, 1.0F);
delete c; DelTensorBuf(c);
delete d; DelTensorBuf(d);
delete e; DelTensorBuf(e);
DelTensorBuf(f);
node->visitMark = NODE_FINISHED;
} }
/* /*
gradient for reduceVariance gradient for reduceVariance
for for
c = reduceVariance(a, dim, b) c = (sum_i (a_i - b)^2) * 1/n
where b is the mean, and n is the size of a
we have we have
dE/da = Unsqueeze(dE/dc) * 2a/dimSizeA[dim] dE/da = Unsqueeze(dE/dc) * 2a/n
dE/db = Unsqueeze(dE/dc) * (-2a/dimSizeA[dim]) dE/db = dE/dc * -2 * b
>> node - the node (c) for backward computation >> node - the node (c) for backward computation
*/ */
void XMathGrad::GradReduceVariance(XTensor * node) void XMathGrad::GradReduceVariance(XTensor * node)
...@@ -1023,26 +1198,36 @@ void XMathGrad::GradReduceVariance(XTensor * node) ...@@ -1023,26 +1198,36 @@ void XMathGrad::GradReduceVariance(XTensor * node)
XTensor * a = income.tails[0]; XTensor * a = income.tails[0];
XTensor * b = income.tails[1]; XTensor * b = income.tails[1];
XTensor * c = NewTensor(a); XTensor * c = NewTensorBuf(a, a->devID, a->mem);
XTensor * d = NewTensor(b); XTensor * d = NewTensorBuf(a, a->devID, a->mem);
XTensor * e = NewTensor(a); XTensor * e = NewTensorBuf(a, a->devID, a->mem);
XTensor * f = NewTensorBuf(b, b->devID, b->mem);
int dim = income.GetParamInt(0); int dim = income.GetParamInt(0);
int n = a->GetDim(dim); int n = a->GetDim(dim);
XNoder::MakeGrad(a); XNoder::MakeGrad(a);
XNoder::MakeGrad(b); XNoder::MakeGrad(b);
_ScaleAndShift(a, c, 2.0F / n); /* compute a-b */
_ScaleAndShift(b, d, -2.0F / n); _Unsqueeze(b, c, dim, n);
_Sub(a, c, d);
_ReduceSum(d, f, dim);
/* dE/da = Unsqueeze(dE/dc) * 2 (a-b) / n */
_ScaleAndShiftMe(d, 2.0F / n);
_Unsqueeze(node->grad, e, dim, n); _Unsqueeze(node->grad, e, dim, n);
_Multiply(e, c, a->grad, 1.0F); _Multiply(d, e, a->grad, 1.0F);
_Multiply(node->grad, d, b->grad, 1.0F);
node->visitMark = NODE_FINISHED; /* dE/db = dE/dc * -2 * (a-b) */
_ScaleAndShiftMe(f, -2.0F /n);
_Multiply(node->grad, f, b->grad, 1.0F);
delete c; DelTensorBuf(c);
delete d; DelTensorBuf(d);
delete e; DelTensorBuf(e);
DelTensorBuf(f);
node->visitMark = NODE_FINISHED;
} }
} }
...@@ -53,6 +53,11 @@ private: ...@@ -53,6 +53,11 @@ private:
static static
void GradMultiply(XTensor * node); void GradMultiply(XTensor * node);
/* gradient for multiply one dimension: c = a * b * \alpha
where the size of b is equal to that of one dimension of a */
static
void GradMultiplyDim(XTensor * node);
/* gradient for matrix multiply: c = matmul(a, b) * \alpha */ /* gradient for matrix multiply: c = matmul(a, b) * \alpha */
static static
void GradMatrixMul(XTensor * node); void GradMatrixMul(XTensor * node);
...@@ -92,6 +97,10 @@ private: ...@@ -92,6 +97,10 @@ private:
static static
void GradDiv(XTensor * node); void GradDiv(XTensor * node);
/* gradient for DivideDim */
static
void GradDivDim(XTensor * node);
/* gradient for reduceMean */ /* gradient for reduceMean */
static static
void GradReduceMean(XTensor * node); void GradReduceMean(XTensor * node);
......
...@@ -137,6 +137,8 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss) ...@@ -137,6 +137,8 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
XTensor * x = income.tails[0]; XTensor * x = income.tails[0];
XNoder::MakeGrad(x); XNoder::MakeGrad(x);
lossGrad.Compute(gold, root, x, NULL, x->grad, funcID, params, loss); lossGrad.Compute(gold, root, x, NULL, x->grad, funcID, params, loss);
//XNoder::MakeGrad(root);
//lossGrad.Compute(gold, root, x, root->grad, x->grad, funcID, params, loss);
root->visitMark = NODE_FINISHED; root->visitMark = NODE_FINISHED;
} }
/* we compuate dE/dy (y is the output) if no predefined activation function is used */ /* we compuate dE/dy (y is the output) if no predefined activation function is used */
......
...@@ -99,7 +99,7 @@ arguments: ...@@ -99,7 +99,7 @@ arguments:
(how many words) (how many words)
-shuffle: shuffle the training data -shuffle: shuffle the training data
-devid D: the id of the device used -devid D: the id of the device used
-1: GPU, >=0: GPUs -1: CPU, >=0: GPUs
-mempool: use memory pools for memory management -mempool: use memory pools for memory management
-autodiff: use automatic differentiation for training -autodiff: use automatic differentiation for training
......
...@@ -57,18 +57,22 @@ void T2TAttention::InitModel(int argc, const char ** argv, int myDevID, XMem * m ...@@ -57,18 +57,22 @@ void T2TAttention::InitModel(int argc, const char ** argv, int myDevID, XMem * m
float minmax = 0; float minmax = 0;
LoadParamInt(argc, argv, "nhead", &nhead, 8); LoadParamInt(argc, argv, "nhead", &nhead, 8);
LoadParamInt(argc, argv, "d", &dk, DEFAULT_BEDDING_SIZE); LoadParamInt(argc, argv, "d", &dk, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "d", &dv, DEFAULT_BEDDING_SIZE); LoadParamInt(argc, argv, "d", &dv, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "d", &d, DEFAULT_BEDDING_SIZE); LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
LoadParamFloat(argc, argv, "attminmax", &minmax, 0.08F); LoadParamFloat(argc, argv, "attminmax", &minmax, 0.1F);
InitTensor2D(&wk, d, dk, X_FLOAT, devID, mem); InitTensor2D(&wk, d, dk, X_FLOAT, devID, mem);
InitTensor2D(&wq, d, dk, X_FLOAT, devID, mem); InitTensor2D(&wq, d, dk, X_FLOAT, devID, mem);
InitTensor2D(&wv, d, dv, X_FLOAT, devID, mem); InitTensor2D(&wv, d, dv, X_FLOAT, devID, mem);
float scale = 1.0F;
float finfoutk = (float)sqrt(6.0F * scale/(d + dk));
float finfoutv = (float)sqrt(6.0F * scale/(d + dv));
wk.SetDataRand(-minmax, minmax); wk.SetDataRand(-finfoutk, finfoutk);
wq.SetDataRand(-minmax, minmax); wq.SetDataRand(-finfoutk, finfoutk);
wv.SetDataRand(-minmax, minmax); wv.SetDataRand(-finfoutv, finfoutv);
} }
/* /*
...@@ -104,7 +108,7 @@ XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v) ...@@ -104,7 +108,7 @@ XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v)
XTensor scalar; XTensor scalar;
/* scalar = softmax(Q * K^T / sqrt(dk)) * V */ /* scalar = softmax(Q * K^T / sqrt(dk)) * V */
scalar = Softmax(Linear(BMMul(qheads, X_NOTRANS, kheads, X_TRANS), 1/sqrt((float)dk)), -1); scalar = Softmax(Linear(BMMul(qheads, X_NOTRANS, kheads, X_TRANS), 1/(float)sqrt((float)dk)), -1);
att = BMMul(scalar, vheads); att = BMMul(scalar, vheads);
/* concatenate the heads */ /* concatenate the heads */
......
...@@ -53,16 +53,14 @@ void T2TEmbedder::InitModel(int argc, const char ** argv, int myDevID, XMem * my ...@@ -53,16 +53,14 @@ void T2TEmbedder::InitModel(int argc, const char ** argv, int myDevID, XMem * my
devID = myDevID; devID = myDevID;
mem = myMem; mem = myMem;
int d = 0;
LoadParamInt(argc, argv, "vsize", &vSize, -1); LoadParamInt(argc, argv, "vsize", &vSize, -1);
LoadParamInt(argc, argv, "maxlen", &maxLength, 256); LoadParamInt(argc, argv, "maxlen", &maxLength, 512);
LoadParamInt(argc, argv, "d", &eSize, DEFAULT_BEDDING_SIZE); LoadParamInt(argc, argv, "d", &eSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "d", &d, DEFAULT_BEDDING_SIZE); LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
InitTensor2D(&w, vSize, eSize, X_FLOAT, devID, mem); InitTensor2D(&w, vSize, eSize, X_FLOAT, devID, mem);
w.SetDataRandn(0, sqrt((float)eSize)); w.SetDataRandn(0, 1.0F/(float)sqrt((float)eSize));
/* create the positional embedding matrix */ /* create the positional embedding matrix */
MakePosEmbedding(eSize, d, maxLength); MakePosEmbedding(eSize, d, maxLength);
...@@ -84,11 +82,11 @@ void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length) ...@@ -84,11 +82,11 @@ void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length)
for(int k = 0; k < eSize; k++){ for(int k = 0; k < eSize; k++){
if(k % 2 == 0){ if(k % 2 == 0){
int i = k/2; int i = k/2;
dp[k] = sin(pos/pow(10000.0F, 2.0F*i/d)); dp[k] = (float)sin(pos/pow(10000.0F, 2.0F*i/d));
} }
else{ else{
int i = (k - 1)/2; int i = (k - 1)/2;
dp[k] = cos(pos/pow(10000.0F, 2.0F*i/d)); dp[k] = (float)cos(pos/pow(10000.0F, 2.0F*i/d));
} }
} }
} }
...@@ -135,10 +133,10 @@ XTensor T2TEmbedder::Make(XTensor &input) ...@@ -135,10 +133,10 @@ XTensor T2TEmbedder::Make(XTensor &input)
XTensor wordEmbedding; XTensor wordEmbedding;
/* then we make word embeddings */ /* then we make word embeddings */
wordEmbedding = MMul(&input, w); wordEmbedding = Linear(MMul(input, w), (float)sqrt((float)d));
/* we sum over the two embeddings */ /* we sum over the two embeddings */
return wordEmbedding + posEmbedding; return wordEmbedding +posEmbedding;
} }
} }
...@@ -29,7 +29,7 @@ using namespace nts; ...@@ -29,7 +29,7 @@ using namespace nts;
namespace transformer namespace transformer
{ {
#define DEFAULT_BEDDING_SIZE 512 #define DEFAULT_EMBEDDING_SIZE 512
/* /*
embedding (of word at position i): embedding (of word at position i):
...@@ -53,6 +53,9 @@ public: ...@@ -53,6 +53,9 @@ public:
/* maximum length of the sequence */ /* maximum length of the sequence */
int maxLength; int maxLength;
/* dimension size of the hidden layers in the t2t model */
int d;
/* word embedding matrix */ /* word embedding matrix */
XTensor w; XTensor w;
......
...@@ -38,7 +38,8 @@ AttEncoder::~AttEncoder() ...@@ -38,7 +38,8 @@ AttEncoder::~AttEncoder()
{ {
delete[] attentions; delete[] attentions;
delete[] fnns; delete[] fnns;
delete[] layerNorms; delete[] attLayerNorms;
delete[] fnnLayerNorms;
} }
/* /*
...@@ -53,13 +54,12 @@ void AttEncoder::InitModel(int argc, const char ** argv, int myDevID, XMem * myM ...@@ -53,13 +54,12 @@ void AttEncoder::InitModel(int argc, const char ** argv, int myDevID, XMem * myM
devID = myDevID; devID = myDevID;
mem = myMem; mem = myMem;
LoadParamInt(argc, argv, "nstack", &nlayer, 6); LoadParamInt(argc, argv, "nlayer", &nlayer, 6);
LoadParamInt(argc, argv, "hsize", &hSize, 512); LoadParamInt(argc, argv, "hsize", &hSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "esize", &eSize, 512); LoadParamInt(argc, argv, "esize", &eSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "vsize", &vSize, -1); LoadParamInt(argc, argv, "vsize", &vSize, -1);
CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
CheckNTErrors(nlayer > 1, "We have one encoding layer at least!");
CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsize\""); CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsize\"");
/* embedding model */ /* embedding model */
...@@ -67,13 +67,15 @@ void AttEncoder::InitModel(int argc, const char ** argv, int myDevID, XMem * myM ...@@ -67,13 +67,15 @@ void AttEncoder::InitModel(int argc, const char ** argv, int myDevID, XMem * myM
attentions = new T2TAttention[nlayer]; attentions = new T2TAttention[nlayer];
fnns = new T2TFNN[nlayer]; fnns = new T2TFNN[nlayer];
layerNorms = new T2TLN[nlayer]; attLayerNorms = new T2TLN[nlayer];
fnnLayerNorms = new T2TLN[nlayer];
/* initialize the stacked layers */ /* initialize the stacked layers */
for(int i = 0; i < nlayer; i++){ for(int i = 0; i < nlayer; i++){
attentions[i].InitModel(argc, argv, myDevID, myMem); attentions[i].InitModel(argc, argv, myDevID, myMem);
fnns[i].InitModel(argc, argv, myDevID, myMem); fnns[i].InitModel(argc, argv, myDevID, myMem);
layerNorms[i].InitModel(argc, argv, myDevID, myMem); attLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
fnnLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
} }
} }
...@@ -103,10 +105,7 @@ XTensor AttEncoder::Make(XTensor &input) ...@@ -103,10 +105,7 @@ XTensor AttEncoder::Make(XTensor &input)
/* TODO: dropout */ /* TODO: dropout */
/* layer normalization */ /* layer normalization */
ln = layerNorms[i].Make(res); x = attLayerNorms[i].Make(res);
/* input of next layer */
x = ln;
/* fnn */ /* fnn */
fnn = fnns[i].Make(x); fnn = fnns[i].Make(x);
...@@ -117,10 +116,7 @@ XTensor AttEncoder::Make(XTensor &input) ...@@ -117,10 +116,7 @@ XTensor AttEncoder::Make(XTensor &input)
/* TODO: dropout */ /* TODO: dropout */
/* layer normalization */ /* layer normalization */
ln = layerNorms[i].Make(res); x = fnnLayerNorms[i].Make(res);
/* input of next layer */
x = ln;
} }
return x; return x;
......
...@@ -86,8 +86,11 @@ public: ...@@ -86,8 +86,11 @@ public:
/* attention model of each layer */ /* attention model of each layer */
T2TAttention * attentions; T2TAttention * attentions;
/* layer normalization */ /* layer normalization for fnn */
T2TLN * layerNorms; T2TLN * fnnLayerNorms;
/* layer normalization for attention */
T2TLN * attLayerNorms;
/* input tensor of the encoder */ /* input tensor of the encoder */
XTensor * input; XTensor * input;
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
*/ */
#include <math.h>
#include "T2TFNN.h" #include "T2TFNN.h"
#include "T2TUtility.h" #include "T2TUtility.h"
#include "T2TEmbedding.h" #include "T2TEmbedding.h"
...@@ -55,10 +56,10 @@ void T2TFNN::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem) ...@@ -55,10 +56,10 @@ void T2TFNN::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
float minmax = 0; float minmax = 0;
LoadParamInt(argc, argv, "d", &inSize, DEFAULT_BEDDING_SIZE); LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "d", &outSize, DEFAULT_BEDDING_SIZE); LoadParamInt(argc, argv, "d", &outSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "fnnh", &hSize, DEFAULT_BEDDING_SIZE); LoadParamInt(argc, argv, "fnnh", &hSize, DEFAULT_EMBEDDING_SIZE);
LoadParamFloat(argc, argv, "fnnminmax", &minmax, 0.08F); LoadParamFloat(argc, argv, "fnnminmax", &minmax, 0.1F);
InitTensor2D(&w1, inSize, hSize, X_FLOAT, devID, mem); InitTensor2D(&w1, inSize, hSize, X_FLOAT, devID, mem);
InitTensor1D(&b1, hSize, X_FLOAT, devID, mem); InitTensor1D(&b1, hSize, X_FLOAT, devID, mem);
...@@ -66,10 +67,14 @@ void T2TFNN::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem) ...@@ -66,10 +67,14 @@ void T2TFNN::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
InitTensor2D(&w2, hSize, outSize, X_FLOAT, devID, mem); InitTensor2D(&w2, hSize, outSize, X_FLOAT, devID, mem);
InitTensor1D(&b2, outSize, X_FLOAT, devID, mem); InitTensor1D(&b2, outSize, X_FLOAT, devID, mem);
w1.SetDataRand(-minmax, minmax); float scale = 1.0F;
b1.SetDataRand(-minmax, minmax); float finfout1 = (float)sqrt(6.0F * scale/(inSize + hSize));
w2.SetDataRand(-minmax, minmax); float finfout2 = (float)sqrt(6.0F * scale/(hSize + outSize));
b2.SetDataRand(-minmax, minmax);
w1.SetDataRand(-finfout1, finfout1);
b1.SetZeroAll();
w2.SetDataRand(-finfout2, finfout2);
b2.SetZeroAll();
} }
/* /*
...@@ -83,10 +88,10 @@ XTensor T2TFNN::Make(XTensor &input) ...@@ -83,10 +88,10 @@ XTensor T2TFNN::Make(XTensor &input)
XTensor t1; XTensor t1;
/* t1 = max(0, x * w1 + b1) */ /* t1 = max(0, x * w1 + b1) */
t1 = Rectify(MMul(input, X_NOTRANS, w1, X_NOTRANS) + b1); t1 = Rectify(MMul(input, w1) + b1);
/* result = t1 * w2 + b2 */ /* result = t1 * w2 + b2 */
return MMul(t1, X_NOTRANS, w2, X_NOTRANS) + b2; return MMul(t1, w2) + b2;
} }
......
...@@ -20,6 +20,8 @@ ...@@ -20,6 +20,8 @@
*/ */
#include "T2TLayerNormal.h" #include "T2TLayerNormal.h"
#include "T2TUtility.h"
#include "T2TEmbedding.h"
#include "../../tensor/core/CHeader.h" #include "../../tensor/core/CHeader.h"
namespace transformer namespace transformer
...@@ -48,6 +50,18 @@ void T2TLN::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem) ...@@ -48,6 +50,18 @@ void T2TLN::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
{ {
devID = myDevID; devID = myDevID;
mem = myMem; mem = myMem;
int d = 0;
LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
InitTensor2D(&w, d, d, X_FLOAT, devID, mem);
InitTensor1D(&b, d, X_FLOAT, devID, mem);
float scale = 1.0F;
float finfout = (float)sqrt(6.0F * scale / (d + d));
w.SetDataRand(-finfout, finfout);
b.SetZeroAll();
} }
/* /*
...@@ -60,6 +74,7 @@ y = ...@@ -60,6 +74,7 @@ y =
XTensor T2TLN::Make(XTensor &input) XTensor T2TLN::Make(XTensor &input)
{ {
XTensor &x = input; XTensor &x = input;
XTensor xn;
XTensor mean; XTensor mean;
XTensor variance; XTensor variance;
XTensor standard; XTensor standard;
...@@ -67,7 +82,7 @@ XTensor T2TLN::Make(XTensor &input) ...@@ -67,7 +82,7 @@ XTensor T2TLN::Make(XTensor &input)
XTensor standardFilled; XTensor standardFilled;
/* \mu = (sum_i x_i)/m */ /* \mu = (sum_i x_i)/m */
mean = ReduceSum(x, x.order - 1); mean = ReduceMean(x, x.order - 1);
/* \sigma = (sum_i (x_i - \mu)^2)/m */ /* \sigma = (sum_i (x_i - \mu)^2)/m */
variance = ReduceVariance(x, x.order - 1, mean); variance = ReduceVariance(x, x.order - 1, mean);
...@@ -76,12 +91,15 @@ XTensor T2TLN::Make(XTensor &input) ...@@ -76,12 +91,15 @@ XTensor T2TLN::Make(XTensor &input)
standard = Power(variance, 0.5F); standard = Power(variance, 0.5F);
/* unsqueeze mean and standard deviation to fit them into /* unsqueeze mean and standard deviation to fit them into
the same size of x */ the same shape of x */
meanFilled = Unsqueeze(mean, x.order - 1, x.GetDim(-1)); meanFilled = Unsqueeze(mean, x.order - 1, x.GetDim(-1));
standardFilled = Unsqueeze(standard, x.order - 1, x.GetDim(-1)); standardFilled = Unsqueeze(standard, x.order - 1, x.GetDim(-1));
/* x' = (x - \mu)/standard */ /* x' = (x - \mu)/standard */
return (x - meanFilled)/standardFilled; xn = (x - meanFilled)/standardFilled ;
/* result = x' * w + b */
return MMul(xn, w) + b;
} }
} }
...@@ -29,6 +29,8 @@ using namespace nts; ...@@ -29,6 +29,8 @@ using namespace nts;
namespace transformer namespace transformer
{ {
/* layer normalization: y = norm(x) * w + b
where norm(x) = (x - mean)/standardDeviation */
class T2TLN class T2TLN
{ {
public: public:
...@@ -37,6 +39,12 @@ public: ...@@ -37,6 +39,12 @@ public:
/* memory pool */ /* memory pool */
XMem * mem; XMem * mem;
/* the transformation matrix w */
XTensor w;
/* the bias term b */
XTensor b;
public: public:
/* constructor */ /* constructor */
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include "T2TModel.h" #include "T2TModel.h"
#include "T2TUtility.h" #include "T2TUtility.h"
#include "../../tensor/core/CHeader.h"
namespace transformer namespace transformer
{ {
...@@ -81,9 +82,9 @@ make the entire network (with the output softmax layer) ...@@ -81,9 +82,9 @@ make the entire network (with the output softmax layer)
*/ */
void T2TModel::Make(XTensor &input, XTensor &output) void T2TModel::Make(XTensor &input, XTensor &output)
{ {
XTensor encoding;
if(isLM){ if(isLM){
XTensor encoding;
encoding = MakeEncoding(input); encoding = MakeEncoding(input);
outputLayer.Make(encoding, output); outputLayer.Make(encoding, output);
} }
...@@ -92,4 +93,4 @@ void T2TModel::Make(XTensor &input, XTensor &output) ...@@ -92,4 +93,4 @@ void T2TModel::Make(XTensor &input, XTensor &output)
} }
} }
} }
\ No newline at end of file
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
*/ */
#include <math.h>
#include "T2TOutput.h" #include "T2TOutput.h"
#include "T2TUtility.h" #include "T2TUtility.h"
#include "T2TEmbedding.h" #include "T2TEmbedding.h"
...@@ -56,12 +57,15 @@ void T2TOutput::InitModel(int argc, const char ** argv, int myDevID, XMem * myMe ...@@ -56,12 +57,15 @@ void T2TOutput::InitModel(int argc, const char ** argv, int myDevID, XMem * myMe
float minmax = 0; float minmax = 0;
LoadParamInt(argc, argv, "vsize", &vSize, -1); LoadParamInt(argc, argv, "vsize", &vSize, -1);
LoadParamInt(argc, argv, "d", &inSize, DEFAULT_BEDDING_SIZE); LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "d", &hSize, DEFAULT_BEDDING_SIZE); LoadParamInt(argc, argv, "d", &hSize, DEFAULT_EMBEDDING_SIZE);
LoadParamFloat(argc, argv, "outputminmax", &minmax, 0.08F); LoadParamFloat(argc, argv, "outputminmax", &minmax, 0.08F);
InitTensor2D(&w, hSize, vSize, X_FLOAT, devID, mem); InitTensor2D(&w, hSize, vSize, X_FLOAT, devID, mem);
w.SetDataRand(-minmax, minmax);
float scale = 1.0F;
float finfout = (float)sqrt(6.0F * scale/(hSize + vSize));
w.SetDataRand(-finfout, finfout);
} }
/* /*
...@@ -89,4 +93,4 @@ void T2TOutput::Make(XTensor &input, XTensor &output) ...@@ -89,4 +93,4 @@ void T2TOutput::Make(XTensor &input, XTensor &output)
output = LogSoftmax(MMul(x, w), -1); output = LogSoftmax(MMul(x, w), -1);
} }
} }
\ No newline at end of file
...@@ -59,6 +59,8 @@ void T2TTrainer::Init(int argc, const char ** argv) ...@@ -59,6 +59,8 @@ void T2TTrainer::Init(int argc, const char ** argv)
LoadParamInt(argc, argv, "wbatch", &wBatchSize, 1); LoadParamInt(argc, argv, "wbatch", &wBatchSize, 1);
LoadParamInt(argc, argv, "nepoch", &nepoch, 1); LoadParamInt(argc, argv, "nepoch", &nepoch, 1);
LoadParamInt(argc, argv, "nstep", &nstep, 1); LoadParamInt(argc, argv, "nstep", &nstep, 1);
LoadParamInt(argc, argv, "d", &d, 512);
LoadParamInt(argc, argv, "nwarmup", &nwarmup, 4000);
LoadParamInt(argc, argv, "vsize", &vSize, 1); LoadParamInt(argc, argv, "vsize", &vSize, 1);
LoadParamBool(argc, argv, "sorted", &isLenSorted, false); LoadParamBool(argc, argv, "sorted", &isLenSorted, false);
LoadParamInt(argc, argv, "bufsize", &bufSize, 50000); LoadParamInt(argc, argv, "bufsize", &bufSize, 50000);
...@@ -82,6 +84,7 @@ void T2TTrainer::Train(const char * fn, T2TModel * model) ...@@ -82,6 +84,7 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
int wordCountTotal = 0; int wordCountTotal = 0;
bool isEnd = false; bool isEnd = false;
float loss = 0; float loss = 0;
float lr = 0;
XNet net; XNet net;
...@@ -108,8 +111,12 @@ void T2TTrainer::Train(const char * fn, T2TModel * model) ...@@ -108,8 +111,12 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
/* back-propagation for obtaining gradients */ /* back-propagation for obtaining gradients */
net.Backward(output, batch, CROSSENTROPY); net.Backward(output, batch, CROSSENTROPY);
/* learning rate */
lr = (1 / (float)sqrt((float)d)) * (float)MIN(pow(step + 1, -0.5), (step + 1) * pow(nwarmup, -1.5));
//lr = 0.00005F;
/* update the parameters */ /* update the parameters */
Update(model); Update(model, lr);
/* get probabilities */ /* get probabilities */
float prob = GetProb(&output, &batch, NULL); float prob = GetProb(&output, &batch, NULL);
...@@ -125,18 +132,21 @@ void T2TTrainer::Train(const char * fn, T2TModel * model) ...@@ -125,18 +132,21 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
if (step % 1 == 0) { if (step % 1 == 0) {
double elapsed = GetClockSec() - startT; double elapsed = GetClockSec() - startT;
XPRINT5(0, stderr, "[INFO] elapsed=%.1fs, step=%d, epoch=%d, ngram=%d, ppl=%.3f\n", XPRINT6(0, stderr, "[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, ppl=%.3f\n",
elapsed, step, epoch + 1, wordCountTotal, exp(loss / wordCount)); lr, elapsed, step, epoch + 1, wordCountTotal, exp(loss / wordCount));
} }
} }
fclose(file); fclose(file);
if (isEnd)
break;
} }
double elapsed = GetClockSec() - startT; double elapsed = GetClockSec() - startT;
XPRINT5(0, stderr, "[INFO] elapsed=%.1fs, step=%d, epoch=%d, ngram=%d, ppl=%.3f\n", XPRINT6(0, stderr, "[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, ppl=%.3f\n",
elapsed, step, epoch, wordCountTotal, exp(loss / wordCount)); lr, elapsed, step, epoch, wordCountTotal, exp(loss / wordCount));
XPRINT3(0, stderr, "[INFO] training finished (took %.1fs, step=%d and epoch=%d)\n", XPRINT3(0, stderr, "[INFO] training finished (took %.1fs, step=%d and epoch=%d)\n",
elapsed, step, epoch); elapsed, step, epoch);
} }
...@@ -317,10 +327,14 @@ float T2TTrainer::GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs) ...@@ -317,10 +327,14 @@ float T2TTrainer::GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs)
} }
/* /*
update the model by delta rule update the model by delta rule
\theta_new = \theta - \lrate * grad
where
\lrate = d^-0.5 * min(stepNum^-0.5, stepNum * warmupStepNum^-1.5)
>> model - the t2t model >> model - the t2t model
>> lr - learning rate
*/ */
void T2TTrainer::Update(T2TModel * model) void T2TTrainer::Update(T2TModel * model, const float lr)
{ {
XList ws(100); XList ws(100);
...@@ -331,6 +345,13 @@ void T2TTrainer::Update(T2TModel * model) ...@@ -331,6 +345,13 @@ void T2TTrainer::Update(T2TModel * model)
ws.Add(&model->encoder.fnns[i].b1); ws.Add(&model->encoder.fnns[i].b1);
ws.Add(&model->encoder.fnns[i].w2); ws.Add(&model->encoder.fnns[i].w2);
ws.Add(&model->encoder.fnns[i].b2); ws.Add(&model->encoder.fnns[i].b2);
ws.Add(&model->encoder.attentions[i].wk);
ws.Add(&model->encoder.attentions[i].wq);
ws.Add(&model->encoder.attentions[i].wv);
ws.Add(&model->encoder.fnnLayerNorms[i].w);
ws.Add(&model->encoder.fnnLayerNorms[i].b);
ws.Add(&model->encoder.attLayerNorms[i].w);
ws.Add(&model->encoder.attLayerNorms[i].b);
} }
ws.Add(&model->encoder.embedder.w); ws.Add(&model->encoder.embedder.w);
...@@ -339,11 +360,37 @@ void T2TTrainer::Update(T2TModel * model) ...@@ -339,11 +360,37 @@ void T2TTrainer::Update(T2TModel * model)
XTensor * para = (XTensor*)ws.Get(i); XTensor * para = (XTensor*)ws.Get(i);
XTensor * paraGrad = para->grad; XTensor * paraGrad = para->grad;
if (para == NULL || paraGrad == NULL)
continue;
CheckNTErrors(para != NULL, "NULL parameter tensor!"); CheckNTErrors(para != NULL, "NULL parameter tensor!");
CheckNTErrors(paraGrad != NULL, "NULL gradient tensor!"); CheckNTErrors(paraGrad != NULL, "NULL gradient tensor!");
/*
DTYPE * d = new DTYPE[para->unitNum * para->unitSize];
DTYPE * g = new DTYPE[para->unitNum * para->unitSize];
XMemCopy(d, -1, para->data, para->devID, para->unitNum * para->unitSize);
XMemCopy(g, -1, paraGrad->data, paraGrad->devID, para->unitNum * para->unitSize);
for (int i = 0; i < para->unitNum; i++) {
if (IsNAN(d[i]) || IsINF(d[i])) {
int nnn = 0;
}
if (IsNAN(g[i]) || IsINF(g[i])) {
int nnn = 0;
}
}
delete[] d;
delete[] g;
*/
/* the delta rule */ /* the delta rule */
_Sum(para, paraGrad, para, -lrate); _Sum(para, paraGrad, para, -lr);
/* clear gradient */
paraGrad->SetZeroAll();
} }
} }
......
...@@ -63,6 +63,12 @@ public: ...@@ -63,6 +63,12 @@ public:
/* indicates whether the sequence is sorted by length */ /* indicates whether the sequence is sorted by length */
bool isLenSorted; bool isLenSorted;
/* dimension size of each inner layer */
int d;
/* step number of warm-up for training */
int nwarmup;
/* vocabulary size of the source side */ /* vocabulary size of the source side */
int vSize; int vSize;
...@@ -105,7 +111,7 @@ public: ...@@ -105,7 +111,7 @@ public:
float GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs); float GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs);
/* update the model by delta rule */ /* update the model by delta rule */
void Update(T2TModel * model); void Update(T2TModel * model, const float lr);
}; };
......
...@@ -26,6 +26,8 @@ ...@@ -26,6 +26,8 @@
namespace transformer namespace transformer
{ {
FILE * tmpFILE;
void LoadParamString(int argc, const char ** argv, const char * name, char * p, const char * defaultP) void LoadParamString(int argc, const char ** argv, const char * name, char * p, const char * defaultP)
{ {
char vname[128]; char vname[128];
......
...@@ -27,6 +27,8 @@ ...@@ -27,6 +27,8 @@
namespace transformer namespace transformer
{ {
extern FILE * tmpFILE;
/* load arguments */ /* load arguments */
void LoadParamString(int argc, const char ** argv, const char * name, char * p, const char * defaultP); void LoadParamString(int argc, const char ** argv, const char * name, char * p, const char * defaultP);
void LoadParamInt(int argc, const char ** argv, const char * name, int * p, int defaultP); void LoadParamInt(int argc, const char ** argv, const char * name, int * p, int defaultP);
......
...@@ -33,6 +33,8 @@ int TransformerMain(int argc, const char ** argv) ...@@ -33,6 +33,8 @@ int TransformerMain(int argc, const char ** argv)
if(argc == 0) if(argc == 0)
return 1; return 1;
tmpFILE = fopen("tmp.txt", "wb");
ShowParams(argc, argv); ShowParams(argc, argv);
char * trainFN = new char[MAX_LINE_LENGTH]; char * trainFN = new char[MAX_LINE_LENGTH];
...@@ -51,6 +53,8 @@ int TransformerMain(int argc, const char ** argv) ...@@ -51,6 +53,8 @@ int TransformerMain(int argc, const char ** argv)
delete[] trainFN; delete[] trainFN;
fclose(tmpFILE);
return 0; return 0;
} }
......
...@@ -45,7 +45,7 @@ int main( int argc, const char ** argv ) ...@@ -45,7 +45,7 @@ int main( int argc, const char ** argv )
//_CrtSetBreakAlloc(123); //_CrtSetBreakAlloc(123);
/* a tiny test */ /* a tiny test */
SmallTest(); //SmallTest();
//_CrtDumpMemoryLeaks(); //_CrtDumpMemoryLeaks();
//return 0; //return 0;
......
...@@ -43,7 +43,7 @@ ...@@ -43,7 +43,7 @@
/* the nts (NiuTrans.Tensor) namespace */ /* the nts (NiuTrans.Tensor) namespace */
namespace nts { namespace nts {
#define _XINLINE_ inline #define _XINLINE_
//#define DOUBELPRICSION //#define DOUBELPRICSION
......
...@@ -45,12 +45,16 @@ const char * GetOPName(int type) ...@@ -45,12 +45,16 @@ const char * GetOPName(int type)
return "M_CLIP"; return "M_CLIP";
else if (type == MATH_DIV) else if (type == MATH_DIV)
return "M_DIV"; return "M_DIV";
else if (type == MATH_DIVDIM)
return "M_DIVDIM";
else if (type == MATH_MATRIXMUL) else if (type == MATH_MATRIXMUL)
return "M_MATRIXMUL"; return "M_MATRIXMUL";
else if (type == MATH_MATRIXMULBATCHED) else if (type == MATH_MATRIXMULBATCHED)
return "M_MATRIXMULBATCHED"; return "M_MATRIXMULBATCHED";
else if (type == MATH_MULTIPLY) else if (type == MATH_MULTIPLY)
return "M_MULTIPLY"; return "M_MULTIPLY";
else if (type == MATH_MULTIPLYDIM)
return "M_MULTIPLYDIM";
else if (type == MATH_NEGATE) else if (type == MATH_NEGATE)
return "M_NEGATE"; return "M_NEGATE";
else if (type == MATH_NORMALIZE) else if (type == MATH_NORMALIZE)
...@@ -61,10 +65,12 @@ const char * GetOPName(int type) ...@@ -61,10 +65,12 @@ const char * GetOPName(int type)
return "M_SCALEANDSHIFT"; return "M_SCALEANDSHIFT";
else if (type == MATH_SIGN) else if (type == MATH_SIGN)
return "M_SIGN"; return "M_SIGN";
else if (type == MATH_SUM)
return "M_SUM";
else if (type == MATH_SUB) else if (type == MATH_SUB)
return "M_SUB"; return "M_SUB";
else if (type == MATH_SUBDIM)
return "M_SUBDIM";
else if (type == MATH_SUM)
return "M_SUM";
else if (type == MATH_SUMDIM) else if (type == MATH_SUMDIM)
return "M_SUMDIM"; return "M_SUMDIM";
else if (type == REDUCE_REDUCEMAX) else if (type == REDUCE_REDUCEMAX)
......
...@@ -41,17 +41,20 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -41,17 +41,20 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#define MATH_CLIP MATH_ROUND + 1 #define MATH_CLIP MATH_ROUND + 1
#define MATH_DIV MATH_CLIP + 1 #define MATH_DIV MATH_CLIP + 1
#define MATH_MATRIXMUL MATH_DIV + 1 #define MATH_DIVDIM MATH_DIV + 1
#define MATH_MATRIXMUL MATH_DIVDIM + 1
#define MATH_MATRIXMULBATCHED MATH_MATRIXMUL + 1 #define MATH_MATRIXMULBATCHED MATH_MATRIXMUL + 1
#define MATH_MULTIPLY MATH_MATRIXMULBATCHED + 1 #define MATH_MULTIPLY MATH_MATRIXMULBATCHED + 1
#define MATH_NEGATE MATH_MULTIPLY + 1 #define MATH_MULTIPLYDIM MATH_MULTIPLY + 1
#define MATH_NEGATE MATH_MULTIPLYDIM + 1
#define MATH_NORMALIZE MATH_NEGATE + 1 #define MATH_NORMALIZE MATH_NEGATE + 1
#define MATH_POWER MATH_NORMALIZE + 1 #define MATH_POWER MATH_NORMALIZE + 1
#define MATH_SCALEANDSHIFT MATH_POWER + 1 #define MATH_SCALEANDSHIFT MATH_POWER + 1
#define MATH_SIGN MATH_SCALEANDSHIFT + 1 #define MATH_SIGN MATH_SCALEANDSHIFT + 1
#define MATH_SUM MATH_SIGN + 1 #define MATH_SUB MATH_SIGN + 1
#define MATH_SUB MATH_SUM + 1 #define MATH_SUBDIM MATH_SUB + 1
#define MATH_SUMDIM MATH_SUB + 1 #define MATH_SUM MATH_SUBDIM + 1
#define MATH_SUMDIM MATH_SUM + 1
#define REDUCE MATH_SUMDIM + 1 #define REDUCE MATH_SUMDIM + 1
#define REDUCE_REDUCEMAX REDUCE + 1 #define REDUCE_REDUCEMAX REDUCE + 1
......
...@@ -1046,7 +1046,7 @@ bool XTensor::Set3D(DTYPE value, int d0, int d1, int d2) ...@@ -1046,7 +1046,7 @@ bool XTensor::Set3D(DTYPE value, int d0, int d1, int d2)
CheckNTErrors(d2 >= 0 && d2 < dimSize[2], "dimension 1 is out of range!"); CheckNTErrors(d2 >= 0 && d2 < dimSize[2], "dimension 1 is out of range!");
CheckNTErrors(dataType == DEFAULT_DTYPE, "The tensor is not in default type."); CheckNTErrors(dataType == DEFAULT_DTYPE, "The tensor is not in default type.");
int dims[3] = {d0, d1, d1}; int dims[3] = {d0, d1, d2};
return SetToDevice(devID, GetCell(dims, 3), value); return SetToDevice(devID, GetCell(dims, 3), value);
} }
......
...@@ -27,15 +27,18 @@ ...@@ -27,15 +27,18 @@
#include "../XTensor.h" #include "../XTensor.h"
#include "arithmetic/Div.h" #include "arithmetic/Div.h"
#include "arithmetic/DivDim.h"
#include "arithmetic/MatrixMul.h" #include "arithmetic/MatrixMul.h"
#include "arithmetic/MatrixMul2D.h" #include "arithmetic/MatrixMul2D.h"
#include "arithmetic/MatrixMul2DMultiTheading.h" #include "arithmetic/MatrixMul2DMultiTheading.h"
#include "arithmetic/MatrixMul2DParallel.h" #include "arithmetic/MatrixMul2DParallel.h"
#include "arithmetic/MatrixMulBatched.h" #include "arithmetic/MatrixMulBatched.h"
#include "arithmetic/Multiply.h" #include "arithmetic/Multiply.h"
#include "arithmetic/MultiplyDim.h"
#include "arithmetic/Negate.h" #include "arithmetic/Negate.h"
#include "arithmetic/Sign.h" #include "arithmetic/Sign.h"
#include "arithmetic/Sub.h" #include "arithmetic/Sub.h"
#include "arithmetic/SubDim.h"
#include "arithmetic/Sum.h" #include "arithmetic/Sum.h"
#include "arithmetic/SumByColumnTV.h" #include "arithmetic/SumByColumnTV.h"
#include "arithmetic/SumByColumnVT.h" #include "arithmetic/SumByColumnVT.h"
......
...@@ -31,7 +31,7 @@ element-wise division of two tensors: ...@@ -31,7 +31,7 @@ element-wise division of two tensors:
c(i) = a(i)/b(i) + \alpha * c(i) c(i) = a(i)/b(i) + \alpha * c(i)
where i is the index of the element where i is the index of the element
*/ */
void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha = 0, int leadingDim = 0); void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha = 0.0, int leadingDim = 0);
/* /*
element-wise division of two tensors (do it on site) element-wise division of two tensors (do it on site)
...@@ -39,7 +39,7 @@ keep the result in the input tensor a and return nothing ...@@ -39,7 +39,7 @@ keep the result in the input tensor a and return nothing
a(i) = a(i)/b(i) + \alpha * a(i) a(i) = a(i)/b(i) + \alpha * a(i)
where i is the index of the element where i is the index of the element
*/ */
void _DivMe(XTensor * a, const XTensor * b, DTYPE alpha = 0, int leadingDim = 0); void _DivMe(XTensor * a, const XTensor * b, DTYPE alpha = 0.0, int leadingDim = 0);
/* /*
element-wise division of two tensors (return a XTensor structure) element-wise division of two tensors (return a XTensor structure)
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-15
*/
#include "Div.h"
#include "DivDim.h"
#include "DivDim.cuh"
#include "../../XName.h"
#include "../movement/CopyValues.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
tensor division
c = a / b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is divided with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> c - where we put result. we save it in a if c is NULL
>> n - the dimension index
>> alpha - the scaling factor
*/
void _DivDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha)
{
CheckNTErrors(a && b && c, "Empty tensor input!");
CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in division!");
CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
"Unmatched data types in addition!");
CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in division!");
CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
if(XTensor::IsSameShaped(a, b)){
_Div(a, b, c, alpha);
return;
}
if(a->devID >= 0 || b->devID >= 0 || c->devID >= 0){
#ifdef USE_CUDA
_CudaDivDim(a, b, c, n, alpha);
#else
ShowNTErrors("Please specify USE_CUDA and recompile the code!");
#endif
}
else{
int stride = 1;
int blockSize = a->dimSize[n];
int blockNum = 1;
for(int i = a->order - 1; i >= 0; i--){
if(i > n)
stride *= a->dimSize[i];
else if(i < n)
blockNum *= a->dimSize[i];
}
if (a->dataType == DEFAULT_DTYPE){
int num = a->unitNum;
if(stride > 1){
for(int i = 0, j = 0; i < num; i += stride, j++){
DTYPE * ap = (DTYPE*)a->data + i;
DTYPE bv = *((DTYPE*)b->data + j % blockSize);
DTYPE * cp = (DTYPE*)c->data + i;
for(int k = 0; k < stride; k++){
if(alpha == 0.0F)
cp[k] = ap[k] / bv;
else
cp[k] = ap[k] / bv + alpha * cp[k];
}
}
}
else if(stride == 1){
DTYPE * bp = (DTYPE*)b->data;
for(int i = 0; i < num; i += blockSize){
DTYPE * ap = (DTYPE*)a->data + i;
DTYPE * cp = (DTYPE*)c->data + i;
if(alpha == 0.0F){
for(int j = 0; j < blockSize; j++)
cp[j] = ap[j] / bp[j];
}
else{
for(int j = 0; j < blockSize; j++)
cp[j] = ap[j] / bp[j] + alpha * cp[j];
}
}
}
else{
ShowNTErrors("Something is wrong!");
}
}
else {
ShowNTErrors("TODO!");
}
}
}
/*
tensor division of two tensors (do it on site)
keep the result in the input tensor and return nothing
a = a/b + \alpha * a
where the size of b is equal to the n-th dimension of a,
i.e., a is divided with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> n - the dimension index
>> alpha - the scaling factor
*/
void _DivDim(XTensor * a, const XTensor * b, int n, DTYPE alpha)
{
_DivDim(a, b, a, n, alpha);
}
/*
tensor division of two tensors (return a XTensor structure and make tensor connections)
make a new tensor to keep the result and return it
c = a/b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is divided with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> n - the dimension index
>> alpha - the scaling factor
<< return - the result tensor by tensor division
*/
XTensor DivDim(const XTensor &a, const XTensor &b, int n, DTYPE alpha)
{
XTensor c(&a);
c.SetTMP();
/* call _Div function */
_DivDim(&a, &b, &c, n, alpha);
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_DIVDIM);
XLink::AddParamToHeadInt(&c, n);
XLink::AddParamToHead(&c, alpha);
return c;
}
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-15
*/
#include "DivDim.cuh"
#include "../../XDevice.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/*
tensor division of a tensor and a row vector
c = a / b + alpha * c
where a is a tensor and b is a row vector
>> a - pointer to the data array of a
>> b - pointer to the data array of b
>> c - pointer to the data array of c
>> rowNum - number of rows of a and c
>> colNum - number of columns of a and c (i.e., the size of b)
>> alpha - the scaling factor
*/
template <class T, bool alphaFired>
__global__
void KernelDivWithRow(T * a, T * b, T * c, int rowNum, int colNum, T alpha)
{
__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int col = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
if(col >= colNum || row >= rowNum)
return;
if(threadIdx.y == 0)
bv[threadIdx.x] = b[col];
__syncthreads();
int offset = colNum * row + col;
if(alphaFired)
c[offset] = a[offset] / bv[threadIdx.x] + c[offset] * alpha;
else
c[offset] = a[offset] / bv[threadIdx.x];
}
/*
tensor division of a tensor and a colum vector
c = a / b + alpha * c
where a is a tensor and b is a colum vector
>> a - pointer to the data array of a
>> b - pointer to the data array of b
>> c - pointer to the data array of c
>> rowNum - number of rows of a and c (i.e., the size of b)
>> colNum - number of columns of a and c
>> blockNum - size of a block (matrix), i.e., rowNum * colNum
>> blockNum - number of matrics
>> alpha - the scaling factor
*/
template <class T, bool alphaFired>
__global__
void KernelDivWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, T alpha)
{
__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = colIndex % colNum;
int block = colIndex / colNum;
if(row >= rowNum || block >= blockNum)
return;
if(threadIdx.x == 0)
bv[threadIdx.y] = b[row];
__syncthreads();
int offset = block * blockSize + row * colNum + col;
if(alphaFired)
c[offset] = a[offset] / bv[threadIdx.y] + c[offset] * alpha;
else
c[offset] = a[offset] / bv[threadIdx.y];
}
/*
tensor division
c = a / b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is divided with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> c - where we put a / b + \alpha * c. we save it in a if c is NULL
>> n - the dimension index
>> alpha - the scaling factor
*/
void _CudaDivDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha)
{
CheckNTErrors(a && b && c, "Empty tensor input!");
CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in division!");
CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
"Unmatched data types in division!");
CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in division!");
CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
int stride = 1;
int blockSize = a->dimSize[n];
int blockNum = 1;
for(int i = a->order - 1; i >= 0; i--){
if(i > n)
stride *= a->dimSize[i];
else if(i < n)
blockNum *= a->dimSize[i];
}
int cudaGrids[3];
int cudaBlocks[3];
int devIDBackup = 0;
ProtectCudaDev(a->devID, devIDBackup);
if (a->dataType == DEFAULT_DTYPE){
if(stride > 1){
GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
if(alpha == (DTYPE)0.0F)
KernelDivWithCol<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockSize, stride, blockSize * stride, blockNum, alpha);
else
KernelDivWithCol<DTYPE, true> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockSize, stride, blockSize * stride, blockNum, alpha);
}
else if(stride == 1){
GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
if(alpha == (DTYPE)0.0F)
KernelDivWithRow<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockNum, blockSize, alpha);
else
KernelDivWithRow<DTYPE, true> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockNum, blockSize, alpha);
}
else{
ShowNTErrors("Something is wrong!");
}
}
else {
ShowNTErrors("TODO!");
}
BacktoCudaDev(a->devID, devIDBackup);
}
#endif
} // namespace nts(NiuTrans.Tensor)
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-15
*/
#ifndef __DIVDIM_CUH__
#define __DIVDIM_CUH__
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/*
tensor division
c(i) = a/b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is divided with b by broadcasting (cuda version)
*/
void _CudaDivDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha = (DTYPE)0.0);
#endif
} // namespace nts(NiuTrans.Tensor)
#endif // __DIVDIM_CUH__
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-15
*/
#ifndef __DIVDIM_H__
#define __DIVDIM_H__
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
tensor division of two tensors:
c(i) = a/b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is divided with b by broadcasting
*/
void _DivDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha = (DTYPE)0.0);
/*
tensor division of two tensors:
c(i) = a/b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is divided with b by broadcasting
we keep the result in the input tensor a and return nothing
*/
void _DivDim(XTensor * a, const XTensor * b, int n, DTYPE alpha = (DTYPE)0.0);
/*
tensor division of two tensors:
c(i) = a/b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is divided with b by broadcasting
we make a new tensor c to keep the result and return it
*/
XTensor DivDim(const XTensor &a, const XTensor &b, int n, DTYPE alpha = (DTYPE)0.0);
} // namespace nts(NiuTrans.Tensor)
#endif // __DIVDIM_H__
...@@ -31,7 +31,7 @@ element-wise product of two tensors: ...@@ -31,7 +31,7 @@ element-wise product of two tensors:
c(i) = a(i)*b(i) + \alpha * c(i) c(i) = a(i)*b(i) + \alpha * c(i)
where i is the index of the element where i is the index of the element
*/ */
void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha = 0, int leadingDim = 0); void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha = 0.0, int leadingDim = 0);
/* /*
element-wise product of two tensors (do it on site) element-wise product of two tensors (do it on site)
...@@ -39,7 +39,7 @@ keep the result in the input tensor a and return nothing ...@@ -39,7 +39,7 @@ keep the result in the input tensor a and return nothing
a(i) = a(i)*b(i) + \alpha * a(i) a(i) = a(i)*b(i) + \alpha * a(i)
where i is the index of the element where i is the index of the element
*/ */
void _MultiplyMe(XTensor * a, const XTensor * b, DTYPE alpha = 0, int leadingDim = 0); void _MultiplyMe(XTensor * a, const XTensor * b, DTYPE alpha = 0.0, int leadingDim = 0);
/* /*
element-wise product of two tensors (return a XTensor structure) element-wise product of two tensors (return a XTensor structure)
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: JIANG Yufan (email: jiangyufan2018@outlook.com) 2018-08-14
*/
#include "Multiply.h"
#include "MultiplyDim.h"
#include "MultiplyDim.cuh"
#include "../../XName.h"
#include "../movement/CopyValues.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
tensor multiplication
c = a * b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is multiplied with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> c - where we put a * b + \alpha * c. we save it in a if c is NULL
>> n - the dimension index
>> alpha - the scaling factor
*/
void _MultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha) {
CheckNTErrors(a && b && c, "Empty tensor input!");
CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in multiplication!");
CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
"Unmatched data types in multiplication!");
CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in multiplication!");
CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
if(XTensor::IsSameShaped(a, b)){
_Multiply(a, b, c, alpha);
return;
}
if(a->devID >= 0 || b->devID >= 0 || c->devID >= 0){
#ifdef USE_CUDA
_CudaMultiplyDim(a, b, c, n, alpha);
#else
ShowNTErrors("Please specify USE_CUDA and recompile the code!");
#endif
}
else{
int stride = 1;
int blockSize = a->dimSize[n];
int blockNum = 1;
for(int i = a->order - 1; i >= 0; i--){
if(i > n)
stride *= a->dimSize[i];
else if(i < n)
blockNum *= a->dimSize[i];
}
if(a->dataType == DEFAULT_DTYPE){
int num = a->unitNum;
if(stride > 1){
for(int i = 0, j = 0; i < num; i += stride, j++){
DTYPE * ap = (DTYPE*)a->data + i;
DTYPE bv = *((DTYPE*)b->data + j % blockSize);
DTYPE * cp = (DTYPE*)c->data + i;
for(int k = 0; k < stride; k++)
if(alpha == 0.0F)
cp[k] = ap[k] * bv;
else
cp[k] = ap[k] * bv + alpha * cp[k];
}
}
else if(stride == 1){
DTYPE * bp = (DTYPE*)b->data;
for(int i = 0; i < num; i += blockSize){
DTYPE * ap = (DTYPE*)a->data + i;
DTYPE * cp = (DTYPE*)c->data + i;
if(alpha == 0.0F){
for(int j = 0; j < blockSize; j++)
cp[j] = ap[j] * bp[j];
}
else{
for(int j = 0; j < blockSize; j++)
cp[j] = ap[j] * bp[j] + alpha * cp[j];
}
}
}
else{
ShowNTErrors("Something is wrong!");
}
}
else {
ShowNTErrors("TODO!");
}
}
}
/*
tensor multiplication(do it on site)
make a new tensor to keep the result and return it
c = a * b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is multiplied with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> n - the dimension index
>> alpha - the scaling factor
*/
void _MultiplyDimMe(XTensor * a, const XTensor * b, int n, DTYPE alpha)
{
_MultiplyDim(a, b, a, n, alpha);
}
/*
tensor multiplication (return a XTensor structure and make tensor connections)
make a new tensor to keep the result and return it
c = a * b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is multiplied with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> n - the dimension index
>> alpha - the scaling factor
<< return - the result tensor by tensor multiplication
*/
XTensor MultiplyDim(const XTensor &a, const XTensor &b, int n, DTYPE alpha)
{
XTensor c(&a);
c.SetTMP();
/* call _Multiply function */
_MultiplyDim(&a, &b, &c, n, alpha);
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM);
XLink::AddParamToHeadInt(&c, n);
XLink::AddParamToHead(&c, alpha);
return c;
}
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: JIANG Yufan (email: jiangyufan2018@outlook.com) 2018-08-14
*/
#include "../../XDevice.h"
#include "../../XUtility.h"
#include "MultiplyDim.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/*
tensor multiplication of a tensor and a row vector
c = a * b + \alpha * c
where a is a tensor and b is a row vector
>> a - pointer to the data array of a
>> b - pointer to the data array of b
>> c - pointer to the data array of c
>> rowNum - number of rows of a and c
>> colNum - number of columns of a and c (i.e., the size of b)
>> alpha - the scaling factor
*/
template <class T, bool alphaFired>
__global__
void KernelMultiplyWithRow(T * a, T * b, T * c, int rowNum, int colNum, T alpha)
{
__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int col = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
if (col >= colNum || row >= rowNum)
return;
if (threadIdx.y == 0)
bv[threadIdx.x] = b[col];
__syncthreads();
int offset = colNum * row + col;
if (alphaFired)
c[offset] = a[offset] * bv[threadIdx.x] + c[offset] * alpha;
else
c[offset] = a[offset] * bv[threadIdx.x];
}
/*
tensor multiplication of a tensor and a colum vector
c = a * b + \alpha * c
where a is a tensor and b is a colum vector
>> a - pointer to the data array of a
>> b - pointer to the data array of b
>> c - pointer to the data array of c
>> rowNum - number of rows of a and c (i.e., the size of b)
>> colNum - number of columns of a and c
>> blockNum - size of a block (matrix), i.e., rowNum * colNum
>> blockNum - number of matrics
>> alpha - the scaling factor
*/
template <class T, bool alphaFired>
__global__
void KernelMultiplyWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, T alpha)
{
__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = colIndex % colNum;
int block = colIndex / colNum;
if (row >= rowNum || block >= blockNum)
return;
if (threadIdx.x == 0)
bv[threadIdx.y] = b[row];
__syncthreads();
int offset = block * blockSize + row * colNum + col;
if (alphaFired)
c[offset] = a[offset] * bv[threadIdx.y] + c[offset] * alpha;
else
c[offset] = a[offset] * bv[threadIdx.y];
}
/*
tensor multiplication
c = a * b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is multiplied with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> c - where we put a * b + \alpha * c. we save it in a if c is NULL
>> n - the dimension index
>> alpha - the scaling factor
*/
void _CudaMultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha)
{
CheckNTErrors(a && b && c, "Empty tensor input!");
CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in multiplication!");
CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
"Unmatched data types in multiplication!");
CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in multiplication!");
CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
int stride = 1;
int blockSize = a->dimSize[n];
int blockNum = 1;
for (int i = a->order - 1; i >= 0; i--) {
if (i > n)
stride *= a->dimSize[i];
else if (i < n)
blockNum *= a->dimSize[i];
}
int cudaGrids[3];
int cudaBlocks[3];
int devIDBackup = 0;
ProtectCudaDev(a->devID, devIDBackup);
if (a->dataType == DEFAULT_DTYPE) {
if (stride > 1) {
GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
if(alpha == (DTYPE)0.0F)
KernelMultiplyWithCol<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockSize, stride, blockSize * stride, blockNum, alpha);
else
KernelMultiplyWithCol<DTYPE, true> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockSize, stride, blockSize * stride, blockNum, alpha);
}
else if (stride == 1) {
GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
if(alpha == (DTYPE)0.0F)
KernelMultiplyWithRow<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockNum, blockSize, alpha);
else
KernelMultiplyWithRow<DTYPE, true> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockNum, blockSize, alpha);
}
else {
ShowNTErrors("Something is wrong!");
}
}
else {
ShowNTErrors("TODO!");
}
BacktoCudaDev(a->devID, devIDBackup);
}
#endif
} // namespace nts(NiuTrans.Tensor)
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: JIANG Yufan (email: jiangyufan2018@outlook.com) 2018-08-14
*/
#ifndef __MULTIPLYDIM_CUH__
#define __MULTIPLYDIM_CUH__
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* tensor summation a * b + \alpha * c where the size of b is equal to the n-th dimension of a,
i.e., a is multiplied with b by broadcasting (cuda version) */
void _CudaMultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha = 0);
#endif
} // namespace nts(NiuTrans.Tensor)
#endif // __MULTIPLYDIM_CUH__
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: JIANG Yufan (email: jiangyufan2018@outlook.com) 2018-08-14
*/
#ifndef __MULTIPLYDIM_H__
#define __MULTIPLYDIM_H__
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* tensor multiplication c = a * b + \alpha * c where the size of b is equal to the n-th dimension of a,
i.e., a is multiplied with b by broadcasting */
void _MultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha = 0.0);
/* tensor multiplication a = a * b + \alpha * c where the size of b is equal to the n-th dimension of a,
i.e., a is multiplied with b by broadcasting. we keep the result in the input tensor a and return nothing */
void _MultiplyDimMe(XTensor * a, const XTensor * b, int n, DTYPE alpha = 0.0);
/* tensor multiplication c = a * b + \alpha * c where the size of b is equal to the n-th dimension of a,
i.e., a is multiplied with b by broadcasting. We make a new tensor c to keep the result and return it */
XTensor MultiplyDim(const XTensor &a, const XTensor &b, int n, DTYPE alpha = 0.0);
} // namespace nts(NiuTrans.Tensor)
#endif // __MULTIPLYDIM_H__
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#include "../../XUtility.h" #include "../../XUtility.h"
#include "Sub.h" #include "Sub.h"
#include "Sub.cuh" #include "Sub.cuh"
#include "SubDim.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -123,7 +124,35 @@ void _SubMe(XTensor * a, const XTensor * b, DTYPE beta) ...@@ -123,7 +124,35 @@ void _SubMe(XTensor * a, const XTensor * b, DTYPE beta)
{ {
_Sub(a, b, a, beta); _Sub(a, b, a, beta);
} }
/*
return a dimension if the subtraction is performed as SubDim (in more details in SubDim.h)
>> a - a tensor
>> b - another tensor for subtraction
*/
int GetSubDimIndex(const XTensor &a, const XTensor &b)
{
if(a.order < b.order)
return -1;
if(XTensor::IsSameShaped(&a, &b))
return -1;
int hitCount = 0;
int hitDim = -1;
for(int i = 0; i < a.order; i++){
if(a.dimSize[i] == b.unitNum){
hitDim = i;
hitCount++;
}
}
if(hitCount == 1)
return hitDim;
else
return -1;
}
/* /*
tensor subtraction c = a - b * \beta (return a XTensor structure) tensor subtraction c = a - b * \beta (return a XTensor structure)
make a new tensor c to keep the result and return it make a new tensor c to keep the result and return it
...@@ -138,12 +167,29 @@ XTensor Sub(const XTensor &a, const XTensor &b, DTYPE beta) ...@@ -138,12 +167,29 @@ XTensor Sub(const XTensor &a, const XTensor &b, DTYPE beta)
XTensor c(&a); XTensor c(&a);
c.SetTMP(); c.SetTMP();
/* call _Sub function */ int n = GetSubDimIndex(a, b);
_Sub(&a, &b, &c, beta);
if(n == -1){
/* tensor connections */ /* call _Sub function */
XLink::MakeLink(&a, &b, &c, MATH_SUB); _Sub(&a, &b, &c, beta);
XLink::AddParamToHead(&c, beta);
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_SUB);
XLink::AddParamToHead(&c, beta);
}
else if(n >= 0 && n < a.order){
/* call _SubDim function */
_SubDim(&a, &b, &c, n, beta);
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_SUBDIM);
XLink::AddParamToHeadInt(&c, n);
XLink::AddParamToHead(&c, beta);
}
else{
ShowNTErrors("Something is wrong!");
}
return c; return c;
} }
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
*/
#include "Sub.h"
#include "SubDim.h"
#include "SubDim.cuh"
#include "../../XName.h"
#include "../movement/CopyValues.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
tensor subtraction
c = a - b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> c - where we put a-b*\beta. we save it in a if c is NULL
>> n - the dimension index
>> beta - the scaling factor
*/
void _SubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta)
{
CheckNTErrors(a && b && c, "Empty tensor input!");
CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in subtraction!");
CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
"Unmatched data types in subtraction!");
CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in subtraction!");
CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
if (beta == 0) {
_CopyValues(a, c);
return;
}
if (XTensor::IsSameShaped(a, b)) {
_Sub(a, b, c, beta);
return;
}
if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
#ifdef USE_CUDA
_CudaSubDim(a, b, c, n, beta);
#else
ShowNTErrors("Please specify USE_CUDA and recompile the code!");
#endif
}
else {
int stride = 1;
int blockSize = a->dimSize[n];
int blockNum = 1;
for (int i = a->order - 1; i >= 0; i--) {
if (i > n)
stride *= a->dimSize[i];
else if (i < n)
blockNum *= a->dimSize[i];
}
if (a->dataType == DEFAULT_DTYPE) {
int num = a->unitNum;
if (stride > 1) {
for (int i = 0, j = 0; i < num; i += stride, j++) {
DTYPE * ap = (DTYPE*)a->data + i;
DTYPE bv = *((DTYPE*)b->data + j % blockSize) * beta;
DTYPE * cp = (DTYPE*)c->data + i;
for (int k = 0; k < stride; k++)
cp[k] = ap[k] - bv;
}
}
else if (stride == 1) {
DTYPE * bp = (DTYPE*)b->data;
for (int i = 0; i < num; i += blockSize) {
DTYPE * ap = (DTYPE*)a->data + i;
DTYPE * cp = (DTYPE*)c->data + i;
if (beta == 1.0F) {
for (int j = 0; j < blockSize; j++)
cp[j] = ap[j] - bp[j];
}
else {
for (int j = 0; j < blockSize; j++)
cp[j] = ap[j] - bp[j] * beta;
}
}
}
else {
ShowNTErrors("Something is wrong!");
}
}
else {
ShowNTErrors("TODO!");
}
}
}
/*
tensor subtraction (do it on site)
keep the result in the input tensor and return nothing
c = a - b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> n - the dimension index
>> beta - the scaling factor
*/
void _SubDim(XTensor * a, const XTensor * b, int n, DTYPE beta)
{
_SubDim(a, b, a, n, beta);
}
/*
tensor subtraction (return a XTensor structure and make tensor connections)
make a new tensor to keep the result and return it
c = a - b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> n - the dimension index
>> beta - the scaling factor
<< return - the result tensor by tensor subtraction
*/
XTensor SubDim(const XTensor &a, const XTensor &b, int n, DTYPE beta)
{
XTensor c(&a);
c.SetTMP();
/* call _Sub function */
_SubDim(&a, &b, &c, n, beta);
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_SUBDIM);
XLink::AddParamToHeadInt(&c, n);
XLink::AddParamToHead(&c, beta);
return c;
}
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
*/
#include "SubDim.cuh"
#include "../../XDevice.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/*
tensor subtraction of a tensor and a row vector
c = a - b * \beta
where a is a tensor and b is a row vector
>> a - pointer to the data array of a
>> b - pointer to the data array of b
>> c - pointer to the data array of c
>> rowNum - number of rows of a and c
>> colNum - number of columns of a and c (i.e., the size of b)
>> beta - the scaling factor
*/
template <class T, bool betaFired>
__global__
void KernelSubWithRow(T * a, T * b, T * c, int rowNum, int colNum, T beta)
{
__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int col = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
if (col >= colNum || row >= rowNum)
return;
if (threadIdx.y == 0)
bv[threadIdx.x] = b[col];
__syncthreads();
int offset = colNum * row + col;
if (betaFired)
c[offset] = a[offset] - bv[threadIdx.x] * beta;
else
c[offset] = a[offset] - bv[threadIdx.x];
}
/*
tensor subtraction of a tensor and a colum vector
c = a - b * \beta
where a is a tensor and b is a colum vector
>> a - pointer to the data array of a
>> b - pointer to the data array of b
>> c - pointer to the data array of c
>> rowNum - number of rows of a and c (i.e., the size of b)
>> colNum - number of columns of a and c
>> blockNum - size of a block (matrix), i.e., rowNum * colNum
>> blockNum - number of matrics
>> beta - the scaling factor
*/
template <class T, bool betaFired>
__global__
void KernelSubWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, T beta)
{
__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = colIndex % colNum;
int block = colIndex / colNum;
if (row >= rowNum || block >= blockNum)
return;
if (threadIdx.x == 0)
bv[threadIdx.y] = b[row];
__syncthreads();
int offset = block * blockSize + row * colNum + col;
if (betaFired)
c[offset] = a[offset] - bv[threadIdx.y] * beta;
else
c[offset] = a[offset] - bv[threadIdx.y];
}
/*
tensor subtraction (cuda version)
c = a - b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> c - where we put a+b*\beta. we save it in a if c is NULL
>> n - the dimension index
>> beta - the scaling factor
*/
void _CudaSubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta)
{
CheckNTErrors(a && b && c, "Empty tensor input!");
CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in subtraction!");
CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
"Unmatched data types in subtraction!");
CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in subtraction!");
CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
int stride = 1;
int blockSize = a->dimSize[n];
int blockNum = 1;
for (int i = a->order - 1; i >= 0; i--) {
if (i > n)
stride *= a->dimSize[i];
else if (i < n)
blockNum *= a->dimSize[i];
}
int cudaGrids[3];
int cudaBlocks[3];
int devIDBackup = 0;
ProtectCudaDev(a->devID, devIDBackup);
if (a->dataType == DEFAULT_DTYPE) {
if (stride > 1) {
GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
if (beta == (DTYPE)1.0F)
KernelSubWithCol<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockSize, stride, blockSize * stride, blockNum, beta);
else
KernelSubWithCol<DTYPE, true> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockSize, stride, blockSize * stride, blockNum, beta);
}
else if (stride == 1) {
GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
if (beta == (DTYPE)1.0F)
KernelSubWithRow<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockNum, blockSize, beta);
else
KernelSubWithRow<DTYPE, true> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockNum, blockSize, beta);
}
else {
ShowNTErrors("Something is wrong!");
}
}
else {
ShowNTErrors("TODO!");
}
BacktoCudaDev(a->devID, devIDBackup);
}
#endif
} // namespace nts(NiuTrans.Tensor)
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
*/
#ifndef __SUBDIM_CUH__
#define __SUBDIM_CUH__
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting (cuda version) */
void _CudaSubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta = (DTYPE)1.0);
#endif
} // namespace nts(NiuTrans.Tensor)
#endif // __SUBDIM_CUH__
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
*/
#ifndef __SUBDIM_H__
#define __SUBDIM_H__
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting*/
void _SubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta = (DTYPE)1.0);
/* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting. we keep the result in the input tensor a and return nothing */
void _SubDim(XTensor * a, const XTensor * b, int n, DTYPE beta = (DTYPE)1.0);
/* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting. We make a new tensor c to keep the result and return it */
XTensor SubDim(const XTensor &a, const XTensor &b, int n, DTYPE beta = (DTYPE)1.0);
} // namespace nts(NiuTrans.Tensor)
#endif // __SUBDIM_H__
...@@ -131,23 +131,43 @@ void _SumMe(XTensor * a, const XTensor * b, DTYPE beta) ...@@ -131,23 +131,43 @@ void _SumMe(XTensor * a, const XTensor * b, DTYPE beta)
} }
/* /*
return a dimension if the sum is performed as SumDim (in more details in SumDim.h return a dimension if the sum is performed as SumDim (in more details in SumDim.h)
>> a - a tensor >> a - a tensor
>> b - another tensor for sum >> b - another tensor for sum
*/ */
int GetSumDimIndex(const XTensor &a, const XTensor &b) int GetSumDimIndex(const XTensor &a, const XTensor &b)
{ {
//if(a.order < b.order)
// return -1;
//int hitCount = 0;
//int hitDim = -1;
//for(int i = 0; i < b.order; i++){
// if(b.dimSize[b.order - 1 - i] == 1)
// continue;
// else if(b.dimSize[b.order - 1 - i] == a.dimSize[a.order - 1 - i]){
// hitCount++;
// hitDim = a.order - b.order + i;
// }
//}
//if(hitCount == 1)
// return hitDim;
//else
// return -1;
if(a.order < b.order) if(a.order < b.order)
return -1; return -1;
if(XTensor::IsSameShaped(&a, &b))
return -1;
int hitCount = 0; int hitCount = 0;
int hitDim = -1; int hitDim = -1;
for(int i = 0; i < b.order; i++){ for(int i = 0; i < a.order; i++){
if(b.dimSize[b.order - 1 - i] == 1) if(a.dimSize[i] == b.unitNum){
continue; hitDim = i;
else if(b.dimSize[b.order - 1 - i] == a.dimSize[a.order - 1 - i]){
hitCount++; hitCount++;
hitDim = a.order - b.order + i;
} }
} }
...@@ -182,7 +202,7 @@ XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta) ...@@ -182,7 +202,7 @@ XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta)
XLink::AddParamToHead(&c, beta); XLink::AddParamToHead(&c, beta);
} }
else if(n >= 0 && n < a.order){ else if(n >= 0 && n < a.order){
/* call _Sum function */ /* call _SumDim function */
_SumDim(&a, &b, &c, n, beta); _SumDim(&a, &b, &c, n, beta);
/* tensor connections */ /* tensor connections */
......
...@@ -118,6 +118,7 @@ XTensor SelectRange(const XTensor &a, int dim, int low, int high) ...@@ -118,6 +118,7 @@ XTensor SelectRange(const XTensor &a, int dim, int low, int high)
/* tensor connection */ /* tensor connection */
XLink::MakeLink(&a, NULL, &c, GETANDSET_SELECT); XLink::MakeLink(&a, NULL, &c, GETANDSET_SELECT);
XLink::AddParamToHeadInt(&c, dim);
XLink::AddParamToHeadInt(&c, low); XLink::AddParamToHeadInt(&c, low);
XLink::AddParamToHeadInt(&c, high); XLink::AddParamToHeadInt(&c, high);
......
...@@ -70,8 +70,8 @@ void _SetDataFanInOut(XTensor * tensor, DTYPE gain) ...@@ -70,8 +70,8 @@ void _SetDataFanInOut(XTensor * tensor, DTYPE gain)
fanOut = numOutputFmaps * receptiveFieldSize; fanOut = numOutputFmaps * receptiveFieldSize;
} }
DTYPE std = gain * sqrt(2.0/(fanIn + fanOut)); DTYPE std = gain * (float)sqrt(2.0/(fanIn + fanOut));
DTYPE a = sqrt(3.0) * std; DTYPE a = (DTYPE)sqrt(3.0) * std;
_SetDataRand(tensor, -a, a); _SetDataRand(tensor, -a, a);
} }
......
...@@ -213,7 +213,7 @@ void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper) ...@@ -213,7 +213,7 @@ void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
DTYPE variance = upper - lower; DTYPE variance = upper - lower;
if (tensor->dataType == X_FLOAT) if (tensor->dataType == X_FLOAT)
KernelSetDataRandFloat <<<blocks, threads >>>((float*)tensor->data, tensor->unitNum, lower, variance); KernelSetDataRandFloat <<<blocks, threads >>>((float*) tensor->data, tensor->unitNum, lower, variance);
else if (tensor->dataType == X_DOUBLE) else if (tensor->dataType == X_DOUBLE)
KernelSetDataRandDouble <<<blocks, threads >>>((double*)tensor->data, tensor->unitNum, lower, variance); KernelSetDataRandDouble <<<blocks, threads >>>((double*)tensor->data, tensor->unitNum, lower, variance);
......
...@@ -49,7 +49,7 @@ void _SetDataFixedDouble(XTensor * tensor, double p); ...@@ -49,7 +49,7 @@ void _SetDataFixedDouble(XTensor * tensor, double p);
void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper); void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper);
/* generate data items with a normal distribution with specified mean and standard deviation */ /* generate data items with a normal distribution with specified mean and standard deviation */
void _SetDataRandN(XTensor * tensor, DTYPE mean, DTYPE standardDeviation); void _SetDataRandN(XTensor * tensor, DTYPE mean = 0.0F, DTYPE standardDeviation = 1.0F);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -60,8 +60,12 @@ void _Power(const XTensor * a, XTensor * b, DTYPE p) ...@@ -60,8 +60,12 @@ void _Power(const XTensor * a, XTensor * b, DTYPE p)
bData[i] = aData[i] * aData[i]; bData[i] = aData[i] * aData[i];
} }
else { else {
for (int i = 0; i < a->unitNum; i++) for (int i = 0; i < a->unitNum; i++) {
bData[i] = (DTYPE)pow(aData[i], p); if (p < 0 && aData[i] == 0)
bData[i] = 1e20F;
else
bData[i] = (DTYPE)pow(aData[i], p);
}
} }
} }
......
...@@ -77,8 +77,13 @@ void KernelPower(DTYPE * a, DTYPE * b, DTYPE p, int size) ...@@ -77,8 +77,13 @@ void KernelPower(DTYPE * a, DTYPE * b, DTYPE p, int size)
{ {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) if (i < size) {
b[i] = pow(a[i], p); DTYPE v = a[i];
if (p < 0 && v == 0)
b[i] = 1e20;
else
b[i] = pow(a[i], p);
}
} }
/* /*
...@@ -94,8 +99,13 @@ void KernelPower(__half * a, __half * b, __half p, int size) ...@@ -94,8 +99,13 @@ void KernelPower(__half * a, __half * b, __half p, int size)
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) #if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
#else #else
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) if (i < size) {
b[i] = __float2half(pow(__half2float(a[i]), __half2float(p))); float v = __half2float(a[i]);
if (__half2float(p) < 0 && v == 0)
b[i] = __float2half(1e20);
else
b[i] = __float2half(pow(__half2float(a[i]), __half2float(p)));
}
#endif #endif
} }
......
...@@ -29,6 +29,71 @@ namespace nts{ // namespace nts(NiuTrans.Tensor) ...@@ -29,6 +29,71 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/*
use PTX code to reduce float data
*/
__device__ __forceinline__
float shflDownReduceMax(float input)
{
float output;
asm volatile(
"{"
".reg .f32 r0;"
".reg .pred p;"
"shfl.down.b32 r0, %1, 0x10, 0x1f;"
"setp.lt.f32 p,%1,r0;"
"@p mov.f32 %1,r0;"
"shfl.down.b32 r0, %1, 0x8, 0xf;"
"setp.lt.f32 p,%1,r0;"
"@p mov.f32 %1,r0;"
"shfl.down.b32 r0, %1, 0x4, 0x7;"
"setp.lt.f32 p,%1,r0;"
"@p mov.f32 %1,r0;"
"shfl.down.b32 r0, %1, 0x2, 0x3;"
"setp.lt.f32 p,%1,r0;"
"@p mov.f32 %1,r0;"
"shfl.down.b32 r0, %1, 0x1, 0x1;"
"setp.lt.f32 p, %1, r0; "
"@p mov.f32 %1,r0;"
"mov.f32 %0,%1;"
"}"
: "=f"(output) : "f"(input));
return output;
}
/*
use PTX code to reduce int data
*/
__device__ __forceinline__
int shflDownReduceMax(int input)
{
int output;
asm volatile(
"{"
".reg .s32 r0;"
".reg .pred p;"
"shfl.down.b32 r0, %1, 0x10, 0x1f;"
"setp.lt.s32 p,%1,r0;"
"@p mov.s32 %1,r0;"
"shfl.down.b32 r0, %1, 0x8, 0xf;"
"setp.lt.s32 p,%1,r0;"
"@p mov.s32 %1,r0;"
"shfl.down.b32 r0, %1, 0x4, 0x7;"
"setp.lt.s32 p,%1,r0;"
"@p mov.s32 %1,r0;"
"shfl.down.b32 r0, %1, 0x2, 0x3;"
"setp.lt.s32 p,%1,r0;"
"@p mov.s32 %1,r0;"
"shfl.down.b32 r0, %1, 0x1, 0x1;"
"setp.lt.s32 p, %1, r0; "
"@p mov.s32 %1,r0;"
"mov.s32 %0,%1;"
"}"
: "=r"(output) : "r"(input));
return output;
}
/* /*
reduce a tensor to another that keeps the max value along a dimension - slow version reduce a tensor to another that keeps the max value along a dimension - slow version
Given a block of data, we go over each dimension i in the stride and we have Given a block of data, we go over each dimension i in the stride and we have
...@@ -63,7 +128,7 @@ void KernelReduceMax(DTYPE * input, DTYPE * output, ...@@ -63,7 +128,7 @@ void KernelReduceMax(DTYPE * input, DTYPE * output,
int iOffset = i % stride; int iOffset = i % stride;
DTYPE value = (i < stride * blockNum && j < strideNum) ? DTYPE value = (i < stride * blockNum && j < strideNum) ?
input[blockSize * k + stride * j + iOffset]: FLOAT_MIN; input[blockSize * k + stride * j + iOffset] : FLOAT_MIN;
/* load data into the shared mem */ /* load data into the shared mem */
iData[threadIdx.x * blockDim.y + threadIdx.y] = value; iData[threadIdx.x * blockDim.y + threadIdx.y] = value;
...@@ -186,30 +251,26 @@ void KernelReduceMaxFast(DTYPE * input, DTYPE * output, ...@@ -186,30 +251,26 @@ void KernelReduceMaxFast(DTYPE * input, DTYPE * output,
int k = i / stride; int k = i / stride;
int iOffset = i % stride; int iOffset = i % stride;
DTYPE * data = iData + threadIdx.x * blockDim.y; DTYPE * data = iData + threadIdx.x * blockDim.y;
DTYPE * inputData = input + k * blockSize; DTYPE * inputData = input + k * blockSize;
DTYPE value = j < strideNum ? inputData[j * stride + iOffset]: FLOAT_MIN; DTYPE value = j < strideNum ? inputData[j * stride + iOffset] : FLOAT_MIN;
DTYPE value2 = j + blockDim.y < strideNum ? inputData[(j + blockDim.y) * stride + iOffset]: FLOAT_MIN; DTYPE value2 = j + blockDim.y < strideNum ? inputData[(j + blockDim.y) * stride + iOffset]: FLOAT_MIN;
/* load data into the shared mem */ value = MAX(value, value2);
data[tid] = MAX(value, value2); value = shflDownReduceMax(value);
if ((tid & 0x1f) == 0)
data[tid / 32] = value;
__syncthreads(); __syncthreads();
/* unroll the warp */ if (tid < 32) {
if(goodSize >= 512) {if(tid < 256) {if(data[tid] < data[tid + 256]) data[tid] = data[tid + 256];} __syncthreads();} if (tid < blockDim.y / 32)
if(goodSize >= 256) {if(tid < 128) {if(data[tid] < data[tid + 128]) data[tid] = data[tid + 128];} __syncthreads();} value = data[tid];
if(goodSize >= 128) {if(tid < 64) {if(data[tid] < data[tid + 64]) data[tid] = data[tid + 64];} __syncthreads();} else
if(goodSize >= 64) {if(tid < 32) {if(data[tid] < data[tid + 32]) data[tid] = data[tid + 32];} __syncthreads();} value = FLOAT_MIN;
if(goodSize >= 32) {if(tid < 16) {if(data[tid] < data[tid + 16]) data[tid] = data[tid + 16];} __syncthreads();} value = shflDownReduceMax(value);
if(goodSize >= 16) {if(tid < 8) {if(data[tid] < data[tid + 8]) data[tid] = data[tid + 8];} __syncthreads();} if (tid == 0 && blockIdx.y < reducedStrideNum)
if(goodSize >= 8) {if(tid < 4) {if(data[tid] < data[tid + 4]) data[tid] = data[tid + 4];} __syncthreads();} output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = value;
if(goodSize >= 4) {if(tid < 2) {if(data[tid] < data[tid + 2]) data[tid] = data[tid + 2];} __syncthreads();} }
if(goodSize >= 2) {if(tid < 1) {if(data[tid] < data[tid + 1]) data[tid] = data[tid + 1];} __syncthreads();}
/* write result for this block to the output array */
if(threadIdx.y == 0 && blockIdx.y < reducedStrideNum)
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = data[0];
} }
/* /*
...@@ -271,9 +332,9 @@ void KernelReduceMaxFast(__half * input, __half * output, ...@@ -271,9 +332,9 @@ void KernelReduceMaxFast(__half * input, __half * output,
if (goodSize >= 64) { if (tid < 32) { if (data[tid] < data[tid + 32]) data[tid] = data[tid + 32]; } __syncthreads(); } if (goodSize >= 64) { if (tid < 32) { if (data[tid] < data[tid + 32]) data[tid] = data[tid + 32]; } __syncthreads(); }
if (goodSize >= 32) { if (tid < 16) { if (data[tid] < data[tid + 16]) data[tid] = data[tid + 16]; } __syncthreads(); } if (goodSize >= 32) { if (tid < 16) { if (data[tid] < data[tid + 16]) data[tid] = data[tid + 16]; } __syncthreads(); }
if (goodSize >= 16) { if (tid < 8) { if (data[tid] < data[tid + 8]) data[tid] = data[tid + 8]; } __syncthreads(); } if (goodSize >= 16) { if (tid < 8) { if (data[tid] < data[tid + 8]) data[tid] = data[tid + 8]; } __syncthreads(); }
if (goodSize >= 8) { if (tid < 4) { if (data[tid] < data[tid + 4]) data[tid] = data[tid + 4]; } __syncthreads(); } if (goodSize >= 8) { if (tid < 4) { if (data[tid] < data[tid + 4]) data[tid] = data[tid + 4]; } __syncthreads(); }
if (goodSize >= 4) { if (tid < 2) { if (data[tid] < data[tid + 2]) data[tid] = data[tid + 2]; } __syncthreads(); } if (goodSize >= 4) { if (tid < 2) { if (data[tid] < data[tid + 2]) data[tid] = data[tid + 2]; } __syncthreads(); }
if (goodSize >= 2) { if (tid < 1) { if (data[tid] < data[tid + 1]) data[tid] = data[tid + 1]; } __syncthreads(); } if (goodSize >= 2) { if (tid < 1) { if (data[tid] < data[tid + 1]) data[tid] = data[tid + 1]; } __syncthreads(); }
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) #if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
/* write result for this block to the output array */ /* write result for this block to the output array */
...@@ -291,7 +352,7 @@ reduce a tensor to another that keeps the max value along a dimension - simple ...@@ -291,7 +352,7 @@ reduce a tensor to another that keeps the max value along a dimension - simple
*/ */
__global__ __global__
void KernelReduceMaxSimpleFast(DTYPE * input, DTYPE * output, void KernelReduceMaxSimpleFast(DTYPE * input, DTYPE * output,
int stride, int strideNum, int blockSize, int blockNum) int stride, int strideNum, int blockSize, int blockNum)
{ {
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
...@@ -326,6 +387,108 @@ void KernelReduceMaxSimpleFast(DTYPE * input, DTYPE * output, ...@@ -326,6 +387,108 @@ void KernelReduceMaxSimpleFast(DTYPE * input, DTYPE * output,
op[offset] = max; op[offset] = max;
} }
/*
according the GPU's sm number allocation warp num
*/
inline void continuousStorageThreadAllocation(dim3& grid, dim3& block, long long vectorNum, int vectorSize)
{
int warpNum = 4;
if (vectorNum < 20 * 8){
warpNum = 8;
if (vectorNum < 20 * 4){
warpNum = 16;
if (warpNum < 20 * 2)
warpNum = 32;
}
}
int minWarpNum = vectorSize / 32;
if (vectorSize % 32 != 0) minWarpNum++;
warpNum = min(warpNum, minWarpNum);
grid.x = vectorNum;
grid.y = 1;
grid.z = 1;
block.x = 1;
block.y = warpNum * 32;
block.z = 1;
}
/*
adjust threads.x number then we can use warp optimization
*/
inline void adjustThreadForUseWarpOptimization(dim3& blocks, dim3& threads)
{
if (threads.x > 1) {
blocks.x *= threads.x;
threads.x = 1;
}
if (threads.y < 32)
threads.y = 32;
}
/*
In some case,we use less block to imporve efficiency
*/
__global__
void KernelReduceMaxOpLessBlocks(DTYPE * input, DTYPE * output, int strideNum, int blockNum)
{
int idx = threadIdx.x % 32;
int idy = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
int startIndex = idy * strideNum;
DTYPE threadMax = FLOAT_MIN;
for (int i = idx; i < strideNum; i += 32) {
threadMax = max(input[startIndex + i], threadMax);
}
threadMax = shflDownReduceMax(threadMax);
if (idx == 0)
output[idy] = threadMax;
}
/*
we use PTX code reduce
*/
__global__
void KernelReduceMaxOp(DTYPE * input, DTYPE * output,int stride, int strideNum,
int reducedStrideNum,int blockSize, int blockNum)
{
__shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK / 32];
unsigned int tid = threadIdx.y;
unsigned int j = blockIdx.y * blockDim.y + threadIdx.y;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= stride * blockNum)
return;
/* first level reduction */
int k = i / stride;
int iOffset = i % stride;
DTYPE threadMax = FLOAT_MIN;
DTYPE * data = iData + threadIdx.x * blockDim.y;
DTYPE * inputData = input + k * blockSize;
for (int it = j; it < strideNum; it += blockDim.y){
threadMax = max(inputData[it * stride + iOffset], threadMax);
}
__syncthreads();
threadMax = shflDownReduceMax(threadMax);
if ((tid & 0x1f) == 0)
data[tid / 32] = threadMax;
__syncthreads();
/* use one warp to reduce remaining data */
if (tid < 32){
if (tid < blockDim.y / 32)
threadMax = data[tid];
else threadMax = 0;
threadMax = shflDownReduceMax(threadMax);
if (tid == 0 && blockIdx.y < reducedStrideNum)
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = threadMax;
}
}
/* /*
get the max-valued items along a dimension of the tensor (cuda version). get the max-valued items along a dimension of the tensor (cuda version).
For a 1-dimensional data array a, For a 1-dimensional data array a,
...@@ -336,20 +499,18 @@ sum_i = max_{0<=j<strideNum} input_{i,j} ...@@ -336,20 +499,18 @@ sum_i = max_{0<=j<strideNum} input_{i,j}
*/ */
void _CudaReduceMax(const XTensor * input, XTensor * output, int dim) void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
{ {
CheckNTErrors((input && output), "Empty input or output tensors!"); CheckNTErrors(input && output, "Empty input or output tensors!");
CheckNTErrors((input->order == output->order + 1), "Incorrect tensor sizes!"); CheckNTErrors(input->order == output->order + 1, "Incorrect tensor sizes!");
CheckNTErrors((input->order > dim && dim >=0), "Illegal dimension to reduce!"); CheckNTErrors(input->order > dim && dim >=0, "Illegal dimension to reduce!");
CheckNTErrors((input->dataType == output->dataType), "Unmatched data types!"); CheckNTErrors(input->dataType == output->dataType, "Unmatched data types!");
int dimRDI = input->order - dim - 1; int dimRDI = input->order - dim - 1;
for(int i = 0; i < input->order; i++){ for(int i = 0; i < input->order; i++){
if(i < dimRDI){ if(i < dimRDI){
CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i]), CheckNTErrors(input->dimSizeRDI[i] == output->dimSizeRDI[i], "Unmatched tensors!");
"Unmatched tensors!");
} }
else if(i > dimRDI){ else if(i > dimRDI){
CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i - 1]), CheckNTErrors(input->dimSizeRDI[i] == output->dimSizeRDI[i - 1], "Unmatched tensors!");
"Unmatched tensors!");
} }
} }
...@@ -382,130 +543,149 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim) ...@@ -382,130 +543,149 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
int devIDBackup; int devIDBackup;
ProtectCudaDev(input->devID, devIDBackup); ProtectCudaDev(input->devID, devIDBackup);
do{ if (stride == 1 && blockNum >= 10) {
if (input->dataType == DEFAULT_DTYPE) { dim3 grids;
DTYPE * iData = NULL; dim3 blocks;
DTYPE * oData = NULL; continuousStorageThreadAllocation(grids, blocks, (long long)blockNum, strideNum);
if (iter == 0) { if (blocks.y >= 128) {
iData = (DTYPE*)input->data; KernelReduceMaxOp <<<grids, blocks >>> ((DTYPE *)input->data, (DTYPE*)output->data, stride, strideNum, grids.y, blockSize, blockNum);
oData = buf1;
}
else if (iter % 2 == 1) {
iData = buf1;
oData = buf2;
}
else {
iData = buf2;
oData = buf1;
}
/* unroll the reduction procedure. The code is messy but it is faster. */
if (strideNum < 32) {
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
KernelReduceMax << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else if (strideNum < 128) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!");
KernelReduceMaxFast<64> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else if (strideNum < 256) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!");
KernelReduceMaxFast<128> << <blocks, threads >> >(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else if (strideNum < 512) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!");
KernelReduceMaxFast<256> << <blocks, threads >> >(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!");
KernelReduceMaxFast<512> << <blocks, threads >> >(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
} }
else if (input->dataType == X_FLOAT16) { else {
__half * buf1ft16 = (__half *)buf1; if (blockNum % 4 != 0) blockNum = (int)(blockNum / 4) + 1;
__half * buf2ft16 = (__half *)buf2; else blockNum = blockNum / 4;
__half * iData = NULL; KernelReduceMaxOpLessBlocks <<<blockNum, 128 >>> ((DTYPE *)input->data, (DTYPE*)output->data, strideNum, blockNum);
__half * oData = NULL; }
if (iter == 0) { }
iData = (__half*)input->data; else {
oData = buf1ft16; do {
} if (input->dataType == DEFAULT_DTYPE) {
else if (iter % 2 == 1) { DTYPE * iData = NULL;
iData = buf1ft16; DTYPE * oData = NULL;
oData = buf2ft16; if (iter == 0) {
iData = (DTYPE*)input->data;
oData = buf1;
}
else if (iter % 2 == 1) {
iData = buf1;
oData = buf2;
}
else {
iData = buf2;
oData = buf1;
}
/* unroll the reduction procedure. The code is messy but it is faster. */
if (strideNum < 32) {
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
KernelReduceMax <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else if (strideNum < 128) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
CheckNTErrors(cudaBlockSize[0] >= 64, "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads);
KernelReduceMaxFast<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else if (strideNum < 256) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
CheckNTErrors(cudaBlockSize[0] >= 128, "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads);
KernelReduceMaxFast<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else if (strideNum < 512) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
CheckNTErrors(cudaBlockSize[0] >= 256, "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads);
KernelReduceMaxFast<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
CheckNTErrors(cudaBlockSize[0] >= 512, "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads);
KernelReduceMaxFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
} }
else { else if (input->dataType == X_FLOAT16) {
iData = buf2ft16; __half * buf1ft16 = (__half *)buf1;
oData = buf1ft16; __half * buf2ft16 = (__half *)buf2;
__half * iData = NULL;
__half * oData = NULL;
if (iter == 0) {
iData = (__half*)input->data;
oData = buf1ft16;
}
else if (iter % 2 == 1) {
iData = buf1ft16;
oData = buf2ft16;
}
else {
iData = buf2ft16;
oData = buf1ft16;
}
/* unroll the reduction procedure. The code is messy but it is faster. */
if (strideNum < 32) {
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
KernelReduceMax <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else if (strideNum < 128) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors(cudaBlockSize[0] >= 64, "Incorrect thread number when calling the cuda kernel!");
KernelReduceMaxFast<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else if (strideNum < 256) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors(cudaBlockSize[0] >= 128, "Incorrect thread number when calling the cuda kernel!");
KernelReduceMaxFast<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else if (strideNum < 512) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors(cudaBlockSize[0] >= 256, "Incorrect thread number when calling the cuda kernel!");
KernelReduceMaxFast<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors(cudaBlockSize[0] >= 512, "Incorrect thread number when calling the cuda kernel!");
KernelReduceMaxFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
} }
/* unroll the reduction procedure. The code is messy but it is faster. */ strideNum = cudaGridSize[0];
if (strideNum < 32) { blockSize = cudaGridSize[0];
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
KernelReduceMax << <blocks, threads >> >(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else if (strideNum < 128) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!");
KernelReduceMaxFast<64> << <blocks, threads >> >(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else if (strideNum < 256) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!");
KernelReduceMaxFast<128> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else if (strideNum < 512) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!");
KernelReduceMaxFast<256> << <blocks, threads >> >(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!");
KernelReduceMaxFast<512> << <blocks, threads >> >(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
}
strideNum = cudaGridSize[0];
blockSize = cudaGridSize[0];
iter++; iter++;
}while(strideNum > 1); } while (strideNum > 1);
}
BacktoCudaDev(input->devID, devIDBackup); BacktoCudaDev(input->devID, devIDBackup);
......
...@@ -27,6 +27,57 @@ namespace nts{ // namespace nts(NiuTrans.Tensor) ...@@ -27,6 +27,57 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/*
use PTX code to reduce float data
*/
__device__ __forceinline__
float shflDownReduceSum(float input)
{
float output;
asm volatile(
"{"
".reg .f32 r0;"
"shfl.down.b32 r0, %1, 0x10, 0x1f;"
"add.f32 %1, r0, %1;"
"shfl.down.b32 r0, %1, 0x8, 0xf;"
"add.f32 %1, r0, %1;"
"shfl.down.b32 r0, %1, 0x4, 0x7;"
"add.f32 %1, r0, %1;"
"shfl.down.b32 r0, %1, 0x2, 0x3;"
"add.f32 %1, r0, %1;"
"shfl.down.b32 r0, %1, 0x1, 0x1;"
"add.f32 %0, r0, %1;"
"}"
: "=f"(output) : "f"(input));
return output;
}
/*
use PTX code to reduce int data
*/
__device__ __forceinline__
int shflDownReduceSum(int input)
{
int output;
asm volatile(
"{"
".reg .s32 r0;"
"shfl.down.b32 r0, %1, 0x10, 0x1f;"
"add.s32 %1, r0, %1;"
"shfl.down.b32 r0, %1, 0x8, 0xf;"
"add.s32 %1, r0, %1;"
"shfl.down.b32 r0, %1, 0x4, 0x7;"
"add.s32 %1, r0, %1;"
"shfl.down.b32 r0, %1, 0x2, 0x3;"
"add.s32 %1, r0, %1;"
"shfl.down.b32 r0, %1, 0x1, 0x1;"
"add.s32 %0, r0, %1;"
"}"
: "=r"(output) : "r"(input));
return output;
}
/* /*
reduce a tensor to another that keeps the sum along a dimension - slow version reduce a tensor to another that keeps the sum along a dimension - slow version
Given a block of data, we go over each dimension i in the stride and we have Given a block of data, we go over each dimension i in the stride and we have
...@@ -157,6 +208,7 @@ void KernelReduceSum(__half * input, __half * output, ...@@ -157,6 +208,7 @@ void KernelReduceSum(__half * input, __half * output,
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) #if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
__half value = isValid ? __hsub(input[blockSize * k + stride * j + iOffset], bias[threadIdx.x]) : __half(0); __half value = isValid ? __hsub(input[blockSize * k + stride * j + iOffset], bias[threadIdx.x]) : __half(0);
DTYPE power2 = __half2float(power); DTYPE power2 = __half2float(power);
if(power2 != (DTYPE)1.0){ if(power2 != (DTYPE)1.0){
if(power2 == (DTYPE)2.0) if(power2 == (DTYPE)2.0)
value = __hmul(value, value); value = __hmul(value, value);
...@@ -276,25 +328,19 @@ void KernelReduceSumFast(DTYPE * input, DTYPE * output, ...@@ -276,25 +328,19 @@ void KernelReduceSumFast(DTYPE * input, DTYPE * output,
value2 = exp(value2); value2 = exp(value2);
} }
/* load data into the shared mem */ value = value + value2;
data[tid] = value + value2;
__syncthreads(); __syncthreads();
value = shflDownReduceSum(value);
/* unroll the warp */ if ((tid & 0x1f) == 0) { data[tid / 32] = value; }
if(goodSize >= 512) {if(tid < 256) {data[tid] += data[tid + 256];} __syncthreads();} __syncthreads();
if(goodSize >= 256) {if(tid < 128) {data[tid] += data[tid + 128];} __syncthreads();} if (tid < 32){
if(goodSize >= 128) {if(tid < 64) {data[tid] += data[tid + 64];} __syncthreads();} if (tid < blockDim.y / 32)
if(goodSize >= 64) {if(tid < 32) {data[tid] += data[tid + 32];} __syncthreads();} value = data[tid];
if(goodSize >= 32) {if(tid < 16) {data[tid] += data[tid + 16];} __syncthreads();} else value = 0;
if(goodSize >= 16) {if(tid < 8) {data[tid] += data[tid + 8];} __syncthreads();} value = shflDownReduceSum(value);
if(goodSize >= 8) {if(tid < 4) {data[tid] += data[tid + 4];} __syncthreads();} if (tid == 0 && blockIdx.y < reducedStrideNum)
if(goodSize >= 4) {if(tid < 2) {data[tid] += data[tid + 2];} __syncthreads();} output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = value;
if(goodSize >= 2) {if(tid < 1) {data[tid] += data[tid + 1];} __syncthreads();} }
/* write result for this block to the output array */
if(threadIdx.y == 0 && blockIdx.y < reducedStrideNum)
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = data[0];
} }
/* /*
...@@ -430,6 +476,195 @@ void KernelReduceSumFast(__half * input, __half * output, ...@@ -430,6 +476,195 @@ void KernelReduceSumFast(__half * input, __half * output,
#endif #endif
} }
/*
if data storage is discontinuius ,use this way to reduce
*/
__global__
void KernelReduceSumDiscontinuousStorage(DTYPE * input, DTYPE * output, int stride, int blockNum,
int strideNum, DTYPE * shift, DTYPE power, bool isExp)
{
__shared__ DTYPE bias[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int blockIndex = idx / stride;
int offsetInBlock = idx % stride;
if (idx >= stride * blockNum)
return;
bias[idx % blockDim.x] = shift != NULL ? shift[idx] : 0;
DTYPE ans = 0;
#pragma unroll
for (int i = stride * strideNum * blockIndex + offsetInBlock;
i < stride * strideNum * blockIndex + offsetInBlock + stride * strideNum;
i += stride){
DTYPE value = input[i];
value = value - bias[idx % blockDim.x];
if (power != (DTYPE)1.0) {
if (power == (DTYPE)2.0) {
value = value * value;
}
else if (power == (DTYPE)0.5) {
value = sqrt(value);
}
else {
value = pow(value, power);
}
}
if (isExp) {
value = exp(value);
}
ans += value;
}
output[idx] = ans;
}
__global__
void KernelReduceSumOp(DTYPE * input, DTYPE * output,
int stride, int strideNum, int reducedStrideNum,
int blockSize, int blockNum,
DTYPE * shift, DTYPE power, bool isExp)
{
__shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK / 32];
__shared__ DTYPE bias[MAX_CUDA_THREAD_NUM_PER_BLOCK];
unsigned int tid = threadIdx.y;
unsigned int j = blockIdx.y * blockDim.y + threadIdx.y;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= stride * blockNum)
return;
if (threadIdx.y == 0)
bias[threadIdx.x] = shift != NULL ? shift[i] : 0;
__syncthreads();
/* first level reduction */
int k = i / stride;
int iOffset = i % stride;
DTYPE threadSum = 0;
DTYPE * data = iData + threadIdx.x * blockDim.y;
DTYPE * inputData = input + k * blockSize;
for (int it = j; it < strideNum; it += blockDim.y){
DTYPE value = inputData[it * stride + iOffset] - bias[threadIdx.x];
if (power != (DTYPE)1.0) {
if (power == (DTYPE)2.0) {
value = value * value;
}
else if (power == (DTYPE)0.5) {
value = sqrt(value);
}
else {
value = pow(value, power);
}
}
if (isExp) value = exp(value);
threadSum += value;
}
__syncthreads();
threadSum = shflDownReduceSum(threadSum);
if ((tid & 0x1f) == 0) { data[tid / 32] = threadSum; }
__syncthreads();
if (tid < 32){
if (tid < blockDim.y / 32)
threadSum = data[tid];
else threadSum = 0;
threadSum = shflDownReduceSum(threadSum);
if (tid == 0 && blockIdx.y < reducedStrideNum)
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = threadSum;
}
}
__global__
void KernelReduceSumOpLessBlocks(DTYPE * input, DTYPE * output,
int strideNum, int blockNum,
DTYPE * shift, DTYPE power, bool isExp)
{
__shared__ DTYPE bias[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int idx = threadIdx.x % 32;
int idy = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
if (idx == 0)
bias[threadIdx.x / 32] = shift != NULL ? shift[idy] : 0;
int startIndex = idy * strideNum;
DTYPE threadSum = 0;
for (int i = idx; i < strideNum; i += 32) {
DTYPE value = input[startIndex + i] - bias[threadIdx.x / 32];
if (power != (DTYPE)1.0) {
if (power == (DTYPE)2.0) {
value = value * value;
}
else if (power == (DTYPE)0.5) {
value = sqrt(value);
}
else {
value = pow(value, power);
}
}
if (isExp) value = exp(value);
threadSum += value;
}
threadSum = shflDownReduceSum(threadSum);
if (idx == 0)
output[idy] = threadSum;
}
/*
according the GPU's sm number allocation warp num
*/
inline void continuousStorageThreadAllocation(dim3& grid, dim3& block, long long vectorNum, int vectorSize)
{
int warpNum = 4;
if (vectorNum < 20 * 8) {
warpNum = 8;
if (vectorNum < 20 * 4) {
warpNum = 16;
if (warpNum < 20 * 2)
warpNum = 32;
}
}
int minWarpNum = vectorSize / 32;
if (vectorSize % 32 != 0) minWarpNum++;
warpNum = min(warpNum, minWarpNum);
grid.x = vectorNum;
grid.y = 1;
grid.z = 1;
block.x = 1;
block.y = warpNum * 32;
block.z = 1;
}
/*
this situation we use block.x * grid.x deal one vector for continuous read
*/
inline void discontinuousStorageNoShareMemThreadAllocation(dim3& grid, dim3& block, int stride, int blockNum)
{
block.x = 512;
block.y = 1;
if ((stride * blockNum) % 512 == 0)
grid.x = (stride * blockNum) / 512;
else
grid.x = (stride * blockNum) / 512 + 1;
grid.y = 1;
}
/*
adjust threads.x number then we can use warp optimization
*/
inline void adjustThreadForUseWarpOptimization(dim3& blocks, dim3& threads)
{
if (threads.x > 1){
blocks.x *= threads.x;
threads.x = 1;
}
if (threads.y < 32)
threads.y = 32;
}
/* /*
sum the items along a dimension of the tensor (cuda version). sum the items along a dimension of the tensor (cuda version).
For a 1-dimensional data array a, For a 1-dimensional data array a,
...@@ -444,27 +679,24 @@ sum = \sum_i exp((a_i - shift)^power) if isExp == true ...@@ -444,27 +679,24 @@ sum = \sum_i exp((a_i - shift)^power) if isExp == true
*/ */
void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor * shift, DTYPE power, bool isExp) void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor * shift, DTYPE power, bool isExp)
{ {
CheckNTErrors((input && output), "Empty input or output tensors!"); CheckNTErrors(input && output, "Empty input or output tensors!");
CheckNTErrors((input->order == output->order + 1), "Incorrect tensor sizes!"); CheckNTErrors(input->order == output->order + 1, "Incorrect tensor sizes!");
CheckNTErrors((input->order > dim && dim >=0), "Illegal dimension to reduce!"); CheckNTErrors(input->order > dim && dim >= 0, "Illegal dimension to reduce!");
CheckNTErrors((input->dataType == output->dataType), "Unmatched data types!"); CheckNTErrors(input->dataType == output->dataType, "Unmatched data types!");
CheckNTErrors((shift == NULL || output->unitNum == shift->unitNum), "Incorrect shift tensor size!"); CheckNTErrors(shift == NULL || output->unitNum == shift->unitNum, "Incorrect shift tensor size!");
int dimRDI = input->order - dim - 1; int dimRDI = input->order - dim - 1;
for(int i = 0; i < input->order; i++){ for(int i = 0; i < input->order; i++){
if(i < dimRDI){ if(i < dimRDI){
CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i]), CheckNTErrors(input->dimSizeRDI[i] == output->dimSizeRDI[i], "Unmatched tensors!");
"Unmatched tensors!");
} }
else if(i > dimRDI){ else if(i > dimRDI){
CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i - 1]), CheckNTErrors(input->dimSizeRDI[i] == output->dimSizeRDI[i - 1], "Unmatched tensors!");
"Unmatched tensors!");
} }
} }
if(input->dataType == X_FLOAT16){ if(input->dataType == X_FLOAT16)
CheckNTErrors((power == 0 || power == 0.5 || power == 1.0 || power == 2.0), "TODO!"); CheckNTErrors(power == 0 || power == 0.5 || power == 1.0 || power == 2.0, "TODO!");
}
int cudaGridSize[3]; int cudaGridSize[3];
int cudaBlockSize[3]; int cudaBlockSize[3];
...@@ -496,136 +728,172 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen ...@@ -496,136 +728,172 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
int devIDBackup; int devIDBackup;
ProtectCudaDev(input->devID, devIDBackup); ProtectCudaDev(input->devID, devIDBackup);
do{ if (stride == 1 && blockNum >= 10) {
if(input->dataType == DEFAULT_DTYPE){ dim3 grids;
DTYPE * iData = NULL; dim3 blocks;
DTYPE * oData = NULL; continuousStorageThreadAllocation(grids, blocks, (long long)blockNum, strideNum);
if (iter == 0) { if (blocks.y >= 128)
iData = (DTYPE*)input->data; KernelReduceSumOp <<<grids, blocks >>> ((DTYPE *)input->data, (DTYPE*)output->data, stride, strideNum, grids.y, blockSize, blockNum, sp, power, isExp);
oData = buf1; else {
} if (blockNum % 4 != 0) blockNum = (int)(blockNum / 4) + 1;
else if (iter % 2 == 1) { else blockNum = blockNum / 4;
iData = buf1; KernelReduceSumOpLessBlocks << <blockNum, 128 >> > ((DTYPE *)input->data, (DTYPE*)output->data, strideNum, blockNum, sp, power, isExp);
oData = buf2;
}
else {
iData = buf2;
oData = buf1;
}
/* unroll the reduction procedure. The code is messy but it is faster. */
if(strideNum < 32){
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
KernelReduceSum <<<blocks, threads >>>(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp);
}
else if(strideNum < 128){
GDevs.GetCudaThread2D(devID, MAX(strideNum/2+1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!");
KernelReduceSumFast<64> <<<blocks, threads >>>(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp);
}
else if(strideNum < 256){
GDevs.GetCudaThread2D(devID, MAX(strideNum/2+1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!");
KernelReduceSumFast<128> <<<blocks, threads >>>(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp);
}
else if(strideNum < 512){
GDevs.GetCudaThread2D(devID, MAX(strideNum/2+1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!");
KernelReduceSumFast<256> <<<blocks, threads >>>(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp);
}
else{
GDevs.GetCudaThread2D(devID, MAX(strideNum/2+1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!");
KernelReduceSumFast<512> <<<blocks, threads >>>(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp);
}
} }
else if(input->dataType == X_FLOAT16){ }
__half * buf1ft16 = (__half *)buf1; else if (stride != 1 && stride * blockNum > 4096){
__half * buf2ft16 = (__half *)buf2; //GDevs->GetGridAndBlockSize2D(devID, stride * blockNum, strideNum,MAX_INT, cudaGridSize, cudaBlockSize);
__half * spft16 = (__half *)sp; //unsigned int* goutput = (unsigned int *)input->data;
unsigned short power2 = FloatToFloat16(power); //convert2uintV2 <<<dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1])>>> ((float*)input->data, goutput, stride, strideNum, blockNum, strideNum*blockNum*stride);
__half * powerft16p = (__half*)&power2; dim3 grid, block;
__half * iData = NULL; discontinuousStorageNoShareMemThreadAllocation(grid, block, stride, blockNum);
__half * oData = NULL; KernelReduceSumDiscontinuousStorage <<<grid, block>>> ((DTYPE *)input->data, (DTYPE*)output->data, stride,
if (iter == 0) { strideNum, blockNum,sp, power, isExp);
iData = (__half*)input->data; }
oData = buf1ft16; else {
} do {
else if (iter % 2 == 1) { if (input->dataType == DEFAULT_DTYPE) {
iData = buf1ft16; DTYPE * iData = NULL;
oData = buf2ft16; DTYPE * oData = NULL;
} if (iter == 0) {
else { iData = (DTYPE*)input->data;
iData = buf2ft16; oData = buf1;
oData = buf1ft16; }
} else if (iter % 2 == 1) {
iData = buf1;
/* unroll the reduction procedure. The code is messy but it is faster. */ oData = buf2;
if(strideNum < 32){ }
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize); else {
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]); iData = buf2;
if (cudaGridSize[0] == 1) oData = buf1;
oData = (__half*)output->data; }
KernelReduceSum << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, spft16, *powerft16p, isExp); /* unroll the reduction procedure. The code is messy but it is faster. */
} if (strideNum <= 32) {
else if(strideNum < 128){ GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
GDevs.GetCudaThread2D(devID, MAX(strideNum/2+1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize); dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]); if (cudaGridSize[0] == 1)
if (cudaGridSize[0] == 1) oData = (DTYPE*)output->data;
oData = (__half*)output->data; KernelReduceSum <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y,
CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!"); blockSize, blockNum, sp, power, isExp);
KernelReduceSumFast<64> <<<blocks, threads >>>(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, spft16, *powerft16p, isExp); }
} else if (strideNum < 128) {
else if(strideNum < 256){ GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
GDevs.GetCudaThread2D(devID, MAX(strideNum/2+1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize); dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]); if (cudaGridSize[0] == 1)
if (cudaGridSize[0] == 1) oData = (DTYPE*)output->data;
oData = (__half*)output->data; CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!");
CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!"); adjustThreadForUseWarpOptimization(blocks, threads);
KernelReduceSumFast<128> <<<blocks, threads >>>(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, spft16, *powerft16p, isExp); KernelReduceSumFast<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y,
} blockSize, blockNum, sp, power, isExp);
else if(strideNum < 512){ }
GDevs.GetCudaThread2D(devID, MAX(strideNum/2+1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize); else if (strideNum < 256) {
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]); GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
if (cudaGridSize[0] == 1) dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
oData = (__half*)output->data; if (cudaGridSize[0] == 1)
CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!"); oData = (DTYPE*)output->data;
KernelReduceSumFast<256> <<<blocks, threads >>>(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, spft16, *powerft16p, isExp); CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads);
KernelReduceSumFast<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y,
blockSize, blockNum, sp, power, isExp);
}
else if (strideNum < 512) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads);
KernelReduceSumFast<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y,
blockSize, blockNum, sp, power, isExp);
}
else {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads);
KernelReduceSumFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y,
blockSize, blockNum, sp, power, isExp);
}
} }
else{ else if (input->dataType == X_FLOAT16) {
GDevs.GetCudaThread2D(devID, MAX(strideNum/2+1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize); __half * buf1ft16 = (__half *)buf1;
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]); __half * buf2ft16 = (__half *)buf2;
if (cudaGridSize[0] == 1) __half * spft16 = (__half *)sp;
oData = (__half*)output->data; unsigned short power2 = FloatToFloat16(power);
CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!"); __half * powerft16p = (__half*)&power2;
KernelReduceSumFast<512> <<<blocks, threads >>>(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, spft16, *powerft16p, isExp); __half * iData = NULL;
__half * oData = NULL;
if (iter == 0) {
iData = (__half*)input->data;
oData = buf1ft16;
}
else if (iter % 2 == 1) {
iData = buf1ft16;
oData = buf2ft16;
}
else {
iData = buf2ft16;
oData = buf1ft16;
}
/* unroll the reduction procedure. The code is messy but it is faster. */
if (strideNum < 32) {
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
KernelReduceSum <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y,
blockSize, blockNum, spft16, *powerft16p, isExp);
}
else if (strideNum < 128) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!");
KernelReduceSumFast<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y,
blockSize, blockNum, spft16, *powerft16p, isExp);
}
else if (strideNum < 256) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!");
KernelReduceSumFast<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y,
blockSize, blockNum, spft16, *powerft16p, isExp);
}
else if (strideNum < 512) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!");
KernelReduceSumFast<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y,
blockSize, blockNum, spft16, *powerft16p, isExp);
}
else {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!");
KernelReduceSumFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y,
blockSize, blockNum, spft16, *powerft16p, isExp);
}
} }
}
strideNum = cudaGridSize[0]; strideNum = cudaGridSize[0];
blockSize = cudaGridSize[0]; blockSize = cudaGridSize[0];
sp = NULL; sp = NULL;
power = (DTYPE)1.0; power = (DTYPE)1.0;
isExp = false; isExp = false;
iter++; iter++;
}while(strideNum > 1);
} while (strideNum > 1);
}
ProtectCudaDev(input->devID, devIDBackup); ProtectCudaDev(input->devID, devIDBackup);
if (mem != NULL) if (mem != NULL)
......
...@@ -176,15 +176,19 @@ make a new tensor to keep the result and return it ...@@ -176,15 +176,19 @@ make a new tensor to keep the result and return it
*/ */
XTensor LogSoftmax(const XTensor &x, int leadDim) XTensor LogSoftmax(const XTensor &x, int leadDim)
{ {
int ld = leadDim;
if (ld < 0)
ld = x.order - 1;
XTensor y(&x); XTensor y(&x);
y.SetTMP(); y.SetTMP();
/* call _LogSoftmax function */ /* call _LogSoftmax function */
_LogSoftmax(&x, &y, leadDim); _LogSoftmax(&x, &y, ld);
/* tensor connection */ /* tensor connection */
XLink::MakeLink(&x, NULL, &y, FUNC_LOGSOFTMAX); XLink::MakeLink(&x, NULL, &y, FUNC_LOGSOFTMAX);
XLink::AddParamToHeadInt(&y, leadDim); XLink::AddParamToHeadInt(&y, ld);
return y; return y;
} }
...@@ -248,7 +252,7 @@ There are two ways to implement this process. ...@@ -248,7 +252,7 @@ There are two ways to implement this process.
Method 1. we compute dE/dy and dy/dx resepectively, and then reach dE/dx by dE/dx = dE/dy * dy/dx Method 1. we compute dE/dy and dy/dx resepectively, and then reach dE/dx by dE/dx = dE/dy * dy/dx
(or more precisely dE/dx_j = \sum_{i} {dE/dy_i * dy_i/dx_j}) (or more precisely dE/dx_j = \sum_{i} {dE/dy_i * dy_i/dx_j})
Method 2. we compute dE/dx (or dE/dx_j) in a single step, rather than resorting to the Method 2. we compute dE/dx (or dE/dx_j) in a single step, rather than resorting to the
sub-models dE/dy and dy/dx. We can do this by using dE/dx_j = -gold_j + exp(y_j) sub-models of dE/dy and dy/dx. We can do this by using dE/dx_j = -gold_j + exp(y_j)
Here we choose Method 2, i.e., we straightforwardly compute dE/dx_j by Here we choose Method 2, i.e., we straightforwardly compute dE/dx_j by
...@@ -257,12 +261,12 @@ dE/dx_j = -gold_j + exp(y_j) ...@@ -257,12 +261,12 @@ dE/dx_j = -gold_j + exp(y_j)
(or dE/dx_j = -\delta(i,j) + exp(y_j) for a Maximum A Posteriori Estimation (MAP)) (or dE/dx_j = -\delta(i,j) + exp(y_j) for a Maximum A Posteriori Estimation (MAP))
Method 1 is also fine but is more time consuming due to the summation over dimensions. Method 1 is also fine but is more time consuming due to the summation over dimensions.
Note that this method is not good for the standard version softmax when working with Note that this method is not good for the standard version softmax when we work with
the cross entropy loss. Because it is numerical unstable. When we use a usual method to the cross entropy loss because it is numerical unstable. When we use a usual method to
define softmax, we have softmax: y_i = log(e^{x_i} / \sum_{k} e^{x_k}). It is trivial to define softmax, we have softmax: y_i = log(e^{x_i} / \sum_{k} e^{x_k}). It is trivial to
know that dy_i/dx_j = y_i * \delta(i,j) - y_i * y_j. As y_i and y_j could be a small number, know that dy_i/dx_j = y_i * \delta(i,j) - y_i * y_j. As y_i and y_j could be small numbers,
y_i * y_i would result in a much smaller one with a risk of lossing precision. This is even y_i * y_i would result in a much smaller value with a risk of lossing precision. This is even
worse we multiply dy_i/dx_j with dE/dy_i. So it is in general to use log softmax instead for worse we multiply dy_i/dx_j with dE/dy_i. So it is in general to use log softmax for
better numerical stability. better numerical stability.
>> gold - gold standard to measure error (or loss) >> gold - gold standard to measure error (or loss)
......
...@@ -103,10 +103,10 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim) ...@@ -103,10 +103,10 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
else{ else{
for(int i = 0; i < n; i++){ for(int i = 0; i < n; i++){
DTYPE r = (DTYPE)exp(ip[i * m + j] - mp[j])/sp[j]; DTYPE r = (DTYPE)exp(ip[i * m + j] - mp[j])/sp[j];
if(IsNAN(r)) if (r > (DTYPE)1.0F)
r = DTYPE_MIN; r = (DTYPE)1.0F;
if(IsINF(r)) else if (r < 0)
r = DTYPE_MIN; r = 0;
op[i * m + j] = r; op[i * m + j] = r;
} }
} }
...@@ -143,14 +143,19 @@ make a new tensor to keep the result and return it ...@@ -143,14 +143,19 @@ make a new tensor to keep the result and return it
*/ */
XTensor Softmax(const XTensor &x, int leadDim) XTensor Softmax(const XTensor &x, int leadDim)
{ {
int ld = leadDim;
if (ld < 0)
ld = x.order - 1;
XTensor y(&x); XTensor y(&x);
y.SetTMP(); y.SetTMP();
/* call _Softmax function */ /* call _Softmax function */
_Softmax(&x, &y, leadDim); _Softmax(&x, &y, ld);
/* tensor connection */ /* tensor connection */
XLink::MakeLink(&x, NULL, &y, FUNC_SOFTMAX); XLink::MakeLink(&x, NULL, &y, FUNC_SOFTMAX);
XLink::AddParamToHeadInt(&y, ld);
return y; return y;
} }
......
...@@ -85,7 +85,13 @@ void KernelSoftmaxComputeTensor(DTYPE * x, DTYPE * max, DTYPE * sum, DTYPE * y, ...@@ -85,7 +85,13 @@ void KernelSoftmaxComputeTensor(DTYPE * x, DTYPE * max, DTYPE * sum, DTYPE * y,
if(i < strideSizeTotal && j < strideNum){ if(i < strideSizeTotal && j < strideNum){
int offset = int(i / stride) * blockSize + j * stride + i2[threadIdx.x]; int offset = int(i / stride) * blockSize + j * stride + i2[threadIdx.x];
y[offset] = exp(x[offset] - xMax[threadIdx.x])/xSum[threadIdx.x]; DTYPE r = exp(x[offset] - xMax[threadIdx.x])/xSum[threadIdx.x];
if (r >(DTYPE)1.0F)
r = (DTYPE)1.0F;
else if (r < 0)
r = 0;
y[offset] = r;
} }
} }
...@@ -194,7 +200,12 @@ void KernelSoftmaxComputeTensorUseBroadcast(DTYPE * input, DTYPE * max, DTYPE * ...@@ -194,7 +200,12 @@ void KernelSoftmaxComputeTensorUseBroadcast(DTYPE * input, DTYPE * max, DTYPE *
maxData = broadcast(maxData); maxData = broadcast(maxData);
if (i < strideNum){ if (i < strideNum){
int offset = int(j / stride) * blockSize + i * stride + i2; int offset = int(j / stride) * blockSize + i * stride + i2;
output[offset] = exp(input[offset] - maxData) / sumData; DTYPE r = exp(input[offset] - maxData) / sumData;
if (r > (DTYPE)1.0F)
r = (DTYPE)1.0F;
else if (r < 0)
r = 0;
output[offset] = r;
} }
} }
} }
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-14
*/
#include "TDivDim.h"
#include "../core/arithmetic/DivDim.h"
#include "../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
case 1: tensor division c = a/b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is divided with b by broadcasting.
In this case, (2, 4) / (2) = (2, 4), n = 0, alpha = 0.0.
*/
bool TestDivDim1()
{
/* a tensor of size (2, 4) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 2;
aDimSize[1] = 4;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (2) */
int bOrder = 1;
int * bDimSize = new int[bOrder];
bDimSize[0] = 2;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[2] = {1.0F, -1.0F};
DTYPE answer[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{-4.0F, -5.0F, -6.0F, -7.0F} };
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensor(aOrder, aDimSize);
XTensor * b = NewTensor(bOrder, bDimSize);
XTensor * c = NewTensor(aOrder, aDimSize);
XTensor * cMe = NewTensor(aOrder, aDimSize);
XTensor cUser;
/* initialize variables */
a->SetData(aData, aUnitNum);
cMe->SetData(aData, aUnitNum);
b->SetData(bData, bUnitNum);
c->SetZeroAll();
/* call DivDim function */
_DivDim(a, b, c, 0);
_DivDim(cMe, b, 0);
cUser = DivDim(*a, *b, 0);
/* check results */
cpuTest = c->CheckData(answer, aUnitNum) &&
cMe->CheckData(answer, aUnitNum) &&
cUser.CheckData(answer, aUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
cMeGPU->SetData(aData, aUnitNum);
bGPU->SetData(bData, bUnitNum);
cGPU->SetZeroAll();
/* call sum function */
_DivDim(aGPU, bGPU, cGPU, 0);
_DivDim(cMeGPU, bGPU, 0);
cUserGPU = DivDim(*aGPU, *bGPU, 0);
/* check results */
gpuTest = cGPU->CheckData(answer, aUnitNum) &&
cMeGPU->CheckData(answer, aUnitNum) &&
cUserGPU.CheckData(answer, aUnitNum);
/* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 2: tensor division c = a/b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is divided with b by broadcasting.
In this case, (2, 4) / (2, 2) = (2, 4), n = 1.
*/
bool TestDivDim2()
{
/* a tensor of size (2, 4) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 2;
aDimSize[1] = 4;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (2, 2) */
int bOrder = 2;
int * bDimSize = new int[bOrder];
bDimSize[0] = 2;
bDimSize[1] = 2;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[2][2] = { {1.0F, -1.0F},
{-1.0F, 1.0F} };
DTYPE answer[2][4] = { {0.0F, -1.0F, -2.0F, 3.0F},
{4.0F, -5.0F, -6.0F, 7.0F} };
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensor(aOrder, aDimSize);
XTensor * b = NewTensor(bOrder, bDimSize);
XTensor * c = NewTensor(aOrder, aDimSize);
XTensor * cMe = NewTensor(aOrder, aDimSize);
XTensor cUser;
/* initialize variables */
a->SetData(aData, aUnitNum);
cMe->SetData(aData, aUnitNum);
b->SetData(bData, bUnitNum);
c->SetZeroAll();
/* call DivDim function */
_DivDim(a, b, c, 1);
_DivDim(cMe, b, 1);
cUser = DivDim(*a, *b, 1);
/* check results */
cpuTest = c->CheckData(answer, aUnitNum) &&
cMe->CheckData(answer, aUnitNum) &&
cUser.CheckData(answer, aUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
cMeGPU->SetData(aData, aUnitNum);
bGPU->SetData(bData, bUnitNum);
cGPU->SetZeroAll();
/* call sum function */
_DivDim(aGPU, bGPU, cGPU, 1);
_DivDim(cMeGPU, bGPU, 1);
cUserGPU = DivDim(*aGPU, *bGPU, 1);
/* check results */
gpuTest = cGPU->CheckData(answer, aUnitNum) &&
cMeGPU->CheckData(answer, aUnitNum) &&
cUserGPU.CheckData(answer, aUnitNum);
/* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
*/
/* test for DivDim Function */
bool TestDivDim()
{
XPRINT(0, stdout, "[TEST DIVDIM] tensor division c(i) = a/b + \alpha * c by broadcasting\n");
bool returnFlag = true, caseFlag = true;
/* case 1 test */
caseFlag = TestDivDim1();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 1 failed!\n");
}
else
XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */
caseFlag = TestDivDim2();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* other cases test */
/*
TODO!!
*/
if (returnFlag) {
XPRINT(0, stdout, ">> All Passed!\n");
}
else
XPRINT(0, stdout, ">> Failed!\n");
XPRINT(0, stdout, "\n");
return returnFlag;
}
} // namespace nts(NiuTrans.Tensor)
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-14
*/
#ifndef __TEST_DIVDIM_H__
#define __TEST_DIVDIM_H__
#include "../core/arithmetic/DivDim.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for DivDim Function */
bool TestDivDim();
} // namespace nts(NiuTrans.Tensor)
#endif // __TEST_DIVDIM_H__
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-30
*/
#include "TMultiplyDim.h"
#include "../core/arithmetic/MultiplyDim.h"
#include "../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
case 1: tensor multiplication c = a * b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is multiplied with b by broadcasting
In this case, (2, 4) * (2) = (2, 4), n = 0.
*/
bool TestMultiplyDim1()
{
/* a tensor of size (2, 4) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 2;
aDimSize[1] = 4;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (2) */
int bOrder = 1;
int * bDimSize = new int[bOrder];
bDimSize[0] = 2;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[2] = {1.0F, -1.0F};
DTYPE answer[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{-4.0F, -5.0F, -6.0F, -7.0F} };
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensor(aOrder, aDimSize);
XTensor * b = NewTensor(bOrder, bDimSize);
XTensor * c = NewTensor(aOrder, aDimSize);
XTensor * cMe = NewTensor(aOrder, aDimSize);
XTensor cUser;
/* initialize variables */
a->SetData(aData, aUnitNum);
cMe->SetData(aData, aUnitNum);
b->SetData(bData, bUnitNum);
c->SetZeroAll();
/* call MultiplyDim function */
_MultiplyDim(a, b, c, 0);
_MultiplyDimMe(cMe, b, 0);
cUser = MultiplyDim(*a, *b, 0);
/* check results */
cpuTest = c->CheckData(answer, aUnitNum) &&
cMe->CheckData(answer, aUnitNum) &&
cUser.CheckData(answer, aUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
cMeGPU->SetData(aData, aUnitNum);
bGPU->SetData(bData, bUnitNum);
cGPU->SetZeroAll();
/* call MultiplyDim function */
_MultiplyDim(aGPU, bGPU, cGPU, 0);
_MultiplyDimMe(cMeGPU, bGPU, 0);
cUserGPU = MultiplyDim(*aGPU, *bGPU, 0);
/* check results */
gpuTest = cGPU->CheckData(answer, aUnitNum) &&
cMeGPU->CheckData(answer, aUnitNum) &&
cUserGPU.CheckData(answer, aUnitNum);
/* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 2: tensor multiplication c = a*b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is multiplied with b by broadcasting.
In this case, (2, 4) * (4) = (2, 4), n = 1.
*/
bool TestMultiplyDim2()
{
/* a tensor of size (2, 4) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 2;
aDimSize[1] = 4;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (4) */
int bOrder = 1;
int * bDimSize = new int[bOrder];
bDimSize[0] = 4;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[4] = {1.0F, -1.0F , 1.0F, -1.0F};
DTYPE answer[2][4] = { {0.0F, -1.0F, 2.0F, -3.0F},
{4.0F, -5.0F, 6.0F, -7.0F} };
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensor(aOrder, aDimSize);
XTensor * b = NewTensor(bOrder, bDimSize);
XTensor * c = NewTensor(aOrder, aDimSize);
XTensor * cMe = NewTensor(aOrder, aDimSize);
XTensor cUser;
/* initialize variables */
a->SetData(aData, aUnitNum);
cMe->SetData(aData, aUnitNum);
b->SetData(bData, bUnitNum);
c->SetZeroAll();
/* call MultiplyDim function */
_MultiplyDim(a, b, c, 1);
_MultiplyDimMe(cMe, b, 1);
cUser = MultiplyDim(*a, *b, 1);
/* check results */
cpuTest = c->CheckData(answer, aUnitNum) &&
cMe->CheckData(answer, aUnitNum) &&
cUser.CheckData(answer, aUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
cMeGPU->SetData(aData, aUnitNum);
bGPU->SetData(bData, bUnitNum);
cGPU->SetZeroAll();
/* call MultiplyDim function */
_MultiplyDim(aGPU, bGPU, cGPU, 1);
_MultiplyDimMe(cMeGPU, bGPU, 1);
cUserGPU = MultiplyDim(*aGPU, *bGPU, 1);
/* check results */
gpuTest = cGPU->CheckData(answer, aUnitNum) &&
cMeGPU->CheckData(answer, aUnitNum) &&
cUserGPU.CheckData(answer, aUnitNum);
/* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* test for MultiplyDim Function */
bool TestMultiplyDim()
{
XPRINT(0, stdout, "[TEST MULTIPLYDIM] tensor multiplication c = a * b + \alpha * c by broadcasting\n");
bool returnFlag = true, caseFlag = true;
/* case 1 test */
caseFlag = TestMultiplyDim1();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 1 failed!\n");
}
else
XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */
caseFlag = TestMultiplyDim2();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* other cases test */
/*
TODO!!
*/
if (returnFlag) {
XPRINT(0, stdout, ">> All Passed!\n");
}
else
XPRINT(0, stdout, ">> Failed!\n");
XPRINT(0, stdout, "\n");
return returnFlag;
}
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-30
*/
#ifndef __TEST_MULTIPLYDIM_H__
#define __TEST_MULTIPLYDIM_H__
#include "../core/arithmetic/MultiplyDim.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for MultiplyDim Function */
bool TestMultiplyDim();
} // namespace nts(NiuTrans.Tensor)
#endif // __TEST_MULTIPLYDIM_H__
\ No newline at end of file
...@@ -24,7 +24,8 @@ ...@@ -24,7 +24,8 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* /*
case 1: sum the items along a dimension of the tensor. case 1: test ReduceSum function.
Sum the items along a dimension of the tensor.
In this case, In this case,
(2, 4) -> (4), dim = 0 (2, 4) -> (4), dim = 0
(2, 4) -> (2), dim = 1 (2, 4) -> (2), dim = 1
...@@ -90,8 +91,8 @@ bool TestReduceSum1() ...@@ -90,8 +91,8 @@ bool TestReduceSum1()
tUser2 = ReduceSum(*s, 1, *shift2); tUser2 = ReduceSum(*s, 1, *shift2);
/* check results */ /* check results */
cpuTest = t1->CheckData(answer1, tUnitNum1) && tUser1.CheckData(answer1, tUnitNum1) cpuTest = t1->CheckData(answer1, tUnitNum1) && tUser1.CheckData(answer1, tUnitNum1) &&
&& t2->CheckData(answer2, tUnitNum2) && tUser2.CheckData(answer2, tUnitNum2); t2->CheckData(answer2, tUnitNum2) && tUser2.CheckData(answer2, tUnitNum2);
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
...@@ -120,8 +121,8 @@ bool TestReduceSum1() ...@@ -120,8 +121,8 @@ bool TestReduceSum1()
tUserGPU2 = ReduceSum(*sGPU, 1, *shiftGPU2); tUserGPU2 = ReduceSum(*sGPU, 1, *shiftGPU2);
/* check results */ /* check results */
gpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tUserGPU1.CheckData(answer1, tUnitNum1) gpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tUserGPU1.CheckData(answer1, tUnitNum1) &&
&& tGPU2->CheckData(answer2, tUnitNum2) && tUserGPU2.CheckData(answer2, tUnitNum2); tGPU2->CheckData(answer2, tUnitNum2) && tUserGPU2.CheckData(answer2, tUnitNum2);
/* destroy variables */ /* destroy variables */
delete s; delete s;
......
...@@ -28,7 +28,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -28,7 +28,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* /*
case 1: tensor summation c = a + b * \beta case 1: tensor summation c = a + b * \beta
where the size of b is equal to the n-th dimension of a, where the size of b is equal to the n-th dimension of a,
i.e., a is summed with b by broadcasting i.e., a is summed with b by broadcasting.
In this case, (2, 4) + (2) = (2, 4), n = 0.
*/ */
bool TestSumDim1() bool TestSumDim1()
{ {
...@@ -79,9 +80,9 @@ bool TestSumDim1() ...@@ -79,9 +80,9 @@ bool TestSumDim1()
cUser = SumDim(*a, *b, 0); cUser = SumDim(*a, *b, 0);
/* check results */ /* check results */
cpuTest = c->CheckData(answer, aUnitNum) cpuTest = c->CheckData(answer, aUnitNum) &&
&& cMe->CheckData(answer, aUnitNum) cMe->CheckData(answer, aUnitNum) &&
&& cUser.CheckData(answer, aUnitNum); cUser.CheckData(answer, aUnitNum);
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
...@@ -106,9 +107,9 @@ bool TestSumDim1() ...@@ -106,9 +107,9 @@ bool TestSumDim1()
cUserGPU = SumDim(*aGPU, *bGPU, 0); cUserGPU = SumDim(*aGPU, *bGPU, 0);
/* check results */ /* check results */
gpuTest = cGPU->CheckData(answer, aUnitNum) gpuTest = cGPU->CheckData(answer, aUnitNum) &&
&& cMeGPU->CheckData(answer, aUnitNum) cMeGPU->CheckData(answer, aUnitNum) &&
&& cUserGPU.CheckData(answer, aUnitNum); cUserGPU.CheckData(answer, aUnitNum);
/* destroy variables */ /* destroy variables */
delete a; delete a;
...@@ -139,7 +140,8 @@ bool TestSumDim1() ...@@ -139,7 +140,8 @@ bool TestSumDim1()
/* /*
case 2: tensor summation c = a + b * \beta case 2: tensor summation c = a + b * \beta
where the size of b is equal to the n-th dimension of a, where the size of b is equal to the n-th dimension of a,
i.e., a is summed with b by broadcasting i.e., a is summed with b by broadcasting.
In this case, (2, 4) + (2, 2) = (2, 4), n = 1.
*/ */
bool TestSumDim2() bool TestSumDim2()
{ {
...@@ -192,9 +194,9 @@ bool TestSumDim2() ...@@ -192,9 +194,9 @@ bool TestSumDim2()
cUser = SumDim(*a, *b, 1); cUser = SumDim(*a, *b, 1);
/* check results */ /* check results */
cpuTest = c->CheckData(answer, aUnitNum) cpuTest = c->CheckData(answer, aUnitNum) &&
&& cMe->CheckData(answer, aUnitNum) cMe->CheckData(answer, aUnitNum) &&
&& cUser.CheckData(answer, aUnitNum); cUser.CheckData(answer, aUnitNum);
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
...@@ -219,9 +221,9 @@ bool TestSumDim2() ...@@ -219,9 +221,9 @@ bool TestSumDim2()
cUserGPU = SumDim(*aGPU, *bGPU, 1); cUserGPU = SumDim(*aGPU, *bGPU, 1);
/* check results */ /* check results */
gpuTest = cGPU->CheckData(answer, aUnitNum) gpuTest = cGPU->CheckData(answer, aUnitNum) &&
&& cMeGPU->CheckData(answer, aUnitNum) cMeGPU->CheckData(answer, aUnitNum) &&
&& cUserGPU.CheckData(answer, aUnitNum); cUserGPU.CheckData(answer, aUnitNum);
/* destroy variables */ /* destroy variables */
delete a; delete a;
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-30
*/
#include "TTmp.h"
#include "../XTensor.h"
#include "../../xc/ultility.h"
#include "../../xc/myCode.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
bool TestTmp1()
{
XTensor a;
XTensor b;
InitTensor4D(&a, 8, 32, 106, 106, X_FLOAT, -1, NULL);
FILE * fReadOrigin = fopen("V:/temp/input.dump", "rb");
a.Read(fReadOrigin, "a-plus-bias");
fclose(fReadOrigin);
b = Softmax(a, 3);
XTensor c;
InitTensor4D(&c, 8, 32, 106, 106, X_FLOAT, -1, NULL);
FILE * fReadResult = fopen("V:/temp/input.dump.result", "rb");
c.Read(fReadResult, "");
fclose(fReadResult);
printf("\n\nThis is CPU!\n");
b.Dump(stderr, "b", 100);
printf("\n\n");
c.Dump(stderr, "c", 100);
bool cpuTest;
cpuTest = b.CheckData(c.data, b.unitNum, 1e-6F);
if(cpuTest == true)
printf("CPU Yeah!");
else
printf("CPU ops..");
exit(1);
#ifdef USE_CUDA
XTensor aGPU;
XTensor bGPU;
InitTensor4D(&aGPU, 8, 32, 106, 106, X_FLOAT, 0, NULL);
InitTensor4D(&bGPU, 8, 32, 106, 106, X_FLOAT, 0, NULL);
fReadOrigin = fopen("V:/temp/input.dump", "rb");
aGPU.Read(fReadOrigin, "a-plus-bias");
fclose(fReadOrigin);
//bGPU = Softmax(aGPU, 3);
_Softmax(&aGPU, &bGPU, 3);
printf("\n\nThis is GPU\n");
bGPU.Dump(stderr, "bGPU", 100);
bool gpuTest;
gpuTest = bGPU.CheckData(c.data, bGPU.unitNum, 1e-4F);
if(gpuTest == true)
printf("GPU Yeah!");
else
printf("GPU ops..");
#endif // USE_CUDA
exit(1);
return 0;
}
bool TestTmp2()
{
XTensor a;
XTensor b;
InitTensor4D(&a, 8, 32, 106, 106, X_FLOAT, -1, NULL);
InitTensor4D(&b, 8, 32, 106, 106, X_FLOAT, -1, NULL);
//FILE * fReadResultGold = fopen("V:/temp/input.dump.gold", "rb");
//a.Read(fReadResultGold, "input");
//fclose(fReadResultGold);
FILE * fReadResult = fopen("V:/temp/input.dump", "rb");
b.Read(fReadResult, "a-plus-bias");
fclose(fReadResult);
ShowData(&b, "");
bool flag = CheckTensorData(a, b, 1e-3F);
if (flag)
printf("yeah");
else
printf("ops.");
exit(1);
return 0;
}
/* other cases */
/*
TODO!!
*/
/* test for Tmp Function */
bool TestTmp()
{
XPRINT(0, stdout, "[TEST Temp] temporary test\n");
bool returnFlag = true, caseFlag = true;
///* case 1 test */
//caseFlag = TestTmp1();
//if (!caseFlag) {
// returnFlag = false;
// XPRINT(0, stdout, ">> case 1 failed!\n");
//}
//else
// XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */
caseFlag = TestTmp2();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* other cases test */
/*
TODO!!
*/
if (returnFlag) {
XPRINT(0, stdout, ">> All Passed!\n");
}
else
XPRINT(0, stdout, ">> Failed!\n");
XPRINT(0, stdout, "\n");
return returnFlag;
}
} // namespace nts(NiuTrans.Tensor)
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-13
*/
#ifndef __TEST_TMP_H__
#define __TEST_TMP_H__
#include "../core/CHeader.h"
#include "../function/FHeader.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
bool TestTmp();
} // namespace nts(NiuTrans.Tensor)
#endif // __TEST_TMP_H__
...@@ -99,8 +99,8 @@ bool TestUnsqueeze1() ...@@ -99,8 +99,8 @@ bool TestUnsqueeze1()
tUser2 = Unsqueeze(*s, 2, 2); tUser2 = Unsqueeze(*s, 2, 2);
/* check results */ /* check results */
cpuTest = t1->CheckData(answer1, tUnitNum1) && tUser1.CheckData(answer1, tUnitNum1) cpuTest = t1->CheckData(answer1, tUnitNum1) && tUser1.CheckData(answer1, tUnitNum1) &&
&& t2->CheckData(answer2, tUnitNum2) && tUser2.CheckData(answer2, tUnitNum2); t2->CheckData(answer2, tUnitNum2) && tUser2.CheckData(answer2, tUnitNum2);
#ifdef USE_CUDA #ifdef USE_CUDA
/* GPU test */ /* GPU test */
......
...@@ -29,6 +29,8 @@ bool Test() ...@@ -29,6 +29,8 @@ bool Test()
bool wrong = false; bool wrong = false;
XPRINT(0, stdout, "Testing the XTensor utilites ... \n\n"); XPRINT(0, stdout, "Testing the XTensor utilites ... \n\n");
//wrong = !TestTmp() || wrong;
wrong = !TestAbsolute() || wrong; wrong = !TestAbsolute() || wrong;
wrong = !TestClip() || wrong; wrong = !TestClip() || wrong;
wrong = !TestConcatenate() || wrong; wrong = !TestConcatenate() || wrong;
...@@ -38,6 +40,7 @@ bool Test() ...@@ -38,6 +40,7 @@ bool Test()
wrong = !TestCopyIndexed() || wrong; wrong = !TestCopyIndexed() || wrong;
wrong = !TestCopyValues() || wrong; wrong = !TestCopyValues() || wrong;
wrong = !TestDiv() || wrong; wrong = !TestDiv() || wrong;
wrong = !TestDivDim() || wrong;
wrong = !TestExp() || wrong; wrong = !TestExp() || wrong;
wrong = !TestLog() || wrong; wrong = !TestLog() || wrong;
wrong = !TestMatrixMul() || wrong; wrong = !TestMatrixMul() || wrong;
...@@ -46,6 +49,7 @@ bool Test() ...@@ -46,6 +49,7 @@ bool Test()
wrong = !TestMatrixMulBatched() || wrong; wrong = !TestMatrixMulBatched() || wrong;
wrong = !TestMerge() || wrong; wrong = !TestMerge() || wrong;
wrong = !TestMultiply() || wrong; wrong = !TestMultiply() || wrong;
wrong = !TestMultiplyDim() || wrong;
wrong = !TestNegate() || wrong; wrong = !TestNegate() || wrong;
wrong = !TestNormalize() || wrong; wrong = !TestNormalize() || wrong;
wrong = !TestPower() || wrong; wrong = !TestPower() || wrong;
......
...@@ -22,6 +22,8 @@ ...@@ -22,6 +22,8 @@
#ifndef __TEST_H__ #ifndef __TEST_H__
#define __TEST_H__ #define __TEST_H__
#include "TTmp.h"
#include "TAbsolute.h" #include "TAbsolute.h"
#include "TClip.h" #include "TClip.h"
#include "TConcatenate.h" #include "TConcatenate.h"
...@@ -31,6 +33,7 @@ ...@@ -31,6 +33,7 @@
#include "TCopyIndexed.h" #include "TCopyIndexed.h"
#include "TCopyValues.h" #include "TCopyValues.h"
#include "TDiv.h" #include "TDiv.h"
#include "TDivDim.h"
#include "TExp.h" #include "TExp.h"
#include "TLog.h" #include "TLog.h"
#include "TMatrixMul.h" #include "TMatrixMul.h"
...@@ -39,6 +42,7 @@ ...@@ -39,6 +42,7 @@
#include "TMatrixMulBatched.h" #include "TMatrixMulBatched.h"
#include "TMerge.h" #include "TMerge.h"
#include "TMultiply.h" #include "TMultiply.h"
#include "TMultiplyDim.h"
#include "TNegate.h" #include "TNegate.h"
#include "TNormalize.h" #include "TNormalize.h"
#include "TPower.h" #include "TPower.h"
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论