Commit e223c59c by xuchen

bug fixed and add some function

parent 8e13830b
......@@ -2,7 +2,7 @@
## 注意事项
CUDA最新版本9.2尚且不支持VS2017最新版本,因此建议使用CUDA版本为9.0或9.1,建议使用VS版本为VS2015,或使用VS2017时安装v140工具集。
CUDA最新版本9.2尚且不支持VS2017最新版本,因此建议使用CUDA版本为9.0或9.1,建议使用VS版本为VS2015,或使用VS2017时安装v140工具集,解决方案平台设置为×64
## CUDA配置
......@@ -29,7 +29,7 @@ CUDA最新版本9.2尚且不支持VS2017最新版本,因此建议使用CUDA版
**C/C++->预处理器->预处理器定义** 中,添加
>USE_CUDA;USE_BLAS;WIN32;MKL;DEBUG;CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_WARNINGS_
>USE_CUDA;USE_BLAS;WIN32;MKL;_DEBUG;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_WARNINGS_
CONSOLE;
**链接器->系统->子系统**,设置为控制台。
......
......@@ -24,6 +24,7 @@
#include "../tensor/XUtility.h"
#include "../tensor/function/FHeader.h"
#include "../tensor/core/CHeader.h"
#include "../tensor/test/Test.h"
#include "../sample/fnnlm/FNNLM.h"
#include "../sample/transformer/Transformer.h"
......@@ -31,18 +32,24 @@
//#include <stdlib.h>
//#include <crtdbg.h>
void BackwardTest();
void TransposeTest();
void SumDimTest();
using namespace nts;
using namespace fnnlm;
using namespace transformer;
using namespace GAN;
int main( int argc, const char ** argv )
{
//_CrtSetBreakAlloc(896);
//BackwardTest();
//return 0;
if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
if(argc > 1 && !strcmp(argv[1], "-test"))
Test();
else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
FNNLMMain(argc - 1, argv + 1);
else if(argc > 1 && !strcmp(argv[1], "-t2t"))
TransformerMain(argc - 1, argv + 1);
......@@ -58,6 +65,41 @@ int main( int argc, const char ** argv )
return 0;
}
void BackwardTest()
{
XNet net;
XTensor a;
XTensor b;
XTensor c;
XTensor mean;
XTensor origin;
InitTensor2D(&a, 2, 3);
InitTensor1D(&b, 2);
a.SetZeroAll();
b.SetZeroAll();
a.Set2D(1.0F, 0, 0);
a.Set2D(2.0F, 0, 1);
a.Set2D(3.0F, 0, 2);
a.Set2D(4.0F, 1, 0);
a.Set2D(5.0F, 1, 1);
a.Set2D(6.0F, 1, 2);
b.Set1D(2.0F, 0);
b.Set1D(1.0F, 1);
c = DivDim(a, b, 0);
c.Dump(stderr, "c:");
XLink::ShowNetwork(stderr, &c);
net.Backward(c);
net.Dump(stderr);
}
void TransposeTest()
{
#ifdef USE_CUDA
......
......@@ -50,6 +50,7 @@ void XFuncGrad::MakeGrad(XTensor * node)
_IdentityBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
else if(operID == FUNC_LOGSOFTMAX){
int leadDim = income.GetParamInt(0);
CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in logsoftmax!");
_LogSoftmaxBackward(NULL, output, input, output->grad, input->grad, leadDim, NOLOSS);
}
else if(operID == FUNC_RECTIFY)
......@@ -58,6 +59,7 @@ void XFuncGrad::MakeGrad(XTensor * node)
_SigmoidBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
else if(operID == FUNC_SOFTMAX){
int leadDim = income.GetParamInt(0);
CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in softmax!");
_SoftmaxBackward(NULL, output, input, output->grad, input->grad, leadDim, NOLOSS);
}
else{
......
......@@ -22,7 +22,11 @@
#include "XBackwardLoss.h"
#include "../tensor/XName.h"
#include "../tensor/function/HardTanH.h"
#include "../tensor/function/Identity.h"
#include "../tensor/function/LogSoftmax.h"
#include "../tensor/function/Rectify.h"
#include "../tensor/function/Sigmoid.h"
#include "../tensor/function/Softmax.h"
namespace nts{
......@@ -49,10 +53,22 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x,
if(funcID == FUNC_HARDTANH){
_HardTanHBackward(gold, y, x, dedy, dedx, lossName);
}
else if(funcID == FUNC_IDENTITY){
_IdentityBackward(gold, y, x, dedy, dedx, lossName);
}
else if(funcID == FUNC_LOGSOFTMAX){
int leadDim = *(int*)params;
_LogSoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName);
}
else if(funcID == FUNC_RECTIFY){
_RectifyBackward(gold, y, x, dedy, dedx, lossName);
}
else if(funcID == FUNC_SIGMOID){
_SigmoidBackward(gold, y, x, dedy, dedx, lossName);
}else if(funcID == FUNC_SOFTMAX){
int leadDim = *(int*)params;
_SoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName);
}
else{
ShowNTErrors("wrong function found when call the backward process!");
}
......
......@@ -41,6 +41,8 @@ void XMathGrad::MakeGrad(XTensor * node)
GradSumDim(node);
else if(operID == MATH_MULTIPLY)
GradMultiply(node);
else if (operID == MATH_MULTIPLYDIM)
GradMultiplyDim(node);
else if(operID == MATH_MATRIXMUL)
GradMatrixMul(node);
else if(operID == MATH_MATRIXMULBATCHED)
......@@ -55,6 +57,8 @@ void XMathGrad::MakeGrad(XTensor * node)
GradScaleAndShift(node);
else if (operID == MATH_DIV)
GradDiv(node);
else if (operID == MATH_DIVDIM)
GradDivDim(node);
else if (operID == MATH_SUB)
GradSub(node);
else if (operID == MATH_SIN)
......@@ -242,6 +246,94 @@ void XMathGrad::GradMultiply(XTensor * node)
node->visitMark = NODE_FINISHED;
}
/*
gradient for multiply with one dimension
c = a * b
where the size of b is equal to dimension n of a, i.e., |b| = a.dimSize[n]
dE/da = dE/dc * b
dE/db = (dE/dc * a).reduce(0,...,n-1,n+1,...)
*/
void XMathGrad::GradMultiplyDim(XTensor * node)
{
XLink &income = node->income;
CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for MULTIPLYDIM!");
XTensor * a = income.tails[0];
XTensor * b = income.tails[1];
int n = income.GetParamInt(0);
XNoder::MakeGrad(a);
XNoder::MakeGrad(b);
/* dE/da */
_MultiplyDim(node->grad, b, a->grad, n, 1.0F);
/* dE/db */
int order = a->order;
int dimSize[MAX_TENSOR_DIM_NUM];
memcpy(dimSize, a->dimSize, sizeof(int) * a->order);
XTensor * bGradTMP = NewTensorBuf(node->grad, node->devID, node->mem);
_Multiply(node->grad, a, bGradTMP);
if(n == order - 1){
int reshapedSize[MAX_TENSOR_DIM_NUM];
reshapedSize[0] = a->unitNum/dimSize[order - 1];
reshapedSize[1] = dimSize[order - 1];
/* we reshape dE/dc * a to a matrix whose column number is equal to the
size of b. Then we can reduce the matrix into a row vector. */
bGradTMP->Reshape(2, reshapedSize);
if(b->outgo.tailNum > 1){
XTensor * bGradTMP2 = NewTensorBuf(b->grad, b->devID, b->mem);
_ReduceSum(bGradTMP, bGradTMP2, 0);
_Sum(b->grad, bGradTMP2, b->grad);
DelTensorBuf(bGradTMP2);
}
else{
_ReduceSum(bGradTMP, b->grad, 0);
}
}
else{
int reshapedSize[MAX_TENSOR_DIM_NUM];
reshapedSize[0] = 1;
reshapedSize[1] = dimSize[n];
reshapedSize[2] = 1;
for(int i = 0; i < order; i++){
if(i < n)
reshapedSize[0] *= dimSize[i];
}
reshapedSize[2] = a->unitNum / (reshapedSize[0] * reshapedSize[1]);
/* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|.
Then reduce along with z and x to obtain dE/db. */
bGradTMP->Reshape(3, reshapedSize);
XTensor * interGrad = NewTensorBuf(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
_ReduceSum(bGradTMP, interGrad, 2);
if(b->outgo.tailNum > 1){
XTensor * bGradTMP2 = NewTensorBuf(b->grad, b->devID, b->mem);
_ReduceSum(interGrad, bGradTMP2, 0);
_Sum(b->grad, bGradTMP2, b->grad);
DelTensorBuf(bGradTMP2);
}
else{
_ReduceSum(interGrad, b->grad, 0);
}
DelTensorBuf(interGrad);
}
DelTensor(bGradTMP);
node->visitMark = NODE_FINISHED;
}
/*
gradient for matrix multiply
for c = matmul(a, b) * \alpha
......@@ -381,7 +473,6 @@ void XMathGrad::GradMatrixMulBatched(XTensor * node)
XNoder::MakeGrad(a);
XNoder::MakeGrad(b);
XTensor * c = node;
XTensor * dedc = node->grad;
XTensor * deda = a->grad;
XTensor * dedb = b->grad;
......@@ -460,7 +551,7 @@ gradient for power
for
c = pow(a,p)
we have
dE/da = (dE/dc) * p*a^(p-1)
dE/da = (dE/dc) * p * a^(p-1)
>> node - the node (c) for backward computation
*/
void XMathGrad::GradPower(XTensor * node)
......@@ -469,21 +560,19 @@ void XMathGrad::GradPower(XTensor * node)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for POWER!");
XTensor * a = income.tails[0];
XTensor * b = NewTensor(a);
XTensor * c = NewTensor(a);
XTensor * b = NewTensorBuf(a, a->devID, a->mem);
DTYPE p = income.GetParam(0);
XNoder::MakeGrad(a);
_Power(a, b, (p-1)/p);
_ScaleAndShift(b, c, p);
_Multiply(node->grad, c, a->grad, 1.0F);
_Power(a, b, p - 1.0F);
_ScaleAndShiftMe(b, p);
_Multiply(node->grad, b, a->grad, 1.0F);
DelTensor(b);
node->visitMark = NODE_FINISHED;
delete b;
delete c;
}
/*
......@@ -500,16 +589,16 @@ void XMathGrad::GradNegate(XTensor * node)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for NEGATE!");
XTensor * a = income.tails[0];
XTensor * b = NewTensor(a);
XTensor * b = NewTensorBuf(a, a->devID, a->mem);
XNoder::MakeGrad(a);
_ScaleAndShift(node->grad, b, -1.0F);
_Sum(a->grad, b, a->grad);
node->visitMark = NODE_FINISHED;
DelTensorBuf(b);
delete b;
node->visitMark = NODE_FINISHED;
}
/*
......@@ -526,18 +615,14 @@ void XMathGrad::GradScaleAndShift(XTensor * node)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SCALEANDSHIFT!");
XTensor * a = income.tails[0];
XTensor * b = NewTensor(a);
DTYPE scale = income.GetParam(0);
XNoder::MakeGrad(a);
_ScaleAndShift(node->grad, b, scale);
_Sum(a->grad, b, a->grad);
_Sum(a->grad, node->grad, a->grad, scale);
node->visitMark = NODE_FINISHED;
delete b;
}
/*
......@@ -583,9 +668,7 @@ void XMathGrad::GradDiv(XTensor * node)
XTensor * a = income.tails[0];
XTensor * b = income.tails[1];
XTensor * c = NewTensor(b);
XTensor * d = NewTensor(b);
XTensor * e = NewTensor(b);
XTensor * ab2 = NewTensorBuf(a, a->devID, a->mem);
XNoder::MakeGrad(a);
XNoder::MakeGrad(b);
......@@ -593,16 +676,115 @@ void XMathGrad::GradDiv(XTensor * node)
CheckNTErrors(XTensor::IsSameShaped(a, b), "Wrong sized input tensors!");
_Div(node->grad, b, a->grad, 1.0F);
_Power(b, c, -2.0F);
_Multiply(a, c, d);
_ScaleAndShift(d, e, -1.0F);
_Multiply(node->grad, e, b->grad, 1.0F);
_Power(b, ab2, -2.0F);
_Multiply(a, ab2, ab2);
_ScaleAndShiftMe(ab2, -1.0F);
_Multiply(node->grad, ab2, b->grad, 1.0F);
DelTensorBuf(ab2);
node->visitMark = NODE_FINISHED;
}
/*
gradient for division with one dimension
c = a / b
where the size of b is equal to dimension n of a, i.e., |b| = a.dimSize[n]
dE/da = dE/dc * (1/b)
dE/db = dE/dc * b.reduce(0,...,n-1,n+1,...)
dE/db = (dE/dc * a).reduce(0,...,n-1,n+1,...)
*/
void XMathGrad::GradDivDim(XTensor * node)
{
XLink &income = node->income;
CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for DIVDIM!");
delete c;
delete d;
delete e;
XTensor * a = income.tails[0];
XTensor * b = income.tails[1];
int n = income.GetParamInt(0);
XNoder::MakeGrad(a);
XNoder::MakeGrad(b);
/* dE/da = dE/dc * (1/b) */
_DivDim(node->grad, b, a->grad, n, 1.0);
/* dE/db = dE/dc * dc/db */
int order = a->order;
int dimSize[MAX_TENSOR_DIM_NUM];
memcpy(dimSize, a->dimSize, sizeof(int) * a->order);
XTensor * aTMP1 = NewTensorBuf(a, a->devID, a->mem);
XTensor * aTMP2 = NewTensorBuf(a, a->devID, a->mem);
XTensor * bTMP = NewTensorBuf(b, b->devID, b->mem);
XTensor * interGradTMP = NewTensorBuf(node->grad, node->devID, node->mem);
_Negate(a, aTMP1);
_Power(b, bTMP, -2);
_MultiplyDim(aTMP1, bTMP, aTMP2, n);
_Multiply(node->grad, aTMP2, interGradTMP);
if(n == order - 1){
int reshapedSize[MAX_TENSOR_DIM_NUM];
reshapedSize[0] = a->unitNum/dimSize[order - 1];
reshapedSize[1] = dimSize[order - 1];
/* we reshape dE/dc * a to a matrix whose column number is equal to the
size of b. Then we can reduce the matrix into a row vector. */
interGradTMP->Reshape(2, reshapedSize);
if(b->outgo.tailNum > 1){
XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
_ReduceSum(interGradTMP, bGradTMP, 0);
_Sum(b->grad, bGradTMP, b->grad);
DelTensorBuf(bGradTMP);
}
else{
_ReduceSum(interGradTMP, b->grad, 0);
}
}
else{
int reshapedSize[MAX_TENSOR_DIM_NUM];
reshapedSize[0] = 1;
reshapedSize[1] = dimSize[n];
reshapedSize[2] = 1;
for(int i = 0; i < order; i++){
if(i < n)
reshapedSize[0] *= dimSize[i];
}
reshapedSize[2] = a->unitNum / (reshapedSize[0] * reshapedSize[1]);
/* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|.
Then reduce along with z and x to obtain dE/db. */
interGradTMP->Reshape(3, reshapedSize);
XTensor * interGrad = NewTensorBuf(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
_ReduceSum(interGradTMP, interGrad, 2);
if(b->outgo.tailNum > 1){
XTensor * bGradTMP2 = NewTensorBuf(b->grad, b->devID, b->mem);
_ReduceSum(interGrad, bGradTMP2, 0);
_Sum(b->grad, bGradTMP2, b->grad);
DelTensorBuf(bGradTMP2);
}
else{
_ReduceSum(interGrad, b->grad, 0);
}
DelTensorBuf(interGrad);
}
DelTensorBuf(aTMP1);
DelTensorBuf(aTMP2);
DelTensorBuf(bTMP);
DelTensorBuf(interGradTMP);
node->visitMark = NODE_FINISHED;
}
/*
......@@ -619,16 +801,16 @@ void XMathGrad::GradExp(XTensor * node)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for EXP!");
XTensor * a = income.tails[0];
XTensor * b = NewTensor(a);
XTensor * b = NewTensorBuf(a, a->devID, a->mem);
XNoder::MakeGrad(a);
_Exp(a, b);
_Multiply(node->grad, b, a->grad, 1.0F);
node->visitMark = NODE_FINISHED;
DelTensorBuf(b);
delete b;
node->visitMark = NODE_FINISHED;
}
/*
......@@ -645,16 +827,16 @@ void XMathGrad::GradSin(XTensor * node)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SIN!");
XTensor * a = income.tails[0];
XTensor * b = NewTensor(a);
XTensor * b = NewTensorBuf(a, a->devID, a->mem);
XNoder::MakeGrad(a);
_Cos(a, b);
_Multiply(node->grad, b, a->grad, 1.0F);
node->visitMark = NODE_FINISHED;
DelTensorBuf(b);
delete b;
node->visitMark = NODE_FINISHED;
}
/*
......@@ -671,19 +853,17 @@ void XMathGrad::GradCos(XTensor * node)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for COS!");
XTensor * a = income.tails[0];
XTensor * b = NewTensor(a);
XTensor * c = NewTensor(a);
XTensor * b = NewTensorBuf(a, a->devID, a->mem);
XNoder::MakeGrad(a);
_Sin(a, b);
_ScaleAndShift(b, c, -1.0F);
_Multiply(node->grad, c, a->grad, 1.0F);
_ScaleAndShiftMe(b, -1.0F);
_Multiply(node->grad, b, a->grad, 1.0F);
node->visitMark = NODE_FINISHED;
DelTensorBuf(b);
delete b;
delete c;
node->visitMark = NODE_FINISHED;
}
/*
......@@ -700,19 +880,17 @@ void XMathGrad::GradTan(XTensor * node)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for TAN!");
XTensor * a = income.tails[0];
XTensor * b = NewTensor(a);
XTensor * c = NewTensor(a);
XTensor * b = NewTensorBuf(a, a->devID, a->mem);
XNoder::MakeGrad(a);
_Cos(a, b);
_Power(b, c, -2.0F);
_Multiply(node->grad, c, a->grad, 1.0F);
_PowerMe(b, -2.0F);
_Multiply(node->grad, b, a->grad, 1.0F);
node->visitMark = NODE_FINISHED;
DelTensorBuf(b);
delete b;
delete c;
node->visitMark = NODE_FINISHED;
}
/*
......@@ -818,16 +996,16 @@ void XMathGrad::GradAbsolute(XTensor * node)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for ABSOLUTE!");
XTensor * a = income.tails[0];
XTensor * b = NewTensor(a);
XTensor * b = NewTensorBuf(a, a->devID, a->mem);
XNoder::MakeGrad(a);
_Sign(a, b);
_Multiply(node->grad, b, a->grad, 1.0F);
node->visitMark = NODE_FINISHED;
DelTensorBuf(b);
delete b;
node->visitMark = NODE_FINISHED;
}
/*
......@@ -843,17 +1021,10 @@ void XMathGrad::GradSign(XTensor * node)
XLink &income = node->income;
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SIGN!");
XTensor * a = income.tails[0];
XTensor * b = NewTensor(a);
XNoder::MakeGrad(a);
b->SetZeroAll();
_Sum(a->grad, b, a->grad);
// we do nothing here
// TODO: set grad = 0 if the node is the only child
node->visitMark = NODE_FINISHED;
delete b;
}
/*
......@@ -869,17 +1040,10 @@ void XMathGrad::GradRound(XTensor * node)
XLink &income = node->income;
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for ROUND!");
XTensor * a = income.tails[0];
XTensor * b = NewTensor(a);
XNoder::MakeGrad(a);
b->SetZeroAll();
_Sum(a->grad, b, a->grad);
// we do nothing here
// TODO: set grad = 0 if the node is the only child
node->visitMark = NODE_FINISHED;
delete b;
}
/*
......@@ -895,7 +1059,7 @@ void XMathGrad::GradClip(XTensor * node)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for CLIP!");
XTensor * a = income.tails[0];
XTensor * b = NewTensor(a);
XTensor * b = NewTensorBuf(a, a->devID, a->mem);
DTYPE lower = income.GetParam(0);
DTYPE upper = income.GetParam(1);
......@@ -905,9 +1069,9 @@ void XMathGrad::GradClip(XTensor * node)
_ClipBackward(node, a, node->grad, a->grad, lower, upper);
_Sum(a->grad, b, a->grad);
node->visitMark = NODE_FINISHED;
DelTensorBuf(b);
delete b;
node->visitMark = NODE_FINISHED;
}
/*
......@@ -924,21 +1088,20 @@ void XMathGrad::GradReduceMean(XTensor * node)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for Reduce!");
XTensor * a = income.tails[0];
XTensor * b = NewTensor(a);
XTensor * c = NewTensor(a);
XTensor * b = NewTensorBuf(a, a->devID, a->mem);
int dim = income.GetParamInt(0);
int n = a->GetDim(dim);
XNoder::MakeGrad(a);
_Unsqueeze(node->grad, b, dim, n);
_ScaleAndShift(b, c, 1.0F/n);
_Sum(a->grad, c, a->grad);
_ScaleAndShiftMe(b, 1.0F/n);
_Sum(a->grad, b, a->grad);
node->visitMark = NODE_FINISHED;
DelTensorBuf(b);
delete b;
delete c;
node->visitMark = NODE_FINISHED;
}
/*
......@@ -955,27 +1118,28 @@ void XMathGrad::GradReduceSum(XTensor * node)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for Reduce!");
XTensor * a = income.tails[0];
XTensor * b = NewTensor(a);
XTensor * b = NewTensorBuf(a, a->devID, a->mem);
int dim = income.GetParamInt(0);
int n = a->GetDim(dim);
XNoder::MakeGrad(a);
_Unsqueeze(node->grad, b, dim, n);
_Sum(a->grad, b, a->grad);
node->visitMark = NODE_FINISHED;
DelTensor(b);
delete b;
node->visitMark = NODE_FINISHED;
}
/*
gradient for reduceSumSquared
for
c = reduceSumSquared(a, dim, b)
c = \sum_i (a_i - b)^2
we have
dE/da = Unsqueeze(dE/dc) * 2a
dE/db = Unsqueeze(dE/dc) * (-2b)
dE/db = dE/dc * -2 * n * b
>> node - the node (c) for backward computation
*/
void XMathGrad::GradReduceSumSquared(XTensor * node)
......@@ -985,35 +1149,46 @@ void XMathGrad::GradReduceSumSquared(XTensor * node)
XTensor * a = income.tails[0];
XTensor * b = income.tails[1];
XTensor * c = NewTensor(a);
XTensor * d = NewTensor(b);
XTensor * e = NewTensor(c);
XTensor * c = NewTensorBuf(a, a->devID, a->mem);
XTensor * d = NewTensorBuf(a, a->devID, a->mem);
XTensor * e = NewTensorBuf(a, a->devID, a->mem);
XTensor * f = NewTensorBuf(b, b->devID, b->mem);
int dim = income.GetParamInt(0);
int n = a->GetDim(dim);
XNoder::MakeGrad(a);
XNoder::MakeGrad(b);
_ScaleAndShift(a, c, 2.0F);
_ScaleAndShift(b, d, -2.0F);
/* compute a-b */
_Unsqueeze(b, c, dim, n);
_Sub(a, c, d);
_ReduceSum(d, f, dim);
/* dE/da = Unsqueeze(dE/dc) * 2(a-b) */
_ScaleAndShiftMe(d, 2.0F);
_Unsqueeze(node->grad, e, dim, n);
_Multiply(e, c, a->grad, 1.0F);
_Multiply(node->grad, d, b->grad, 1.0F);
_Multiply(d, e, a->grad, 1.0F);
node->visitMark = NODE_FINISHED;
/* dE/db = dE/dc * -2 * (a-b*n) */
_ScaleAndShiftMe(f, -2.0F);
_Multiply(node->grad, f, b->grad, 1.0F);
delete c;
delete d;
delete e;
DelTensorBuf(c);
DelTensorBuf(d);
DelTensorBuf(e);
DelTensorBuf(f);
node->visitMark = NODE_FINISHED;
}
/*
gradient for reduceVariance
for
c = reduceVariance(a, dim, b)
c = (sum_i (a_i - b)^2) * 1/n
where b is the mean, and n is the size of a
we have
dE/da = Unsqueeze(dE/dc) * 2a/dimSizeA[dim]
dE/db = Unsqueeze(dE/dc) * (-2a/dimSizeA[dim])
dE/da = Unsqueeze(dE/dc) * 2a/n
dE/db = dE/dc * -2 * b
>> node - the node (c) for backward computation
*/
void XMathGrad::GradReduceVariance(XTensor * node)
......@@ -1023,26 +1198,36 @@ void XMathGrad::GradReduceVariance(XTensor * node)
XTensor * a = income.tails[0];
XTensor * b = income.tails[1];
XTensor * c = NewTensor(a);
XTensor * d = NewTensor(b);
XTensor * e = NewTensor(a);
XTensor * c = NewTensorBuf(a, a->devID, a->mem);
XTensor * d = NewTensorBuf(a, a->devID, a->mem);
XTensor * e = NewTensorBuf(a, a->devID, a->mem);
XTensor * f = NewTensorBuf(b, b->devID, b->mem);
int dim = income.GetParamInt(0);
int n = a->GetDim(dim);
XNoder::MakeGrad(a);
XNoder::MakeGrad(b);
_ScaleAndShift(a, c, 2.0F / n);
_ScaleAndShift(b, d, -2.0F / n);
/* compute a-b */
_Unsqueeze(b, c, dim, n);
_Sub(a, c, d);
_ReduceSum(d, f, dim);
/* dE/da = Unsqueeze(dE/dc) * 2 (a-b) / n */
_ScaleAndShiftMe(d, 2.0F / n);
_Unsqueeze(node->grad, e, dim, n);
_Multiply(e, c, a->grad, 1.0F);
_Multiply(node->grad, d, b->grad, 1.0F);
_Multiply(d, e, a->grad, 1.0F);
node->visitMark = NODE_FINISHED;
/* dE/db = dE/dc * -2 * (a-b) */
_ScaleAndShiftMe(f, -2.0F /n);
_Multiply(node->grad, f, b->grad, 1.0F);
delete c;
delete d;
delete e;
DelTensorBuf(c);
DelTensorBuf(d);
DelTensorBuf(e);
DelTensorBuf(f);
node->visitMark = NODE_FINISHED;
}
}
......@@ -53,6 +53,11 @@ private:
static
void GradMultiply(XTensor * node);
/* gradient for multiply one dimension: c = a * b * \alpha
where the size of b is equal to that of one dimension of a */
static
void GradMultiplyDim(XTensor * node);
/* gradient for matrix multiply: c = matmul(a, b) * \alpha */
static
void GradMatrixMul(XTensor * node);
......@@ -92,6 +97,10 @@ private:
static
void GradDiv(XTensor * node);
/* gradient for DivideDim */
static
void GradDivDim(XTensor * node);
/* gradient for reduceMean */
static
void GradReduceMean(XTensor * node);
......
......@@ -137,6 +137,8 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
XTensor * x = income.tails[0];
XNoder::MakeGrad(x);
lossGrad.Compute(gold, root, x, NULL, x->grad, funcID, params, loss);
//XNoder::MakeGrad(root);
//lossGrad.Compute(gold, root, x, root->grad, x->grad, funcID, params, loss);
root->visitMark = NODE_FINISHED;
}
/* we compuate dE/dy (y is the output) if no predefined activation function is used */
......
......@@ -99,7 +99,7 @@ arguments:
(how many words)
-shuffle: shuffle the training data
-devid D: the id of the device used
-1: GPU, >=0: GPUs
-1: CPU, >=0: GPUs
-mempool: use memory pools for memory management
-autodiff: use automatic differentiation for training
......
......@@ -57,18 +57,22 @@ void T2TAttention::InitModel(int argc, const char ** argv, int myDevID, XMem * m
float minmax = 0;
LoadParamInt(argc, argv, "nhead", &nhead, 8);
LoadParamInt(argc, argv, "d", &dk, DEFAULT_BEDDING_SIZE);
LoadParamInt(argc, argv, "d", &dv, DEFAULT_BEDDING_SIZE);
LoadParamInt(argc, argv, "d", &d, DEFAULT_BEDDING_SIZE);
LoadParamFloat(argc, argv, "attminmax", &minmax, 0.08F);
LoadParamInt(argc, argv, "d", &dk, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "d", &dv, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
LoadParamFloat(argc, argv, "attminmax", &minmax, 0.1F);
InitTensor2D(&wk, d, dk, X_FLOAT, devID, mem);
InitTensor2D(&wq, d, dk, X_FLOAT, devID, mem);
InitTensor2D(&wv, d, dv, X_FLOAT, devID, mem);
float scale = 1.0F;
float finfoutk = (float)sqrt(6.0F * scale/(d + dk));
float finfoutv = (float)sqrt(6.0F * scale/(d + dv));
wk.SetDataRand(-minmax, minmax);
wq.SetDataRand(-minmax, minmax);
wv.SetDataRand(-minmax, minmax);
wk.SetDataRand(-finfoutk, finfoutk);
wq.SetDataRand(-finfoutk, finfoutk);
wv.SetDataRand(-finfoutv, finfoutv);
}
/*
......@@ -104,7 +108,7 @@ XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v)
XTensor scalar;
/* scalar = softmax(Q * K^T / sqrt(dk)) * V */
scalar = Softmax(Linear(BMMul(qheads, X_NOTRANS, kheads, X_TRANS), 1/sqrt((float)dk)), -1);
scalar = Softmax(Linear(BMMul(qheads, X_NOTRANS, kheads, X_TRANS), 1/(float)sqrt((float)dk)), -1);
att = BMMul(scalar, vheads);
/* concatenate the heads */
......
......@@ -53,16 +53,14 @@ void T2TEmbedder::InitModel(int argc, const char ** argv, int myDevID, XMem * my
devID = myDevID;
mem = myMem;
int d = 0;
LoadParamInt(argc, argv, "vsize", &vSize, -1);
LoadParamInt(argc, argv, "maxlen", &maxLength, 256);
LoadParamInt(argc, argv, "d", &eSize, DEFAULT_BEDDING_SIZE);
LoadParamInt(argc, argv, "d", &d, DEFAULT_BEDDING_SIZE);
LoadParamInt(argc, argv, "maxlen", &maxLength, 512);
LoadParamInt(argc, argv, "d", &eSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
InitTensor2D(&w, vSize, eSize, X_FLOAT, devID, mem);
w.SetDataRandn(0, sqrt((float)eSize));
w.SetDataRandn(0, 1.0F/(float)sqrt((float)eSize));
/* create the positional embedding matrix */
MakePosEmbedding(eSize, d, maxLength);
......@@ -84,11 +82,11 @@ void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length)
for(int k = 0; k < eSize; k++){
if(k % 2 == 0){
int i = k/2;
dp[k] = sin(pos/pow(10000.0F, 2.0F*i/d));
dp[k] = (float)sin(pos/pow(10000.0F, 2.0F*i/d));
}
else{
int i = (k - 1)/2;
dp[k] = cos(pos/pow(10000.0F, 2.0F*i/d));
dp[k] = (float)cos(pos/pow(10000.0F, 2.0F*i/d));
}
}
}
......@@ -135,10 +133,10 @@ XTensor T2TEmbedder::Make(XTensor &input)
XTensor wordEmbedding;
/* then we make word embeddings */
wordEmbedding = MMul(&input, w);
wordEmbedding = Linear(MMul(input, w), (float)sqrt((float)d));
/* we sum over the two embeddings */
return wordEmbedding + posEmbedding;
return wordEmbedding +posEmbedding;
}
}
......@@ -29,7 +29,7 @@ using namespace nts;
namespace transformer
{
#define DEFAULT_BEDDING_SIZE 512
#define DEFAULT_EMBEDDING_SIZE 512
/*
embedding (of word at position i):
......@@ -53,6 +53,9 @@ public:
/* maximum length of the sequence */
int maxLength;
/* dimension size of the hidden layers in the t2t model */
int d;
/* word embedding matrix */
XTensor w;
......
......@@ -38,7 +38,8 @@ AttEncoder::~AttEncoder()
{
delete[] attentions;
delete[] fnns;
delete[] layerNorms;
delete[] attLayerNorms;
delete[] fnnLayerNorms;
}
/*
......@@ -53,13 +54,12 @@ void AttEncoder::InitModel(int argc, const char ** argv, int myDevID, XMem * myM
devID = myDevID;
mem = myMem;
LoadParamInt(argc, argv, "nstack", &nlayer, 6);
LoadParamInt(argc, argv, "hsize", &hSize, 512);
LoadParamInt(argc, argv, "esize", &eSize, 512);
LoadParamInt(argc, argv, "nlayer", &nlayer, 6);
LoadParamInt(argc, argv, "hsize", &hSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "esize", &eSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "vsize", &vSize, -1);
CheckNTErrors(nlayer > 1, "We have one encoding layer at least!");
CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsize\"");
/* embedding model */
......@@ -67,13 +67,15 @@ void AttEncoder::InitModel(int argc, const char ** argv, int myDevID, XMem * myM
attentions = new T2TAttention[nlayer];
fnns = new T2TFNN[nlayer];
layerNorms = new T2TLN[nlayer];
attLayerNorms = new T2TLN[nlayer];
fnnLayerNorms = new T2TLN[nlayer];
/* initialize the stacked layers */
for(int i = 0; i < nlayer; i++){
attentions[i].InitModel(argc, argv, myDevID, myMem);
fnns[i].InitModel(argc, argv, myDevID, myMem);
layerNorms[i].InitModel(argc, argv, myDevID, myMem);
attLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
fnnLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
}
}
......@@ -103,10 +105,7 @@ XTensor AttEncoder::Make(XTensor &input)
/* TODO: dropout */
/* layer normalization */
ln = layerNorms[i].Make(res);
/* input of next layer */
x = ln;
x = attLayerNorms[i].Make(res);
/* fnn */
fnn = fnns[i].Make(x);
......@@ -117,10 +116,7 @@ XTensor AttEncoder::Make(XTensor &input)
/* TODO: dropout */
/* layer normalization */
ln = layerNorms[i].Make(res);
/* input of next layer */
x = ln;
x = fnnLayerNorms[i].Make(res);
}
return x;
......
......@@ -86,8 +86,11 @@ public:
/* attention model of each layer */
T2TAttention * attentions;
/* layer normalization */
T2TLN * layerNorms;
/* layer normalization for fnn */
T2TLN * fnnLayerNorms;
/* layer normalization for attention */
T2TLN * attLayerNorms;
/* input tensor of the encoder */
XTensor * input;
......
......@@ -19,6 +19,7 @@
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
*/
#include <math.h>
#include "T2TFNN.h"
#include "T2TUtility.h"
#include "T2TEmbedding.h"
......@@ -55,10 +56,10 @@ void T2TFNN::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
float minmax = 0;
LoadParamInt(argc, argv, "d", &inSize, DEFAULT_BEDDING_SIZE);
LoadParamInt(argc, argv, "d", &outSize, DEFAULT_BEDDING_SIZE);
LoadParamInt(argc, argv, "fnnh", &hSize, DEFAULT_BEDDING_SIZE);
LoadParamFloat(argc, argv, "fnnminmax", &minmax, 0.08F);
LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "d", &outSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "fnnh", &hSize, DEFAULT_EMBEDDING_SIZE);
LoadParamFloat(argc, argv, "fnnminmax", &minmax, 0.1F);
InitTensor2D(&w1, inSize, hSize, X_FLOAT, devID, mem);
InitTensor1D(&b1, hSize, X_FLOAT, devID, mem);
......@@ -66,10 +67,14 @@ void T2TFNN::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
InitTensor2D(&w2, hSize, outSize, X_FLOAT, devID, mem);
InitTensor1D(&b2, outSize, X_FLOAT, devID, mem);
w1.SetDataRand(-minmax, minmax);
b1.SetDataRand(-minmax, minmax);
w2.SetDataRand(-minmax, minmax);
b2.SetDataRand(-minmax, minmax);
float scale = 1.0F;
float finfout1 = (float)sqrt(6.0F * scale/(inSize + hSize));
float finfout2 = (float)sqrt(6.0F * scale/(hSize + outSize));
w1.SetDataRand(-finfout1, finfout1);
b1.SetZeroAll();
w2.SetDataRand(-finfout2, finfout2);
b2.SetZeroAll();
}
/*
......@@ -83,10 +88,10 @@ XTensor T2TFNN::Make(XTensor &input)
XTensor t1;
/* t1 = max(0, x * w1 + b1) */
t1 = Rectify(MMul(input, X_NOTRANS, w1, X_NOTRANS) + b1);
t1 = Rectify(MMul(input, w1) + b1);
/* result = t1 * w2 + b2 */
return MMul(t1, X_NOTRANS, w2, X_NOTRANS) + b2;
return MMul(t1, w2) + b2;
}
......
......@@ -20,6 +20,8 @@
*/
#include "T2TLayerNormal.h"
#include "T2TUtility.h"
#include "T2TEmbedding.h"
#include "../../tensor/core/CHeader.h"
namespace transformer
......@@ -48,6 +50,18 @@ void T2TLN::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
{
devID = myDevID;
mem = myMem;
int d = 0;
LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
InitTensor2D(&w, d, d, X_FLOAT, devID, mem);
InitTensor1D(&b, d, X_FLOAT, devID, mem);
float scale = 1.0F;
float finfout = (float)sqrt(6.0F * scale / (d + d));
w.SetDataRand(-finfout, finfout);
b.SetZeroAll();
}
/*
......@@ -60,6 +74,7 @@ y =
XTensor T2TLN::Make(XTensor &input)
{
XTensor &x = input;
XTensor xn;
XTensor mean;
XTensor variance;
XTensor standard;
......@@ -67,7 +82,7 @@ XTensor T2TLN::Make(XTensor &input)
XTensor standardFilled;
/* \mu = (sum_i x_i)/m */
mean = ReduceSum(x, x.order - 1);
mean = ReduceMean(x, x.order - 1);
/* \sigma = (sum_i (x_i - \mu)^2)/m */
variance = ReduceVariance(x, x.order - 1, mean);
......@@ -76,12 +91,15 @@ XTensor T2TLN::Make(XTensor &input)
standard = Power(variance, 0.5F);
/* unsqueeze mean and standard deviation to fit them into
the same size of x */
the same shape of x */
meanFilled = Unsqueeze(mean, x.order - 1, x.GetDim(-1));
standardFilled = Unsqueeze(standard, x.order - 1, x.GetDim(-1));
/* x' = (x - \mu)/standard */
return (x - meanFilled)/standardFilled;
xn = (x - meanFilled)/standardFilled ;
/* result = x' * w + b */
return MMul(xn, w) + b;
}
}
......@@ -29,6 +29,8 @@ using namespace nts;
namespace transformer
{
/* layer normalization: y = norm(x) * w + b
where norm(x) = (x - mean)/standardDeviation */
class T2TLN
{
public:
......@@ -37,6 +39,12 @@ public:
/* memory pool */
XMem * mem;
/* the transformation matrix w */
XTensor w;
/* the bias term b */
XTensor b;
public:
/* constructor */
......
......@@ -22,6 +22,7 @@
#include "T2TModel.h"
#include "T2TUtility.h"
#include "../../tensor/core/CHeader.h"
namespace transformer
{
......@@ -81,9 +82,9 @@ make the entire network (with the output softmax layer)
*/
void T2TModel::Make(XTensor &input, XTensor &output)
{
XTensor encoding;
if(isLM){
XTensor encoding;
encoding = MakeEncoding(input);
outputLayer.Make(encoding, output);
}
......@@ -92,4 +93,4 @@ void T2TModel::Make(XTensor &input, XTensor &output)
}
}
}
\ No newline at end of file
}
......@@ -19,6 +19,7 @@
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
*/
#include <math.h>
#include "T2TOutput.h"
#include "T2TUtility.h"
#include "T2TEmbedding.h"
......@@ -56,12 +57,15 @@ void T2TOutput::InitModel(int argc, const char ** argv, int myDevID, XMem * myMe
float minmax = 0;
LoadParamInt(argc, argv, "vsize", &vSize, -1);
LoadParamInt(argc, argv, "d", &inSize, DEFAULT_BEDDING_SIZE);
LoadParamInt(argc, argv, "d", &hSize, DEFAULT_BEDDING_SIZE);
LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "d", &hSize, DEFAULT_EMBEDDING_SIZE);
LoadParamFloat(argc, argv, "outputminmax", &minmax, 0.08F);
InitTensor2D(&w, hSize, vSize, X_FLOAT, devID, mem);
w.SetDataRand(-minmax, minmax);
float scale = 1.0F;
float finfout = (float)sqrt(6.0F * scale/(hSize + vSize));
w.SetDataRand(-finfout, finfout);
}
/*
......@@ -89,4 +93,4 @@ void T2TOutput::Make(XTensor &input, XTensor &output)
output = LogSoftmax(MMul(x, w), -1);
}
}
\ No newline at end of file
}
......@@ -59,6 +59,8 @@ void T2TTrainer::Init(int argc, const char ** argv)
LoadParamInt(argc, argv, "wbatch", &wBatchSize, 1);
LoadParamInt(argc, argv, "nepoch", &nepoch, 1);
LoadParamInt(argc, argv, "nstep", &nstep, 1);
LoadParamInt(argc, argv, "d", &d, 512);
LoadParamInt(argc, argv, "nwarmup", &nwarmup, 4000);
LoadParamInt(argc, argv, "vsize", &vSize, 1);
LoadParamBool(argc, argv, "sorted", &isLenSorted, false);
LoadParamInt(argc, argv, "bufsize", &bufSize, 50000);
......@@ -82,6 +84,7 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
int wordCountTotal = 0;
bool isEnd = false;
float loss = 0;
float lr = 0;
XNet net;
......@@ -108,8 +111,12 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
/* back-propagation for obtaining gradients */
net.Backward(output, batch, CROSSENTROPY);
/* learning rate */
lr = (1 / (float)sqrt((float)d)) * (float)MIN(pow(step + 1, -0.5), (step + 1) * pow(nwarmup, -1.5));
//lr = 0.00005F;
/* update the parameters */
Update(model);
Update(model, lr);
/* get probabilities */
float prob = GetProb(&output, &batch, NULL);
......@@ -125,18 +132,21 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
if (step % 1 == 0) {
double elapsed = GetClockSec() - startT;
XPRINT5(0, stderr, "[INFO] elapsed=%.1fs, step=%d, epoch=%d, ngram=%d, ppl=%.3f\n",
elapsed, step, epoch + 1, wordCountTotal, exp(loss / wordCount));
XPRINT6(0, stderr, "[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, ppl=%.3f\n",
lr, elapsed, step, epoch + 1, wordCountTotal, exp(loss / wordCount));
}
}
fclose(file);
if (isEnd)
break;
}
double elapsed = GetClockSec() - startT;
XPRINT5(0, stderr, "[INFO] elapsed=%.1fs, step=%d, epoch=%d, ngram=%d, ppl=%.3f\n",
elapsed, step, epoch, wordCountTotal, exp(loss / wordCount));
XPRINT6(0, stderr, "[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, ppl=%.3f\n",
lr, elapsed, step, epoch, wordCountTotal, exp(loss / wordCount));
XPRINT3(0, stderr, "[INFO] training finished (took %.1fs, step=%d and epoch=%d)\n",
elapsed, step, epoch);
}
......@@ -317,10 +327,14 @@ float T2TTrainer::GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs)
}
/*
update the model by delta rule
update the model by delta rule
\theta_new = \theta - \lrate * grad
where
\lrate = d^-0.5 * min(stepNum^-0.5, stepNum * warmupStepNum^-1.5)
>> model - the t2t model
>> lr - learning rate
*/
void T2TTrainer::Update(T2TModel * model)
void T2TTrainer::Update(T2TModel * model, const float lr)
{
XList ws(100);
......@@ -331,6 +345,13 @@ void T2TTrainer::Update(T2TModel * model)
ws.Add(&model->encoder.fnns[i].b1);
ws.Add(&model->encoder.fnns[i].w2);
ws.Add(&model->encoder.fnns[i].b2);
ws.Add(&model->encoder.attentions[i].wk);
ws.Add(&model->encoder.attentions[i].wq);
ws.Add(&model->encoder.attentions[i].wv);
ws.Add(&model->encoder.fnnLayerNorms[i].w);
ws.Add(&model->encoder.fnnLayerNorms[i].b);
ws.Add(&model->encoder.attLayerNorms[i].w);
ws.Add(&model->encoder.attLayerNorms[i].b);
}
ws.Add(&model->encoder.embedder.w);
......@@ -339,11 +360,37 @@ void T2TTrainer::Update(T2TModel * model)
XTensor * para = (XTensor*)ws.Get(i);
XTensor * paraGrad = para->grad;
if (para == NULL || paraGrad == NULL)
continue;
CheckNTErrors(para != NULL, "NULL parameter tensor!");
CheckNTErrors(paraGrad != NULL, "NULL gradient tensor!");
/*
DTYPE * d = new DTYPE[para->unitNum * para->unitSize];
DTYPE * g = new DTYPE[para->unitNum * para->unitSize];
XMemCopy(d, -1, para->data, para->devID, para->unitNum * para->unitSize);
XMemCopy(g, -1, paraGrad->data, paraGrad->devID, para->unitNum * para->unitSize);
for (int i = 0; i < para->unitNum; i++) {
if (IsNAN(d[i]) || IsINF(d[i])) {
int nnn = 0;
}
if (IsNAN(g[i]) || IsINF(g[i])) {
int nnn = 0;
}
}
delete[] d;
delete[] g;
*/
/* the delta rule */
_Sum(para, paraGrad, para, -lrate);
_Sum(para, paraGrad, para, -lr);
/* clear gradient */
paraGrad->SetZeroAll();
}
}
......
......@@ -63,6 +63,12 @@ public:
/* indicates whether the sequence is sorted by length */
bool isLenSorted;
/* dimension size of each inner layer */
int d;
/* step number of warm-up for training */
int nwarmup;
/* vocabulary size of the source side */
int vSize;
......@@ -105,7 +111,7 @@ public:
float GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs);
/* update the model by delta rule */
void Update(T2TModel * model);
void Update(T2TModel * model, const float lr);
};
......
......@@ -26,6 +26,8 @@
namespace transformer
{
FILE * tmpFILE;
void LoadParamString(int argc, const char ** argv, const char * name, char * p, const char * defaultP)
{
char vname[128];
......
......@@ -27,6 +27,8 @@
namespace transformer
{
extern FILE * tmpFILE;
/* load arguments */
void LoadParamString(int argc, const char ** argv, const char * name, char * p, const char * defaultP);
void LoadParamInt(int argc, const char ** argv, const char * name, int * p, int defaultP);
......
......@@ -33,6 +33,8 @@ int TransformerMain(int argc, const char ** argv)
if(argc == 0)
return 1;
tmpFILE = fopen("tmp.txt", "wb");
ShowParams(argc, argv);
char * trainFN = new char[MAX_LINE_LENGTH];
......@@ -51,6 +53,8 @@ int TransformerMain(int argc, const char ** argv)
delete[] trainFN;
fclose(tmpFILE);
return 0;
}
......
......@@ -45,7 +45,7 @@ int main( int argc, const char ** argv )
//_CrtSetBreakAlloc(123);
/* a tiny test */
SmallTest();
//SmallTest();
//_CrtDumpMemoryLeaks();
//return 0;
......
......@@ -43,7 +43,7 @@
/* the nts (NiuTrans.Tensor) namespace */
namespace nts {
#define _XINLINE_ inline
#define _XINLINE_
//#define DOUBELPRICSION
......
......@@ -45,12 +45,16 @@ const char * GetOPName(int type)
return "M_CLIP";
else if (type == MATH_DIV)
return "M_DIV";
else if (type == MATH_DIVDIM)
return "M_DIVDIM";
else if (type == MATH_MATRIXMUL)
return "M_MATRIXMUL";
else if (type == MATH_MATRIXMULBATCHED)
return "M_MATRIXMULBATCHED";
else if (type == MATH_MULTIPLY)
return "M_MULTIPLY";
else if (type == MATH_MULTIPLYDIM)
return "M_MULTIPLYDIM";
else if (type == MATH_NEGATE)
return "M_NEGATE";
else if (type == MATH_NORMALIZE)
......@@ -61,10 +65,12 @@ const char * GetOPName(int type)
return "M_SCALEANDSHIFT";
else if (type == MATH_SIGN)
return "M_SIGN";
else if (type == MATH_SUM)
return "M_SUM";
else if (type == MATH_SUB)
return "M_SUB";
else if (type == MATH_SUBDIM)
return "M_SUBDIM";
else if (type == MATH_SUM)
return "M_SUM";
else if (type == MATH_SUMDIM)
return "M_SUMDIM";
else if (type == REDUCE_REDUCEMAX)
......
......@@ -41,17 +41,20 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#define MATH_CLIP MATH_ROUND + 1
#define MATH_DIV MATH_CLIP + 1
#define MATH_MATRIXMUL MATH_DIV + 1
#define MATH_DIVDIM MATH_DIV + 1
#define MATH_MATRIXMUL MATH_DIVDIM + 1
#define MATH_MATRIXMULBATCHED MATH_MATRIXMUL + 1
#define MATH_MULTIPLY MATH_MATRIXMULBATCHED + 1
#define MATH_NEGATE MATH_MULTIPLY + 1
#define MATH_MULTIPLYDIM MATH_MULTIPLY + 1
#define MATH_NEGATE MATH_MULTIPLYDIM + 1
#define MATH_NORMALIZE MATH_NEGATE + 1
#define MATH_POWER MATH_NORMALIZE + 1
#define MATH_SCALEANDSHIFT MATH_POWER + 1
#define MATH_SIGN MATH_SCALEANDSHIFT + 1
#define MATH_SUM MATH_SIGN + 1
#define MATH_SUB MATH_SUM + 1
#define MATH_SUMDIM MATH_SUB + 1
#define MATH_SUB MATH_SIGN + 1
#define MATH_SUBDIM MATH_SUB + 1
#define MATH_SUM MATH_SUBDIM + 1
#define MATH_SUMDIM MATH_SUM + 1
#define REDUCE MATH_SUMDIM + 1
#define REDUCE_REDUCEMAX REDUCE + 1
......
......@@ -1046,7 +1046,7 @@ bool XTensor::Set3D(DTYPE value, int d0, int d1, int d2)
CheckNTErrors(d2 >= 0 && d2 < dimSize[2], "dimension 1 is out of range!");
CheckNTErrors(dataType == DEFAULT_DTYPE, "The tensor is not in default type.");
int dims[3] = {d0, d1, d1};
int dims[3] = {d0, d1, d2};
return SetToDevice(devID, GetCell(dims, 3), value);
}
......
......@@ -27,15 +27,18 @@
#include "../XTensor.h"
#include "arithmetic/Div.h"
#include "arithmetic/DivDim.h"
#include "arithmetic/MatrixMul.h"
#include "arithmetic/MatrixMul2D.h"
#include "arithmetic/MatrixMul2DMultiTheading.h"
#include "arithmetic/MatrixMul2DParallel.h"
#include "arithmetic/MatrixMulBatched.h"
#include "arithmetic/Multiply.h"
#include "arithmetic/MultiplyDim.h"
#include "arithmetic/Negate.h"
#include "arithmetic/Sign.h"
#include "arithmetic/Sub.h"
#include "arithmetic/SubDim.h"
#include "arithmetic/Sum.h"
#include "arithmetic/SumByColumnTV.h"
#include "arithmetic/SumByColumnVT.h"
......
......@@ -31,7 +31,7 @@ element-wise division of two tensors:
c(i) = a(i)/b(i) + \alpha * c(i)
where i is the index of the element
*/
void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha = 0, int leadingDim = 0);
void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha = 0.0, int leadingDim = 0);
/*
element-wise division of two tensors (do it on site)
......@@ -39,7 +39,7 @@ keep the result in the input tensor a and return nothing
a(i) = a(i)/b(i) + \alpha * a(i)
where i is the index of the element
*/
void _DivMe(XTensor * a, const XTensor * b, DTYPE alpha = 0, int leadingDim = 0);
void _DivMe(XTensor * a, const XTensor * b, DTYPE alpha = 0.0, int leadingDim = 0);
/*
element-wise division of two tensors (return a XTensor structure)
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-15
*/
#include "Div.h"
#include "DivDim.h"
#include "DivDim.cuh"
#include "../../XName.h"
#include "../movement/CopyValues.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
tensor division
c = a / b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is divided with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> c - where we put result. we save it in a if c is NULL
>> n - the dimension index
>> alpha - the scaling factor
*/
void _DivDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha)
{
CheckNTErrors(a && b && c, "Empty tensor input!");
CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in division!");
CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
"Unmatched data types in addition!");
CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in division!");
CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
if(XTensor::IsSameShaped(a, b)){
_Div(a, b, c, alpha);
return;
}
if(a->devID >= 0 || b->devID >= 0 || c->devID >= 0){
#ifdef USE_CUDA
_CudaDivDim(a, b, c, n, alpha);
#else
ShowNTErrors("Please specify USE_CUDA and recompile the code!");
#endif
}
else{
int stride = 1;
int blockSize = a->dimSize[n];
int blockNum = 1;
for(int i = a->order - 1; i >= 0; i--){
if(i > n)
stride *= a->dimSize[i];
else if(i < n)
blockNum *= a->dimSize[i];
}
if (a->dataType == DEFAULT_DTYPE){
int num = a->unitNum;
if(stride > 1){
for(int i = 0, j = 0; i < num; i += stride, j++){
DTYPE * ap = (DTYPE*)a->data + i;
DTYPE bv = *((DTYPE*)b->data + j % blockSize);
DTYPE * cp = (DTYPE*)c->data + i;
for(int k = 0; k < stride; k++){
if(alpha == 0.0F)
cp[k] = ap[k] / bv;
else
cp[k] = ap[k] / bv + alpha * cp[k];
}
}
}
else if(stride == 1){
DTYPE * bp = (DTYPE*)b->data;
for(int i = 0; i < num; i += blockSize){
DTYPE * ap = (DTYPE*)a->data + i;
DTYPE * cp = (DTYPE*)c->data + i;
if(alpha == 0.0F){
for(int j = 0; j < blockSize; j++)
cp[j] = ap[j] / bp[j];
}
else{
for(int j = 0; j < blockSize; j++)
cp[j] = ap[j] / bp[j] + alpha * cp[j];
}
}
}
else{
ShowNTErrors("Something is wrong!");
}
}
else {
ShowNTErrors("TODO!");
}
}
}
/*
tensor division of two tensors (do it on site)
keep the result in the input tensor and return nothing
a = a/b + \alpha * a
where the size of b is equal to the n-th dimension of a,
i.e., a is divided with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> n - the dimension index
>> alpha - the scaling factor
*/
void _DivDim(XTensor * a, const XTensor * b, int n, DTYPE alpha)
{
_DivDim(a, b, a, n, alpha);
}
/*
tensor division of two tensors (return a XTensor structure and make tensor connections)
make a new tensor to keep the result and return it
c = a/b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is divided with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> n - the dimension index
>> alpha - the scaling factor
<< return - the result tensor by tensor division
*/
XTensor DivDim(const XTensor &a, const XTensor &b, int n, DTYPE alpha)
{
XTensor c(&a);
c.SetTMP();
/* call _Div function */
_DivDim(&a, &b, &c, n, alpha);
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_DIVDIM);
XLink::AddParamToHeadInt(&c, n);
XLink::AddParamToHead(&c, alpha);
return c;
}
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-15
*/
#include "DivDim.cuh"
#include "../../XDevice.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/*
tensor division of a tensor and a row vector
c = a / b + alpha * c
where a is a tensor and b is a row vector
>> a - pointer to the data array of a
>> b - pointer to the data array of b
>> c - pointer to the data array of c
>> rowNum - number of rows of a and c
>> colNum - number of columns of a and c (i.e., the size of b)
>> alpha - the scaling factor
*/
template <class T, bool alphaFired>
__global__
void KernelDivWithRow(T * a, T * b, T * c, int rowNum, int colNum, T alpha)
{
__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int col = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
if(col >= colNum || row >= rowNum)
return;
if(threadIdx.y == 0)
bv[threadIdx.x] = b[col];
__syncthreads();
int offset = colNum * row + col;
if(alphaFired)
c[offset] = a[offset] / bv[threadIdx.x] + c[offset] * alpha;
else
c[offset] = a[offset] / bv[threadIdx.x];
}
/*
tensor division of a tensor and a colum vector
c = a / b + alpha * c
where a is a tensor and b is a colum vector
>> a - pointer to the data array of a
>> b - pointer to the data array of b
>> c - pointer to the data array of c
>> rowNum - number of rows of a and c (i.e., the size of b)
>> colNum - number of columns of a and c
>> blockNum - size of a block (matrix), i.e., rowNum * colNum
>> blockNum - number of matrics
>> alpha - the scaling factor
*/
template <class T, bool alphaFired>
__global__
void KernelDivWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, T alpha)
{
__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = colIndex % colNum;
int block = colIndex / colNum;
if(row >= rowNum || block >= blockNum)
return;
if(threadIdx.x == 0)
bv[threadIdx.y] = b[row];
__syncthreads();
int offset = block * blockSize + row * colNum + col;
if(alphaFired)
c[offset] = a[offset] / bv[threadIdx.y] + c[offset] * alpha;
else
c[offset] = a[offset] / bv[threadIdx.y];
}
/*
tensor division
c = a / b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is divided with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> c - where we put a / b + \alpha * c. we save it in a if c is NULL
>> n - the dimension index
>> alpha - the scaling factor
*/
void _CudaDivDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha)
{
CheckNTErrors(a && b && c, "Empty tensor input!");
CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in division!");
CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
"Unmatched data types in division!");
CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in division!");
CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
int stride = 1;
int blockSize = a->dimSize[n];
int blockNum = 1;
for(int i = a->order - 1; i >= 0; i--){
if(i > n)
stride *= a->dimSize[i];
else if(i < n)
blockNum *= a->dimSize[i];
}
int cudaGrids[3];
int cudaBlocks[3];
int devIDBackup = 0;
ProtectCudaDev(a->devID, devIDBackup);
if (a->dataType == DEFAULT_DTYPE){
if(stride > 1){
GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
if(alpha == (DTYPE)0.0F)
KernelDivWithCol<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockSize, stride, blockSize * stride, blockNum, alpha);
else
KernelDivWithCol<DTYPE, true> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockSize, stride, blockSize * stride, blockNum, alpha);
}
else if(stride == 1){
GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
if(alpha == (DTYPE)0.0F)
KernelDivWithRow<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockNum, blockSize, alpha);
else
KernelDivWithRow<DTYPE, true> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockNum, blockSize, alpha);
}
else{
ShowNTErrors("Something is wrong!");
}
}
else {
ShowNTErrors("TODO!");
}
BacktoCudaDev(a->devID, devIDBackup);
}
#endif
} // namespace nts(NiuTrans.Tensor)
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-15
*/
#ifndef __DIVDIM_CUH__
#define __DIVDIM_CUH__
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/*
tensor division
c(i) = a/b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is divided with b by broadcasting (cuda version)
*/
void _CudaDivDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha = (DTYPE)0.0);
#endif
} // namespace nts(NiuTrans.Tensor)
#endif // __DIVDIM_CUH__
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-15
*/
#ifndef __DIVDIM_H__
#define __DIVDIM_H__
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
tensor division of two tensors:
c(i) = a/b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is divided with b by broadcasting
*/
void _DivDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha = (DTYPE)0.0);
/*
tensor division of two tensors:
c(i) = a/b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is divided with b by broadcasting
we keep the result in the input tensor a and return nothing
*/
void _DivDim(XTensor * a, const XTensor * b, int n, DTYPE alpha = (DTYPE)0.0);
/*
tensor division of two tensors:
c(i) = a/b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is divided with b by broadcasting
we make a new tensor c to keep the result and return it
*/
XTensor DivDim(const XTensor &a, const XTensor &b, int n, DTYPE alpha = (DTYPE)0.0);
} // namespace nts(NiuTrans.Tensor)
#endif // __DIVDIM_H__
......@@ -31,7 +31,7 @@ element-wise product of two tensors:
c(i) = a(i)*b(i) + \alpha * c(i)
where i is the index of the element
*/
void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha = 0, int leadingDim = 0);
void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha = 0.0, int leadingDim = 0);
/*
element-wise product of two tensors (do it on site)
......@@ -39,7 +39,7 @@ keep the result in the input tensor a and return nothing
a(i) = a(i)*b(i) + \alpha * a(i)
where i is the index of the element
*/
void _MultiplyMe(XTensor * a, const XTensor * b, DTYPE alpha = 0, int leadingDim = 0);
void _MultiplyMe(XTensor * a, const XTensor * b, DTYPE alpha = 0.0, int leadingDim = 0);
/*
element-wise product of two tensors (return a XTensor structure)
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: JIANG Yufan (email: jiangyufan2018@outlook.com) 2018-08-14
*/
#include "Multiply.h"
#include "MultiplyDim.h"
#include "MultiplyDim.cuh"
#include "../../XName.h"
#include "../movement/CopyValues.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
tensor multiplication
c = a * b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is multiplied with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> c - where we put a * b + \alpha * c. we save it in a if c is NULL
>> n - the dimension index
>> alpha - the scaling factor
*/
void _MultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha) {
CheckNTErrors(a && b && c, "Empty tensor input!");
CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in multiplication!");
CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
"Unmatched data types in multiplication!");
CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in multiplication!");
CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
if(XTensor::IsSameShaped(a, b)){
_Multiply(a, b, c, alpha);
return;
}
if(a->devID >= 0 || b->devID >= 0 || c->devID >= 0){
#ifdef USE_CUDA
_CudaMultiplyDim(a, b, c, n, alpha);
#else
ShowNTErrors("Please specify USE_CUDA and recompile the code!");
#endif
}
else{
int stride = 1;
int blockSize = a->dimSize[n];
int blockNum = 1;
for(int i = a->order - 1; i >= 0; i--){
if(i > n)
stride *= a->dimSize[i];
else if(i < n)
blockNum *= a->dimSize[i];
}
if(a->dataType == DEFAULT_DTYPE){
int num = a->unitNum;
if(stride > 1){
for(int i = 0, j = 0; i < num; i += stride, j++){
DTYPE * ap = (DTYPE*)a->data + i;
DTYPE bv = *((DTYPE*)b->data + j % blockSize);
DTYPE * cp = (DTYPE*)c->data + i;
for(int k = 0; k < stride; k++)
if(alpha == 0.0F)
cp[k] = ap[k] * bv;
else
cp[k] = ap[k] * bv + alpha * cp[k];
}
}
else if(stride == 1){
DTYPE * bp = (DTYPE*)b->data;
for(int i = 0; i < num; i += blockSize){
DTYPE * ap = (DTYPE*)a->data + i;
DTYPE * cp = (DTYPE*)c->data + i;
if(alpha == 0.0F){
for(int j = 0; j < blockSize; j++)
cp[j] = ap[j] * bp[j];
}
else{
for(int j = 0; j < blockSize; j++)
cp[j] = ap[j] * bp[j] + alpha * cp[j];
}
}
}
else{
ShowNTErrors("Something is wrong!");
}
}
else {
ShowNTErrors("TODO!");
}
}
}
/*
tensor multiplication(do it on site)
make a new tensor to keep the result and return it
c = a * b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is multiplied with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> n - the dimension index
>> alpha - the scaling factor
*/
void _MultiplyDimMe(XTensor * a, const XTensor * b, int n, DTYPE alpha)
{
_MultiplyDim(a, b, a, n, alpha);
}
/*
tensor multiplication (return a XTensor structure and make tensor connections)
make a new tensor to keep the result and return it
c = a * b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is multiplied with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> n - the dimension index
>> alpha - the scaling factor
<< return - the result tensor by tensor multiplication
*/
XTensor MultiplyDim(const XTensor &a, const XTensor &b, int n, DTYPE alpha)
{
XTensor c(&a);
c.SetTMP();
/* call _Multiply function */
_MultiplyDim(&a, &b, &c, n, alpha);
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM);
XLink::AddParamToHeadInt(&c, n);
XLink::AddParamToHead(&c, alpha);
return c;
}
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: JIANG Yufan (email: jiangyufan2018@outlook.com) 2018-08-14
*/
#include "../../XDevice.h"
#include "../../XUtility.h"
#include "MultiplyDim.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/*
tensor multiplication of a tensor and a row vector
c = a * b + \alpha * c
where a is a tensor and b is a row vector
>> a - pointer to the data array of a
>> b - pointer to the data array of b
>> c - pointer to the data array of c
>> rowNum - number of rows of a and c
>> colNum - number of columns of a and c (i.e., the size of b)
>> alpha - the scaling factor
*/
template <class T, bool alphaFired>
__global__
void KernelMultiplyWithRow(T * a, T * b, T * c, int rowNum, int colNum, T alpha)
{
__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int col = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
if (col >= colNum || row >= rowNum)
return;
if (threadIdx.y == 0)
bv[threadIdx.x] = b[col];
__syncthreads();
int offset = colNum * row + col;
if (alphaFired)
c[offset] = a[offset] * bv[threadIdx.x] + c[offset] * alpha;
else
c[offset] = a[offset] * bv[threadIdx.x];
}
/*
tensor multiplication of a tensor and a colum vector
c = a * b + \alpha * c
where a is a tensor and b is a colum vector
>> a - pointer to the data array of a
>> b - pointer to the data array of b
>> c - pointer to the data array of c
>> rowNum - number of rows of a and c (i.e., the size of b)
>> colNum - number of columns of a and c
>> blockNum - size of a block (matrix), i.e., rowNum * colNum
>> blockNum - number of matrics
>> alpha - the scaling factor
*/
template <class T, bool alphaFired>
__global__
void KernelMultiplyWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, T alpha)
{
__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = colIndex % colNum;
int block = colIndex / colNum;
if (row >= rowNum || block >= blockNum)
return;
if (threadIdx.x == 0)
bv[threadIdx.y] = b[row];
__syncthreads();
int offset = block * blockSize + row * colNum + col;
if (alphaFired)
c[offset] = a[offset] * bv[threadIdx.y] + c[offset] * alpha;
else
c[offset] = a[offset] * bv[threadIdx.y];
}
/*
tensor multiplication
c = a * b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is multiplied with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> c - where we put a * b + \alpha * c. we save it in a if c is NULL
>> n - the dimension index
>> alpha - the scaling factor
*/
void _CudaMultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha)
{
CheckNTErrors(a && b && c, "Empty tensor input!");
CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in multiplication!");
CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
"Unmatched data types in multiplication!");
CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in multiplication!");
CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
int stride = 1;
int blockSize = a->dimSize[n];
int blockNum = 1;
for (int i = a->order - 1; i >= 0; i--) {
if (i > n)
stride *= a->dimSize[i];
else if (i < n)
blockNum *= a->dimSize[i];
}
int cudaGrids[3];
int cudaBlocks[3];
int devIDBackup = 0;
ProtectCudaDev(a->devID, devIDBackup);
if (a->dataType == DEFAULT_DTYPE) {
if (stride > 1) {
GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
if(alpha == (DTYPE)0.0F)
KernelMultiplyWithCol<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockSize, stride, blockSize * stride, blockNum, alpha);
else
KernelMultiplyWithCol<DTYPE, true> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockSize, stride, blockSize * stride, blockNum, alpha);
}
else if (stride == 1) {
GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
if(alpha == (DTYPE)0.0F)
KernelMultiplyWithRow<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockNum, blockSize, alpha);
else
KernelMultiplyWithRow<DTYPE, true> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockNum, blockSize, alpha);
}
else {
ShowNTErrors("Something is wrong!");
}
}
else {
ShowNTErrors("TODO!");
}
BacktoCudaDev(a->devID, devIDBackup);
}
#endif
} // namespace nts(NiuTrans.Tensor)
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: JIANG Yufan (email: jiangyufan2018@outlook.com) 2018-08-14
*/
#ifndef __MULTIPLYDIM_CUH__
#define __MULTIPLYDIM_CUH__
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* tensor summation a * b + \alpha * c where the size of b is equal to the n-th dimension of a,
i.e., a is multiplied with b by broadcasting (cuda version) */
void _CudaMultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha = 0);
#endif
} // namespace nts(NiuTrans.Tensor)
#endif // __MULTIPLYDIM_CUH__
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: JIANG Yufan (email: jiangyufan2018@outlook.com) 2018-08-14
*/
#ifndef __MULTIPLYDIM_H__
#define __MULTIPLYDIM_H__
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* tensor multiplication c = a * b + \alpha * c where the size of b is equal to the n-th dimension of a,
i.e., a is multiplied with b by broadcasting */
void _MultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha = 0.0);
/* tensor multiplication a = a * b + \alpha * c where the size of b is equal to the n-th dimension of a,
i.e., a is multiplied with b by broadcasting. we keep the result in the input tensor a and return nothing */
void _MultiplyDimMe(XTensor * a, const XTensor * b, int n, DTYPE alpha = 0.0);
/* tensor multiplication c = a * b + \alpha * c where the size of b is equal to the n-th dimension of a,
i.e., a is multiplied with b by broadcasting. We make a new tensor c to keep the result and return it */
XTensor MultiplyDim(const XTensor &a, const XTensor &b, int n, DTYPE alpha = 0.0);
} // namespace nts(NiuTrans.Tensor)
#endif // __MULTIPLYDIM_H__
......@@ -24,6 +24,7 @@
#include "../../XUtility.h"
#include "Sub.h"
#include "Sub.cuh"
#include "SubDim.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......@@ -123,7 +124,35 @@ void _SubMe(XTensor * a, const XTensor * b, DTYPE beta)
{
_Sub(a, b, a, beta);
}
/*
return a dimension if the subtraction is performed as SubDim (in more details in SubDim.h)
>> a - a tensor
>> b - another tensor for subtraction
*/
int GetSubDimIndex(const XTensor &a, const XTensor &b)
{
if(a.order < b.order)
return -1;
if(XTensor::IsSameShaped(&a, &b))
return -1;
int hitCount = 0;
int hitDim = -1;
for(int i = 0; i < a.order; i++){
if(a.dimSize[i] == b.unitNum){
hitDim = i;
hitCount++;
}
}
if(hitCount == 1)
return hitDim;
else
return -1;
}
/*
tensor subtraction c = a - b * \beta (return a XTensor structure)
make a new tensor c to keep the result and return it
......@@ -138,12 +167,29 @@ XTensor Sub(const XTensor &a, const XTensor &b, DTYPE beta)
XTensor c(&a);
c.SetTMP();
/* call _Sub function */
_Sub(&a, &b, &c, beta);
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_SUB);
XLink::AddParamToHead(&c, beta);
int n = GetSubDimIndex(a, b);
if(n == -1){
/* call _Sub function */
_Sub(&a, &b, &c, beta);
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_SUB);
XLink::AddParamToHead(&c, beta);
}
else if(n >= 0 && n < a.order){
/* call _SubDim function */
_SubDim(&a, &b, &c, n, beta);
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_SUBDIM);
XLink::AddParamToHeadInt(&c, n);
XLink::AddParamToHead(&c, beta);
}
else{
ShowNTErrors("Something is wrong!");
}
return c;
}
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
*/
#include "Sub.h"
#include "SubDim.h"
#include "SubDim.cuh"
#include "../../XName.h"
#include "../movement/CopyValues.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
tensor subtraction
c = a - b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> c - where we put a-b*\beta. we save it in a if c is NULL
>> n - the dimension index
>> beta - the scaling factor
*/
void _SubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta)
{
CheckNTErrors(a && b && c, "Empty tensor input!");
CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in subtraction!");
CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
"Unmatched data types in subtraction!");
CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in subtraction!");
CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
if (beta == 0) {
_CopyValues(a, c);
return;
}
if (XTensor::IsSameShaped(a, b)) {
_Sub(a, b, c, beta);
return;
}
if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
#ifdef USE_CUDA
_CudaSubDim(a, b, c, n, beta);
#else
ShowNTErrors("Please specify USE_CUDA and recompile the code!");
#endif
}
else {
int stride = 1;
int blockSize = a->dimSize[n];
int blockNum = 1;
for (int i = a->order - 1; i >= 0; i--) {
if (i > n)
stride *= a->dimSize[i];
else if (i < n)
blockNum *= a->dimSize[i];
}
if (a->dataType == DEFAULT_DTYPE) {
int num = a->unitNum;
if (stride > 1) {
for (int i = 0, j = 0; i < num; i += stride, j++) {
DTYPE * ap = (DTYPE*)a->data + i;
DTYPE bv = *((DTYPE*)b->data + j % blockSize) * beta;
DTYPE * cp = (DTYPE*)c->data + i;
for (int k = 0; k < stride; k++)
cp[k] = ap[k] - bv;
}
}
else if (stride == 1) {
DTYPE * bp = (DTYPE*)b->data;
for (int i = 0; i < num; i += blockSize) {
DTYPE * ap = (DTYPE*)a->data + i;
DTYPE * cp = (DTYPE*)c->data + i;
if (beta == 1.0F) {
for (int j = 0; j < blockSize; j++)
cp[j] = ap[j] - bp[j];
}
else {
for (int j = 0; j < blockSize; j++)
cp[j] = ap[j] - bp[j] * beta;
}
}
}
else {
ShowNTErrors("Something is wrong!");
}
}
else {
ShowNTErrors("TODO!");
}
}
}
/*
tensor subtraction (do it on site)
keep the result in the input tensor and return nothing
c = a - b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> n - the dimension index
>> beta - the scaling factor
*/
void _SubDim(XTensor * a, const XTensor * b, int n, DTYPE beta)
{
_SubDim(a, b, a, n, beta);
}
/*
tensor subtraction (return a XTensor structure and make tensor connections)
make a new tensor to keep the result and return it
c = a - b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> n - the dimension index
>> beta - the scaling factor
<< return - the result tensor by tensor subtraction
*/
XTensor SubDim(const XTensor &a, const XTensor &b, int n, DTYPE beta)
{
XTensor c(&a);
c.SetTMP();
/* call _Sub function */
_SubDim(&a, &b, &c, n, beta);
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_SUBDIM);
XLink::AddParamToHeadInt(&c, n);
XLink::AddParamToHead(&c, beta);
return c;
}
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
*/
#include "SubDim.cuh"
#include "../../XDevice.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/*
tensor subtraction of a tensor and a row vector
c = a - b * \beta
where a is a tensor and b is a row vector
>> a - pointer to the data array of a
>> b - pointer to the data array of b
>> c - pointer to the data array of c
>> rowNum - number of rows of a and c
>> colNum - number of columns of a and c (i.e., the size of b)
>> beta - the scaling factor
*/
template <class T, bool betaFired>
__global__
void KernelSubWithRow(T * a, T * b, T * c, int rowNum, int colNum, T beta)
{
__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int col = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
if (col >= colNum || row >= rowNum)
return;
if (threadIdx.y == 0)
bv[threadIdx.x] = b[col];
__syncthreads();
int offset = colNum * row + col;
if (betaFired)
c[offset] = a[offset] - bv[threadIdx.x] * beta;
else
c[offset] = a[offset] - bv[threadIdx.x];
}
/*
tensor subtraction of a tensor and a colum vector
c = a - b * \beta
where a is a tensor and b is a colum vector
>> a - pointer to the data array of a
>> b - pointer to the data array of b
>> c - pointer to the data array of c
>> rowNum - number of rows of a and c (i.e., the size of b)
>> colNum - number of columns of a and c
>> blockNum - size of a block (matrix), i.e., rowNum * colNum
>> blockNum - number of matrics
>> beta - the scaling factor
*/
template <class T, bool betaFired>
__global__
void KernelSubWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, T beta)
{
__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = colIndex % colNum;
int block = colIndex / colNum;
if (row >= rowNum || block >= blockNum)
return;
if (threadIdx.x == 0)
bv[threadIdx.y] = b[row];
__syncthreads();
int offset = block * blockSize + row * colNum + col;
if (betaFired)
c[offset] = a[offset] - bv[threadIdx.y] * beta;
else
c[offset] = a[offset] - bv[threadIdx.y];
}
/*
tensor subtraction (cuda version)
c = a - b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> c - where we put a+b*\beta. we save it in a if c is NULL
>> n - the dimension index
>> beta - the scaling factor
*/
void _CudaSubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta)
{
CheckNTErrors(a && b && c, "Empty tensor input!");
CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in subtraction!");
CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
"Unmatched data types in subtraction!");
CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in subtraction!");
CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
int stride = 1;
int blockSize = a->dimSize[n];
int blockNum = 1;
for (int i = a->order - 1; i >= 0; i--) {
if (i > n)
stride *= a->dimSize[i];
else if (i < n)
blockNum *= a->dimSize[i];
}
int cudaGrids[3];
int cudaBlocks[3];
int devIDBackup = 0;
ProtectCudaDev(a->devID, devIDBackup);
if (a->dataType == DEFAULT_DTYPE) {
if (stride > 1) {
GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
if (beta == (DTYPE)1.0F)
KernelSubWithCol<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockSize, stride, blockSize * stride, blockNum, beta);
else
KernelSubWithCol<DTYPE, true> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockSize, stride, blockSize * stride, blockNum, beta);
}
else if (stride == 1) {
GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
if (beta == (DTYPE)1.0F)
KernelSubWithRow<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockNum, blockSize, beta);
else
KernelSubWithRow<DTYPE, true> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockNum, blockSize, beta);
}
else {
ShowNTErrors("Something is wrong!");
}
}
else {
ShowNTErrors("TODO!");
}
BacktoCudaDev(a->devID, devIDBackup);
}
#endif
} // namespace nts(NiuTrans.Tensor)
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
*/
#ifndef __SUBDIM_CUH__
#define __SUBDIM_CUH__
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting (cuda version) */
void _CudaSubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta = (DTYPE)1.0);
#endif
} // namespace nts(NiuTrans.Tensor)
#endif // __SUBDIM_CUH__
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
*/
#ifndef __SUBDIM_H__
#define __SUBDIM_H__
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting*/
void _SubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta = (DTYPE)1.0);
/* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting. we keep the result in the input tensor a and return nothing */
void _SubDim(XTensor * a, const XTensor * b, int n, DTYPE beta = (DTYPE)1.0);
/* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting. We make a new tensor c to keep the result and return it */
XTensor SubDim(const XTensor &a, const XTensor &b, int n, DTYPE beta = (DTYPE)1.0);
} // namespace nts(NiuTrans.Tensor)
#endif // __SUBDIM_H__
......@@ -131,23 +131,43 @@ void _SumMe(XTensor * a, const XTensor * b, DTYPE beta)
}
/*
return a dimension if the sum is performed as SumDim (in more details in SumDim.h
return a dimension if the sum is performed as SumDim (in more details in SumDim.h)
>> a - a tensor
>> b - another tensor for sum
*/
int GetSumDimIndex(const XTensor &a, const XTensor &b)
{
//if(a.order < b.order)
// return -1;
//int hitCount = 0;
//int hitDim = -1;
//for(int i = 0; i < b.order; i++){
// if(b.dimSize[b.order - 1 - i] == 1)
// continue;
// else if(b.dimSize[b.order - 1 - i] == a.dimSize[a.order - 1 - i]){
// hitCount++;
// hitDim = a.order - b.order + i;
// }
//}
//if(hitCount == 1)
// return hitDim;
//else
// return -1;
if(a.order < b.order)
return -1;
if(XTensor::IsSameShaped(&a, &b))
return -1;
int hitCount = 0;
int hitDim = -1;
for(int i = 0; i < b.order; i++){
if(b.dimSize[b.order - 1 - i] == 1)
continue;
else if(b.dimSize[b.order - 1 - i] == a.dimSize[a.order - 1 - i]){
for(int i = 0; i < a.order; i++){
if(a.dimSize[i] == b.unitNum){
hitDim = i;
hitCount++;
hitDim = a.order - b.order + i;
}
}
......@@ -182,7 +202,7 @@ XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta)
XLink::AddParamToHead(&c, beta);
}
else if(n >= 0 && n < a.order){
/* call _Sum function */
/* call _SumDim function */
_SumDim(&a, &b, &c, n, beta);
/* tensor connections */
......
......@@ -118,6 +118,7 @@ XTensor SelectRange(const XTensor &a, int dim, int low, int high)
/* tensor connection */
XLink::MakeLink(&a, NULL, &c, GETANDSET_SELECT);
XLink::AddParamToHeadInt(&c, dim);
XLink::AddParamToHeadInt(&c, low);
XLink::AddParamToHeadInt(&c, high);
......
......@@ -70,8 +70,8 @@ void _SetDataFanInOut(XTensor * tensor, DTYPE gain)
fanOut = numOutputFmaps * receptiveFieldSize;
}
DTYPE std = gain * sqrt(2.0/(fanIn + fanOut));
DTYPE a = sqrt(3.0) * std;
DTYPE std = gain * (float)sqrt(2.0/(fanIn + fanOut));
DTYPE a = (DTYPE)sqrt(3.0) * std;
_SetDataRand(tensor, -a, a);
}
......
......@@ -213,7 +213,7 @@ void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
DTYPE variance = upper - lower;
if (tensor->dataType == X_FLOAT)
KernelSetDataRandFloat <<<blocks, threads >>>((float*)tensor->data, tensor->unitNum, lower, variance);
KernelSetDataRandFloat <<<blocks, threads >>>((float*) tensor->data, tensor->unitNum, lower, variance);
else if (tensor->dataType == X_DOUBLE)
KernelSetDataRandDouble <<<blocks, threads >>>((double*)tensor->data, tensor->unitNum, lower, variance);
......
......@@ -49,7 +49,7 @@ void _SetDataFixedDouble(XTensor * tensor, double p);
void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper);
/* generate data items with a normal distribution with specified mean and standard deviation */
void _SetDataRandN(XTensor * tensor, DTYPE mean, DTYPE standardDeviation);
void _SetDataRandN(XTensor * tensor, DTYPE mean = 0.0F, DTYPE standardDeviation = 1.0F);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -60,8 +60,12 @@ void _Power(const XTensor * a, XTensor * b, DTYPE p)
bData[i] = aData[i] * aData[i];
}
else {
for (int i = 0; i < a->unitNum; i++)
bData[i] = (DTYPE)pow(aData[i], p);
for (int i = 0; i < a->unitNum; i++) {
if (p < 0 && aData[i] == 0)
bData[i] = 1e20F;
else
bData[i] = (DTYPE)pow(aData[i], p);
}
}
}
......
......@@ -77,8 +77,13 @@ void KernelPower(DTYPE * a, DTYPE * b, DTYPE p, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size)
b[i] = pow(a[i], p);
if (i < size) {
DTYPE v = a[i];
if (p < 0 && v == 0)
b[i] = 1e20;
else
b[i] = pow(a[i], p);
}
}
/*
......@@ -94,8 +99,13 @@ void KernelPower(__half * a, __half * b, __half p, int size)
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
#else
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size)
b[i] = __float2half(pow(__half2float(a[i]), __half2float(p)));
if (i < size) {
float v = __half2float(a[i]);
if (__half2float(p) < 0 && v == 0)
b[i] = __float2half(1e20);
else
b[i] = __float2half(pow(__half2float(a[i]), __half2float(p)));
}
#endif
}
......
......@@ -29,6 +29,71 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/*
use PTX code to reduce float data
*/
__device__ __forceinline__
float shflDownReduceMax(float input)
{
float output;
asm volatile(
"{"
".reg .f32 r0;"
".reg .pred p;"
"shfl.down.b32 r0, %1, 0x10, 0x1f;"
"setp.lt.f32 p,%1,r0;"
"@p mov.f32 %1,r0;"
"shfl.down.b32 r0, %1, 0x8, 0xf;"
"setp.lt.f32 p,%1,r0;"
"@p mov.f32 %1,r0;"
"shfl.down.b32 r0, %1, 0x4, 0x7;"
"setp.lt.f32 p,%1,r0;"
"@p mov.f32 %1,r0;"
"shfl.down.b32 r0, %1, 0x2, 0x3;"
"setp.lt.f32 p,%1,r0;"
"@p mov.f32 %1,r0;"
"shfl.down.b32 r0, %1, 0x1, 0x1;"
"setp.lt.f32 p, %1, r0; "
"@p mov.f32 %1,r0;"
"mov.f32 %0,%1;"
"}"
: "=f"(output) : "f"(input));
return output;
}
/*
use PTX code to reduce int data
*/
__device__ __forceinline__
int shflDownReduceMax(int input)
{
int output;
asm volatile(
"{"
".reg .s32 r0;"
".reg .pred p;"
"shfl.down.b32 r0, %1, 0x10, 0x1f;"
"setp.lt.s32 p,%1,r0;"
"@p mov.s32 %1,r0;"
"shfl.down.b32 r0, %1, 0x8, 0xf;"
"setp.lt.s32 p,%1,r0;"
"@p mov.s32 %1,r0;"
"shfl.down.b32 r0, %1, 0x4, 0x7;"
"setp.lt.s32 p,%1,r0;"
"@p mov.s32 %1,r0;"
"shfl.down.b32 r0, %1, 0x2, 0x3;"
"setp.lt.s32 p,%1,r0;"
"@p mov.s32 %1,r0;"
"shfl.down.b32 r0, %1, 0x1, 0x1;"
"setp.lt.s32 p, %1, r0; "
"@p mov.s32 %1,r0;"
"mov.s32 %0,%1;"
"}"
: "=r"(output) : "r"(input));
return output;
}
/*
reduce a tensor to another that keeps the max value along a dimension - slow version
Given a block of data, we go over each dimension i in the stride and we have
......@@ -63,7 +128,7 @@ void KernelReduceMax(DTYPE * input, DTYPE * output,
int iOffset = i % stride;
DTYPE value = (i < stride * blockNum && j < strideNum) ?
input[blockSize * k + stride * j + iOffset]: FLOAT_MIN;
input[blockSize * k + stride * j + iOffset] : FLOAT_MIN;
/* load data into the shared mem */
iData[threadIdx.x * blockDim.y + threadIdx.y] = value;
......@@ -186,30 +251,26 @@ void KernelReduceMaxFast(DTYPE * input, DTYPE * output,
int k = i / stride;
int iOffset = i % stride;
DTYPE * data = iData + threadIdx.x * blockDim.y;
DTYPE * inputData = input + k * blockSize;
DTYPE value = j < strideNum ? inputData[j * stride + iOffset]: FLOAT_MIN;
DTYPE * data = iData + threadIdx.x * blockDim.y;
DTYPE * inputData = input + k * blockSize;
DTYPE value = j < strideNum ? inputData[j * stride + iOffset] : FLOAT_MIN;
DTYPE value2 = j + blockDim.y < strideNum ? inputData[(j + blockDim.y) * stride + iOffset]: FLOAT_MIN;
/* load data into the shared mem */
data[tid] = MAX(value, value2);
value = MAX(value, value2);
value = shflDownReduceMax(value);
if ((tid & 0x1f) == 0)
data[tid / 32] = value;
__syncthreads();
/* unroll the warp */
if(goodSize >= 512) {if(tid < 256) {if(data[tid] < data[tid + 256]) data[tid] = data[tid + 256];} __syncthreads();}
if(goodSize >= 256) {if(tid < 128) {if(data[tid] < data[tid + 128]) data[tid] = data[tid + 128];} __syncthreads();}
if(goodSize >= 128) {if(tid < 64) {if(data[tid] < data[tid + 64]) data[tid] = data[tid + 64];} __syncthreads();}
if(goodSize >= 64) {if(tid < 32) {if(data[tid] < data[tid + 32]) data[tid] = data[tid + 32];} __syncthreads();}
if(goodSize >= 32) {if(tid < 16) {if(data[tid] < data[tid + 16]) data[tid] = data[tid + 16];} __syncthreads();}
if(goodSize >= 16) {if(tid < 8) {if(data[tid] < data[tid + 8]) data[tid] = data[tid + 8];} __syncthreads();}
if(goodSize >= 8) {if(tid < 4) {if(data[tid] < data[tid + 4]) data[tid] = data[tid + 4];} __syncthreads();}
if(goodSize >= 4) {if(tid < 2) {if(data[tid] < data[tid + 2]) data[tid] = data[tid + 2];} __syncthreads();}
if(goodSize >= 2) {if(tid < 1) {if(data[tid] < data[tid + 1]) data[tid] = data[tid + 1];} __syncthreads();}
/* write result for this block to the output array */
if(threadIdx.y == 0 && blockIdx.y < reducedStrideNum)
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = data[0];
if (tid < 32) {
if (tid < blockDim.y / 32)
value = data[tid];
else
value = FLOAT_MIN;
value = shflDownReduceMax(value);
if (tid == 0 && blockIdx.y < reducedStrideNum)
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = value;
}
}
/*
......@@ -271,9 +332,9 @@ void KernelReduceMaxFast(__half * input, __half * output,
if (goodSize >= 64) { if (tid < 32) { if (data[tid] < data[tid + 32]) data[tid] = data[tid + 32]; } __syncthreads(); }
if (goodSize >= 32) { if (tid < 16) { if (data[tid] < data[tid + 16]) data[tid] = data[tid + 16]; } __syncthreads(); }
if (goodSize >= 16) { if (tid < 8) { if (data[tid] < data[tid + 8]) data[tid] = data[tid + 8]; } __syncthreads(); }
if (goodSize >= 8) { if (tid < 4) { if (data[tid] < data[tid + 4]) data[tid] = data[tid + 4]; } __syncthreads(); }
if (goodSize >= 4) { if (tid < 2) { if (data[tid] < data[tid + 2]) data[tid] = data[tid + 2]; } __syncthreads(); }
if (goodSize >= 2) { if (tid < 1) { if (data[tid] < data[tid + 1]) data[tid] = data[tid + 1]; } __syncthreads(); }
if (goodSize >= 8) { if (tid < 4) { if (data[tid] < data[tid + 4]) data[tid] = data[tid + 4]; } __syncthreads(); }
if (goodSize >= 4) { if (tid < 2) { if (data[tid] < data[tid + 2]) data[tid] = data[tid + 2]; } __syncthreads(); }
if (goodSize >= 2) { if (tid < 1) { if (data[tid] < data[tid + 1]) data[tid] = data[tid + 1]; } __syncthreads(); }
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
/* write result for this block to the output array */
......@@ -291,7 +352,7 @@ reduce a tensor to another that keeps the max value along a dimension - simple
*/
__global__
void KernelReduceMaxSimpleFast(DTYPE * input, DTYPE * output,
int stride, int strideNum, int blockSize, int blockNum)
int stride, int strideNum, int blockSize, int blockNum)
{
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
......@@ -326,6 +387,108 @@ void KernelReduceMaxSimpleFast(DTYPE * input, DTYPE * output,
op[offset] = max;
}
/*
according the GPU's sm number allocation warp num
*/
inline void continuousStorageThreadAllocation(dim3& grid, dim3& block, long long vectorNum, int vectorSize)
{
int warpNum = 4;
if (vectorNum < 20 * 8){
warpNum = 8;
if (vectorNum < 20 * 4){
warpNum = 16;
if (warpNum < 20 * 2)
warpNum = 32;
}
}
int minWarpNum = vectorSize / 32;
if (vectorSize % 32 != 0) minWarpNum++;
warpNum = min(warpNum, minWarpNum);
grid.x = vectorNum;
grid.y = 1;
grid.z = 1;
block.x = 1;
block.y = warpNum * 32;
block.z = 1;
}
/*
adjust threads.x number then we can use warp optimization
*/
inline void adjustThreadForUseWarpOptimization(dim3& blocks, dim3& threads)
{
if (threads.x > 1) {
blocks.x *= threads.x;
threads.x = 1;
}
if (threads.y < 32)
threads.y = 32;
}
/*
In some case,we use less block to imporve efficiency
*/
__global__
void KernelReduceMaxOpLessBlocks(DTYPE * input, DTYPE * output, int strideNum, int blockNum)
{
int idx = threadIdx.x % 32;
int idy = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
int startIndex = idy * strideNum;
DTYPE threadMax = FLOAT_MIN;
for (int i = idx; i < strideNum; i += 32) {
threadMax = max(input[startIndex + i], threadMax);
}
threadMax = shflDownReduceMax(threadMax);
if (idx == 0)
output[idy] = threadMax;
}
/*
we use PTX code reduce
*/
__global__
void KernelReduceMaxOp(DTYPE * input, DTYPE * output,int stride, int strideNum,
int reducedStrideNum,int blockSize, int blockNum)
{
__shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK / 32];
unsigned int tid = threadIdx.y;
unsigned int j = blockIdx.y * blockDim.y + threadIdx.y;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= stride * blockNum)
return;
/* first level reduction */
int k = i / stride;
int iOffset = i % stride;
DTYPE threadMax = FLOAT_MIN;
DTYPE * data = iData + threadIdx.x * blockDim.y;
DTYPE * inputData = input + k * blockSize;
for (int it = j; it < strideNum; it += blockDim.y){
threadMax = max(inputData[it * stride + iOffset], threadMax);
}
__syncthreads();
threadMax = shflDownReduceMax(threadMax);
if ((tid & 0x1f) == 0)
data[tid / 32] = threadMax;
__syncthreads();
/* use one warp to reduce remaining data */
if (tid < 32){
if (tid < blockDim.y / 32)
threadMax = data[tid];
else threadMax = 0;
threadMax = shflDownReduceMax(threadMax);
if (tid == 0 && blockIdx.y < reducedStrideNum)
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = threadMax;
}
}
/*
get the max-valued items along a dimension of the tensor (cuda version).
For a 1-dimensional data array a,
......@@ -336,20 +499,18 @@ sum_i = max_{0<=j<strideNum} input_{i,j}
*/
void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
{
CheckNTErrors((input && output), "Empty input or output tensors!");
CheckNTErrors((input->order == output->order + 1), "Incorrect tensor sizes!");
CheckNTErrors((input->order > dim && dim >=0), "Illegal dimension to reduce!");
CheckNTErrors((input->dataType == output->dataType), "Unmatched data types!");
CheckNTErrors(input && output, "Empty input or output tensors!");
CheckNTErrors(input->order == output->order + 1, "Incorrect tensor sizes!");
CheckNTErrors(input->order > dim && dim >=0, "Illegal dimension to reduce!");
CheckNTErrors(input->dataType == output->dataType, "Unmatched data types!");
int dimRDI = input->order - dim - 1;
for(int i = 0; i < input->order; i++){
if(i < dimRDI){
CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i]),
"Unmatched tensors!");
CheckNTErrors(input->dimSizeRDI[i] == output->dimSizeRDI[i], "Unmatched tensors!");
}
else if(i > dimRDI){
CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i - 1]),
"Unmatched tensors!");
CheckNTErrors(input->dimSizeRDI[i] == output->dimSizeRDI[i - 1], "Unmatched tensors!");
}
}
......@@ -382,130 +543,149 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
int devIDBackup;
ProtectCudaDev(input->devID, devIDBackup);
do{
if (input->dataType == DEFAULT_DTYPE) {
DTYPE * iData = NULL;
DTYPE * oData = NULL;
if (iter == 0) {
iData = (DTYPE*)input->data;
oData = buf1;
}
else if (iter % 2 == 1) {
iData = buf1;
oData = buf2;
}
else {
iData = buf2;
oData = buf1;
}
/* unroll the reduction procedure. The code is messy but it is faster. */
if (strideNum < 32) {
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
KernelReduceMax << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else if (strideNum < 128) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!");
KernelReduceMaxFast<64> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else if (strideNum < 256) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!");
KernelReduceMaxFast<128> << <blocks, threads >> >(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else if (strideNum < 512) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!");
KernelReduceMaxFast<256> << <blocks, threads >> >(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!");
KernelReduceMaxFast<512> << <blocks, threads >> >(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
if (stride == 1 && blockNum >= 10) {
dim3 grids;
dim3 blocks;
continuousStorageThreadAllocation(grids, blocks, (long long)blockNum, strideNum);
if (blocks.y >= 128) {
KernelReduceMaxOp <<<grids, blocks >>> ((DTYPE *)input->data, (DTYPE*)output->data, stride, strideNum, grids.y, blockSize, blockNum);
}
else if (input->dataType == X_FLOAT16) {
__half * buf1ft16 = (__half *)buf1;
__half * buf2ft16 = (__half *)buf2;
__half * iData = NULL;
__half * oData = NULL;
if (iter == 0) {
iData = (__half*)input->data;
oData = buf1ft16;
}
else if (iter % 2 == 1) {
iData = buf1ft16;
oData = buf2ft16;
else {
if (blockNum % 4 != 0) blockNum = (int)(blockNum / 4) + 1;
else blockNum = blockNum / 4;
KernelReduceMaxOpLessBlocks <<<blockNum, 128 >>> ((DTYPE *)input->data, (DTYPE*)output->data, strideNum, blockNum);
}
}
else {
do {
if (input->dataType == DEFAULT_DTYPE) {
DTYPE * iData = NULL;
DTYPE * oData = NULL;
if (iter == 0) {
iData = (DTYPE*)input->data;
oData = buf1;
}
else if (iter % 2 == 1) {
iData = buf1;
oData = buf2;
}
else {
iData = buf2;
oData = buf1;
}
/* unroll the reduction procedure. The code is messy but it is faster. */
if (strideNum < 32) {
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
KernelReduceMax <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else if (strideNum < 128) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
CheckNTErrors(cudaBlockSize[0] >= 64, "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads);
KernelReduceMaxFast<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else if (strideNum < 256) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
CheckNTErrors(cudaBlockSize[0] >= 128, "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads);
KernelReduceMaxFast<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else if (strideNum < 512) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
CheckNTErrors(cudaBlockSize[0] >= 256, "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads);
KernelReduceMaxFast<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
CheckNTErrors(cudaBlockSize[0] >= 512, "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads);
KernelReduceMaxFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
}
else {
iData = buf2ft16;
oData = buf1ft16;
else if (input->dataType == X_FLOAT16) {
__half * buf1ft16 = (__half *)buf1;
__half * buf2ft16 = (__half *)buf2;
__half * iData = NULL;
__half * oData = NULL;
if (iter == 0) {
iData = (__half*)input->data;
oData = buf1ft16;
}
else if (iter % 2 == 1) {
iData = buf1ft16;
oData = buf2ft16;
}
else {
iData = buf2ft16;
oData = buf1ft16;
}
/* unroll the reduction procedure. The code is messy but it is faster. */
if (strideNum < 32) {
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
KernelReduceMax <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else if (strideNum < 128) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors(cudaBlockSize[0] >= 64, "Incorrect thread number when calling the cuda kernel!");
KernelReduceMaxFast<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else if (strideNum < 256) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors(cudaBlockSize[0] >= 128, "Incorrect thread number when calling the cuda kernel!");
KernelReduceMaxFast<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else if (strideNum < 512) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors(cudaBlockSize[0] >= 256, "Incorrect thread number when calling the cuda kernel!");
KernelReduceMaxFast<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors(cudaBlockSize[0] >= 512, "Incorrect thread number when calling the cuda kernel!");
KernelReduceMaxFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
}
/* unroll the reduction procedure. The code is messy but it is faster. */
if (strideNum < 32) {
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
KernelReduceMax << <blocks, threads >> >(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else if (strideNum < 128) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!");
KernelReduceMaxFast<64> << <blocks, threads >> >(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else if (strideNum < 256) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!");
KernelReduceMaxFast<128> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else if (strideNum < 512) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!");
KernelReduceMaxFast<256> << <blocks, threads >> >(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!");
KernelReduceMaxFast<512> << <blocks, threads >> >(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
}
strideNum = cudaGridSize[0];
blockSize = cudaGridSize[0];
strideNum = cudaGridSize[0];
blockSize = cudaGridSize[0];
iter++;
iter++;
}while(strideNum > 1);
} while (strideNum > 1);
}
BacktoCudaDev(input->devID, devIDBackup);
......
......@@ -27,6 +27,57 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/*
use PTX code to reduce float data
*/
__device__ __forceinline__
float shflDownReduceSum(float input)
{
float output;
asm volatile(
"{"
".reg .f32 r0;"
"shfl.down.b32 r0, %1, 0x10, 0x1f;"
"add.f32 %1, r0, %1;"
"shfl.down.b32 r0, %1, 0x8, 0xf;"
"add.f32 %1, r0, %1;"
"shfl.down.b32 r0, %1, 0x4, 0x7;"
"add.f32 %1, r0, %1;"
"shfl.down.b32 r0, %1, 0x2, 0x3;"
"add.f32 %1, r0, %1;"
"shfl.down.b32 r0, %1, 0x1, 0x1;"
"add.f32 %0, r0, %1;"
"}"
: "=f"(output) : "f"(input));
return output;
}
/*
use PTX code to reduce int data
*/
__device__ __forceinline__
int shflDownReduceSum(int input)
{
int output;
asm volatile(
"{"
".reg .s32 r0;"
"shfl.down.b32 r0, %1, 0x10, 0x1f;"
"add.s32 %1, r0, %1;"
"shfl.down.b32 r0, %1, 0x8, 0xf;"
"add.s32 %1, r0, %1;"
"shfl.down.b32 r0, %1, 0x4, 0x7;"
"add.s32 %1, r0, %1;"
"shfl.down.b32 r0, %1, 0x2, 0x3;"
"add.s32 %1, r0, %1;"
"shfl.down.b32 r0, %1, 0x1, 0x1;"
"add.s32 %0, r0, %1;"
"}"
: "=r"(output) : "r"(input));
return output;
}
/*
reduce a tensor to another that keeps the sum along a dimension - slow version
Given a block of data, we go over each dimension i in the stride and we have
......@@ -157,6 +208,7 @@ void KernelReduceSum(__half * input, __half * output,
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
__half value = isValid ? __hsub(input[blockSize * k + stride * j + iOffset], bias[threadIdx.x]) : __half(0);
DTYPE power2 = __half2float(power);
if(power2 != (DTYPE)1.0){
if(power2 == (DTYPE)2.0)
value = __hmul(value, value);
......@@ -276,25 +328,19 @@ void KernelReduceSumFast(DTYPE * input, DTYPE * output,
value2 = exp(value2);
}
/* load data into the shared mem */
data[tid] = value + value2;
value = value + value2;
__syncthreads();
/* unroll the warp */
if(goodSize >= 512) {if(tid < 256) {data[tid] += data[tid + 256];} __syncthreads();}
if(goodSize >= 256) {if(tid < 128) {data[tid] += data[tid + 128];} __syncthreads();}
if(goodSize >= 128) {if(tid < 64) {data[tid] += data[tid + 64];} __syncthreads();}
if(goodSize >= 64) {if(tid < 32) {data[tid] += data[tid + 32];} __syncthreads();}
if(goodSize >= 32) {if(tid < 16) {data[tid] += data[tid + 16];} __syncthreads();}
if(goodSize >= 16) {if(tid < 8) {data[tid] += data[tid + 8];} __syncthreads();}
if(goodSize >= 8) {if(tid < 4) {data[tid] += data[tid + 4];} __syncthreads();}
if(goodSize >= 4) {if(tid < 2) {data[tid] += data[tid + 2];} __syncthreads();}
if(goodSize >= 2) {if(tid < 1) {data[tid] += data[tid + 1];} __syncthreads();}
/* write result for this block to the output array */
if(threadIdx.y == 0 && blockIdx.y < reducedStrideNum)
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = data[0];
value = shflDownReduceSum(value);
if ((tid & 0x1f) == 0) { data[tid / 32] = value; }
__syncthreads();
if (tid < 32){
if (tid < blockDim.y / 32)
value = data[tid];
else value = 0;
value = shflDownReduceSum(value);
if (tid == 0 && blockIdx.y < reducedStrideNum)
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = value;
}
}
/*
......@@ -430,6 +476,195 @@ void KernelReduceSumFast(__half * input, __half * output,
#endif
}
/*
if data storage is discontinuius ,use this way to reduce
*/
__global__
void KernelReduceSumDiscontinuousStorage(DTYPE * input, DTYPE * output, int stride, int blockNum,
int strideNum, DTYPE * shift, DTYPE power, bool isExp)
{
__shared__ DTYPE bias[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int blockIndex = idx / stride;
int offsetInBlock = idx % stride;
if (idx >= stride * blockNum)
return;
bias[idx % blockDim.x] = shift != NULL ? shift[idx] : 0;
DTYPE ans = 0;
#pragma unroll
for (int i = stride * strideNum * blockIndex + offsetInBlock;
i < stride * strideNum * blockIndex + offsetInBlock + stride * strideNum;
i += stride){
DTYPE value = input[i];
value = value - bias[idx % blockDim.x];
if (power != (DTYPE)1.0) {
if (power == (DTYPE)2.0) {
value = value * value;
}
else if (power == (DTYPE)0.5) {
value = sqrt(value);
}
else {
value = pow(value, power);
}
}
if (isExp) {
value = exp(value);
}
ans += value;
}
output[idx] = ans;
}
__global__
void KernelReduceSumOp(DTYPE * input, DTYPE * output,
int stride, int strideNum, int reducedStrideNum,
int blockSize, int blockNum,
DTYPE * shift, DTYPE power, bool isExp)
{
__shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK / 32];
__shared__ DTYPE bias[MAX_CUDA_THREAD_NUM_PER_BLOCK];
unsigned int tid = threadIdx.y;
unsigned int j = blockIdx.y * blockDim.y + threadIdx.y;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= stride * blockNum)
return;
if (threadIdx.y == 0)
bias[threadIdx.x] = shift != NULL ? shift[i] : 0;
__syncthreads();
/* first level reduction */
int k = i / stride;
int iOffset = i % stride;
DTYPE threadSum = 0;
DTYPE * data = iData + threadIdx.x * blockDim.y;
DTYPE * inputData = input + k * blockSize;
for (int it = j; it < strideNum; it += blockDim.y){
DTYPE value = inputData[it * stride + iOffset] - bias[threadIdx.x];
if (power != (DTYPE)1.0) {
if (power == (DTYPE)2.0) {
value = value * value;
}
else if (power == (DTYPE)0.5) {
value = sqrt(value);
}
else {
value = pow(value, power);
}
}
if (isExp) value = exp(value);
threadSum += value;
}
__syncthreads();
threadSum = shflDownReduceSum(threadSum);
if ((tid & 0x1f) == 0) { data[tid / 32] = threadSum; }
__syncthreads();
if (tid < 32){
if (tid < blockDim.y / 32)
threadSum = data[tid];
else threadSum = 0;
threadSum = shflDownReduceSum(threadSum);
if (tid == 0 && blockIdx.y < reducedStrideNum)
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = threadSum;
}
}
__global__
void KernelReduceSumOpLessBlocks(DTYPE * input, DTYPE * output,
int strideNum, int blockNum,
DTYPE * shift, DTYPE power, bool isExp)
{
__shared__ DTYPE bias[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int idx = threadIdx.x % 32;
int idy = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
if (idx == 0)
bias[threadIdx.x / 32] = shift != NULL ? shift[idy] : 0;
int startIndex = idy * strideNum;
DTYPE threadSum = 0;
for (int i = idx; i < strideNum; i += 32) {
DTYPE value = input[startIndex + i] - bias[threadIdx.x / 32];
if (power != (DTYPE)1.0) {
if (power == (DTYPE)2.0) {
value = value * value;
}
else if (power == (DTYPE)0.5) {
value = sqrt(value);
}
else {
value = pow(value, power);
}
}
if (isExp) value = exp(value);
threadSum += value;
}
threadSum = shflDownReduceSum(threadSum);
if (idx == 0)
output[idy] = threadSum;
}
/*
according the GPU's sm number allocation warp num
*/
inline void continuousStorageThreadAllocation(dim3& grid, dim3& block, long long vectorNum, int vectorSize)
{
int warpNum = 4;
if (vectorNum < 20 * 8) {
warpNum = 8;
if (vectorNum < 20 * 4) {
warpNum = 16;
if (warpNum < 20 * 2)
warpNum = 32;
}
}
int minWarpNum = vectorSize / 32;
if (vectorSize % 32 != 0) minWarpNum++;
warpNum = min(warpNum, minWarpNum);
grid.x = vectorNum;
grid.y = 1;
grid.z = 1;
block.x = 1;
block.y = warpNum * 32;
block.z = 1;
}
/*
this situation we use block.x * grid.x deal one vector for continuous read
*/
inline void discontinuousStorageNoShareMemThreadAllocation(dim3& grid, dim3& block, int stride, int blockNum)
{
block.x = 512;
block.y = 1;
if ((stride * blockNum) % 512 == 0)
grid.x = (stride * blockNum) / 512;
else
grid.x = (stride * blockNum) / 512 + 1;
grid.y = 1;
}
/*
adjust threads.x number then we can use warp optimization
*/
inline void adjustThreadForUseWarpOptimization(dim3& blocks, dim3& threads)
{
if (threads.x > 1){
blocks.x *= threads.x;
threads.x = 1;
}
if (threads.y < 32)
threads.y = 32;
}
/*
sum the items along a dimension of the tensor (cuda version).
For a 1-dimensional data array a,
......@@ -444,27 +679,24 @@ sum = \sum_i exp((a_i - shift)^power) if isExp == true
*/
void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor * shift, DTYPE power, bool isExp)
{
CheckNTErrors((input && output), "Empty input or output tensors!");
CheckNTErrors((input->order == output->order + 1), "Incorrect tensor sizes!");
CheckNTErrors((input->order > dim && dim >=0), "Illegal dimension to reduce!");
CheckNTErrors((input->dataType == output->dataType), "Unmatched data types!");
CheckNTErrors((shift == NULL || output->unitNum == shift->unitNum), "Incorrect shift tensor size!");
CheckNTErrors(input && output, "Empty input or output tensors!");
CheckNTErrors(input->order == output->order + 1, "Incorrect tensor sizes!");
CheckNTErrors(input->order > dim && dim >= 0, "Illegal dimension to reduce!");
CheckNTErrors(input->dataType == output->dataType, "Unmatched data types!");
CheckNTErrors(shift == NULL || output->unitNum == shift->unitNum, "Incorrect shift tensor size!");
int dimRDI = input->order - dim - 1;
for(int i = 0; i < input->order; i++){
if(i < dimRDI){
CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i]),
"Unmatched tensors!");
CheckNTErrors(input->dimSizeRDI[i] == output->dimSizeRDI[i], "Unmatched tensors!");
}
else if(i > dimRDI){
CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i - 1]),
"Unmatched tensors!");
CheckNTErrors(input->dimSizeRDI[i] == output->dimSizeRDI[i - 1], "Unmatched tensors!");
}
}
if(input->dataType == X_FLOAT16){
CheckNTErrors((power == 0 || power == 0.5 || power == 1.0 || power == 2.0), "TODO!");
}
if(input->dataType == X_FLOAT16)
CheckNTErrors(power == 0 || power == 0.5 || power == 1.0 || power == 2.0, "TODO!");
int cudaGridSize[3];
int cudaBlockSize[3];
......@@ -496,136 +728,172 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
int devIDBackup;
ProtectCudaDev(input->devID, devIDBackup);
do{
if(input->dataType == DEFAULT_DTYPE){
DTYPE * iData = NULL;
DTYPE * oData = NULL;
if (iter == 0) {
iData = (DTYPE*)input->data;
oData = buf1;
}
else if (iter % 2 == 1) {
iData = buf1;
oData = buf2;
}
else {
iData = buf2;
oData = buf1;
}
/* unroll the reduction procedure. The code is messy but it is faster. */
if(strideNum < 32){
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
KernelReduceSum <<<blocks, threads >>>(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp);
}
else if(strideNum < 128){
GDevs.GetCudaThread2D(devID, MAX(strideNum/2+1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!");
KernelReduceSumFast<64> <<<blocks, threads >>>(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp);
}
else if(strideNum < 256){
GDevs.GetCudaThread2D(devID, MAX(strideNum/2+1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!");
KernelReduceSumFast<128> <<<blocks, threads >>>(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp);
}
else if(strideNum < 512){
GDevs.GetCudaThread2D(devID, MAX(strideNum/2+1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!");
KernelReduceSumFast<256> <<<blocks, threads >>>(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp);
}
else{
GDevs.GetCudaThread2D(devID, MAX(strideNum/2+1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!");
KernelReduceSumFast<512> <<<blocks, threads >>>(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp);
}
if (stride == 1 && blockNum >= 10) {
dim3 grids;
dim3 blocks;
continuousStorageThreadAllocation(grids, blocks, (long long)blockNum, strideNum);
if (blocks.y >= 128)
KernelReduceSumOp <<<grids, blocks >>> ((DTYPE *)input->data, (DTYPE*)output->data, stride, strideNum, grids.y, blockSize, blockNum, sp, power, isExp);
else {
if (blockNum % 4 != 0) blockNum = (int)(blockNum / 4) + 1;
else blockNum = blockNum / 4;
KernelReduceSumOpLessBlocks << <blockNum, 128 >> > ((DTYPE *)input->data, (DTYPE*)output->data, strideNum, blockNum, sp, power, isExp);
}
else if(input->dataType == X_FLOAT16){
__half * buf1ft16 = (__half *)buf1;
__half * buf2ft16 = (__half *)buf2;
__half * spft16 = (__half *)sp;
unsigned short power2 = FloatToFloat16(power);
__half * powerft16p = (__half*)&power2;
__half * iData = NULL;
__half * oData = NULL;
if (iter == 0) {
iData = (__half*)input->data;
oData = buf1ft16;
}
else if (iter % 2 == 1) {
iData = buf1ft16;
oData = buf2ft16;
}
else {
iData = buf2ft16;
oData = buf1ft16;
}
/* unroll the reduction procedure. The code is messy but it is faster. */
if(strideNum < 32){
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
KernelReduceSum << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, spft16, *powerft16p, isExp);
}
else if(strideNum < 128){
GDevs.GetCudaThread2D(devID, MAX(strideNum/2+1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!");
KernelReduceSumFast<64> <<<blocks, threads >>>(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, spft16, *powerft16p, isExp);
}
else if(strideNum < 256){
GDevs.GetCudaThread2D(devID, MAX(strideNum/2+1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!");
KernelReduceSumFast<128> <<<blocks, threads >>>(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, spft16, *powerft16p, isExp);
}
else if(strideNum < 512){
GDevs.GetCudaThread2D(devID, MAX(strideNum/2+1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!");
KernelReduceSumFast<256> <<<blocks, threads >>>(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, spft16, *powerft16p, isExp);
}
else if (stride != 1 && stride * blockNum > 4096){
//GDevs->GetGridAndBlockSize2D(devID, stride * blockNum, strideNum,MAX_INT, cudaGridSize, cudaBlockSize);
//unsigned int* goutput = (unsigned int *)input->data;
//convert2uintV2 <<<dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1])>>> ((float*)input->data, goutput, stride, strideNum, blockNum, strideNum*blockNum*stride);
dim3 grid, block;
discontinuousStorageNoShareMemThreadAllocation(grid, block, stride, blockNum);
KernelReduceSumDiscontinuousStorage <<<grid, block>>> ((DTYPE *)input->data, (DTYPE*)output->data, stride,
strideNum, blockNum,sp, power, isExp);
}
else {
do {
if (input->dataType == DEFAULT_DTYPE) {
DTYPE * iData = NULL;
DTYPE * oData = NULL;
if (iter == 0) {
iData = (DTYPE*)input->data;
oData = buf1;
}
else if (iter % 2 == 1) {
iData = buf1;
oData = buf2;
}
else {
iData = buf2;
oData = buf1;
}
/* unroll the reduction procedure. The code is messy but it is faster. */
if (strideNum <= 32) {
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
KernelReduceSum <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y,
blockSize, blockNum, sp, power, isExp);
}
else if (strideNum < 128) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads);
KernelReduceSumFast<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y,
blockSize, blockNum, sp, power, isExp);
}
else if (strideNum < 256) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads);
KernelReduceSumFast<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y,
blockSize, blockNum, sp, power, isExp);
}
else if (strideNum < 512) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads);
KernelReduceSumFast<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y,
blockSize, blockNum, sp, power, isExp);
}
else {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads);
KernelReduceSumFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y,
blockSize, blockNum, sp, power, isExp);
}
}
else{
GDevs.GetCudaThread2D(devID, MAX(strideNum/2+1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!");
KernelReduceSumFast<512> <<<blocks, threads >>>(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, spft16, *powerft16p, isExp);
else if (input->dataType == X_FLOAT16) {
__half * buf1ft16 = (__half *)buf1;
__half * buf2ft16 = (__half *)buf2;
__half * spft16 = (__half *)sp;
unsigned short power2 = FloatToFloat16(power);
__half * powerft16p = (__half*)&power2;
__half * iData = NULL;
__half * oData = NULL;
if (iter == 0) {
iData = (__half*)input->data;
oData = buf1ft16;
}
else if (iter % 2 == 1) {
iData = buf1ft16;
oData = buf2ft16;
}
else {
iData = buf2ft16;
oData = buf1ft16;
}
/* unroll the reduction procedure. The code is messy but it is faster. */
if (strideNum < 32) {
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
KernelReduceSum <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y,
blockSize, blockNum, spft16, *powerft16p, isExp);
}
else if (strideNum < 128) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!");
KernelReduceSumFast<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y,
blockSize, blockNum, spft16, *powerft16p, isExp);
}
else if (strideNum < 256) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!");
KernelReduceSumFast<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y,
blockSize, blockNum, spft16, *powerft16p, isExp);
}
else if (strideNum < 512) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!");
KernelReduceSumFast<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y,
blockSize, blockNum, spft16, *powerft16p, isExp);
}
else {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!");
KernelReduceSumFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y,
blockSize, blockNum, spft16, *powerft16p, isExp);
}
}
}
strideNum = cudaGridSize[0];
blockSize = cudaGridSize[0];
sp = NULL;
power = (DTYPE)1.0;
isExp = false;
strideNum = cudaGridSize[0];
blockSize = cudaGridSize[0];
sp = NULL;
power = (DTYPE)1.0;
isExp = false;
iter++;
}while(strideNum > 1);
iter++;
} while (strideNum > 1);
}
ProtectCudaDev(input->devID, devIDBackup);
if (mem != NULL)
......
......@@ -176,15 +176,19 @@ make a new tensor to keep the result and return it
*/
XTensor LogSoftmax(const XTensor &x, int leadDim)
{
int ld = leadDim;
if (ld < 0)
ld = x.order - 1;
XTensor y(&x);
y.SetTMP();
/* call _LogSoftmax function */
_LogSoftmax(&x, &y, leadDim);
_LogSoftmax(&x, &y, ld);
/* tensor connection */
XLink::MakeLink(&x, NULL, &y, FUNC_LOGSOFTMAX);
XLink::AddParamToHeadInt(&y, leadDim);
XLink::AddParamToHeadInt(&y, ld);
return y;
}
......@@ -248,7 +252,7 @@ There are two ways to implement this process.
Method 1. we compute dE/dy and dy/dx resepectively, and then reach dE/dx by dE/dx = dE/dy * dy/dx
(or more precisely dE/dx_j = \sum_{i} {dE/dy_i * dy_i/dx_j})
Method 2. we compute dE/dx (or dE/dx_j) in a single step, rather than resorting to the
sub-models dE/dy and dy/dx. We can do this by using dE/dx_j = -gold_j + exp(y_j)
sub-models of dE/dy and dy/dx. We can do this by using dE/dx_j = -gold_j + exp(y_j)
Here we choose Method 2, i.e., we straightforwardly compute dE/dx_j by
......@@ -257,12 +261,12 @@ dE/dx_j = -gold_j + exp(y_j)
(or dE/dx_j = -\delta(i,j) + exp(y_j) for a Maximum A Posteriori Estimation (MAP))
Method 1 is also fine but is more time consuming due to the summation over dimensions.
Note that this method is not good for the standard version softmax when working with
the cross entropy loss. Because it is numerical unstable. When we use a usual method to
Note that this method is not good for the standard version softmax when we work with
the cross entropy loss because it is numerical unstable. When we use a usual method to
define softmax, we have softmax: y_i = log(e^{x_i} / \sum_{k} e^{x_k}). It is trivial to
know that dy_i/dx_j = y_i * \delta(i,j) - y_i * y_j. As y_i and y_j could be a small number,
y_i * y_i would result in a much smaller one with a risk of lossing precision. This is even
worse we multiply dy_i/dx_j with dE/dy_i. So it is in general to use log softmax instead for
know that dy_i/dx_j = y_i * \delta(i,j) - y_i * y_j. As y_i and y_j could be small numbers,
y_i * y_i would result in a much smaller value with a risk of lossing precision. This is even
worse we multiply dy_i/dx_j with dE/dy_i. So it is in general to use log softmax for
better numerical stability.
>> gold - gold standard to measure error (or loss)
......
......@@ -103,10 +103,10 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
else{
for(int i = 0; i < n; i++){
DTYPE r = (DTYPE)exp(ip[i * m + j] - mp[j])/sp[j];
if(IsNAN(r))
r = DTYPE_MIN;
if(IsINF(r))
r = DTYPE_MIN;
if (r > (DTYPE)1.0F)
r = (DTYPE)1.0F;
else if (r < 0)
r = 0;
op[i * m + j] = r;
}
}
......@@ -143,14 +143,19 @@ make a new tensor to keep the result and return it
*/
XTensor Softmax(const XTensor &x, int leadDim)
{
int ld = leadDim;
if (ld < 0)
ld = x.order - 1;
XTensor y(&x);
y.SetTMP();
/* call _Softmax function */
_Softmax(&x, &y, leadDim);
_Softmax(&x, &y, ld);
/* tensor connection */
XLink::MakeLink(&x, NULL, &y, FUNC_SOFTMAX);
XLink::AddParamToHeadInt(&y, ld);
return y;
}
......
......@@ -85,7 +85,13 @@ void KernelSoftmaxComputeTensor(DTYPE * x, DTYPE * max, DTYPE * sum, DTYPE * y,
if(i < strideSizeTotal && j < strideNum){
int offset = int(i / stride) * blockSize + j * stride + i2[threadIdx.x];
y[offset] = exp(x[offset] - xMax[threadIdx.x])/xSum[threadIdx.x];
DTYPE r = exp(x[offset] - xMax[threadIdx.x])/xSum[threadIdx.x];
if (r >(DTYPE)1.0F)
r = (DTYPE)1.0F;
else if (r < 0)
r = 0;
y[offset] = r;
}
}
......@@ -194,7 +200,12 @@ void KernelSoftmaxComputeTensorUseBroadcast(DTYPE * input, DTYPE * max, DTYPE *
maxData = broadcast(maxData);
if (i < strideNum){
int offset = int(j / stride) * blockSize + i * stride + i2;
output[offset] = exp(input[offset] - maxData) / sumData;
DTYPE r = exp(input[offset] - maxData) / sumData;
if (r > (DTYPE)1.0F)
r = (DTYPE)1.0F;
else if (r < 0)
r = 0;
output[offset] = r;
}
}
}
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-14
*/
#include "TDivDim.h"
#include "../core/arithmetic/DivDim.h"
#include "../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
case 1: tensor division c = a/b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is divided with b by broadcasting.
In this case, (2, 4) / (2) = (2, 4), n = 0, alpha = 0.0.
*/
bool TestDivDim1()
{
/* a tensor of size (2, 4) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 2;
aDimSize[1] = 4;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (2) */
int bOrder = 1;
int * bDimSize = new int[bOrder];
bDimSize[0] = 2;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[2] = {1.0F, -1.0F};
DTYPE answer[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{-4.0F, -5.0F, -6.0F, -7.0F} };
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensor(aOrder, aDimSize);
XTensor * b = NewTensor(bOrder, bDimSize);
XTensor * c = NewTensor(aOrder, aDimSize);
XTensor * cMe = NewTensor(aOrder, aDimSize);
XTensor cUser;
/* initialize variables */
a->SetData(aData, aUnitNum);
cMe->SetData(aData, aUnitNum);
b->SetData(bData, bUnitNum);
c->SetZeroAll();
/* call DivDim function */
_DivDim(a, b, c, 0);
_DivDim(cMe, b, 0);
cUser = DivDim(*a, *b, 0);
/* check results */
cpuTest = c->CheckData(answer, aUnitNum) &&
cMe->CheckData(answer, aUnitNum) &&
cUser.CheckData(answer, aUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
cMeGPU->SetData(aData, aUnitNum);
bGPU->SetData(bData, bUnitNum);
cGPU->SetZeroAll();
/* call sum function */
_DivDim(aGPU, bGPU, cGPU, 0);
_DivDim(cMeGPU, bGPU, 0);
cUserGPU = DivDim(*aGPU, *bGPU, 0);
/* check results */
gpuTest = cGPU->CheckData(answer, aUnitNum) &&
cMeGPU->CheckData(answer, aUnitNum) &&
cUserGPU.CheckData(answer, aUnitNum);
/* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 2: tensor division c = a/b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is divided with b by broadcasting.
In this case, (2, 4) / (2, 2) = (2, 4), n = 1.
*/
bool TestDivDim2()
{
/* a tensor of size (2, 4) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 2;
aDimSize[1] = 4;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (2, 2) */
int bOrder = 2;
int * bDimSize = new int[bOrder];
bDimSize[0] = 2;
bDimSize[1] = 2;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[2][2] = { {1.0F, -1.0F},
{-1.0F, 1.0F} };
DTYPE answer[2][4] = { {0.0F, -1.0F, -2.0F, 3.0F},
{4.0F, -5.0F, -6.0F, 7.0F} };
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensor(aOrder, aDimSize);
XTensor * b = NewTensor(bOrder, bDimSize);
XTensor * c = NewTensor(aOrder, aDimSize);
XTensor * cMe = NewTensor(aOrder, aDimSize);
XTensor cUser;
/* initialize variables */
a->SetData(aData, aUnitNum);
cMe->SetData(aData, aUnitNum);
b->SetData(bData, bUnitNum);
c->SetZeroAll();
/* call DivDim function */
_DivDim(a, b, c, 1);
_DivDim(cMe, b, 1);
cUser = DivDim(*a, *b, 1);
/* check results */
cpuTest = c->CheckData(answer, aUnitNum) &&
cMe->CheckData(answer, aUnitNum) &&
cUser.CheckData(answer, aUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
cMeGPU->SetData(aData, aUnitNum);
bGPU->SetData(bData, bUnitNum);
cGPU->SetZeroAll();
/* call sum function */
_DivDim(aGPU, bGPU, cGPU, 1);
_DivDim(cMeGPU, bGPU, 1);
cUserGPU = DivDim(*aGPU, *bGPU, 1);
/* check results */
gpuTest = cGPU->CheckData(answer, aUnitNum) &&
cMeGPU->CheckData(answer, aUnitNum) &&
cUserGPU.CheckData(answer, aUnitNum);
/* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
*/
/* test for DivDim Function */
bool TestDivDim()
{
XPRINT(0, stdout, "[TEST DIVDIM] tensor division c(i) = a/b + \alpha * c by broadcasting\n");
bool returnFlag = true, caseFlag = true;
/* case 1 test */
caseFlag = TestDivDim1();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 1 failed!\n");
}
else
XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */
caseFlag = TestDivDim2();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* other cases test */
/*
TODO!!
*/
if (returnFlag) {
XPRINT(0, stdout, ">> All Passed!\n");
}
else
XPRINT(0, stdout, ">> Failed!\n");
XPRINT(0, stdout, "\n");
return returnFlag;
}
} // namespace nts(NiuTrans.Tensor)
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-14
*/
#ifndef __TEST_DIVDIM_H__
#define __TEST_DIVDIM_H__
#include "../core/arithmetic/DivDim.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for DivDim Function */
bool TestDivDim();
} // namespace nts(NiuTrans.Tensor)
#endif // __TEST_DIVDIM_H__
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-30
*/
#include "TMultiplyDim.h"
#include "../core/arithmetic/MultiplyDim.h"
#include "../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
case 1: tensor multiplication c = a * b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is multiplied with b by broadcasting
In this case, (2, 4) * (2) = (2, 4), n = 0.
*/
bool TestMultiplyDim1()
{
/* a tensor of size (2, 4) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 2;
aDimSize[1] = 4;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (2) */
int bOrder = 1;
int * bDimSize = new int[bOrder];
bDimSize[0] = 2;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[2] = {1.0F, -1.0F};
DTYPE answer[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{-4.0F, -5.0F, -6.0F, -7.0F} };
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensor(aOrder, aDimSize);
XTensor * b = NewTensor(bOrder, bDimSize);
XTensor * c = NewTensor(aOrder, aDimSize);
XTensor * cMe = NewTensor(aOrder, aDimSize);
XTensor cUser;
/* initialize variables */
a->SetData(aData, aUnitNum);
cMe->SetData(aData, aUnitNum);
b->SetData(bData, bUnitNum);
c->SetZeroAll();
/* call MultiplyDim function */
_MultiplyDim(a, b, c, 0);
_MultiplyDimMe(cMe, b, 0);
cUser = MultiplyDim(*a, *b, 0);
/* check results */
cpuTest = c->CheckData(answer, aUnitNum) &&
cMe->CheckData(answer, aUnitNum) &&
cUser.CheckData(answer, aUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
cMeGPU->SetData(aData, aUnitNum);
bGPU->SetData(bData, bUnitNum);
cGPU->SetZeroAll();
/* call MultiplyDim function */
_MultiplyDim(aGPU, bGPU, cGPU, 0);
_MultiplyDimMe(cMeGPU, bGPU, 0);
cUserGPU = MultiplyDim(*aGPU, *bGPU, 0);
/* check results */
gpuTest = cGPU->CheckData(answer, aUnitNum) &&
cMeGPU->CheckData(answer, aUnitNum) &&
cUserGPU.CheckData(answer, aUnitNum);
/* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest;
#endif // USE_CUDA
}
/*
case 2: tensor multiplication c = a*b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is multiplied with b by broadcasting.
In this case, (2, 4) * (4) = (2, 4), n = 1.
*/
bool TestMultiplyDim2()
{
/* a tensor of size (2, 4) */
int aOrder = 2;
int * aDimSize = new int[aOrder];
aDimSize[0] = 2;
aDimSize[1] = 4;
int aUnitNum = 1;
for (int i = 0; i < aOrder; i++)
aUnitNum *= aDimSize[i];
/* a tensor of size (4) */
int bOrder = 1;
int * bDimSize = new int[bOrder];
bDimSize[0] = 4;
int bUnitNum = 1;
for (int i = 0; i < bOrder; i++)
bUnitNum *= bDimSize[i];
DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
{4.0F, 5.0F, 6.0F, 7.0F} };
DTYPE bData[4] = {1.0F, -1.0F , 1.0F, -1.0F};
DTYPE answer[2][4] = { {0.0F, -1.0F, 2.0F, -3.0F},
{4.0F, -5.0F, 6.0F, -7.0F} };
/* CPU test */
bool cpuTest = true;
/* create tensors */
XTensor * a = NewTensor(aOrder, aDimSize);
XTensor * b = NewTensor(bOrder, bDimSize);
XTensor * c = NewTensor(aOrder, aDimSize);
XTensor * cMe = NewTensor(aOrder, aDimSize);
XTensor cUser;
/* initialize variables */
a->SetData(aData, aUnitNum);
cMe->SetData(aData, aUnitNum);
b->SetData(bData, bUnitNum);
c->SetZeroAll();
/* call MultiplyDim function */
_MultiplyDim(a, b, c, 1);
_MultiplyDimMe(cMe, b, 1);
cUser = MultiplyDim(*a, *b, 1);
/* check results */
cpuTest = c->CheckData(answer, aUnitNum) &&
cMe->CheckData(answer, aUnitNum) &&
cUser.CheckData(answer, aUnitNum);
#ifdef USE_CUDA
/* GPU test */
bool gpuTest = true;
/* create tensor */
XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
XTensor cUserGPU;
/* Initialize variables */
aGPU->SetData(aData, aUnitNum);
cMeGPU->SetData(aData, aUnitNum);
bGPU->SetData(bData, bUnitNum);
cGPU->SetZeroAll();
/* call MultiplyDim function */
_MultiplyDim(aGPU, bGPU, cGPU, 1);
_MultiplyDimMe(cMeGPU, bGPU, 1);
cUserGPU = MultiplyDim(*aGPU, *bGPU, 1);
/* check results */
gpuTest = cGPU->CheckData(answer, aUnitNum) &&
cMeGPU->CheckData(answer, aUnitNum) &&
cUserGPU.CheckData(answer, aUnitNum);
/* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete aGPU;
delete bGPU;
delete cGPU;
delete cMeGPU;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest && gpuTest;
#else
/* destroy variables */
delete a;
delete b;
delete c;
delete cMe;
delete[] aDimSize;
delete[] bDimSize;
return cpuTest;
#endif // USE_CUDA
}
/* test for MultiplyDim Function */
bool TestMultiplyDim()
{
XPRINT(0, stdout, "[TEST MULTIPLYDIM] tensor multiplication c = a * b + \alpha * c by broadcasting\n");
bool returnFlag = true, caseFlag = true;
/* case 1 test */
caseFlag = TestMultiplyDim1();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 1 failed!\n");
}
else
XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */
caseFlag = TestMultiplyDim2();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* other cases test */
/*
TODO!!
*/
if (returnFlag) {
XPRINT(0, stdout, ">> All Passed!\n");
}
else
XPRINT(0, stdout, ">> Failed!\n");
XPRINT(0, stdout, "\n");
return returnFlag;
}
}
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-30
*/
#ifndef __TEST_MULTIPLYDIM_H__
#define __TEST_MULTIPLYDIM_H__
#include "../core/arithmetic/MultiplyDim.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/* test for MultiplyDim Function */
bool TestMultiplyDim();
} // namespace nts(NiuTrans.Tensor)
#endif // __TEST_MULTIPLYDIM_H__
\ No newline at end of file
......@@ -24,7 +24,8 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
case 1: sum the items along a dimension of the tensor.
case 1: test ReduceSum function.
Sum the items along a dimension of the tensor.
In this case,
(2, 4) -> (4), dim = 0
(2, 4) -> (2), dim = 1
......@@ -90,8 +91,8 @@ bool TestReduceSum1()
tUser2 = ReduceSum(*s, 1, *shift2);
/* check results */
cpuTest = t1->CheckData(answer1, tUnitNum1) && tUser1.CheckData(answer1, tUnitNum1)
&& t2->CheckData(answer2, tUnitNum2) && tUser2.CheckData(answer2, tUnitNum2);
cpuTest = t1->CheckData(answer1, tUnitNum1) && tUser1.CheckData(answer1, tUnitNum1) &&
t2->CheckData(answer2, tUnitNum2) && tUser2.CheckData(answer2, tUnitNum2);
#ifdef USE_CUDA
/* GPU test */
......@@ -120,8 +121,8 @@ bool TestReduceSum1()
tUserGPU2 = ReduceSum(*sGPU, 1, *shiftGPU2);
/* check results */
gpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tUserGPU1.CheckData(answer1, tUnitNum1)
&& tGPU2->CheckData(answer2, tUnitNum2) && tUserGPU2.CheckData(answer2, tUnitNum2);
gpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tUserGPU1.CheckData(answer1, tUnitNum1) &&
tGPU2->CheckData(answer2, tUnitNum2) && tUserGPU2.CheckData(answer2, tUnitNum2);
/* destroy variables */
delete s;
......
......@@ -28,7 +28,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/*
case 1: tensor summation c = a + b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is summed with b by broadcasting
i.e., a is summed with b by broadcasting.
In this case, (2, 4) + (2) = (2, 4), n = 0.
*/
bool TestSumDim1()
{
......@@ -79,9 +80,9 @@ bool TestSumDim1()
cUser = SumDim(*a, *b, 0);
/* check results */
cpuTest = c->CheckData(answer, aUnitNum)
&& cMe->CheckData(answer, aUnitNum)
&& cUser.CheckData(answer, aUnitNum);
cpuTest = c->CheckData(answer, aUnitNum) &&
cMe->CheckData(answer, aUnitNum) &&
cUser.CheckData(answer, aUnitNum);
#ifdef USE_CUDA
/* GPU test */
......@@ -106,9 +107,9 @@ bool TestSumDim1()
cUserGPU = SumDim(*aGPU, *bGPU, 0);
/* check results */
gpuTest = cGPU->CheckData(answer, aUnitNum)
&& cMeGPU->CheckData(answer, aUnitNum)
&& cUserGPU.CheckData(answer, aUnitNum);
gpuTest = cGPU->CheckData(answer, aUnitNum) &&
cMeGPU->CheckData(answer, aUnitNum) &&
cUserGPU.CheckData(answer, aUnitNum);
/* destroy variables */
delete a;
......@@ -139,7 +140,8 @@ bool TestSumDim1()
/*
case 2: tensor summation c = a + b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is summed with b by broadcasting
i.e., a is summed with b by broadcasting.
In this case, (2, 4) + (2, 2) = (2, 4), n = 1.
*/
bool TestSumDim2()
{
......@@ -192,9 +194,9 @@ bool TestSumDim2()
cUser = SumDim(*a, *b, 1);
/* check results */
cpuTest = c->CheckData(answer, aUnitNum)
&& cMe->CheckData(answer, aUnitNum)
&& cUser.CheckData(answer, aUnitNum);
cpuTest = c->CheckData(answer, aUnitNum) &&
cMe->CheckData(answer, aUnitNum) &&
cUser.CheckData(answer, aUnitNum);
#ifdef USE_CUDA
/* GPU test */
......@@ -219,9 +221,9 @@ bool TestSumDim2()
cUserGPU = SumDim(*aGPU, *bGPU, 1);
/* check results */
gpuTest = cGPU->CheckData(answer, aUnitNum)
&& cMeGPU->CheckData(answer, aUnitNum)
&& cUserGPU.CheckData(answer, aUnitNum);
gpuTest = cGPU->CheckData(answer, aUnitNum) &&
cMeGPU->CheckData(answer, aUnitNum) &&
cUserGPU.CheckData(answer, aUnitNum);
/* destroy variables */
delete a;
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-30
*/
#include "TTmp.h"
#include "../XTensor.h"
#include "../../xc/ultility.h"
#include "../../xc/myCode.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
bool TestTmp1()
{
XTensor a;
XTensor b;
InitTensor4D(&a, 8, 32, 106, 106, X_FLOAT, -1, NULL);
FILE * fReadOrigin = fopen("V:/temp/input.dump", "rb");
a.Read(fReadOrigin, "a-plus-bias");
fclose(fReadOrigin);
b = Softmax(a, 3);
XTensor c;
InitTensor4D(&c, 8, 32, 106, 106, X_FLOAT, -1, NULL);
FILE * fReadResult = fopen("V:/temp/input.dump.result", "rb");
c.Read(fReadResult, "");
fclose(fReadResult);
printf("\n\nThis is CPU!\n");
b.Dump(stderr, "b", 100);
printf("\n\n");
c.Dump(stderr, "c", 100);
bool cpuTest;
cpuTest = b.CheckData(c.data, b.unitNum, 1e-6F);
if(cpuTest == true)
printf("CPU Yeah!");
else
printf("CPU ops..");
exit(1);
#ifdef USE_CUDA
XTensor aGPU;
XTensor bGPU;
InitTensor4D(&aGPU, 8, 32, 106, 106, X_FLOAT, 0, NULL);
InitTensor4D(&bGPU, 8, 32, 106, 106, X_FLOAT, 0, NULL);
fReadOrigin = fopen("V:/temp/input.dump", "rb");
aGPU.Read(fReadOrigin, "a-plus-bias");
fclose(fReadOrigin);
//bGPU = Softmax(aGPU, 3);
_Softmax(&aGPU, &bGPU, 3);
printf("\n\nThis is GPU\n");
bGPU.Dump(stderr, "bGPU", 100);
bool gpuTest;
gpuTest = bGPU.CheckData(c.data, bGPU.unitNum, 1e-4F);
if(gpuTest == true)
printf("GPU Yeah!");
else
printf("GPU ops..");
#endif // USE_CUDA
exit(1);
return 0;
}
bool TestTmp2()
{
XTensor a;
XTensor b;
InitTensor4D(&a, 8, 32, 106, 106, X_FLOAT, -1, NULL);
InitTensor4D(&b, 8, 32, 106, 106, X_FLOAT, -1, NULL);
//FILE * fReadResultGold = fopen("V:/temp/input.dump.gold", "rb");
//a.Read(fReadResultGold, "input");
//fclose(fReadResultGold);
FILE * fReadResult = fopen("V:/temp/input.dump", "rb");
b.Read(fReadResult, "a-plus-bias");
fclose(fReadResult);
ShowData(&b, "");
bool flag = CheckTensorData(a, b, 1e-3F);
if (flag)
printf("yeah");
else
printf("ops.");
exit(1);
return 0;
}
/* other cases */
/*
TODO!!
*/
/* test for Tmp Function */
bool TestTmp()
{
XPRINT(0, stdout, "[TEST Temp] temporary test\n");
bool returnFlag = true, caseFlag = true;
///* case 1 test */
//caseFlag = TestTmp1();
//if (!caseFlag) {
// returnFlag = false;
// XPRINT(0, stdout, ">> case 1 failed!\n");
//}
//else
// XPRINT(0, stdout, ">> case 1 passed!\n");
/* case 2 test */
caseFlag = TestTmp2();
if (!caseFlag) {
returnFlag = false;
XPRINT(0, stdout, ">> case 2 failed!\n");
}
else
XPRINT(0, stdout, ">> case 2 passed!\n");
/* other cases test */
/*
TODO!!
*/
if (returnFlag) {
XPRINT(0, stdout, ">> All Passed!\n");
}
else
XPRINT(0, stdout, ">> Failed!\n");
XPRINT(0, stdout, "\n");
return returnFlag;
}
} // namespace nts(NiuTrans.Tensor)
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-13
*/
#ifndef __TEST_TMP_H__
#define __TEST_TMP_H__
#include "../core/CHeader.h"
#include "../function/FHeader.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
bool TestTmp();
} // namespace nts(NiuTrans.Tensor)
#endif // __TEST_TMP_H__
......@@ -99,8 +99,8 @@ bool TestUnsqueeze1()
tUser2 = Unsqueeze(*s, 2, 2);
/* check results */
cpuTest = t1->CheckData(answer1, tUnitNum1) && tUser1.CheckData(answer1, tUnitNum1)
&& t2->CheckData(answer2, tUnitNum2) && tUser2.CheckData(answer2, tUnitNum2);
cpuTest = t1->CheckData(answer1, tUnitNum1) && tUser1.CheckData(answer1, tUnitNum1) &&
t2->CheckData(answer2, tUnitNum2) && tUser2.CheckData(answer2, tUnitNum2);
#ifdef USE_CUDA
/* GPU test */
......
......@@ -29,6 +29,8 @@ bool Test()
bool wrong = false;
XPRINT(0, stdout, "Testing the XTensor utilites ... \n\n");
//wrong = !TestTmp() || wrong;
wrong = !TestAbsolute() || wrong;
wrong = !TestClip() || wrong;
wrong = !TestConcatenate() || wrong;
......@@ -38,6 +40,7 @@ bool Test()
wrong = !TestCopyIndexed() || wrong;
wrong = !TestCopyValues() || wrong;
wrong = !TestDiv() || wrong;
wrong = !TestDivDim() || wrong;
wrong = !TestExp() || wrong;
wrong = !TestLog() || wrong;
wrong = !TestMatrixMul() || wrong;
......@@ -46,6 +49,7 @@ bool Test()
wrong = !TestMatrixMulBatched() || wrong;
wrong = !TestMerge() || wrong;
wrong = !TestMultiply() || wrong;
wrong = !TestMultiplyDim() || wrong;
wrong = !TestNegate() || wrong;
wrong = !TestNormalize() || wrong;
wrong = !TestPower() || wrong;
......
......@@ -22,6 +22,8 @@
#ifndef __TEST_H__
#define __TEST_H__
#include "TTmp.h"
#include "TAbsolute.h"
#include "TClip.h"
#include "TConcatenate.h"
......@@ -31,6 +33,7 @@
#include "TCopyIndexed.h"
#include "TCopyValues.h"
#include "TDiv.h"
#include "TDivDim.h"
#include "TExp.h"
#include "TLog.h"
#include "TMatrixMul.h"
......@@ -39,6 +42,7 @@
#include "TMatrixMulBatched.h"
#include "TMerge.h"
#include "TMultiply.h"
#include "TMultiplyDim.h"
#include "TNegate.h"
#include "TNormalize.h"
#include "TPower.h"
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论