Commit 44bf9fa6 by linye

Merge了最新版本代码,修复了一些Bug

parent c9c53870
...@@ -66,6 +66,7 @@ void PowerFP16Test(); ...@@ -66,6 +66,7 @@ void PowerFP16Test();
void ClipFP16Test(); void ClipFP16Test();
void GatherFP16Test(); void GatherFP16Test();
void SetDataGPUFP16Test(); void SetDataGPUFP16Test();
void SumIntTest();
using namespace nts; using namespace nts;
using namespace fnnlm; using namespace fnnlm;
...@@ -89,8 +90,6 @@ int main( int argc, const char ** argv ) ...@@ -89,8 +90,6 @@ int main( int argc, const char ** argv )
//return 0; //return 0;
//ConvertBackwardTest(); //ConvertBackwardTest();
//return 0; //return 0;
//DropoutFP16Test();
//return 0;
//UnsqueezeFP16Test(); //UnsqueezeFP16Test();
//return 0; //return 0;
//ReduceMaxFP16Test(); //ReduceMaxFP16Test();
...@@ -143,11 +142,17 @@ int main( int argc, const char ** argv ) ...@@ -143,11 +142,17 @@ int main( int argc, const char ** argv )
//InitCPUFP16Test(); //InitCPUFP16Test();
//return 0; //return 0;
SetDataGPUFP16Test(); //MycublasGemmExTest();
return 0; //return 0;
MycublasGemmExTest(); //SumIntTest();
return 0; //return 0;
//DropoutFP16Test();
//return 0;
//SetDataGPUFP16Test();
//return 0;
if(argc > 1 && !strcmp(argv[1], "-fnnlm")) if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
...@@ -205,27 +210,69 @@ void MycublasGemmExTest() { ...@@ -205,27 +210,69 @@ void MycublasGemmExTest() {
c1.Dump(stderr, "c1:"); c1.Dump(stderr, "c1:");
} }
void SumIntTest() {
XTensor a;
XTensor b;
XTensor c;
XTensor inta;
XTensor intb;
XTensor intc;
InitTensor2D(&a, 2, 2, X_FLOAT, 0);
InitTensor2D(&b, 2, 2, X_FLOAT, 0);
a.SetDataRand(-5.0, 5.0);
b.SetDataRand(-5.0, 5.0);
a.Dump(stderr, "a:");
b.Dump(stderr, "b:");
inta = ConvertDataType(a, X_INT);
intb = ConvertDataType(b, X_INT);
inta.Dump(stderr, "inta:");
intb.Dump(stderr, "intb:");
intc = Sum(inta, intb);
intc.Dump(stderr, "intc:");
}
void SetDataGPUFP16Test() { void SetDataGPUFP16Test() {
srand(time(NULL)); srand(time(NULL));
XTensor a1; /*XTensor m;
InitTensor2D(&a1, 2, 2, X_FLOAT, 0); InitTensor2D(&m, 2, 2, X_FLOAT, 0);
_SetDataRand(&a1, -5.0F, 5.0F); m.SetDataRand(0.0, 10.0);*/
a1.Dump(&a1, stderr, "a:\n");
XTensor a; XTensor * m = NewTensor2D(2, 2, X_FLOAT, 0);
InitTensor2D(&a, 2, 2, X_FLOAT16, 0); m->SetDataRand(0.0, 10.0);
_SetDataRand(&a, -5.0F, 5.0F); //XTensor a1;
//InitTensor2D(&a1, 2, 2, X_FLOAT, 0);
//_CopyValues(&m, &a1);
//_SetDataRand(&a1, -1.0F, 1.0F);
//a1.Dump(&a1, stderr, "a:\n");
a.Dump(&a, stderr, "a:\n"); /*XTensor a;
InitTensor2D(&a, 2, 2, X_FLOAT16, 0);*/
XTensor b; XTensor * a = NewTensor2D(2, 2, X_FLOAT16, 0);
InitTensor2D(&b, 2, 2, X_FLOAT, 0);
b = ConvertDataType(a, X_FLOAT); _ConvertDataType(m, a);
a->Dump(a, stderr, "a:\n");
_SetDataRand(a, 0.0F, 1.0F);
a->Dump(a, stderr, "a:\n");
//XTensor b;
//InitTensor2D(&b, 2, 2, X_FLOAT, 0);
//b = ConvertDataType(a, X_FLOAT);
//b.Dump(stderr, "b:\n");
b.Dump(stderr, "b:\n");
} }
void ClipFP16Test() { void ClipFP16Test() {
...@@ -447,7 +494,7 @@ void FloatToInt8Test() { ...@@ -447,7 +494,7 @@ void FloatToInt8Test() {
InitTensor2D(&a, 2, 2, X_FLOAT, 0); InitTensor2D(&a, 2, 2, X_FLOAT, 0);
InitTensor2D(&b, 2, 2, X_INT8, 0); InitTensor2D(&b, 2, 2, X_INT8, 0);
a.SetDataRand(-5.0F, 5.0F); a.SetDataRand(5.0F, 5.0F);
a.Dump(stderr, "a:"); a.Dump(stderr, "a:");
b = ConvertDataType(a, X_INT8); b = ConvertDataType(a, X_INT8);
...@@ -741,8 +788,8 @@ void MultiplyDimFP16Test() ...@@ -741,8 +788,8 @@ void MultiplyDimFP16Test()
halfA = ConvertDataType(a, X_FLOAT16); halfA = ConvertDataType(a, X_FLOAT16);
halfB = ConvertDataType(b, X_FLOAT16); halfB = ConvertDataType(b, X_FLOAT16);
c1 = MultiplyDim(a1, b1, 1, 0); c1 = MultiplyDim(a1, b1, 1);
halfC = MultiplyDim(halfA, halfB, 1, 0); halfC = MultiplyDim(halfA, halfB, 1);
c = ConvertDataType(halfC, X_FLOAT); c = ConvertDataType(halfC, X_FLOAT);
...@@ -950,26 +997,26 @@ void SubFP16Test() ...@@ -950,26 +997,26 @@ void SubFP16Test()
void DropoutFP16Test() void DropoutFP16Test()
{ {
srand(time(NULL));
XTensor a; XTensor a;
XTensor b; XTensor b;
XTensor b1; XTensor b1;
XTensor halfA; XTensor halfA;
XTensor halfB; XTensor halfB;
InitTensor2D(&a, 10, 10, X_FLOAT, 0); InitTensor2D(&a, 10, 1, X_FLOAT, 0);
a.SetDataRand(-5.0F, 5.0F); a.SetDataRand(-5.0F, 5.0F);
/*a.Dump(stderr, "a:");*/ a.Dump(stderr, "a:");
halfA = ConvertDataType(a, X_FLOAT16); halfA = ConvertDataType(a, X_FLOAT16);
halfB = Dropout(halfA, 0.5); halfB = Dropout(halfA, 0.2);
b1 = Dropout(a, 0.3); b1 = Dropout(a, 0.2);
b = ConvertDataType(halfB, X_FLOAT);
b.Dump(stderr, "b:"); halfB.Dump(&halfB, stderr, "halfB:");
//b1.Dump(stderr, "b1:"); b1.Dump(&b1, stderr, "b1:");
} }
void ConvertBackwardTest() void ConvertBackwardTest()
...@@ -1069,132 +1116,4 @@ void ConvertTest() ...@@ -1069,132 +1116,4 @@ void ConvertTest()
a1.Dump(stderr, "halfa:"); a1.Dump(stderr, "halfa:");
} }
} }
\ No newline at end of file
void MatrixMulFloat16AndFloatTest()
{
XTensor a;
XTensor b;
XTensor c;
InitTensor2D(&a, 5000, 5000, X_FLOAT, 0);
InitTensor2D(&b, 5000, 5000, X_FLOAT, 0);
InitTensor2D(&c, 5000, 5000, X_FLOAT, 0);
a.SetDataRand(-10.0F, 10.0F);
b.SetDataRand(-10.0F, 10.0F);
int recurrentNum = 10000;
double startT1 = GetClockSec();
for (int i1 = 0; i1 < recurrentNum; i1++)
{
c= MatrixMul(&a, &b);
}
printf("ElapsedFloat32 = %.2f s \n", GetClockSec() - startT1);
double startT2 = GetClockSec();
for (int i2 = 0; i2 < recurrentNum; i2++)
{
c = MatrixMulFloat16(&a, &b);
}
printf("ElapsedFloat16 = %.2f s \n", GetClockSec() - startT2);
}
void MatrixMul2DFloat16Test()
{
XTensor a;
XTensor b;
XTensor c;
XTensor a00;
XTensor b00;
XTensor c00;
XTensor c01;
XTensor halfa;
XTensor halfb;
XTensor halfc;
InitTensor3D(&a, 3, 2, 3, X_FLOAT, 0);
InitTensor2D(&b, 3, 2, X_FLOAT, 0);
InitTensor3D(&c, 3, 2, 2, X_FLOAT, 0);
InitTensor3D(&a00, 3, 2, 3, X_FLOAT, 0);
InitTensor2D(&b00, 3, 2, X_FLOAT, 0);
InitTensor3D(&c00, 3, 2, 2, X_FLOAT, 0);
InitTensor3D(&c01, 3, 2, 2, X_FLOAT, 0);
InitTensor3D(&halfa, 3, 2, 3, X_FLOAT16, 0);
InitTensor2D(&halfb, 3, 2, X_FLOAT16, 0);
InitTensor3D(&halfc, 3, 2, 2, X_FLOAT16, 0);
DTYPE aData[3][2][3] = { { { 0.02121212144F, -1.0234556667F, 2.04354565678F },
{ 2.0234567332F, -1.0213469654F, -3.01568321F } },
{ { -1.022347899421F, 2.012589653664F, 4.035346643F },
{ 3.01234544634F, 1.0324354635F, 2.0546578332F } },
{ { -1.0235743446F, 3.0335753334F, 2.0653323234F },
{ 1.03235643232F, -1.023463345542F, 0.0335563322F } } };
DTYPE bData[3][2] = { { -1.034466323232F, -2.0546676442F },
{ -3.0224354656F, 4.034467866532F },
{ 5.02354657442F, -6.0324355767443F } };
a.SetData(aData, 18);
b.SetData(bData, 6);
_MatrixMul(&a, X_NOTRANS, &b, X_NOTRANS, &c);
_ConvertDataType(&a, &halfa);
_ConvertDataType(&b, &halfb);
_MatrixMul(&halfa, X_NOTRANS, &halfb, X_NOTRANS, &halfc);
_ConvertDataType(&halfc, &c01);
_ConvertDataType(&halfa, &a00);
_ConvertDataType(&halfb, &b00);
_MatrixMul(&a00, X_NOTRANS, &b00, X_NOTRANS, &c00);
c.Dump(stderr, "c:");
c01.Dump(stderr, "c01:");
c00.Dump(stderr, "c0:");
XTensor a1;
XTensor b1;
XTensor c1;
XTensor a10;
XTensor b10;
XTensor c10;
XTensor c11;
XTensor halfa1;
XTensor halfb1;
XTensor halfc1;
InitTensor2D(&a1, 3, 3, X_FLOAT, 0);
InitTensor2D(&b1, 3, 2, X_FLOAT, 0);
InitTensor2D(&c1, 3, 2, X_FLOAT, 0);
InitTensor2D(&a10, 3, 3, X_FLOAT, 0);
InitTensor2D(&b10, 3, 2, X_FLOAT, 0);
InitTensor2D(&c10, 3, 2, X_FLOAT, 0);
InitTensor2D(&c11, 3, 2, X_FLOAT, 0);
InitTensor2D(&halfa1, 3, 3, X_FLOAT16, 0);
InitTensor2D(&halfb1, 3, 2, X_FLOAT16, 0);
InitTensor2D(&halfc1, 3, 2, X_FLOAT16, 0);
DTYPE a1Data[3][3] = { { 0.02121212144F, -1.0234556667F, 2.043541565678F },
{ -2.0234567332F, 1.0213469657774F, -3.0156837543321F } ,
{ 1.022347899421F, -2.012589653664F, 4.03534634643F }};
DTYPE b1Data[3][2] = { { 1.034466323232F, -2.0546676442F },
{ 3.0224354656F, -4.034467866532F },
{ 5.02354657442F, 6.0324355767443F } };
a1.SetData(a1Data, 9);
b1.SetData(b1Data, 6);
_MatrixMul(&a1, X_NOTRANS, &b1, X_NOTRANS, &c1);
_ConvertDataType(&a1, &halfa1);
_ConvertDataType(&b1, &halfb1);
_MatrixMul(&halfa1, X_NOTRANS, &halfb1, X_NOTRANS, &halfc1);
_ConvertDataType(&halfc1, &c11);
_ConvertDataType(&halfa1, &a10);
_ConvertDataType(&halfb1, &b10);
_MatrixMul(&a10, X_NOTRANS, &b10, X_NOTRANS, &c10);
c1.Dump(stderr, "c1:");
c11.Dump(stderr, "c11:");
c10.Dump(stderr, "c10:");
}
...@@ -87,8 +87,6 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y, ...@@ -87,8 +87,6 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y,
XTensor * dedy, XTensor * padding, XTensor * dedy, XTensor * padding,
LOSS_FUNCTION_NAME lossName) LOSS_FUNCTION_NAME lossName)
{ {
//return;
if(gold == NULL){ if(gold == NULL){
if(dedy->dataType == X_FLOAT) if(dedy->dataType == X_FLOAT)
_SetDataFixedFloat(dedy, 1.0F); _SetDataFixedFloat(dedy, 1.0F);
...@@ -97,7 +95,7 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y, ...@@ -97,7 +95,7 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y,
else if(dedy->dataType == X_INT) else if(dedy->dataType == X_INT)
_SetDataFixedInt(dedy, 1); _SetDataFixedInt(dedy, 1);
else{ else{
//ShowNTErrors("TODO"); ShowNTErrors("TODO");
} }
return; return;
} }
......
...@@ -71,6 +71,8 @@ void XMathGrad::MakeGrad(XTensor * node, bool isEfficient) ...@@ -71,6 +71,8 @@ void XMathGrad::MakeGrad(XTensor * node, bool isEfficient)
GradMultiply(node, isEfficient); GradMultiply(node, isEfficient);
else if(operID == MATH_MULTIPLYDIM) else if(operID == MATH_MULTIPLYDIM)
GradMultiplyDim(node, isEfficient); GradMultiplyDim(node, isEfficient);
else if (operID == MATH_MULTIPLYBROADCAST)
GradMultiplyBroadcast(node, isEfficient);
else if(operID == MATH_NEGATE) else if(operID == MATH_NEGATE)
GradNegate(node, isEfficient); GradNegate(node, isEfficient);
else if(operID == MATH_NORMALIZE) else if(operID == MATH_NORMALIZE)
...@@ -87,6 +89,8 @@ void XMathGrad::MakeGrad(XTensor * node, bool isEfficient) ...@@ -87,6 +89,8 @@ void XMathGrad::MakeGrad(XTensor * node, bool isEfficient)
GradSum(node, isEfficient); GradSum(node, isEfficient);
else if(operID == MATH_SUMDIM) else if(operID == MATH_SUMDIM)
GradSumDim(node, isEfficient); GradSumDim(node, isEfficient);
else if(operID == MATH_SUMBROADCAST)
GradSumBroadcast(node, isEfficient);
else if(operID == REDUCE_REDUCEMEAN) else if(operID == REDUCE_REDUCEMEAN)
GradReduceMean(node, isEfficient); GradReduceMean(node, isEfficient);
else if(operID == REDUCE_REDUCESUM) else if(operID == REDUCE_REDUCESUM)
...@@ -736,10 +740,6 @@ dE/db = (dE/dc * a).reduce(0,...,n-1,n+1,...) ...@@ -736,10 +740,6 @@ dE/db = (dE/dc * a).reduce(0,...,n-1,n+1,...)
*/ */
void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient) void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
{ {
/* XTensor node1;
node1 = ConvertDataType(*node, X_FLOAT);
node1.Dump(stderr, "node:");*/
XLink &income = node->income; XLink &income = node->income;
CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for MULTIPLYDIM!"); CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for MULTIPLYDIM!");
...@@ -751,14 +751,6 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient) ...@@ -751,14 +751,6 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
/* dE/da */ /* dE/da */
_MultiplyDim(node->grad, b, a->grad, n, 1.0F); _MultiplyDim(node->grad, b, a->grad, n, 1.0F);
//XTensor a1;
//a1 = ConvertDataType(*a, X_FLOAT);
//a1.Dump(stderr, "a:");
//XTensor b1;
//b1 = ConvertDataType(*b, X_FLOAT);
//b1.Dump(stderr, "b:");
/* dE/db */ /* dE/db */
int order = a->order; int order = a->order;
...@@ -777,19 +769,10 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient) ...@@ -777,19 +769,10 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
size of b. Then we can reduce the matrix into a row vector. */ size of b. Then we can reduce the matrix into a row vector. */
bGradTMP->Reshape(2, reshapedSize); bGradTMP->Reshape(2, reshapedSize);
/*XTensor bGradTMP1;
bGradTMP1 = ConvertDataType(*bGradTMP, X_FLOAT);
bGradTMP1.Dump(stderr, "bGradTMP:");*/
//if(b->outgo.tailNum > 1){ //if(b->outgo.tailNum > 1){
XTensor * bGradTMP2 = NewTensorBuf(b->grad, b->devID, b->mem); XTensor * bGradTMP2 = NewTensorBuf(b->grad, b->devID, b->mem);
_ReduceSum(bGradTMP, bGradTMP2, 0); _ReduceSum(bGradTMP, bGradTMP2, 0);
/* XTensor bGradTMP21;
bGradTMP21 = ConvertDataType(*bGradTMP2, X_FLOAT);
bGradTMP21.Dump(stderr, "bGradTMP2:");*/
_Sum(b->grad, bGradTMP2, b->grad); _Sum(b->grad, bGradTMP2, b->grad);
DelTensorBuf(bGradTMP2); DelTensorBuf(bGradTMP2);
...@@ -832,21 +815,43 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient) ...@@ -832,21 +815,43 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
DelTensorBuf(interGrad); DelTensorBuf(interGrad);
} }
//printf("\n");
//XTensor a2;
//a2 = ConvertDataType(*a, X_FLOAT);
//a2.Dump(stderr, "a2:");
//XTensor b2;
//b2 = ConvertDataType(*b, X_FLOAT);
//b2.Dump(stderr, "b2:");
DelTensorBuf(bGradTMP); DelTensorBuf(bGradTMP);
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
} }
/* /*
gradient for multiplication by broadcasting:
c = a * b
where some dimensions of b are of size 1
dE/da = dE/dc * b
dE/db = (dE/dc * a).reduce(0...n)
where a.reduce(0...n) is the reduction along the dimension
whose size is 1 in b. Note that there might be several reductions.
>> node - the node (c) for backward computation
>> isEfficient - indicates whether the computation is in
an efficient manner
*/
void XMathGrad::GradMultiplyBroadcast(XTensor * node, bool isEfficient)
{
XLink &income = node->income;
CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for MULTIPLYBROADCAST!");
XTensor * a = income.tails[0];
XTensor * b = income.tails[1];
DTYPE beta = income.GetParam(0);
XNoder::MakeGrad(a);
_MultiplyBroadcast(node->grad, b, a->grad, 1.0F);
if(b->isVar || b->income.tailNum > 0){
ShowNTErrors("TODO");
}
}
/*
gradient for negate gradient for negate
for for
c = -a c = -a
...@@ -1020,7 +1025,6 @@ void XMathGrad::GradScaleAndShift(XTensor * node, bool isEfficient) ...@@ -1020,7 +1025,6 @@ void XMathGrad::GradScaleAndShift(XTensor * node, bool isEfficient)
_Sum(a->grad, node->grad, a->grad, scale); _Sum(a->grad, node->grad, a->grad, scale);
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
} }
...@@ -1284,6 +1288,37 @@ void XMathGrad::GradSumDim(XTensor * node, bool isEfficient) ...@@ -1284,6 +1288,37 @@ void XMathGrad::GradSumDim(XTensor * node, bool isEfficient)
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
} }
/*
gradient for sum by broadcasting:
c = a + b * \beta
where some dimensions of b are of size 1
dE/da = dE/dc
dE/db = dE/dc * a.reduce(0..n) * \beta
where a.reduce(0..n) is the reduction along the dimension
whose size is 1 in b
>> node - the node (c) for backward computation
>> isEfficient - indicates whether the computation is in
an efficient manner
*/
void XMathGrad::GradSumBroadcast(XTensor * node, bool isEfficient)
{
XLink &income = node->income;
CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for SUMBROADCAST!");
XTensor * a = income.tails[0];
XTensor * b = income.tails[1];
DTYPE beta = income.GetParam(0);
XNoder::MakeGrad(a);
_Sum(a->grad, node->grad, a->grad);
if(b->isVar || b->income.tailNum > 0){
ShowNTErrors("TODO");
}
}
/* /*
gradient for reduceMean gradient for reduceMean
for for
......
...@@ -109,6 +109,11 @@ private: ...@@ -109,6 +109,11 @@ private:
static static
void GradMultiplyDim(XTensor * node, bool isEfficient); void GradMultiplyDim(XTensor * node, bool isEfficient);
/* gradient for multiply one dimension: c = a * b
where some dimensions of b are of size 1 */
static
void GradMultiplyBroadcast(XTensor * node, bool isEfficient);
/* gradient for negate */ /* gradient for negate */
static static
void GradNegate(XTensor * node, bool isEfficient); void GradNegate(XTensor * node, bool isEfficient);
...@@ -143,6 +148,11 @@ private: ...@@ -143,6 +148,11 @@ private:
static static
void GradSumDim(XTensor * node, bool isEfficient); void GradSumDim(XTensor * node, bool isEfficient);
/* gradient for sum by broadcasting: c = a + b * \beta
where some dimensions of b are of size 1 */
static
void GradSumBroadcast(XTensor * node, bool isEfficient);
/* gradient for reduceMean */ /* gradient for reduceMean */
static static
void GradReduceMean(XTensor * node, bool isEfficient); void GradReduceMean(XTensor * node, bool isEfficient);
......
...@@ -27,7 +27,6 @@ ...@@ -27,7 +27,6 @@
#include "XBackwardFunc.h" #include "XBackwardFunc.h"
#include "XBackwardShape.h" #include "XBackwardShape.h"
#include "../tensor/XName.h" #include "../tensor/XName.h"
#include "../tensor/core/CHeader.h"
namespace nts{ namespace nts{
...@@ -266,7 +265,7 @@ void XNet::BackwardNode(XTensor * node, bool isEfficent) ...@@ -266,7 +265,7 @@ void XNet::BackwardNode(XTensor * node, bool isEfficent)
XMathGrad::MakeGrad(node, isEfficent); XMathGrad::MakeGrad(node, isEfficent);
else if(XFuncGrad::IsFunc(node)) else if(XFuncGrad::IsFunc(node))
XFuncGrad::MakeGrad(node, isEfficent); XFuncGrad::MakeGrad(node, isEfficent);
else if(XDataGrad::IsDataOP(node)) else if (XDataGrad::IsDataOP(node))
XDataGrad::MakeGrad(node, isEfficent); XDataGrad::MakeGrad(node, isEfficent);
else if(XShapeGrad::IsShapeOP(node)) else if(XShapeGrad::IsShapeOP(node))
XShapeGrad::MakeGrad(node, isEfficent); XShapeGrad::MakeGrad(node, isEfficent);
......
...@@ -839,6 +839,9 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net) ...@@ -839,6 +839,9 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
InitModelTensor2D(s, batchSize, model.vSize, model); InitModelTensor2D(s, batchSize, model.vSize, model);
InitModelTensor2D(y, batchSize, model.vSize, model); InitModelTensor2D(y, batchSize, model.vSize, model);
///* s = h_last * w */
//_MatrixMul(&h_last, X_NOTRANS, &w, X_NOTRANS, &s);
XTensor h_last1; XTensor h_last1;
h_last1 = ScaleAndShift(h_last, 100, 0); h_last1 = ScaleAndShift(h_last, 100, 0);
...@@ -850,22 +853,27 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net) ...@@ -850,22 +853,27 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
int8H_last = ConvertDataType(h_last1, X_INT8); int8H_last = ConvertDataType(h_last1, X_INT8);
int8W = ConvertDataType(w1, X_INT8); int8W = ConvertDataType(w1, X_INT8);
XTensor s1;
InitTensor2D(&s1, batchSize, model.vSize, X_FLOAT, model.devID, model.mem);
_MatrixMul2D(&int8H_last, X_NOTRANS, &int8W, X_NOTRANS, &s1);
s = ScaleAndShift(s1, 0.0001, 0); XTensor s1;
InitTensor2D(&s1, batchSize, model.vSize, X_INT, model.devID, model.mem);
_MatrixMul2D(&int8H_last, X_NOTRANS, &int8W, X_NOTRANS, &s1);
XTensor b2D; XTensor b2D;
InitTensor(&b2D, &s); InitTensor2D(&b2D, batchSize, model.vSize, X_FLOAT, model.devID, model.mem);
_Unsqueeze(&b, &b2D, 0, batchSize); _Unsqueeze(&b, &b2D, 0, batchSize);
_Sum(&s, &b2D, &s); b2D = ScaleAndShift(b2D, 10000, 0);
XTensor b2D1;
b2D1 = ConvertDataType(b2D, X_INT);
_Sum(&s1, &b2D1, &s1);
s = ConvertDataType(s1, X_FLOAT);
s = ScaleAndShift(s, 0.0001, 0);
/* y = softmax(s) */ /* y = softmax(s) */
_LogSoftmax(&s, &y, 1); _LogSoftmax(&s, &y, 1);
} }
} }
...@@ -1203,12 +1211,12 @@ void Test(const char * test, const char * result, FNNModel &model) ...@@ -1203,12 +1211,12 @@ void Test(const char * test, const char * result, FNNModel &model)
fprintf(ofile, "%d ", ngrams[0].words[i]); fprintf(ofile, "%d ", ngrams[0].words[i]);
for (int i = 0; i < ngramNum; i++) for (int i = 0; i < ngramNum; i++)
fprintf(ofile, "%d ", ngrams[i].words[model.n - 1]); fprintf(ofile, "%d ", ngrams[i].words[model.n - 1]);
fprintf(ofile, "||| "); fprintf(ofile, "||| ");
for (int i = 0; i < model.n - 1; i++) for (int i = 0; i < model.n - 1; i++)
fprintf(ofile, "<s> "); fprintf(ofile, "<s> ");
for (int i = 0; i < ngramNum; i++) for (int i = 0; i < ngramNum; i++)
fprintf(ofile, "%f ", probs.Get1D(i)); fprintf(ofile, "%f ", probs.Get1D(i));
fprintf(ofile, "||| %f\n", prob); fprintf(ofile, "||| %f\n", prob);
loss += -prob; loss += -prob;
wordCount += ngramNum; wordCount += ngramNum;
......
...@@ -53,6 +53,42 @@ initialize the model ...@@ -53,6 +53,42 @@ initialize the model
>> myDevID - device id >> myDevID - device id
>> myMem - the memory pool >> myMem - the memory pool
*/ */
//void T2TAttention::InitModel(int argc, char ** argv,
// bool myIsMasked, int myIgnored,
// int myDevID, XMem * myMem)
//{
// devID = myDevID;
// mem = myMem;
// isMasked = myIsMasked;
// ignored = myIgnored;
//
// float minmax = 0;
//
// LoadParamInt(argc, argv, "nhead", &nhead, 8);
// LoadParamInt(argc, argv, "d", &dk, DEFAULT_EMBEDDING_SIZE);
// LoadParamInt(argc, argv, "d", &dv, DEFAULT_EMBEDDING_SIZE);
// LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
// LoadParamFloat(argc, argv, "attminmax", &minmax, 0.1F);
// LoadParamFloat(argc, argv, "dropoutatt", &dropoutP, 0);
//
// InitTensor2D(&wk, d, dk, X_FLOAT, devID, mem);
// InitTensor2D(&wq, d, dk, X_FLOAT, devID, mem);
// InitTensor2D(&wv, d, dv, X_FLOAT, devID, mem);
// InitTensor2D(&wa, d, d, X_FLOAT, devID, mem);
// InitTensor2D(&wbig, d, 3 * d, X_FLOAT, devID, mem);
//
// float scale = 1.0F;
// float finfoutk = (float)sqrt(6.0F * scale/(d + dk));
// float finfoutv = (float)sqrt(6.0F * scale/(d + dv));
// float finfouta = (float)sqrt(6.0F * scale / (d + d));
// float finfoutbig = (float)sqrt(6.0F * scale / (d + 3*d));
//
// wk.SetDataRand(-finfoutk, finfoutk);
// wq.SetDataRand(-finfoutk, finfoutk);
// wv.SetDataRand(-finfoutv, finfoutv);
// wa.SetDataRand(-finfouta, finfouta);
// wbig.SetDataRand(-finfoutbig, finfoutbig);
//}
void T2TAttention::InitModel(int argc, char ** argv, void T2TAttention::InitModel(int argc, char ** argv,
bool myIsMasked, int myIgnored, bool myIsMasked, int myIgnored,
...@@ -76,20 +112,19 @@ void T2TAttention::InitModel(int argc, char ** argv, ...@@ -76,20 +112,19 @@ void T2TAttention::InitModel(int argc, char ** argv,
InitTensor2D(&wq, d, dk, X_FLOAT16, devID, mem); InitTensor2D(&wq, d, dk, X_FLOAT16, devID, mem);
InitTensor2D(&wv, d, dv, X_FLOAT16, devID, mem); InitTensor2D(&wv, d, dv, X_FLOAT16, devID, mem);
InitTensor2D(&wa, d, d, X_FLOAT16, devID, mem); InitTensor2D(&wa, d, d, X_FLOAT16, devID, mem);
InitTensor2D(&wbig, d, 3 * d, X_FLOAT16, devID, mem);
float scale = 1.0F; float scale = 1.0F;
float finfoutk = (float)sqrt(6.0F * scale / (d + dk)); float finfoutk = (float)sqrt(6.0F * scale / (d + dk));
float finfoutv = (float)sqrt(6.0F * scale / (d + dv)); float finfoutv = (float)sqrt(6.0F * scale / (d + dv));
float finfouta = (float)sqrt(6.0F * scale / (d + d)); float finfouta = (float)sqrt(6.0F * scale / (d + d));
float finfoutbig = (float)sqrt(6.0F * scale / (d + 3 * d));
wk.SetDataRand(-finfoutk, finfoutk); wk.SetDataRand(-finfoutk, finfoutk);
wq.SetDataRand(-finfoutk, finfoutk); wq.SetDataRand(-finfoutk, finfoutk);
wv.SetDataRand(-finfoutv, finfoutv); wv.SetDataRand(-finfoutv, finfoutv);
wa.SetDataRand(-finfouta, finfouta); wa.SetDataRand(-finfouta, finfouta);
//_SetDataRand(&wk, -finfoutk, finfoutk); wbig.SetDataRand(-finfoutbig, finfoutbig);
//_SetDataRand(&wq, -finfoutk, finfoutk);
//_SetDataRand(&wv, -finfoutv, finfoutv);
//_SetDataRand(&wa, -finfouta, finfouta);
} }
/* /*
...@@ -103,42 +138,138 @@ make the network ...@@ -103,42 +138,138 @@ make the network
>> isTraining - indicates whether the model is used for training >> isTraining - indicates whether the model is used for training
<< return - multi-attention result << return - multi-attention result
*/ */
//XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining, bool selfatt)
//{
// XTensor k2;
// XTensor q2;
// XTensor v2;
//
// if (selfatt){
//
// XTensor con;
// XList split;
//
// con = MMul(k, wbig);
//
// int d1 = con.GetDim(0);
// int d2 = con.GetDim(1);
// int d3 = con.GetDim(2) / 3;
//
// InitTensor3D(&k2, d1, d2, d3, X_FLOAT, devID, mem);
// InitTensor3D(&q2, d1, d2, d3, X_FLOAT, devID, mem);
// InitTensor3D(&v2, d1, d2, d3, X_FLOAT, devID, mem);
//
// split.Add(&q2);
// split.Add(&k2);
// split.Add(&v2);
//
// Split(con, split, 2, 3);
// }
//
// else{
// /* linear transofmration before self-attention */
// k2 = MMul(k, wk);
// q2 = MMul(q, wq);
// v2 = MMul(v, wv);
// }
//
// XTensor kheads;
// XTensor qheads;
// XTensor vheads;
//
// /* multi head */
// kheads = Split(k2, k2.order - 1, nhead);
// qheads = Split(q2, q2.order - 1, nhead);
// vheads = Split(v2, v2.order - 1, nhead);
//
// XTensor att;
// XTensor dot;
// XTensor scalar;
//
// /* scalar = softmax(Q * K^T / sqrt(dk)) * V */
// dot = BMMul(qheads, X_NOTRANS, kheads, X_TRANS);
//
// if(isMasked)
// dot = dot + mask;
//
// dot = Linear(dot, 1.0F/(float)sqrt((float)dk/nhead));
//
// scalar = Softmax(dot, -1);
//
// if(isTraining && dropoutP > 0)
// scalar = Dropout(scalar, dropoutP);
//
// att = BMMul(scalar, vheads);
//
// /* concatenate the heads */
// return MMul(Merge(att, att.order - 1), wa);
//}
XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining) XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining, bool selfatt)
{ {
XTensor halfK;
XTensor halfK2; XTensor halfK2;
XTensor halfQ2; XTensor halfQ2;
XTensor halfV2; XTensor halfV2;
XTensor halfK;
halfK = ConvertDataType(k, X_FLOAT16); halfK = ConvertDataType(k, X_FLOAT16);
halfK2 = MMul(halfK, wk); if (selfatt) {
halfQ2 = MMul(halfK, wq);
halfV2 = MMul(halfK, wv); XTensor halfCon;
XList halfSplit;
halfCon = MMul(halfK, wbig);
int d1 = halfCon.GetDim(0);
int d2 = halfCon.GetDim(1);
int d3 = halfCon.GetDim(2) / 3;
InitTensor3D(&halfK2, d1, d2, d3, X_FLOAT16, devID, mem);
InitTensor3D(&halfQ2, d1, d2, d3, X_FLOAT16, devID, mem);
InitTensor3D(&halfV2, d1, d2, d3, X_FLOAT16, devID, mem);
halfSplit.Add(&halfQ2);
halfSplit.Add(&halfK2);
halfSplit.Add(&halfV2);
Split(halfCon, halfSplit, 2, 3);
}
else {
XTensor halfQ;
XTensor halfV;
halfQ = ConvertDataType(q, X_FLOAT16);
halfV = ConvertDataType(v, X_FLOAT16);
/* linear transofmration before self-attention */
halfK2 = MMul(halfK, wk);
halfQ2 = MMul(halfQ, wq);
halfV2 = MMul(halfV, wv);
}
XTensor halfKheads; XTensor halfKheads;
XTensor halfQheads; XTensor halfQheads;
XTensor halfVheads; XTensor halfVheads;
/* multi head */
halfKheads = Split(halfK2, halfK2.order - 1, nhead); halfKheads = Split(halfK2, halfK2.order - 1, nhead);
halfQheads = Split(halfQ2, halfQ2.order - 1, nhead); halfQheads = Split(halfQ2, halfQ2.order - 1, nhead);
halfVheads = Split(halfV2, halfV2.order - 1, nhead); halfVheads = Split(halfV2, halfV2.order - 1, nhead);
XTensor halfMask; XTensor halfAtt;
XTensor halfDot; XTensor halfDot;
XTensor halfScalar; XTensor halfScalar;
XTensor halfAtt;
halfMask = ConvertDataType(mask, X_FLOAT16);
/* scalar = softmax(Q * K^T / sqrt(dk)) * V */ /* scalar = softmax(Q * K^T / sqrt(dk)) * V */
halfDot = BMMul(halfQheads, X_NOTRANS, halfKheads, X_TRANS); halfDot = BMMul(halfQheads, X_NOTRANS, halfKheads, X_TRANS);
//XTensor halfMask(mask.order, mask.dimSize, X_FLOAT16, mask.denseRatio, mask.devID, mask.mem);
if (isMasked) { if (isMasked) {
XTensor halfMask;
halfMask = ConvertDataType(mask, X_FLOAT16);
halfDot = Sum(halfDot, halfMask); halfDot = Sum(halfDot, halfMask);
} }
halfDot = Linear(halfDot, 1.0F / (float)sqrt((float)dk / nhead)); halfDot = Linear(halfDot, 1.0F / (float)sqrt((float)dk / nhead));
...@@ -147,13 +278,10 @@ XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bo ...@@ -147,13 +278,10 @@ XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bo
if (isTraining && dropoutP > 0) if (isTraining && dropoutP > 0)
halfScalar = Dropout(halfScalar, dropoutP); halfScalar = Dropout(halfScalar, dropoutP);
/*att = BMMul(scalar, vheads);*/
halfAtt = BMMul(halfScalar, halfVheads); halfAtt = BMMul(halfScalar, halfVheads);
/* concatenate the heads */ /* concatenate the heads */
return ConvertDataType(MMul(Merge(halfAtt, halfAtt.order - 1), wa), X_FLOAT); return ConvertDataType(MMul(Merge(halfAtt, halfAtt.order - 1), wa), X_FLOAT);
} }
} }
...@@ -59,7 +59,8 @@ public: ...@@ -59,7 +59,8 @@ public:
/* transformation after dot-product attention */ /* transformation after dot-product attention */
XTensor wa; XTensor wa;
XTensor wbig;
/* size of transformed Q and K */ /* size of transformed Q and K */
int dk; int dk;
...@@ -95,7 +96,7 @@ public: ...@@ -95,7 +96,7 @@ public:
int myDevID = -1, XMem * myMem = NULL); int myDevID = -1, XMem * myMem = NULL);
/* make the network */ /* make the network */
XTensor Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining); XTensor Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining, bool selfatt);
}; };
} }
......
...@@ -21,6 +21,8 @@ ...@@ -21,6 +21,8 @@
#include <math.h> #include <math.h>
#include "T2TDecoder.h" #include "T2TDecoder.h"
#include "T2TUtility.h"
#include "T2TLayerNormal.h"
#include "../../tensor/core/CHeader.h" #include "../../tensor/core/CHeader.h"
namespace transformer namespace transformer
...@@ -53,16 +55,43 @@ void AttDecoder::InitModel(int argc, char ** argv, ...@@ -53,16 +55,43 @@ void AttDecoder::InitModel(int argc, char ** argv,
bool myIsMasked, int myIgnored, bool myIsMasked, int myIgnored,
int myDevID, XMem * myMem) int myDevID, XMem * myMem)
{ {
AttEncoder::InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem); //AttEncoder::InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
devID = myDevID;
mem = myMem;
ignored = myIgnored;
LoadParamInt(argc, argv, "nlayer", &nlayer, 6);
LoadParamInt(argc, argv, "hsize", &hSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "esize", &eSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
LoadParamFloat(argc, argv, "dropout", &dropoutP, 0);
CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsize\"");
/* embedding model */
embedder.InitModel(argc, argv, devID, mem, false);
attentions = new T2TAttention[nlayer];
fnns = new T2TFNN[nlayer];
attLayerNorms = new T2TLN[nlayer];
fnnLayerNorms = new T2TLN[nlayer];
attentionsEnde = new T2TAttention[nlayer]; attentionsEnde = new T2TAttention[nlayer];
attEndeLayerNorms = new T2TLN[nlayer]; attEndeLayerNorms = new T2TLN[nlayer];
/* initialize the stacked layers */ /* initialize the stacked layers */
for(int i = 0; i < nlayer; i++){ for (int i = 0; i < nlayer; i++) {
attentionsEnde[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem); attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
fnns[i].InitModel(argc, argv, myDevID, myMem);
attLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
fnnLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
attentionsEnde[i].InitModel(argc, argv, true, myIgnored, myDevID, myMem);
attEndeLayerNorms[i].InitModel(argc, argv, myDevID, myMem); attEndeLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
} }
} }
/* /*
...@@ -93,7 +122,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X ...@@ -93,7 +122,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
/******************/ /******************/
/* self attention */ /* self attention */
att = attentions[i].Make(x, x, x, mask, isTraining); att = attentions[i].Make(x, x, x, mask, isTraining, true);
/* dropout */ /* dropout */
if(isTraining && dropoutP > 0) if(isTraining && dropoutP > 0)
...@@ -107,7 +136,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X ...@@ -107,7 +136,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
/*****************************/ /*****************************/
/* encoder-decoder attention */ /* encoder-decoder attention */
ende = attentionsEnde[i].Make(outputEnc, x, outputEnc, maskEncDec, isTraining); ende = attentionsEnde[i].Make(outputEnc, x, outputEnc, maskEncDec, isTraining, false);
/* dropout */ /* dropout */
if(isTraining && dropoutP > 0) if(isTraining && dropoutP > 0)
......
...@@ -27,9 +27,56 @@ ...@@ -27,9 +27,56 @@
namespace transformer namespace transformer
{ {
class AttDecoder : public AttEncoder class AttDecoder
{ {
public: public:
/* device id */
int devID;
/* memory pool */
XMem * mem;
/* layer number */
int nlayer;
/* hidden layer size of the FNN layer */
int hSize;
/* embedding size */
int eSize;
/* vocabulary size */
int vSize;
/* dropout probability */
DTYPE dropoutP;
/* some positions can be ignored in attention. this is useful in lm where the first position needs
* special design for the attention model. */
int ignored;
/* embedding of word at each position */
T2TEmbedder embedder;
/* FNN model of each layer */
T2TFNN * fnns;
/* attention model of each layer */
T2TAttention * attentions;
/* layer normalization for fnn */
T2TLN * fnnLayerNorms;
/* layer normalization for attention */
T2TLN * attLayerNorms;
/* input tensor of the encoder */
XTensor * input;
/* output tensor of the encoder */
XTensor * output;
/* encoder-decoder attention model of each layer */ /* encoder-decoder attention model of each layer */
T2TAttention * attentionsEnde; T2TAttention * attentionsEnde;
...@@ -53,4 +100,4 @@ public: ...@@ -53,4 +100,4 @@ public:
} }
#endif #endif
\ No newline at end of file
...@@ -48,12 +48,18 @@ initialize the model ...@@ -48,12 +48,18 @@ initialize the model
>> myDevID - device id >> myDevID - device id
>> myMem - the memory pool >> myMem - the memory pool
*/ */
void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, XMem * myMem) void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, XMem * myMem, bool isEnc)
{ {
devID = myDevID; devID = myDevID;
mem = myMem; mem = myMem;
LoadParamInt(argc, argv, "vsize", &vSize, -1); if(isEnc){
LoadParamInt(argc, argv, "vsize", &vSize, -1);
}
else{
LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
}
//LoadParamInt(argc, argv, "vsize", &vSize, -1);
LoadParamInt(argc, argv, "maxlen", &maxLength, 512); LoadParamInt(argc, argv, "maxlen", &maxLength, 512);
LoadParamInt(argc, argv, "d", &eSize, DEFAULT_EMBEDDING_SIZE); LoadParamInt(argc, argv, "d", &eSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE); LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
...@@ -110,7 +116,6 @@ void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length) ...@@ -110,7 +116,6 @@ void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length)
delete[] data; delete[] data;
} }
/* /*
make the network make the network
*/ */
......
...@@ -71,7 +71,7 @@ public: ...@@ -71,7 +71,7 @@ public:
~T2TEmbedder(); ~T2TEmbedder();
/* initialize the model */ /* initialize the model */
void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL); void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL, bool isEnc = true);
/* make positional embeddings */ /* make positional embeddings */
void MakePosEmbedding(int eSize, int d, int length); void MakePosEmbedding(int eSize, int d, int length);
......
...@@ -116,8 +116,8 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo ...@@ -116,8 +116,8 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo
XTensor res; XTensor res;
/* self attention */ /* self attention */
att = attentions[i].Make(x, x, x, mask, isTraining); att = attentions[i].Make(x, x, x, mask, isTraining, true);
/* dropout */ /* dropout */
if(isTraining && dropoutP > 0) if(isTraining && dropoutP > 0)
att = Dropout(att, dropoutP); att = Dropout(att, dropoutP);
...@@ -160,3 +160,4 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool isTraining) ...@@ -160,3 +160,4 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool isTraining)
} }
} }
...@@ -236,10 +236,10 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe ...@@ -236,10 +236,10 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
XTensor * maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID, paddingEnc.mem); XTensor * maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID, paddingEnc.mem);
_Unsqueeze(&paddingEnc, maskEncDecTMPEnc, paddingEnc.order - 1, paddingDec.GetDim(-1)); _Unsqueeze(&paddingEnc, maskEncDecTMPEnc, paddingEnc.order - 1, paddingDec.GetDim(-1));
_Unsqueeze(&paddingDec, maskEncDecTMPDec, paddingEnc.order, paddingEnc.GetDim(-1)); //_Unsqueeze(&paddingDec, maskEncDecTMPDec, paddingEnc.order, paddingEnc.GetDim(-1));
_Multiply(maskEncDecTMPDec, maskEncDecTMPEnc, maskEncDecTMPDec); //_Multiply(maskEncDecTMPDec, maskEncDecTMPEnc, maskEncDecTMPDec);
_ScaleAndShiftMe(maskEncDecTMPDec, 1e9F, -1e9F); _ScaleAndShiftMe(maskEncDecTMPEnc, 1e9F, -1e9F);
_Unsqueeze(maskEncDecTMPDec, &maskEncDec, 0, dims[0]); _Unsqueeze(maskEncDecTMPEnc, &maskEncDec, 0, dims[0]);
DelTensorBuf(maskEncDecTMPDec); DelTensorBuf(maskEncDecTMPDec);
DelTensorBuf(maskEncDecTMPEnc); DelTensorBuf(maskEncDecTMPEnc);
...@@ -274,7 +274,10 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe ...@@ -274,7 +274,10 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
_Sum(&maskEnc, padding3, &maskEnc); _Sum(&maskEnc, padding3, &maskEnc);
encoding = MakeEncoder(inputEnc, maskEnc, isTraining); encoding = MakeEncoder(inputEnc, maskEnc, isTraining);
//encoding.Dump(stderr, "encoding",10);
decoding = MakeDecoder(inputDec, encoding, maskDec, maskEncDec, isTraining); decoding = MakeDecoder(inputDec, encoding, maskDec, maskEncDec, isTraining);
//decoding.Dump(stderr, "decoding", 10);
outputLayer->Make(decoding, output); outputLayer->Make(decoding, output);
delete[] dims; delete[] dims;
...@@ -298,9 +301,10 @@ void T2TModel::GetParams(XList &list) ...@@ -298,9 +301,10 @@ void T2TModel::GetParams(XList &list)
list.Add(&encoder->fnns[i].b1); list.Add(&encoder->fnns[i].b1);
list.Add(&encoder->fnns[i].w2); list.Add(&encoder->fnns[i].w2);
list.Add(&encoder->fnns[i].b2); list.Add(&encoder->fnns[i].b2);
list.Add(&encoder->attentions[i].wk); //list.Add(&encoder->attentions[i].wk);
list.Add(&encoder->attentions[i].wq); //list.Add(&encoder->attentions[i].wq);
list.Add(&encoder->attentions[i].wv); //list.Add(&encoder->attentions[i].wv);
list.Add(&encoder->attentions[i].wbig);
list.Add(&encoder->attentions[i].wa); list.Add(&encoder->attentions[i].wa);
list.Add(&encoder->fnnLayerNorms[i].w); list.Add(&encoder->fnnLayerNorms[i].w);
list.Add(&encoder->fnnLayerNorms[i].b); list.Add(&encoder->fnnLayerNorms[i].b);
...@@ -322,9 +326,10 @@ void T2TModel::GetParams(XList &list) ...@@ -322,9 +326,10 @@ void T2TModel::GetParams(XList &list)
list.Add(&decoder->attentionsEnde[i].wa); list.Add(&decoder->attentionsEnde[i].wa);
list.Add(&decoder->attEndeLayerNorms[i].w); list.Add(&decoder->attEndeLayerNorms[i].w);
list.Add(&decoder->attEndeLayerNorms[i].b); list.Add(&decoder->attEndeLayerNorms[i].b);
list.Add(&decoder->attentions[i].wk); //list.Add(&decoder->attentions[i].wk);
list.Add(&decoder->attentions[i].wq); //list.Add(&decoder->attentions[i].wq);
list.Add(&decoder->attentions[i].wv); //list.Add(&decoder->attentions[i].wv);
list.Add(&decoder->attentions[i].wbig);
list.Add(&decoder->attentions[i].wa); list.Add(&decoder->attentions[i].wa);
list.Add(&decoder->fnnLayerNorms[i].w); list.Add(&decoder->fnnLayerNorms[i].w);
list.Add(&decoder->fnnLayerNorms[i].b); list.Add(&decoder->fnnLayerNorms[i].b);
......
...@@ -56,7 +56,7 @@ void T2TOutput::InitModel(int argc, char ** argv, int myDevID, XMem * myMem) ...@@ -56,7 +56,7 @@ void T2TOutput::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
float minmax = 0; float minmax = 0;
LoadParamInt(argc, argv, "vsize", &vSize, -1); LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE); LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "d", &hSize, DEFAULT_EMBEDDING_SIZE); LoadParamInt(argc, argv, "d", &hSize, DEFAULT_EMBEDDING_SIZE);
LoadParamFloat(argc, argv, "outputminmax", &minmax, 0.08F); LoadParamFloat(argc, argv, "outputminmax", &minmax, 0.08F);
......
...@@ -33,6 +33,25 @@ using namespace nts; ...@@ -33,6 +33,25 @@ using namespace nts;
namespace transformer namespace transformer
{ {
/* node to keep batch information */
struct BatchNode
{
/* begining position */
int beg;
/* end position */
int end;
/* maximum word number on the encoder side */
int maxEnc;
/* maximum word number on the decoder side */
int maxDec;
/* a key for sorting */
int key;
};
/* trainer of the T2T model */ /* trainer of the T2T model */
class T2TTrainer class T2TTrainer
{ {
...@@ -49,9 +68,15 @@ public: ...@@ -49,9 +68,15 @@ public:
/* another buffer */ /* another buffer */
int * buf2; int * buf2;
/* batch buf */
BatchNode * bufBatch;
/* buffer size */ /* buffer size */
int bufSize; int bufSize;
/* size of batch buffer */
int bufBatchSize;
/* length of each sequence */ /* length of each sequence */
int * seqLen; int * seqLen;
...@@ -66,6 +91,9 @@ public: ...@@ -66,6 +91,9 @@ public:
/* offset for next sequence in the buffer */ /* offset for next sequence in the buffer */
int nextSeq; int nextSeq;
/* offset for next batch */
int nextBatch;
/* indicates whether the sequence is sorted by length */ /* indicates whether the sequence is sorted by length */
bool isLenSorted; bool isLenSorted;
...@@ -142,8 +170,11 @@ public: ...@@ -142,8 +170,11 @@ public:
/* counterpart of "isSmallBatch" */ /* counterpart of "isSmallBatch" */
bool isBigBatch; bool isBigBatch;
/* indicates whether we use small memory footprint for backward process */ /* randomize batches */
bool isSmallFootprint; bool isRandomBatch;
/* indicates whether we intend to debug the net */
bool isDebugged;
public: public:
/* constructor */ /* constructor */
......
...@@ -60,12 +60,13 @@ int TransformerMain(int argc, const char ** argv) ...@@ -60,12 +60,13 @@ int TransformerMain(int argc, const char ** argv)
LoadParamString(argc, args, "output", outputFN, ""); LoadParamString(argc, args, "output", outputFN, "");
srand((unsigned int)time(NULL)); srand((unsigned int)time(NULL));
T2TTrainer trainer; T2TTrainer trainer;
trainer.Init(argc, args); trainer.Init(argc, args);
T2TModel model; T2TModel model;
model.InitModel(argc, args); model.InitModel(argc, args);
/* learn model parameters */ /* learn model parameters */
if(strcmp(trainFN, "")) if(strcmp(trainFN, ""))
trainer.Train(trainFN, testFN, strcmp(modelFN, "") ? modelFN : "checkpoint.model", &model); trainer.Train(trainFN, testFN, strcmp(modelFN, "") ? modelFN : "checkpoint.model", &model);
......
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include <time.h>
#include "XDevice.h" #include "XDevice.h"
#include "XGlobal.h" #include "XGlobal.h"
#include "XThread.h" #include "XThread.h"
...@@ -59,6 +60,7 @@ XDevice::~XDevice() ...@@ -59,6 +60,7 @@ XDevice::~XDevice()
cublasDestroy(cublasHandle); cublasDestroy(cublasHandle);
if(stream != NULL) if(stream != NULL)
delete stream; delete stream;
curandDestroyGenerator(gen);
#endif #endif
} }
...@@ -68,6 +70,7 @@ void XDevice::Init(int myDevID) ...@@ -68,6 +70,7 @@ void XDevice::Init(int myDevID)
Clear(); Clear();
devID = myDevID; devID = myDevID;
seed = rand();
/* CPU information */ /* CPU information */
if(devID < 0){ if(devID < 0){
...@@ -80,6 +83,10 @@ void XDevice::Init(int myDevID) ...@@ -80,6 +83,10 @@ void XDevice::Init(int myDevID)
cudaDeviceProp prop; cudaDeviceProp prop;
cudaSetDevice(myDevID); cudaSetDevice(myDevID);
curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
curandSetPseudoRandomGeneratorSeed(gen, seed);
if(cudaGetDeviceProperties(&prop, devID) != cudaSuccess){ if(cudaGetDeviceProperties(&prop, devID) != cudaSuccess){
XPRINT1(0, stderr, "cannot get GPU(%d) information.", devID); XPRINT1(0, stderr, "cannot get GPU(%d) information.", devID);
exit(1); exit(1);
...@@ -270,6 +277,8 @@ XDevManager::~XDevManager() ...@@ -270,6 +277,8 @@ XDevManager::~XDevManager()
/* initialize it and get the CPU and GPU information */ /* initialize it and get the CPU and GPU information */
void XDevManager::Init() void XDevManager::Init()
{ {
srand((unsigned int)time(NULL));
Clear(); Clear();
/* CPUs (we actually do not care about how many CPUs are using) */ /* CPUs (we actually do not care about how many CPUs are using) */
......
...@@ -99,6 +99,9 @@ public: ...@@ -99,6 +99,9 @@ public:
/* default stream for the device */ /* default stream for the device */
XStream * stream; XStream * stream;
/* seed for random number generation */
int seed;
#ifdef USE_CUDA #ifdef USE_CUDA
/* mutex for handle (GPU cublas) */ /* mutex for handle (GPU cublas) */
...@@ -109,6 +112,9 @@ public: ...@@ -109,6 +112,9 @@ public:
/* specify if the handle is initialized */ /* specify if the handle is initialized */
bool isHandleReady; bool isHandleReady;
/* generater of random numbers */
curandGenerator_t gen;
#endif #endif
......
...@@ -1461,6 +1461,23 @@ void XMem::CreateBLASHandle() ...@@ -1461,6 +1461,23 @@ void XMem::CreateBLASHandle()
#endif #endif
} }
/* show profile of the memory pool */
void XMem::ShowMemUsage(FILE * file)
{
MTYPE used = 0;
MTYPE total = 0;
for(int i = 0; i < blockNum; i++){
if(blocks[i].mem != NULL){
used += blocks[i].used;
total += blocks[i].size;
}
}
fprintf(file, "mem:%.1fMB used:%.1fMB usage:%.3f\n",
(DTYPE)used/MILLION, (DTYPE)total/MILLION, (DTYPE)used/total);
}
#ifdef USE_CUDA #ifdef USE_CUDA
/* get the handle of cublas */ /* get the handle of cublas */
......
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#ifndef __XMEM_H__ #ifndef __XMEM_H__
#define __XMEM_H__ #define __XMEM_H__
#include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#ifdef CUDA_BLAS #ifdef CUDA_BLAS
...@@ -402,6 +403,9 @@ public: ...@@ -402,6 +403,9 @@ public:
/* create a new cublas handle */ /* create a new cublas handle */
void CreateBLASHandle(); void CreateBLASHandle();
/* show profile of the memory pool */
void ShowMemUsage(FILE * file);
#ifdef USE_CUDA #ifdef USE_CUDA
/* get the handle of cublas */ /* get the handle of cublas */
cublasHandle_t * GetCublasHandle(); cublasHandle_t * GetCublasHandle();
......
...@@ -67,6 +67,8 @@ const char * GetOPName(int type) ...@@ -67,6 +67,8 @@ const char * GetOPName(int type)
return "M_MULTIPLY"; return "M_MULTIPLY";
else if (type == MATH_MULTIPLYDIM) else if (type == MATH_MULTIPLYDIM)
return "M_MULTIPLYDIM"; return "M_MULTIPLYDIM";
else if (type == MATH_MULTIPLYBROADCAST)
return "M_MULTIPLYBROADCAST";
else if (type == MATH_NEGATE) else if (type == MATH_NEGATE)
return "M_NEGATE"; return "M_NEGATE";
else if (type == MATH_NORMALIZE) else if (type == MATH_NORMALIZE)
...@@ -85,6 +87,8 @@ const char * GetOPName(int type) ...@@ -85,6 +87,8 @@ const char * GetOPName(int type)
return "M_SUM"; return "M_SUM";
else if (type == MATH_SUMDIM) else if (type == MATH_SUMDIM)
return "M_SUMDIM"; return "M_SUMDIM";
else if (type == MATH_SUMBROADCAST)
return "M_SUMBROADCAST";
else if (type == REDUCE_REDUCEMAX) else if (type == REDUCE_REDUCEMAX)
return "R_REDUCEMAX"; return "R_REDUCEMAX";
else if (type == REDUCE_REDUCEMEAN) else if (type == REDUCE_REDUCEMEAN)
......
...@@ -52,7 +52,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -52,7 +52,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#define MATH_MATRIXMULBATCHED MATH_MATRIXMUL + 1 #define MATH_MATRIXMULBATCHED MATH_MATRIXMUL + 1
#define MATH_MULTIPLY MATH_MATRIXMULBATCHED + 1 #define MATH_MULTIPLY MATH_MATRIXMULBATCHED + 1
#define MATH_MULTIPLYDIM MATH_MULTIPLY + 1 #define MATH_MULTIPLYDIM MATH_MULTIPLY + 1
#define MATH_NEGATE MATH_MULTIPLYDIM + 1 #define MATH_MULTIPLYBROADCAST MATH_MULTIPLYDIM + 1
#define MATH_NEGATE MATH_MULTIPLYBROADCAST + 1
#define MATH_NORMALIZE MATH_NEGATE + 1 #define MATH_NORMALIZE MATH_NEGATE + 1
#define MATH_POWER MATH_NORMALIZE + 1 #define MATH_POWER MATH_NORMALIZE + 1
#define MATH_SCALEANDSHIFT MATH_POWER + 1 #define MATH_SCALEANDSHIFT MATH_POWER + 1
...@@ -61,8 +62,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -61,8 +62,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#define MATH_SUBDIM MATH_SUB + 1 #define MATH_SUBDIM MATH_SUB + 1
#define MATH_SUM MATH_SUBDIM + 1 #define MATH_SUM MATH_SUBDIM + 1
#define MATH_SUMDIM MATH_SUM + 1 #define MATH_SUMDIM MATH_SUM + 1
#define MATH_SUMBROADCAST MATH_SUMDIM + 1
#define REDUCE MATH_SUMDIM + 1 #define REDUCE MATH_SUMBROADCAST + 1
#define REDUCE_REDUCEMAX REDUCE + 1 #define REDUCE_REDUCEMAX REDUCE + 1
#define REDUCE_REDUCEMEAN REDUCE_REDUCEMAX + 1 #define REDUCE_REDUCEMEAN REDUCE_REDUCEMAX + 1
#define REDUCE_REDUCESUM REDUCE_REDUCEMEAN + 1 #define REDUCE_REDUCESUM REDUCE_REDUCEMEAN + 1
......
...@@ -60,6 +60,7 @@ ...@@ -60,6 +60,7 @@
#include "core/utilities/FlushToMem.cuh" #include "core/utilities/FlushToMem.cuh"
#include "core/utilities/SetAscendingOrder.cuh" #include "core/utilities/SetAscendingOrder.cuh"
#endif #endif
/* the nts (NiuTrans.Tensor) namespace */ /* the nts (NiuTrans.Tensor) namespace */
...@@ -690,9 +691,6 @@ set the tensor items by a uniform distribution in range [lower, upper] ...@@ -690,9 +691,6 @@ set the tensor items by a uniform distribution in range [lower, upper]
>> lower - lower value of the range >> lower - lower value of the range
>> upper - upper value of the range >> upper - upper value of the range
*/ */
void XTensor::SetDataRand(DTYPE lower, DTYPE upper) void XTensor::SetDataRand(DTYPE lower, DTYPE upper)
{ {
// TODO: cuda code!!!!!!! // TODO: cuda code!!!!!!!
...@@ -703,7 +701,6 @@ void XTensor::SetDataRand(DTYPE lower, DTYPE upper) ...@@ -703,7 +701,6 @@ void XTensor::SetDataRand(DTYPE lower, DTYPE upper)
// srand((unsigned)time(0)); // srand((unsigned)time(0));
DTYPE variance = upper - lower; DTYPE variance = upper - lower;
void * d = NULL; void * d = NULL;
if (dataType == X_FLOAT) { if (dataType == X_FLOAT) {
d = new float[unitNum]; d = new float[unitNum];
for (int i = 0; i < unitNum; i++) { for (int i = 0; i < unitNum; i++) {
...@@ -715,7 +712,7 @@ void XTensor::SetDataRand(DTYPE lower, DTYPE upper) ...@@ -715,7 +712,7 @@ void XTensor::SetDataRand(DTYPE lower, DTYPE upper)
d = new double[unitNum]; d = new double[unitNum];
for (int i = 0; i < unitNum; i++) { for (int i = 0; i < unitNum; i++) {
*((double*)d + i) = lower + variance * rand() / RAND_MAX; *((double*)d + i) = lower + variance * rand() / RAND_MAX;
} }
} }
else if (dataType == X_FLOAT16) { else if (dataType == X_FLOAT16) {
unsigned short random; unsigned short random;
...@@ -1700,7 +1697,6 @@ void XTensor::Dump(const XTensor * tensor, FILE * file, const char * label, cons ...@@ -1700,7 +1697,6 @@ void XTensor::Dump(const XTensor * tensor, FILE * file, const char * label, cons
{ {
ShowNTErrors("TO DO!"); ShowNTErrors("TO DO!");
} }
} }
/* /*
......
...@@ -46,8 +46,6 @@ void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int le ...@@ -46,8 +46,6 @@ void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int le
"Unmatched tensors in multiplication!"); "Unmatched tensors in multiplication!");
CheckNTErrors((a->order == b->order && a->order == c->order), CheckNTErrors((a->order == b->order && a->order == c->order),
"Unmatched tensors!"); "Unmatched tensors!");
CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
"Unmatched tensors in addition!");
#ifdef USE_CUDA #ifdef USE_CUDA
if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) { if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
......
...@@ -22,12 +22,10 @@ ...@@ -22,12 +22,10 @@
#include "../../XTensor.h" #include "../../XTensor.h"
#include "../../XDevice.h" #include "../../XDevice.h"
#include "../../XName.h" #include "../../XName.h"
#include "../CHeader.h"
#include "MatrixMul.h" #include "MatrixMul.h"
#include "MatrixMul2D.h" #include "MatrixMul2D.h"
#include "XTensorBLAS.h" #include "XTensorBLAS.h"
#include "MatrixMulBatched.h" #include "MatrixMulBatched.h"
#include "timer.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -276,7 +274,8 @@ matrix multiplication with no transposition c = a * b * alpha ...@@ -276,7 +274,8 @@ matrix multiplication with no transposition c = a * b * alpha
>> parallelRunner - parallel processing module >> parallelRunner - parallel processing module
<< return - the result of matrix multiplication << return - the result of matrix multiplication
*/ */
XTensor MatrixMul(const XTensor &a, const XTensor &b, DTYPE alpha, XPRunner * parallelRunner) XTensor MatrixMul(const XTensor &a, const XTensor &b,
DTYPE alpha, XPRunner * parallelRunner)
{ {
CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!"); CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!"); CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
...@@ -317,212 +316,4 @@ XTensor MatrixMul(const XTensor &a, const XTensor &b, DTYPE alpha, XPRunner * pa ...@@ -317,212 +316,4 @@ XTensor MatrixMul(const XTensor &a, const XTensor &b, DTYPE alpha, XPRunner * pa
return c; return c;
} }
/* }// namespace nts(NiuTrans.Tensor)
matrix multiplication (return a XTensor structure) c = trans(a) * trans(b) * alpha \ No newline at end of file
make a new tensor to keep the result and return it
For the input tensors a and b, we perform matrix multiplication on the first two dimentsions.
E.g., let A be a tensor of size y * z * m and B be a tensor of size x * y * n.
For A * B, we go over each order-2 tensor of A (of size x * y) and each order-2 tensor B (of size z * x),
like this c_{i,j} = trans(ai) * trans(bj) * alpha + c_{i,j} * beta
where trans() returns the transposed matrix if the flag is fired, ai is the i-th element tensor of A,
bj is the j-th element tensor of B, and c_{i,j} is the (i,j) element tensor of the result C.
The result C should be a tensor of z * x * n * m.
Obviously C = A * B performs normal matrix multiplication if A = y * z and B = x * y.
>> a - tensor a
>> transposedA - indicates whether the matrices in a are transposed
>> b - tensor b
>> transposedB - indicates whether teh matrices in b are transposed
>> alpha - a coefficient
>> parallelRunner - parallel processing module
<< return - the result of matrix multiplication
*/
XTensor MatrixMulFloat16(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
const XTensor &b, MATRIX_TRANS_TYPE transposedB,
DTYPE alpha, XPRunner * parallelRunner)
{
CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
int an = transposedA == X_TRANS ? a.dimSizeRDI[0] : a.dimSizeRDI[1];
int am = transposedA == X_TRANS ? a.dimSizeRDI[1] : a.dimSizeRDI[0];
int bn = transposedB == X_TRANS ? b.dimSizeRDI[0] : b.dimSizeRDI[1];
int bm = transposedB == X_TRANS ? b.dimSizeRDI[1] : b.dimSizeRDI[0];
CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
int order = a.order + b.order - 2;
int sub = 0;
int * dimSize = new int[order];
for (int i = 2; i < a.order; i++)
dimSize[sub++] = a.dimSizeRDI[a.order + 1 - i];
for (int i = 2; i < b.order; i++)
dimSize[sub++] = b.dimSizeRDI[b.order + 1 - i];
dimSize[sub++] = an;
dimSize[sub++] = bm;
float dr = (!a.isSparse || !b.isSparse) ? 1.0F : MAX(a.denseRatio, b.denseRatio);
XTensor c(order, dimSize, a.dataType, dr, a.devID, a.mem);
c.SetTMPFlag();
//XTensor * halfA = NewTensorBuf(a.order, a.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
//XTensor * halfB = NewTensorBuf(b.order, b.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
//XTensor * halfC = NewTensorBuf(c.order, c.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
//_ConvertDataType(&a, halfA);
//_ConvertDataType(&b, halfB);
//_MatrixMul(halfA, transposedA, halfB, transposedB, halfC, alpha, 0, parallelRunner);
//_ConvertDataType(halfC, &c);
//DelTensorBuf(halfC);
//DelTensorBuf(halfB);
//DelTensorBuf(halfA);
XTensor * halfA = NewTensorBuf(a.order, a.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
XTensor * halfB = NewTensorBuf(b.order, b.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
XTensor * halfC = NewTensorBuf(c.order, c.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
/*timer_c timerConvert1;
timerConvert1.m_start_timer();
*/
_ConvertDataType(&a, halfA);
_ConvertDataType(&b, halfB);
/*timerConvert1.m_end_timer();
printf("time convert1 %f ms\n", timerConvert1.m_get_time_diff_msec());
timer_c timerMatrixMul;
timerMatrixMul.m_start_timer();*/
_MatrixMul(halfA, transposedA, halfB, transposedB, halfC, alpha, 0, parallelRunner);
/*timerMatrixMul.m_end_timer();
printf("time matrixmul %f ms\n", timerMatrixMul.m_get_time_diff_msec());
timer_c timerConvert2;
timerConvert2.m_start_timer();
*/
_ConvertDataType(halfC, &c);
/*timerConvert2.m_end_timer();
printf("time convert2 %f ms\n\n", timerConvert2.m_get_time_diff_msec());*/
DelTensorBuf(halfC);
DelTensorBuf(halfB);
DelTensorBuf(halfA);
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_MATRIXMUL);
XLink::AddParamToHeadTrans(&c, transposedA);
XLink::AddParamToHeadTrans(&c, transposedB);
XLink::AddParamToHead(&c, alpha);
/* destroy variables */
delete[] dimSize;
return c;
}
/*
matrix multiplication with no transposition c = a * b * alpha
>> a - tensor a
>> b - tensor b
>> alpha - a coefficient
>> parallelRunner - parallel processing module
<< return - the result of matrix multiplication
*/
XTensor MatrixMulFloat16(const XTensor &a, const XTensor &b,
DTYPE alpha, XPRunner * parallelRunner)
{
CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
int an = a.dimSizeRDI[1];
int am = a.dimSizeRDI[0];
int bn = b.dimSizeRDI[1];
int bm = b.dimSizeRDI[0];
CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
int order = a.order + b.order - 2;
int sub = 0;
int * dimSize = new int[order];
for (int i = 2; i < a.order; i++)
dimSize[sub++] = a.dimSizeRDI[a.order + 1 - i];
for (int i = 2; i < b.order; i++)
dimSize[sub++] = b.dimSizeRDI[b.order + 1 - i];
dimSize[sub++] = an;
dimSize[sub++] = bm;
float dr = (!a.isSparse || !b.isSparse) ? 1.0F : MAX(a.denseRatio, b.denseRatio);
XTensor c(order, dimSize, a.dataType, dr, a.devID, a.mem);
c.SetTMPFlag();
XTensor * halfA = NewTensorBuf(a.order, a.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
XTensor * halfB = NewTensorBuf(b.order, b.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
XTensor * halfC = NewTensorBuf(c.order, c.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
_ConvertDataType(&a, halfA);
_ConvertDataType(&b, halfB);
_MatrixMul(halfA, X_NOTRANS, halfB, X_NOTRANS, halfC, alpha, 0, parallelRunner);
_ConvertDataType(halfC, &c);
DelTensorBuf(halfC);
DelTensorBuf(halfB);
DelTensorBuf(halfA);
//XTensor * halfA = NewTensorBuf(a.order, a.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
//XTensor * halfB = NewTensorBuf(b.order, b.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
//XTensor * halfC = NewTensorBuf(c.order, c.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
//timer_c timerConvert1;
//timerConvert1.m_start_timer();
//_ConvertDataType(&a, halfA);
//_ConvertDataType(&b, halfB);
//timerConvert1.m_end_timer();
//printf("time convert1 %f ms\n", timerConvert1.m_get_time_diff_msec());
//timer_c timerMatrixMul;
//timerMatrixMul.m_start_timer();
//_MatrixMul(halfA, X_NOTRANS, halfB, X_NOTRANS, halfC, alpha, 0, parallelRunner);
//timerMatrixMul.m_end_timer();
//printf("time matrixmul %f ms\n", timerMatrixMul.m_get_time_diff_msec());
//timer_c timerConvert2;
//timerConvert2.m_start_timer();
//_ConvertDataType(halfC, &c);
//timerConvert2.m_end_timer();
//printf("time convert2 %f ms\n\n", timerConvert2.m_get_time_diff_msec());
//DelTensorBuf(halfC);
//DelTensorBuf(halfB);
//DelTensorBuf(halfA);
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_MATRIXMUL);
XLink::AddParamToHeadTrans(&c, X_NOTRANS);
XLink::AddParamToHeadTrans(&c, X_NOTRANS);
XLink::AddParamToHead(&c, alpha);
/* destroy variables */
delete[] dimSize;
return c;
}
}
\ No newline at end of file
...@@ -27,7 +27,6 @@ ...@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
#define MMul MatrixMul #define MMul MatrixMul
#define MMul16 MatrixMulFloat16
/* /*
matrix multiplication c = trans(a) * trans(b) * alpha + c * beta matrix multiplication c = trans(a) * trans(b) * alpha + c * beta
...@@ -64,13 +63,6 @@ XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor ...@@ -64,13 +63,6 @@ XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor
XTensor MatrixMul(const XTensor &a, const XTensor &b, XTensor MatrixMul(const XTensor &a, const XTensor &b,
DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL); DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
XTensor MatrixMulFloat16(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
/* matrix multiplication with no transposition c = a * b * alpha*/
XTensor MatrixMulFloat16(const XTensor &a, const XTensor &b,
DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
#endif // __MATRIXMUL_H__ #endif // __MATRIXMUL_H__
\ No newline at end of file
...@@ -82,19 +82,7 @@ void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -82,19 +82,7 @@ void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
_MatrixMULCPU(a, transposedA, b, transposedB, c, alpha, beta); _MatrixMULCPU(a, transposedA, b, transposedB, c, alpha, beta);
else else
_MatrixMul2DParallel(a, transposedA, b, transposedB, c, alpha, beta, parallelRunner); _MatrixMul2DParallel(a, transposedA, b, transposedB, c, alpha, beta, parallelRunner);
/*if (a->dataType == DEFAULT_DTYPE &&
b->dataType == DEFAULT_DTYPE &&
c->dataType == DEFAULT_DTYPE)
{
if (useBLAS)
_MatrixMULCPU(a, transposedA, b, transposedB, c, alpha, beta);
else
_MatrixMul2DParallel(a, transposedA, b, transposedB, c, alpha, beta, parallelRunner);
}*/
//else {
// // TODO!!
// ShowNTErrors("TODO!");
//}
} }
/* a dense matrix multiply a sparse matrix */ /* a dense matrix multiply a sparse matrix */
else if (!a->isSparse && b->isSparse) { else if (!a->isSparse && b->isSparse) {
......
...@@ -156,18 +156,6 @@ void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -156,18 +156,6 @@ void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
if (stream != NULL) if (stream != NULL)
cublasSetStream(*handle, stream->stream); cublasSetStream(*handle, stream->stream);
//if (a->dataType == X_FLOAT && b->dataType == X_FLOAT && c->dataType == X_FLOAT) {
// _CudaBLASMatrixMUL(handle, a->data, transposedA, a->dataType,
// b->data, transposedB, a->dataType, c->data, c->dataType,
// a->dimSize[0], a->dimSize[1],
// b->dimSize[0], b->dimSize[1],
// c->dimSize[0], c->dimSize[1],
// alpha, beta);
//}
//else {
// // TODO!!
// ShowNTErrors("TODO!");
//}
_CudaBLASMatrixMUL(handle, a->data, transposedA, a->dataType, _CudaBLASMatrixMUL(handle, a->data, transposedA, a->dataType,
b->data, transposedB, a->dataType, c->data, c->dataType, b->data, transposedB, a->dataType, c->data, c->dataType,
a->dimSize[0], a->dimSize[1], a->dimSize[0], a->dimSize[1],
......
...@@ -63,44 +63,6 @@ void _MatrixMul2DParallel(const XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -63,44 +63,6 @@ void _MatrixMul2DParallel(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
/* trans(a) * b */ /* trans(a) * b */
else if (transposedA == X_TRANS && transposedB == X_NOTRANS) { else if (transposedA == X_TRANS && transposedB == X_NOTRANS) {
int num = an; int num = an;
/*if (a->dataType == X_FLOAT16) {
for (int i = 0; i < cn; i++) {
X_FLOAT16 *p3 = (X_FLOAT16*)c->data + i * cm;
for (int j = 0; j < cm; j++) {
X_FLOAT16 r = 0;
X_FLOAT16 * p1 = (X_FLOAT16*)a->data + 0 * am + i;
X_FLOAT16 * p2 = (X_FLOAT16*)b->data + 0 * bm + j;
for (int k = 0; k < num; k++) {
r += (*p1) * (*p2) * alpha;
p1 += aColNum;
p2 += bColNum;
}
*p3 = *p3 * beta + r;
p3 += 1;
}
}
}
else {
for (int i = 0; i < cn; i++) {
DTYPE * p3 = (DTYPE*)c->data + i * cm;
for (int j = 0; j < cm; j++) {
DTYPE r = 0;
DTYPE * p1 = (DTYPE*)a->data + 0 * am + i;
DTYPE * p2 = (DTYPE*)b->data + 0 * bm + j;
for (int k = 0; k < num; k++) {
r += (*p1) * (*p2) * alpha;
p1 += aColNum;
p2 += bColNum;
}
*p3 = *p3 * beta + r;
p3 += 1;
}
}
}*/
for (int i = 0; i < cn; i++) { for (int i = 0; i < cn; i++) {
DTYPE * p3 = (DTYPE*)c->data + i * cm; DTYPE * p3 = (DTYPE*)c->data + i * cm;
for (int j = 0; j < cm; j++) { for (int j = 0; j < cm; j++) {
......
...@@ -22,7 +22,6 @@ ...@@ -22,7 +22,6 @@
#include "../../XTensor.h" #include "../../XTensor.h"
#include "../../XDevice.h" #include "../../XDevice.h"
#include "../../XName.h" #include "../../XName.h"
#include "../CHeader.h"
#include "MatrixMulBatched.h" #include "MatrixMulBatched.h"
#include "XTensorBLAS.h" #include "XTensorBLAS.h"
#include "MatrixMul2D.h" #include "MatrixMul2D.h"
...@@ -388,142 +387,4 @@ XTensor MatrixMulBatched(const XTensor &a, const XTensor &b, ...@@ -388,142 +387,4 @@ XTensor MatrixMulBatched(const XTensor &a, const XTensor &b,
return c; return c;
} }
/*
matrix multiplication of the two tensors (do it on site)
c = trans(a) * trans(b) * alpha
make a new tensor to keep the result and return it
for each 2-dimensional data array in a (denoted as ai) and
each 2-dimensional data array in b (denoted as bi), we have
ci = trans(ai) * trans(bi) * alpha + cm * beta
where trans() returns the transposed matrix if the flag is fired.
>> a - tensor a
>> transposedA - indicates whether the matrices in a are transposed
>> b - tensor b
>> transposedB - indicates whether teh matrices in b are transposed
>> alpha - a coefficient
>> parallelRunner - parallel processing module
<< return - the result of matrix multiplication of the two tensors
*/
XTensor MatrixMulBatchedFloat16(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
DTYPE alpha, XPRunner * parallelRunner)
{
CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
CheckNTErrors(a.order == b.order, "Input tensor and output tensor must have same order!");
int an = transposedA == X_TRANS ? a.dimSizeRDI[0] : a.dimSizeRDI[1];
int am = transposedA == X_TRANS ? a.dimSizeRDI[1] : a.dimSizeRDI[0];
int bn = transposedB == X_TRANS ? b.dimSizeRDI[0] : b.dimSizeRDI[1];
int bm = transposedB == X_TRANS ? b.dimSizeRDI[1] : b.dimSizeRDI[0];
CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
int order = a.order;
int sub = 0;
int * dimSize = new int[order];
for (int i = 0; i < a.order - 2; i++)
dimSize[sub++] = a.dimSize[i];
dimSize[sub++] = an;
dimSize[sub++] = bm;
float dr = (!a.isSparse || !b.isSparse) ? 1.0F : MAX(a.denseRatio, b.denseRatio);
XTensor c(order, dimSize, a.dataType, dr, a.devID, a.mem);
c.SetTMPFlag();
///*call _MatrixMulBatched function */
//_MatrixMulBatched(&a, transposedA, &b, transposedB, &c, alpha, 0, parallelRunner);
XTensor * halfA = NewTensorBuf(a.order, a.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
XTensor * halfB = NewTensorBuf(b.order, b.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
XTensor * halfC = NewTensorBuf(c.order, c.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
_ConvertDataType(&a, halfA);
_ConvertDataType(&b, halfB);
_MatrixMulBatched(halfA, transposedA, halfB, transposedB, halfC, alpha, 0, parallelRunner);
_ConvertDataType(halfC, &c);
DelTensorBuf(halfC);
DelTensorBuf(halfB);
DelTensorBuf(halfA);
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_MATRIXMULBATCHED);
XLink::AddParamToHeadTrans(&c, transposedA);
XLink::AddParamToHeadTrans(&c, transposedB);
XLink::AddParamToHead(&c, alpha);
/* destroy variables */
delete[] dimSize;
return c;
}
/*
matrix multiplication of the two tensors (do it on site)
c = a * b * alpha
make a new tensor to keep the result and return it
for each 2-dimensional data array in a (denoted as ai) and
each 2-dimensional data array in b (denoted as bi), we have
ci = ai * bi * alpha + cm * beta
>> a - tensor a
>> b - tensor b
>> alpha - a coefficient
>> parallelRunner - parallel processing module
<< return - the result of matrix multiplication of the two tensors
*/
XTensor MatrixMulBatchedFloat16(const XTensor &a, const XTensor &b,
DTYPE alpha, XPRunner * parallelRunner)
{
CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
CheckNTErrors(a.order == b.order, "Input tensor and output tensor must have same order!");
int an = a.dimSizeRDI[1];
int am = a.dimSizeRDI[0];
int bn = b.dimSizeRDI[1];
int bm = b.dimSizeRDI[0];
CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
int order = a.order;
int sub = 0;
int * dimSize = new int[order];
for (int i = 0; i < a.order - 2; i++)
dimSize[sub++] = a.dimSize[i];
dimSize[sub++] = an;
dimSize[sub++] = bm;
float dr = (!a.isSparse || !b.isSparse) ? 1.0F : MAX(a.denseRatio, b.denseRatio);
XTensor c(order, dimSize, a.dataType, dr, a.devID, a.mem);
c.SetTMPFlag();
///*call _MatrixMulBatched function */
//_MatrixMulBatched(&a, X_NOTRANS, &b, X_NOTRANS, &c, alpha, 0, parallelRunner);
XTensor * halfA = NewTensorBuf(a.order, a.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
XTensor * halfB = NewTensorBuf(b.order, b.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
XTensor * halfC = NewTensorBuf(c.order, c.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
_ConvertDataType(&a, halfA);
_ConvertDataType(&b, halfB);
_MatrixMulBatched(halfA, X_NOTRANS, halfB, X_NOTRANS, halfC, alpha, 0, parallelRunner);
_ConvertDataType(halfC, &c);
DelTensorBuf(halfC);
DelTensorBuf(halfB);
DelTensorBuf(halfA);
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_MATRIXMULBATCHED);
XLink::AddParamToHeadTrans(&c, X_NOTRANS);
XLink::AddParamToHeadTrans(&c, X_NOTRANS);
XLink::AddParamToHead(&c, alpha);
/* destroy variables */
delete[] dimSize;
return c;
}
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
...@@ -27,7 +27,6 @@ ...@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
#define BMMul MatrixMulBatched #define BMMul MatrixMulBatched
#define BMMul16 MatrixMulBatchedFloat16
/* /*
matrix multiplication of the two tensors c = trans(a) * trans(b) * alpha + c * beta matrix multiplication of the two tensors c = trans(a) * trans(b) * alpha + c * beta
...@@ -85,12 +84,6 @@ ci = ai * bi * alpha + cm * beta ...@@ -85,12 +84,6 @@ ci = ai * bi * alpha + cm * beta
XTensor MatrixMulBatched(const XTensor &a, const XTensor &b, XTensor MatrixMulBatched(const XTensor &a, const XTensor &b,
DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL); DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
XTensor MatrixMulBatchedFloat16(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
XTensor MatrixMulBatchedFloat16(const XTensor &a, const XTensor &b,
DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
#endif // __MATRIXMULBATCHED_H__ #endif // __MATRIXMULBATCHED_H__
\ No newline at end of file
...@@ -46,8 +46,6 @@ void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, i ...@@ -46,8 +46,6 @@ void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, i
"Unmatched tensors in multiplication!"); "Unmatched tensors in multiplication!");
CheckNTErrors((a->order == b->order && a->order == c->order), CheckNTErrors((a->order == b->order && a->order == c->order),
"Unmatched tensors!"); "Unmatched tensors!");
CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
"Unmatched tensors in addition!");
#ifdef USE_CUDA #ifdef USE_CUDA
if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) { if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
......
...@@ -22,9 +22,10 @@ ...@@ -22,9 +22,10 @@
#include "Multiply.h" #include "Multiply.h"
#include "MultiplyDim.h" #include "MultiplyDim.h"
#include "MultiplyDim.cuh" #include "MultiplyDim.cuh"
#include "../shape/Unsqueeze.h"
#include "../../XName.h" #include "../../XName.h"
#include "../../XUtility.h"
#include "../movement/CopyValues.h" #include "../movement/CopyValues.h"
#include "../getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -136,29 +137,168 @@ void _MultiplyDimMe(XTensor * a, const XTensor * b, int n, DTYPE alpha) ...@@ -136,29 +137,168 @@ void _MultiplyDimMe(XTensor * a, const XTensor * b, int n, DTYPE alpha)
tensor multiplication (return an XTensor structure and make tensor connections) tensor multiplication (return an XTensor structure and make tensor connections)
make a new tensor to keep the result and return it make a new tensor to keep the result and return it
c = a * b + \alpha * c c = a * b
where the size of b is equal to the n-th dimension of a, where the size of b is equal to the n-th dimension of a,
i.e., a is multiplied with b by broadcasting i.e., a is multiplied with b by broadcasting
>> a - a tensor >> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a >> b - another tensor whose size is equal to that of dimension n of a
>> n - the dimension index >> n - the dimension index
>> alpha - the scaling factor
<< return - the result tensor by tensor multiplication << return - the result tensor by tensor multiplication
*/ */
XTensor MultiplyDim(const XTensor &a, const XTensor &b, int n, DTYPE alpha) XTensor MultiplyDim(const XTensor &a, const XTensor &b, int n)
{ {
XTensor c(&a); XTensor c(&a);
c.SetTMPFlag(); c.SetTMPFlag();
/* call _Multiply function */ /* call _Multiply function */
_MultiplyDim(&a, &b, &c, n, alpha); _MultiplyDim(&a, &b, &c, n, 0);
/* tensor connections */ /* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM); XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM);
XLink::AddParamToHeadInt(&c, n); XLink::AddParamToHeadInt(&c, n);
XLink::AddParamToHead(&c, alpha); XLink::AddParamToHead(&c, 0);
return c;
}
/*
tensor broadcast multiplication
c = a * b + c * \beta
where some of dimensions of b can be of size 1
>> a - a tensor
>> b - another tensor that would be broadcasted
>> c - the resulting tensor
>> beta - the scaling factor
*/
void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
{
CheckNTErrors(a->order == b->order, "Wrong tensor orders!");
CheckNTErrors(a->order == c->order, "Wrong tensor orders!");
CheckNTErrors(a->order > 0, "TODO!");
int order = a->order;
int count = 0;
void * source = 0;
void * target = 0;
for(int i = 0; i < order; i++){
if(a->GetDim(i) == b->GetDim(i))
continue;
if(b->GetDim(i) == 1){
int fitSize = a->GetDim(i);
int j = i + 1;
/* we define a range over dimensions. It is to be unsqueezed */
for(; j < order; j++){
if(a->GetDim(j) == b->GetDim(j))
break;
fitSize *= a->GetDim(j);
}
int dimsS[MAX_TENSOR_DIM_NUM];
int dimsT[MAX_TENSOR_DIM_NUM];
for(int k = 0; k < i; k++){
dimsS[k] = a->GetDim(k);
dimsT[k] = a->GetDim(k);
}
dimsT[i] = fitSize;
bool isLast = true;
for(int k = j; k < order; k++){
dimsS[i + k - j + 0] = b->GetDim(k);
dimsT[i + k - j + 1] = b->GetDim(k);
if(a->GetDim(k) != b->GetDim(k)){
if(b->GetDim(k) == 1)
isLast = false;
else{
ShowNTErrors("Wrong dimension size!")
}
}
}
dimsS[0] = -dimsS[0];
dimsT[0] = -dimsT[0];
XTensor * s = NewTensor(order - (j - i), dimsS, a->dataType, a->denseRatio, a->devID, a->mem);
XTensor * t = NewTensor(order - (j - i) + 1, dimsT, b->dataType, b->denseRatio, b->devID, b->mem);
if(count == 0)
source = b->data;
else{
source = target;
}
target = t->mem != NULL ?
t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize):
XMemAlloc(t->devID, t->unitNum * t->unitSize);
s->data = source;
t->data = target;
_Unsqueeze(s, t, i, fitSize);
/* free the memory space of the one before the last allocation */
if(count > 0){
int size = s->unitNum * s->unitSize;
if(t->mem != NULL)
t->mem->ReleaseBuf(t->devID, size);
else
XMemFree(t->devID, source);
}
/* we do multiplication here */
if(isLast){
CheckNTErrors(t->unitNum == c->unitNum, "Wrong tensor size!");
_Multiply(a, t, c, beta);
if(t->mem != NULL)
t->mem->ReleaseBuf(t->devID, t->unitNum * t->unitSize);
else
XMemFree(t->devID, target);
target = NULL;
}
s->data = NULL;
t->data = NULL;
DelTensor(s);
DelTensor(t);
i = j;
count++;
}
}
if(count == 0)
_Multiply(a, b, c, beta);
CheckNTErrors(target == NULL, "Something is wrong!");
}
/*
tensor broadcast multiplication
c = a * b
where some of dimensions of b can be of size 1
>> a - a tensor
>> b - another tensor that would be broadcasted
<< return - the resulting tensor c
*/
XTensor MultiplyBroadcast(const XTensor &a, const XTensor &b)
{
XTensor c(&a);
c.SetTMPFlag();
/* call _SumBroadcast function */
_MultiplyBroadcast(&a, &b, &c, 0);
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYBROADCAST);
XLink::AddParamToHead(&c, 0);
return c; return c;
} }
......
...@@ -217,8 +217,6 @@ void _CudaMultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, ...@@ -217,8 +217,6 @@ void _CudaMultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n,
} }
} }
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
else if (a->dataType == X_FLOAT16) { else if (a->dataType == X_FLOAT16) {
if (stride > 1) { if (stride > 1) {
...@@ -243,7 +241,9 @@ void _CudaMultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, ...@@ -243,7 +241,9 @@ void _CudaMultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n,
} }
} }
#endif else {
ShowNTErrors("TODO!");
}
BacktoCudaDev(a->devID, devIDBackup); BacktoCudaDev(a->devID, devIDBackup);
} }
......
...@@ -34,9 +34,16 @@ void _MultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYP ...@@ -34,9 +34,16 @@ void _MultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYP
i.e., a is multiplied with b by broadcasting. we keep the result in the input tensor a and return nothing */ i.e., a is multiplied with b by broadcasting. we keep the result in the input tensor a and return nothing */
void _MultiplyDimMe(XTensor * a, const XTensor * b, int n, DTYPE alpha = 0.0); void _MultiplyDimMe(XTensor * a, const XTensor * b, int n, DTYPE alpha = 0.0);
/* tensor multiplication c = a * b + \alpha * c where the size of b is equal to the n-th dimension of a, /* tensor multiplication c = a * b where the size of b is equal to the n-th dimension of a,
i.e., a is multiplied with b by broadcasting. We make a new tensor c to keep the result and return it */ i.e., a is multiplied with b by broadcasting. We make a new tensor c to keep the result and return it */
XTensor MultiplyDim(const XTensor &a, const XTensor &b, int n, DTYPE alpha = 0.0); XTensor MultiplyDim(const XTensor &a, const XTensor &b, int n);
/* tensor multiplication summation c = a * b + c * \beta where some of dimensions of b can be of size 1 */
void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
/* tensor broadcast multiplication c = a * b where some of dimensions of b can be of size 1.
we return the resulting tensor here */
XTensor MultiplyBroadcast(const XTensor &a, const XTensor &b);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -60,6 +60,16 @@ void KernelADDHalf(__half * a, __half * b, __half * c, int size, DTYPE beta) ...@@ -60,6 +60,16 @@ void KernelADDHalf(__half * a, __half * b, __half * c, int size, DTYPE beta)
#endif #endif
} }
__global__
void KernelADDInt(int * a, int * b, int * c, int size, DTYPE beta)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size)
c[i] = a[i] + b[i] * (int)beta;
}
/* /*
tensor summation c = a + b * \beta (cuda version) tensor summation c = a + b * \beta (cuda version)
...@@ -101,7 +111,7 @@ void _CudaSum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta) ...@@ -101,7 +111,7 @@ void _CudaSum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
if ((c == a && handle != NULL) && *handle != 0) { if ((c == a && handle != NULL) && *handle != 0) {
#ifdef DOUBELPRICSION #ifdef DOUBELPRICSION
cublasDaxpy(*handle, a->unitNum, &beta, (DTYPE*)->data, 1, (DTYPE*)a->data, 1); cublasDaxpy(*handle, a->unitNum, &beta, (DTYPE*)b->data, 1, (DTYPE*)a->data, 1);
#else #else
cublasSaxpy(*handle, a->unitNum, &beta, (DTYPE*)b->data, 1, (DTYPE*)a->data, 1); cublasSaxpy(*handle, a->unitNum, &beta, (DTYPE*)b->data, 1, (DTYPE*)a->data, 1);
#endif #endif
...@@ -121,16 +131,6 @@ void _CudaSum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta) ...@@ -121,16 +131,6 @@ void _CudaSum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
b->dataType == X_FLOAT16 && b->dataType == X_FLOAT16 &&
c->dataType == X_FLOAT16) c->dataType == X_FLOAT16)
{ {
cublasHandle_t * handle = NULL;
if ((a->mem != NULL) && (b->mem != NULL)) {
cublasHandle_t * handleA = a->mem->GetCublasHandle();
cublasHandle_t * handleB = b->mem->GetCublasHandle();
handle = *handleA != 0 ? handleA : handleB;
}
else {
handle = GDevs.GetCudaHandle(a->devID);
}
int gridSize[3], blockSize[3]; int gridSize[3], blockSize[3];
GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize); GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
...@@ -141,6 +141,20 @@ void _CudaSum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta) ...@@ -141,6 +141,20 @@ void _CudaSum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
KernelADDHalf << <blocks, threads >> >((__half *)a->data, (__half *)b->data, (__half *)c->data, a->unitNum, beta); KernelADDHalf << <blocks, threads >> >((__half *)a->data, (__half *)b->data, (__half *)c->data, a->unitNum, beta);
} }
else if (a->dataType == X_INT &&
b->dataType == X_INT &&
c->dataType == X_INT)
{
int gridSize[3], blockSize[3];
GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
dim3 blocks(gridSize[0]);
dim3 threads(blockSize[0]);
//KernelADD << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, a->unitNum, beta);
KernelADDInt << <blocks, threads >> >((int *)a->data, (int *)b->data, (int *)c->data, a->unitNum, beta);
}
else { else {
// TODO!! // TODO!!
......
...@@ -17,12 +17,16 @@ ...@@ -17,12 +17,16 @@
/* /*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-29 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-29
* &Updated by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-12-26
* Add summation by broadcasting.
*/ */
#include "Sum.h" #include "Sum.h"
#include "SumDim.h" #include "SumDim.h"
#include "SumDim.cuh" #include "SumDim.cuh"
#include "../shape/Unsqueeze.h"
#include "../../XName.h" #include "../../XName.h"
#include "../../XUtility.h"
#include "../movement/CopyValues.h" #include "../movement/CopyValues.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -152,7 +156,7 @@ XTensor SumDim(const XTensor &a, const XTensor &b, int n, DTYPE beta) ...@@ -152,7 +156,7 @@ XTensor SumDim(const XTensor &a, const XTensor &b, int n, DTYPE beta)
XTensor c(&a); XTensor c(&a);
c.SetTMPFlag(); c.SetTMPFlag();
/* call _Sum function */ /* call _SumDim function */
_SumDim(&a, &b, &c, n, beta); _SumDim(&a, &b, &c, n, beta);
/* tensor connections */ /* tensor connections */
...@@ -162,5 +166,146 @@ XTensor SumDim(const XTensor &a, const XTensor &b, int n, DTYPE beta) ...@@ -162,5 +166,146 @@ XTensor SumDim(const XTensor &a, const XTensor &b, int n, DTYPE beta)
return c; return c;
} }
/*
tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1
c = a + b * \beta
>> a - a tensor
>> b - another tensor that would be broadcasted
>> c - the resulting tensor
>> beta - the scaling factor
*/
void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
{
CheckNTErrors(a->order == b->order, "Wrong tensor orders!");
CheckNTErrors(a->order == c->order, "Wrong tensor orders!");
CheckNTErrors(a->order > 0, "TODO!");
int order = a->order;
int count = 0;
void * source = 0;
void * target = 0;
for(int i = 0; i < order; i++){
if(a->GetDim(i) == b->GetDim(i))
continue;
if(b->GetDim(i) == 1){
int fitSize = a->GetDim(i);
int j = i + 1;
/* we define a range over dimensions. It is to be unsqueezed */
for(; j < order; j++){
if(a->GetDim(j) == b->GetDim(j))
break;
fitSize *= a->GetDim(j);
}
int dimsS[MAX_TENSOR_DIM_NUM];
int dimsT[MAX_TENSOR_DIM_NUM];
for(int k = 0; k < i; k++){
dimsS[k] = a->GetDim(k);
dimsT[k] = a->GetDim(k);
}
dimsT[i] = fitSize;
bool isLast = true;
for(int k = j; k < order; k++){
dimsS[i + k - j + 0] = b->GetDim(k);
dimsT[i + k - j + 1] = b->GetDim(k);
if(a->GetDim(k) != b->GetDim(k)){
if(b->GetDim(k) == 1)
isLast = false;
else{
ShowNTErrors("Wrong dimension size!")
}
}
}
dimsS[0] = -dimsS[0];
dimsT[0] = -dimsT[0];
XTensor * s = NewTensor(order - (j - i), dimsS, a->dataType, a->denseRatio, a->devID, a->mem);
XTensor * t = NewTensor(order - (j - i) + 1, dimsT, b->dataType, b->denseRatio, b->devID, b->mem);
if(count == 0)
source = b->data;
else{
source = target;
}
target = t->mem != NULL ?
t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize):
XMemAlloc(t->devID, t->unitNum * t->unitSize);
s->data = source;
t->data = target;
_Unsqueeze(s, t, i, fitSize);
/* free the memory space of the one before the last allocation */
if(count > 0){
int size = s->unitNum * s->unitSize;
if(t->mem != NULL)
t->mem->ReleaseBuf(t->devID, size);
else
XMemFree(t->devID, source);
}
/* we do summation here */
if(isLast){
CheckNTErrors(t->unitNum == c->unitNum, "Wrong tensor size!");
_Sum(a, t, c, beta);
if(t->mem != NULL)
t->mem->ReleaseBuf(t->devID, t->unitNum * t->unitSize);
else
XMemFree(t->devID, target);
target = NULL;
}
s->data = NULL;
t->data = NULL;
DelTensor(s);
DelTensor(t);
i = j;
count++;
}
}
if(count == 0)
_Sum(a, b, c, beta);
CheckNTErrors(target == NULL, "Something is wrong!");
}
/*
tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1
c = a + b * \beta
we return c here
>> a - a tensor
>> b - another tensor that would be broadcasted
>> beta - the scaling factor
<< return - the resulting tensor c
*/
XTensor SumBroadcast(const XTensor &a, const XTensor &b, DTYPE beta)
{
XTensor c(&a);
c.SetTMPFlag();
/* call _SumBroadcast function */
_SumBroadcast(&a, &b, &c, beta);
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_SUMBROADCAST);
XLink::AddParamToHead(&c, beta);
return c;
}
} }
...@@ -17,6 +17,8 @@ ...@@ -17,6 +17,8 @@
/* /*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-29 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-29
* &Updated by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-12-26
* Add summation by broadcasting.
*/ */
#ifndef __SUMDIM_CUH__ #ifndef __SUMDIM_CUH__
......
...@@ -18,6 +18,9 @@ ...@@ -18,6 +18,9 @@
/* /*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-29 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-29
* It reached to 39 centigrade around 3:00 pm in Shenyang * It reached to 39 centigrade around 3:00 pm in Shenyang
* &Updated by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-12-26
* Add summation by broadcasting.
* Four of my master students graduated. Good luck to them for their future work!
*/ */
#ifndef __SUMDIM_H__ #ifndef __SUMDIM_H__
...@@ -38,6 +41,13 @@ void _SumDim(XTensor * a, const XTensor * b, int n, DTYPE beta = (DTYPE)1.0); ...@@ -38,6 +41,13 @@ void _SumDim(XTensor * a, const XTensor * b, int n, DTYPE beta = (DTYPE)1.0);
/* tensor summation c = a + b * \beta where the size of b is equal to the n-th dimension of a, /* tensor summation c = a + b * \beta where the size of b is equal to the n-th dimension of a,
i.e., a is summed with b by broadcasting. We make a new tensor c to keep the result and return it */ i.e., a is summed with b by broadcasting. We make a new tensor c to keep the result and return it */
XTensor SumDim(const XTensor &a, const XTensor &b, int n, DTYPE beta = (DTYPE)1.0); XTensor SumDim(const XTensor &a, const XTensor &b, int n, DTYPE beta = (DTYPE)1.0);
/* tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1 */
void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
/* tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1.
we return the resulting tensor here */
XTensor SumBroadcast(const XTensor &a, const XTensor &b, DTYPE beta = (DTYPE)1.0);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -23,89 +23,12 @@ ...@@ -23,89 +23,12 @@
#include "../../XDevice.h" #include "../../XDevice.h"
#include "../../XTensor.h" #include "../../XTensor.h"
#include "XTensorBLAS.h" #include "XTensorBLAS.h"
#include <stdint.h>
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
#include <stdint.h>
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
//typedef char __int8;
half uint16_as_fp16(uint16_t a)
{
half res;
#if defined (__cplusplus)
memcpy(&res, &a, sizeof(res));
#else /* __cplusplus */
volatile union {
half f;
uint16_t i;
} cvt;
cvt.i = a;
res = cvt.f;
#endif /* __cplusplus */
return res;
}
uint32_t fp32_as_uint32(float a)
{
uint32_t res;
#if defined (__cplusplus)
memcpy(&res, &a, sizeof(res));
#else /* __cplusplus */
volatile union {
float f;
uint32_t i;
} cvt;
cvt.f = a;
res = cvt.i;
#endif /* __cplusplus */
return res;
}
/* host version of device function __float2half_rn() */
half float2half_rn(float a)
{
uint32_t ia = fp32_as_uint32(a);
uint16_t ir;
ir = (ia >> 16) & 0x8000;
if ((ia & 0x7f800000) == 0x7f800000) {
if ((ia & 0x7fffffff) == 0x7f800000) {
ir |= 0x7c00; /* infinity */
}
else {
ir = 0x7fff; /* canonical NaN */
}
}
else if ((ia & 0x7f800000) >= 0x33000000) {
int shift = (int)((ia >> 23) & 0xff) - 127;
if (shift > 15) {
ir |= 0x7c00; /* infinity */
}
else {
ia = (ia & 0x007fffff) | 0x00800000; /* extract mantissa */
if (shift < -14) { /* denormal */
ir |= ia >> (-1 - shift);
ia = ia << (32 - (-1 - shift));
}
else { /* normal */
ir |= ia >> (24 - 11);
ia = ia << (32 - (24 - 11));
ir = ir + ((14 + shift) << 10);
}
/* IEEE-754 round to nearest of even */
if ((ia > 0x80000000) || ((ia == 0x80000000) && (ir & 1))) {
ir++;
}
}
}
return uint16_as_fp16(ir);
}
/* /*
matrix multiplication via cuda version BLAS matrix multiplication via cuda version BLAS
*/ */
...@@ -171,11 +94,24 @@ void _CudaBLASMatrixMUL(cublasHandle_t * handle, ...@@ -171,11 +94,24 @@ void _CudaBLASMatrixMUL(cublasHandle_t * handle,
cublasGemmEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, &alpha2, (const int8_t*)b, CUDA_R_8I, mb, (const int8_t*)a, CUDA_R_8I, ma, &beta2, (float*)c, CUDA_R_32F, mc, CUDA_R_32F, CUBLAS_GEMM_DEFAULT); cublasGemmEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, &alpha2, (const int8_t*)b, CUDA_R_8I, mb, (const int8_t*)a, CUDA_R_8I, ma, &beta2, (float*)c, CUDA_R_32F, mc, CUDA_R_32F, CUBLAS_GEMM_DEFAULT);
} }
else if (dataTypeA == X_INT8 && dataTypeB == X_INT8 && dataTypeC == X_INT) { else if (dataTypeA == X_INT8 && dataTypeB == X_INT8 && dataTypeC == X_INT) {
//ShowNTErrors("TO DO!");
int alpha2 = (int)alpha; int alpha2 = (int)alpha;
int beta2 = (int)beta; int beta2 = (int)beta;
/*
CUDA requires that the dimension of two tensor( lda, ldb ) should be multiples of 4.
details in https://devtalk.nvidia.com/default/topic/999101/about-cublasgemm-int8-support/
*/
if (mb % 4 != 0 || ma % 4 != 0) {
ShowNTErrors("mb, ma( lda, ldb ) should be multiples of 4!");
return;
}
if (transposedA == X_NOTRANS && transposedB == X_NOTRANS) if (transposedA == X_NOTRANS && transposedB == X_NOTRANS)
cublasGemmEx(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, &alpha2, (const int8_t*)b, CUDA_R_8I, mb, (const int8_t*)a, CUDA_R_8I, ma, &beta2, (int*)c, CUDA_R_32I, mc, CUDA_R_32I, CUBLAS_GEMM_DEFAULT); cublasGemmEx(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, &alpha2, (const int8_t*)b, CUDA_R_8I, mb, (const int8_t*)a, CUDA_R_8I, ma, &beta2, (int*)c, CUDA_R_32I, mc, CUDA_R_32I, CUBLAS_GEMM_DEFAULT);
else if (transposedA == X_TRANS && transposedB == X_NOTRANS)
cublasGemmEx(*handle, CUBLAS_OP_N, CUBLAS_OP_T, mc, nc, na, &alpha2, (const int8_t*)b, CUDA_R_8I, mb, (const int8_t*)a, CUDA_R_8I, ma, &beta2, (int*)c, CUDA_R_32I, mc, CUDA_R_32I, CUBLAS_GEMM_DEFAULT);
else if (transposedA == X_NOTRANS && transposedB == X_TRANS)
cublasGemmEx(*handle, CUBLAS_OP_T, CUBLAS_OP_N, mc, nc, ma, &alpha2, (const int8_t*)b, CUDA_R_8I, mb, (const int8_t*)a, CUDA_R_8I, ma, &beta2, (int*)c, CUDA_R_32I, mc, CUDA_R_32I, CUBLAS_GEMM_DEFAULT);
else if (transposedA == X_TRANS && transposedB == X_TRANS)
cublasGemmEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, &alpha2, (const int8_t*)b, CUDA_R_8I, mb, (const int8_t*)a, CUDA_R_8I, ma, &beta2, (int*)c, CUDA_R_32I, mc, CUDA_R_32I, CUBLAS_GEMM_DEFAULT);
} }
else { else {
ShowNTErrors("Unsupported data type!"); ShowNTErrors("Unsupported data type!");
......
...@@ -26,8 +26,6 @@ ...@@ -26,8 +26,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
half float2half_rn(float a);
/* matrix multiplication (BLAS) */ /* matrix multiplication (BLAS) */
void _MatrixMULCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, void _MatrixMULCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB,
XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0); XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
......
...@@ -430,6 +430,39 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper) ...@@ -430,6 +430,39 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
//delete t2; //delete t2;
} }
} }
/*
generate data items with a uniform distribution in [lower, upper] and set
the item to a pre-defined value if the item >= p, set the item to 0 otherwise
>> tensor - the tensor whose data array would be initialized
>> lower - lower value of the range
>> upper - upper value of the range
>> p - the threshold
>> value - the value we intend to assign to the item
*/
void _SetDataRandP(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value)
{
//CheckNTErrors(tensor->dataType == DEFAULT_DTYPE, "TODO");
if (tensor->devID < 0) {
_SetDataRand(tensor, lower, upper);
DTYPE * data = (DTYPE*)tensor->data;
for (int i = 0; i < tensor->unitNum; i++) {
if (data[i] >= p)
data[i] = value;
else
data[i] = 0;
}
}
else {
#ifdef USE_CUDA
_CudaSetDataRandP(tensor, lower, upper, p, value);
#else
ShowNTErrors("Please recompile the code by specifying USE_CUDA");
#endif // USE_CUDA
}
}
/* /*
generate data items with a normal distribution with specified mean and standard deviation generate data items with a normal distribution with specified mean and standard deviation
......
...@@ -28,6 +28,7 @@ ...@@ -28,6 +28,7 @@
#include "../../XDevice.h" #include "../../XDevice.h"
#include "../../XUtility.h" #include "../../XUtility.h"
#include "../getandset/ConvertDataType.h" #include "../getandset/ConvertDataType.h"
#include "../movement/CopyValues.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -197,6 +198,7 @@ set data array with a uniform distribution in [low, high] ...@@ -197,6 +198,7 @@ set data array with a uniform distribution in [low, high]
__global__ __global__
void KernelSetDataRandHalf(half * d, int size, DTYPE lower, DTYPE variance) void KernelSetDataRandHalf(half * d, int size, DTYPE lower, DTYPE variance)
{ {
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
half lowerHalf = __float2half(lower); half lowerHalf = __float2half(lower);
half varianceHalf = __float2half(variance); half varianceHalf = __float2half(variance);
...@@ -204,6 +206,47 @@ void KernelSetDataRandHalf(half * d, int size, DTYPE lower, DTYPE variance) ...@@ -204,6 +206,47 @@ void KernelSetDataRandHalf(half * d, int size, DTYPE lower, DTYPE variance)
if (i < size) { if (i < size) {
d[i] = d[i] * varianceHalf + lowerHalf; d[i] = d[i] * varianceHalf + lowerHalf;
} }
#endif
}
/*
set data items to a pre-defined value if its value >= p, set it to 0 otherwise
>> d - pointer to the data array
>> size - size of the array
>> lower - low value of the range
>> variance - the variance of the range
*/
__global__
void KernelSetDataPCut(DTYPE * d, int size, DTYPE p, DTYPE value)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) {
if (d[i] >= p)
d[i] = value;
else
d[i] = 0;
}
}
__global__
void KernelSetDataPCutHalf(half * d, int size, DTYPE p, DTYPE value)
{
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
half halfP = __float2half(p);
half halfValue = __float2half(value);
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) {
if (d[i] >= halfP)
d[i] = halfValue;
else
d[i] = 0;
}
#endif
} }
/* /*
...@@ -473,34 +516,81 @@ void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper) ...@@ -473,34 +516,81 @@ void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
int devIDBackup; int devIDBackup;
ProtectCudaDev(tensor->devID, devIDBackup); ProtectCudaDev(tensor->devID, devIDBackup);
curandGenerator_t gen; XTensor tensor1(tensor->order, tensor->dimSize, X_FLOAT, tensor->denseRatio, tensor->devID, tensor->mem);
curandCreateGenerator (&gen, CURAND_RNG_PSEUDO_DEFAULT);
curandSetPseudoRandomGeneratorSeed(gen, time(NULL));
DTYPE variance = upper - lower;
if (tensor->dataType == X_FLOAT) { if (tensor->dataType == X_FLOAT){
curandGenerator_t & gen = GDevs.GPUs[tensor->devID].gen;
curandGenerateUniform(gen, (float*)tensor->data, tensor->unitNum); curandGenerateUniform(gen, (float*)tensor->data, tensor->unitNum);
curandDestroyGenerator(gen);
KernelSetDataRandFloat << <blocks, threads >> >((float*)tensor->data, tensor->unitNum, lower, variance);
} }
else{
else if (tensor->dataType == X_DOUBLE) { curandGenerator_t & gen = GDevs.GPUs[tensor->devID].gen;
curandGenerateUniform(gen, (float*)tensor->data, tensor->unitNum); curandGenerateUniform(gen, (float*)tensor1.data, tensor1.unitNum);
curandDestroyGenerator(gen);
KernelSetDataRandDouble << <blocks, threads >> >((double*)tensor->data, tensor->unitNum, lower, variance);
} }
//curandGenerator_t & gen = GDevs.GPUs[tensor->devID].gen;
//curandGenerateUniform(gen, (float*)tensor->data, tensor->unitNum);
else if (tensor->dataType == X_FLOAT16) { DTYPE variance = upper - lower;
XTensor tensor1(tensor->order, tensor->dimSize, X_FLOAT, tensor->denseRatio, tensor->devID,tensor->mem);
curandGenerateUniform(gen, (float *)tensor1.data, tensor1.unitNum); if (variance != 1.0F || lower != 0) {
curandDestroyGenerator(gen); if (tensor->dataType == X_FLOAT) {
_ConvertDataType(&tensor1, tensor); KernelSetDataRandFloat << <blocks, threads >> >((float*)tensor->data, tensor->unitNum, lower, variance);
KernelSetDataRandHalf << <blocks, threads >> >((half*)tensor->data, tensor->unitNum, lower, variance); }
} else if (tensor->dataType == X_DOUBLE) {
KernelSetDataRandDouble << <blocks, threads >> >((double*)tensor->data, tensor->unitNum, lower, variance);
}
else if (tensor->dataType == X_FLOAT16) {
_ConvertDataType(&tensor1, tensor);
KernelSetDataRandHalf << <blocks, threads >> >((half*)tensor->data, tensor->unitNum, lower, variance);
}
else {
ShowNTErrors("TODO!");
}
}
else if (tensor->dataType == X_FLOAT16) {
_ConvertDataType(&tensor1, tensor);
}
BacktoCudaDev(tensor->devID, devIDBackup); BacktoCudaDev(tensor->devID, devIDBackup);
} }
/*
generate data items with a uniform distribution in [lower, upper] and set
the item to a pre-defined value if the item >= p, set the item to 0 otherwise
>> tensor - the tensor whose data array would be initialized
>> lower - lower value of the range
>> upper - upper value of the range
>> p - the threshold
>> value - the value we intend to assign to the item
*/
void _CudaSetDataRandP(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value)
{
_CudaSetDataRand(tensor, lower, upper);
int gridSize[3];
int blockSize[3];
GDevs.GetCudaThread(tensor->devID, tensor->unitNum, gridSize, blockSize);
dim3 blocks(gridSize[0]);
dim3 threads(blockSize[0]);
int devIDBackup;
ProtectCudaDev(tensor->devID, devIDBackup);
if (tensor->dataType == X_FLOAT) {
KernelSetDataPCut << <blocks, threads >> >((float*)tensor->data, tensor->unitNum, p, value);
}
else if (tensor->dataType == X_FLOAT16) {
KernelSetDataPCutHalf << <blocks, threads >> >((__half*)tensor->data, tensor->unitNum, p, value);
}
else {
ShowNTErrors("TODO!")
}
BacktoCudaDev(tensor->devID, devIDBackup);
}
/* /*
set the data with an array of offsets (kernel version) set the data with an array of offsets (kernel version)
>> data - pointer to the data array >> data - pointer to the data array
......
...@@ -49,6 +49,10 @@ void _CudaSetDataLowTri(XTensor * tensor, DTYPE p, int shift); ...@@ -49,6 +49,10 @@ void _CudaSetDataLowTri(XTensor * tensor, DTYPE p, int shift);
/* generate data items with a uniform distribution in [lower, upper] */ /* generate data items with a uniform distribution in [lower, upper] */
void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper); void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper);
/* generate data items with a uniform distribution in [lower, upper] and set
the item to a pre-defined value if the item >= p, set the item to 0 otherwise */
void _CudaSetDataRandP(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value);
/* set the data with an array of offsets */ /* set the data with an array of offsets */
void _CudaSetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYPE num); void _CudaSetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYPE num);
......
...@@ -57,6 +57,10 @@ void _SetDataLowTri(XTensor * tensor, DTYPE p, int shift); ...@@ -57,6 +57,10 @@ void _SetDataLowTri(XTensor * tensor, DTYPE p, int shift);
/* generate data items with a uniform distribution in [lower, upper] */ /* generate data items with a uniform distribution in [lower, upper] */
void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper); void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper);
/* generate data items with a uniform distribution in [lower, upper] and set
the item to a pre-defined value if the item >= p, set the item to 0 otherwise */
void _SetDataRandP(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value);
/* generate data items with a normal distribution with specified mean and standard deviation */ /* generate data items with a normal distribution with specified mean and standard deviation */
void _SetDataRandN(XTensor * tensor, DTYPE mean = 0.0F, DTYPE standardDeviation = 1.0F); void _SetDataRandN(XTensor * tensor, DTYPE mean = 0.0F, DTYPE standardDeviation = 1.0F);
......
...@@ -35,8 +35,6 @@ get the power(a, p) ...@@ -35,8 +35,6 @@ get the power(a, p)
*/ */
void _Power(const XTensor * a, XTensor * b, DTYPE p) void _Power(const XTensor * a, XTensor * b, DTYPE p)
{ {
CheckNTErrors(a->dataType == b->dataType, "Unmatched tensors in addition!");
#ifdef USE_CUDA #ifdef USE_CUDA
/* run it on GPUs */ /* run it on GPUs */
if (a->devID >= 0) { if (a->devID >= 0) {
......
...@@ -138,4 +138,4 @@ XTensor Gather(XTensor &s, XTensor &index) ...@@ -138,4 +138,4 @@ XTensor Gather(XTensor &s, XTensor &index)
} }
} }
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
...@@ -270,4 +270,4 @@ void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index) ...@@ -270,4 +270,4 @@ void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index)
} }
} }
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
...@@ -416,4 +416,4 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcI ...@@ -416,4 +416,4 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcI
#endif // USE_CUDA #endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
...@@ -23,7 +23,6 @@ ...@@ -23,7 +23,6 @@
#include "../../XName.h" #include "../../XName.h"
#include "ReduceMax.h" #include "ReduceMax.h"
#include "ReduceMax.cuh" #include "ReduceMax.cuh"
#include "../getandset/ConvertDataType.h"
namespace nts{ // namespace nts(NiuTrans.Tensor) namespace nts{ // namespace nts(NiuTrans.Tensor)
......
...@@ -27,6 +27,7 @@ ...@@ -27,6 +27,7 @@
#include "../core/arithmetic/MultiplyDim.h" #include "../core/arithmetic/MultiplyDim.h"
#include "../core/math/ScaleAndShift.h" #include "../core/math/ScaleAndShift.h"
#include "../core/CHeader.h" #include "../core/CHeader.h"
#include "../core/getandset/SetData.h"
namespace nts{ // namespace nts(NiuTrans.Tensor namespace nts{ // namespace nts(NiuTrans.Tensor
...@@ -40,7 +41,7 @@ for more details. ...@@ -40,7 +41,7 @@ for more details.
Here, the output is scaled by a factor of \frac{1}{1-p} so that we do not need Here, the output is scaled by a factor of \frac{1}{1-p} so that we do not need
to mark the tensor with probability p in the inference phase. Instead we perform to mark the tensor with probability p in the inference phase. Instead we perform
the same inference procedure as that with no use of dropout on the test data. the same inference procedure as that on the test data withno nb use of dropout.
>> x - input tensor >> x - input tensor
>> y - output tensor >> y - output tensor
...@@ -123,8 +124,8 @@ void _DropoutBackward(const XTensor * y, const XTensor * x, ...@@ -123,8 +124,8 @@ void _DropoutBackward(const XTensor * y, const XTensor * x,
else else
ShowNTErrors("TODO!"); ShowNTErrors("TODO!");
} }
/* /*
dropout function (we make tensor connections here) dropout function (we make tensor connections here)
It randomly zeroes some of the elements of the input tensor It randomly zeroes some of the elements of the input tensor
with probability p via a Bernoulli distribution. with probability p via a Bernoulli distribution.
...@@ -135,89 +136,108 @@ for more details. ...@@ -135,89 +136,108 @@ for more details.
Here, the output is scaled by a factor of \frac{1}{1-p} so that we do not need Here, the output is scaled by a factor of \frac{1}{1-p} so that we do not need
to mark the tensor with probability p in the inference phase. Instead we perform to mark the tensor with probability p in the inference phase. Instead we perform
the same inference procedure as that with no use of dropout on the test data. the same inference procedure as that with no use of dropout on the test data.
>> x - input tensor >> x - input tensor
>> dropProb - probability to set an element to zero >> dropProb - probability to set an element to zero
>> leadingDim - the dimension which we generate the random numbers and perform broadcasting >> leadingDim - the dimension which we generate the random numbers and perform broadcasting
>> leadingDim2 - another dimension which we generate the random numbers and perform broadcasting
<< return - tensor after dropout
*/ */
XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim) XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim, int leadingDim2)
{ {
CheckNTErrors(dropProb >= 0.0 && dropProb <= 1.0, "The probability must be 0-1!"); CheckNTErrors(dropProb >= 0.0 && dropProb <= 1.0, "The probability must be 0-1!");
int n = leadingDim < 0 ? x.order - 1 : leadingDim; XTensor mask;
DTYPE * maskArray = NULL;
DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - dropProb);
CheckNTErrors(n >= 0 && n < x.order, "Wrong leadingDim!"); if(leadingDim < 0 && leadingDim2 < 0){
XTensor mask;
InitTensor(&mask, &x);
_SetDataRandP(&mask, 0, 1.0F, dropProb, scaleFactor);
return Multiply(x, mask);
}
else if(leadingDim2 < 0){
int n = leadingDim;
DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - dropProb); CheckNTErrors(n >= 0 && n < x.order, "Wrong leadingDim!");
/* generate a mask tensor with probability p */
int unitNum = x.dimSize[n];
maskArray = new DTYPE[unitNum];
//srand((unsigned int)time(NULL));
for (int i = 0; i < unitNum; i++)
maskArray[i] = RandomBernoulli(dropProb, scaleFactor);
/* generate a mask tensor with probability p */ XTensor mask;
int unitNum = x.dimSize[n]; InitTensor1D(&mask, unitNum, X_FLOAT, x.devID, x.mem);
DTYPE * maskArray = new DTYPE[unitNum]; mask.SetData(maskArray, unitNum);
//srand((unsigned int)time(NULL)); delete[] maskArray;
for (int i = 0; i < unitNum; i++) {
maskArray[i] = RandomBernoulli(dropProb, scaleFactor);
}
XTensor mask; if (x.dataType == X_FLOAT)
InitTensor1D(&mask, unitNum, X_FLOAT, x.devID, x.mem); {
mask.SetData(maskArray, unitNum); return MultiplyDim(x, mask, n);
}
else if (x.dataType == X_FLOAT16)
{
XTensor mask1(mask.order, mask.dimSize, X_FLOAT16, mask.denseRatio, mask.devID, mask.mem);
//mask1 = ConvertDataType(mask, X_FLOAT16);
_ConvertDataType(&mask, &mask1);
return MultiplyDim(x, mask1, n);
}
else {
ShowNTErrors("TODO!");
}
}
else{
int n = leadingDim;
int m = leadingDim2;
delete[] maskArray; CheckNTErrors(n >= 0 && n < x.order, "Wrong leadingDim!");
CheckNTErrors(m >= 0 && m < x.order, "Wrong leadingDim!");
/* generate a mask tensor with probability p */
int unitNum = x.dimSize[n] * x.dimSize[m];
maskArray = new DTYPE[unitNum];
if (x.dataType == X_FLOAT) //srand((unsigned int)time(NULL));
{ for (int i = 0; i < unitNum; i++)
return MultiplyDim(x, mask, n, 0); maskArray[i] = RandomBernoulli(dropProb, scaleFactor);
}
else
{
XTensor mask1;
mask1 = ConvertDataType(mask, X_FLOAT16);
return MultiplyDim(x, mask1, n, 0); int dims[MAX_TENSOR_DIM_NUM];
}
}
for(int i = 0; i < x.order; i++)
dims[i] = 1;
dims[n] = x.GetDim(n);
dims[m] = x.GetDim(m);
InitTensor(&mask, x.order, dims, X_FLOAT, x.denseRatio,x.devID, x.mem);
mask.SetData(maskArray, unitNum);
delete[] maskArray;
if (x.dataType == X_FLOAT)
{
return MultiplyBroadcast(x, mask);
}
else if (x.dataType == X_FLOAT16)
{
XTensor mask1(mask.order, mask.dimSize, X_FLOAT16, mask.denseRatio, mask.devID, mask.mem);
//mask1 = ConvertDataType(mask, X_FLOAT16);
_ConvertDataType(&mask, &mask1);
return MultiplyBroadcast(x, mask1);
}
else {
ShowNTErrors("TODO!");
}
}
//XTensor DropoutFloat16(const XTensor &x, DTYPE dropProb, int leadingDim)
//{ }
// CheckNTErrors(dropProb >= 0.0 && dropProb <= 1.0, "The probability must be 0-1!");
//
// int n = leadingDim < 0 ? x.order - 1 : leadingDim;
//
// CheckNTErrors(n >= 0 && n < x.order, "Wrong leadingDim!");
//
// DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - dropProb);
//
// /* generate a mask tensor with probability p */
// int unitNum = x.dimSize[n];
// DTYPE * maskArray = new DTYPE[unitNum];
//
// //srand((unsigned int)time(NULL));
// for (int i = 0; i < unitNum; i++) {
// maskArray[i] = RandomBernoulli(dropProb, scaleFactor);
// }
//
// XTensor mask;
// InitTensor1D(&mask, unitNum, X_FLOAT, x.devID, x.mem);
// mask.SetData(maskArray, unitNum);
//
// delete[] maskArray;
//
// XTensor halfMask;
// halfMask = ConvertDataType(mask, X_FLOAT16);
// XTensor halfX;
// halfX = ConvertDataType(x, X_FLOAT16);
// XTensor result;
// XTensor halfResult;
//
// halfResult = MultiplyDim(halfX, halfMask, n, 0);
//
// result = ConvertDataType(halfResult, X_FLOAT);
// return result;
// /*return MultiplyDim(x, mask1, n, 0);*/
//}
/* /*
dropout function without broadcast dropout function without broadcast
...@@ -235,7 +255,6 @@ XTensor DropoutWithoutBroadcast(const XTensor &x, DTYPE dropProb) ...@@ -235,7 +255,6 @@ XTensor DropoutWithoutBroadcast(const XTensor &x, DTYPE dropProb)
int unitNum = x.unitNum; int unitNum = x.unitNum;
DTYPE * maskArray = new DTYPE[unitNum]; DTYPE * maskArray = new DTYPE[unitNum];
srand((unsigned int)time(NULL));
for (int i = 0; i < unitNum; i++) for (int i = 0; i < unitNum; i++)
maskArray[i] = RandomBernoulli(dropProb, scaleFactor); maskArray[i] = RandomBernoulli(dropProb, scaleFactor);
......
...@@ -30,7 +30,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor) ...@@ -30,7 +30,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
inline DTYPE RandomBernoulli(DTYPE dropProb, DTYPE value) inline DTYPE RandomBernoulli(DTYPE dropProb, DTYPE value)
{ {
return (DTYPE)rand()/(DTYPE)RAND_MAX >= dropProb ? (DTYPE)value : 0; return (DTYPE)rand()/(DTYPE)RAND_MAX >= dropProb ? (DTYPE)value : 0;
} }
/* dropout function */ /* dropout function */
...@@ -40,11 +39,9 @@ void _Dropout(const XTensor * x, XTensor * y, unsigned int seed, DTYPE dropProb, ...@@ -40,11 +39,9 @@ void _Dropout(const XTensor * x, XTensor * y, unsigned int seed, DTYPE dropProb,
void _DropoutBackward(const XTensor * y, const XTensor * x, void _DropoutBackward(const XTensor * y, const XTensor * x,
const XTensor * dedy, XTensor * dedx, const XTensor * dedy, XTensor * dedx,
unsigned int seed, DTYPE dropProb, int leadingDim = -1); unsigned int seed, DTYPE dropProb, int leadingDim = -1);
/* dropout function */
XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim = -1);
/* dropout function */
XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim = -1, int leadingDim2 = -1);
/* dropout function without broadcast */ /* dropout function without broadcast */
XTensor DropoutWithoutBroadcast(const XTensor &x, DTYPE dropProb); XTensor DropoutWithoutBroadcast(const XTensor &x, DTYPE dropProb);
......
...@@ -27,9 +27,6 @@ ...@@ -27,9 +27,6 @@
#include "../core/reduce/ReduceSum.h" #include "../core/reduce/ReduceSum.h"
#include "../core/reduce/ReduceMax.h" #include "../core/reduce/ReduceMax.h"
#include "../core/movement/CopyValues.h" #include "../core/movement/CopyValues.h"
#include "../../tensor/core/getandset/ConvertDataType.h"
using namespace nts;
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -182,125 +179,6 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim) ...@@ -182,125 +179,6 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
delete[] dimSize; delete[] dimSize;
// if (!x->isSparse && !y->isSparse &&
// x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE)
// {
// int * dimSize = new int[x->order - 1];
// for (int i = 0; i < x->order; i++) {
// if (i < leadDim)
// dimSize[i] = -x->dimSize[i];
// else if (i > leadDim)
// dimSize[i - 1] = -x->dimSize[i];
// }
//
// XMem * mem = x->mem;
// XTensor * max = NULL;
// XTensor * sum = NULL;
// XTensor * blockx = NULL;
// XTensor * blocky = NULL;
// XTensor * blockMax = NULL;
// XTensor * blockSum = NULL;
//
// int dimensionSize = y->dimSizeRDI[leadDimRDI];
// int stride = 1;
// int blockSize = 1;
// int blockNum = 1;
//
// for (int i = 0; i < leadDimRDI; i++)
// stride *= y->dimSizeRDI[i];
// blockSize = stride * dimensionSize;
// blockNum = y->unitNum / blockSize;
//
// max = NewTensorBuf(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
// sum = NewTensorBuf(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
//
// _ReduceMax(x, max, leadDim);
// _ReduceSum(x, sum, leadDim, max, 1.0F, true);
//
// if (x->devID >= 0) {
// if (leadDimRDI == 0) {
// blockSize = y->unitNum;
// blockNum = 1;
// blockx = NewTensor2D(blockSize / dimensionSize, -dimensionSize, x->dataType, x->devID, mem);
// blocky = NewTensor2D(blockSize / dimensionSize, -dimensionSize, x->dataType, x->devID, mem);
// blockMax = NewTensor2D(blockSize / dimensionSize, -1, x->dataType, x->devID, mem);
// blockSum = NewTensor2D(blockSize / dimensionSize, -1, x->dataType, x->devID, mem);
// }
// else {
// blockx = NewTensor2D(-stride, dimensionSize, x->dataType, x->devID, mem);
// blocky = NewTensor2D(-stride, dimensionSize, x->dataType, x->devID, mem);
// blockMax = NewTensor2D(-stride, 1, x->dataType, x->devID, mem);
// blockSum = NewTensor2D(-stride, 1, x->dataType, x->devID, mem);
// }
// }
//
// for (int k = 0; k < blockNum; k++) {
// int m = stride;
// int n = dimensionSize;
//
// DTYPE * ip = (DTYPE*)x->data + k * blockSize;
// DTYPE * op = (DTYPE*)y->data + k * blockSize;
// DTYPE * mp = (DTYPE*)max->data + k * blockSize / dimensionSize;
// DTYPE * sp = (DTYPE*)sum->data + k * blockSize / dimensionSize;
//
// if (x->devID < 0) {
// for (int j = 0; j < m; j++) {
// DTYPE sumValue = sp[j];
// if (sumValue == 0) {
// for (int i = 0; i < n; i++)
// op[i * m + j] = 0;
// }
// else {
// for (int i = 0; i < n; i++) {
// DTYPE r = (DTYPE)log(exp(ip[i * m + j] - mp[j]) / sp[j]);
// if (IsNAN(r))
// r = LOGPROB_MIN;
// if (IsINF(r))
// r = LOGPROB_MIN;
//
// op[i * m + j] = MAX(r, LOGPROB_MIN);
// }
// }
// }
// }
// else {
// blockx->data = ip;
// blocky->data = op;
// blockMax->data = mp;
// blockSum->data = sp;
//#ifdef USE_CUDA
// if (leadDimRDI == 0)
// _CudaLogSoftmaxSumMax(blockx, blocky, 1, blockSum, blockMax);
// else
// _CudaLogSoftmaxSumMax(blockx, blocky, leadDim, blockSum, blockMax);
//#else
// ShowNTErrors("Please specify USE_CUDA and recompile the code!");
//#endif
// blockx->data = NULL;
// blocky->data = NULL;
// blockMax->data = NULL;
// blockSum->data = NULL;
// }
// }
//
// DelTensorBuf(max);
// DelTensorBuf(sum);
//
// if (x->devID >= 0) {
// delete blockx;
// delete blocky;
// delete blockMax;
// delete blockSum;
// }
//
// delete[] dimSize;
// }
// else
// ShowNTErrors("TODO!");
} }
/* /*
......
...@@ -26,7 +26,6 @@ ...@@ -26,7 +26,6 @@
#include "../core/reduce/ReduceSum.cuh" #include "../core/reduce/ReduceSum.cuh"
#include "../core/reduce/ReduceMax.cuh" #include "../core/reduce/ReduceMax.cuh"
#include "../XDevice.h" #include "../XDevice.h"
#include "device_launch_parameters.h"
#include "cuda_fp16.h" #include "cuda_fp16.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
......
...@@ -26,7 +26,6 @@ ...@@ -26,7 +26,6 @@
#include "../XUtility.h" #include "../XUtility.h"
#include "../core/reduce/ReduceSum.h" #include "../core/reduce/ReduceSum.h"
#include "../core/reduce/ReduceMax.h" #include "../core/reduce/ReduceMax.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
......
...@@ -29,7 +29,6 @@ ...@@ -29,7 +29,6 @@
#include "../core/arithmetic/Sum.h" #include "../core/arithmetic/Sum.h"
#include "../XDevice.h" #include "../XDevice.h"
#include "../XUtility.h" #include "../XUtility.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -86,8 +85,6 @@ void KernelSoftmaxComputeTensor(DTYPE * x, DTYPE * max, DTYPE * sum, DTYPE * y, ...@@ -86,8 +85,6 @@ void KernelSoftmaxComputeTensor(DTYPE * x, DTYPE * max, DTYPE * sum, DTYPE * y,
/* synchronize to make sure the values of max and sum are loaded */ /* synchronize to make sure the values of max and sum are loaded */
__syncthreads(); __syncthreads();
//printf("1: %d %d %d %d\n", i, strideSizeTotal, j, strideNum);
if(i < strideSizeTotal && j < strideNum){ if(i < strideSizeTotal && j < strideNum){
int offset = int(i / stride) * blockSize + j * stride + i2[threadIdx.x]; int offset = int(i / stride) * blockSize + j * stride + i2[threadIdx.x];
DTYPE r = exp(x[offset] - xMax[threadIdx.x])/xSum[threadIdx.x]; DTYPE r = exp(x[offset] - xMax[threadIdx.x])/xSum[threadIdx.x];
...@@ -145,8 +142,6 @@ void KernelSoftmaxComputeTensorHalf(__half * x, __half * max, __half * sum, __ha ...@@ -145,8 +142,6 @@ void KernelSoftmaxComputeTensorHalf(__half * x, __half * max, __half * sum, __ha
/* synchronize to make sure the values of max and sum are loaded */ /* synchronize to make sure the values of max and sum are loaded */
__syncthreads(); __syncthreads();
//printf("2: %d %d %d %d\n",i ,stride * blockNum ,j ,strideNum);
if(i < stride * blockNum && j < strideNum){ if(i < stride * blockNum && j < strideNum){
int offset = int(i / stride) * blockSize + j * stride + i2[threadIdx.x]; int offset = int(i / stride) * blockSize + j * stride + i2[threadIdx.x];
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) #if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
...@@ -256,7 +251,6 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s ...@@ -256,7 +251,6 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s
if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){ if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
if (leadDim != 0 || dimensionSize <= 10) { if (leadDim != 0 || dimensionSize <= 10) {
//printf("%d %d %d %d\n", cudaGridSize[0], cudaGridSize[1], cudaBlockSize[0], cudaBlockSize[1]);
KernelSoftmaxComputeTensor <<< dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >>> KernelSoftmaxComputeTensor <<< dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >>>
((DTYPE*)x->data, (DTYPE*)max->data, (DTYPE*)sum->data, (DTYPE*)y->data, ((DTYPE*)x->data, (DTYPE*)max->data, (DTYPE*)sum->data, (DTYPE*)y->data,
stride, dimensionSize, stride * dimensionSize, blockNum, stride * blockNum); stride, dimensionSize, stride * dimensionSize, blockNum, stride * blockNum);
...@@ -269,8 +263,6 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s ...@@ -269,8 +263,6 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s
} }
else if(x->dataType == X_FLOAT16 && y->dataType == X_FLOAT16){ else if(x->dataType == X_FLOAT16 && y->dataType == X_FLOAT16){
//printf("%d\n\n",dimensionSize);
//printf("%d %d %d %d\n", cudaGridSize[0], cudaGridSize[1], cudaBlockSize[0], cudaBlockSize[1]);
KernelSoftmaxComputeTensorHalf <<< dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >>> KernelSoftmaxComputeTensorHalf <<< dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >>>
((__half*)x->data, (__half*)max->data, (__half*)sum->data, (__half*)y->data, ((__half*)x->data, (__half*)max->data, (__half*)sum->data, (__half*)y->data,
stride, dimensionSize, blockNum); stride, dimensionSize, blockNum);
...@@ -279,10 +271,6 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s ...@@ -279,10 +271,6 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s
ShowNTErrors("TODO!"); ShowNTErrors("TODO!");
} }
/*XTensor y1;
y1 = ConvertDataType(*y, X_FLOAT);
y1.Dump(stderr, "y1:");*/
BacktoCudaDev(x->devID, devIDBackup); BacktoCudaDev(x->devID, devIDBackup);
} }
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论