Commit 44bf9fa6 by linye

Merge了最新版本代码,修复了一些Bug

parent c9c53870
......@@ -66,6 +66,7 @@ void PowerFP16Test();
void ClipFP16Test();
void GatherFP16Test();
void SetDataGPUFP16Test();
void SumIntTest();
using namespace nts;
using namespace fnnlm;
......@@ -89,8 +90,6 @@ int main( int argc, const char ** argv )
//return 0;
//ConvertBackwardTest();
//return 0;
//DropoutFP16Test();
//return 0;
//UnsqueezeFP16Test();
//return 0;
//ReduceMaxFP16Test();
......@@ -143,11 +142,17 @@ int main( int argc, const char ** argv )
//InitCPUFP16Test();
//return 0;
SetDataGPUFP16Test();
return 0;
//MycublasGemmExTest();
//return 0;
MycublasGemmExTest();
return 0;
//SumIntTest();
//return 0;
//DropoutFP16Test();
//return 0;
//SetDataGPUFP16Test();
//return 0;
if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
......@@ -205,27 +210,69 @@ void MycublasGemmExTest() {
c1.Dump(stderr, "c1:");
}
void SumIntTest() {
XTensor a;
XTensor b;
XTensor c;
XTensor inta;
XTensor intb;
XTensor intc;
InitTensor2D(&a, 2, 2, X_FLOAT, 0);
InitTensor2D(&b, 2, 2, X_FLOAT, 0);
a.SetDataRand(-5.0, 5.0);
b.SetDataRand(-5.0, 5.0);
a.Dump(stderr, "a:");
b.Dump(stderr, "b:");
inta = ConvertDataType(a, X_INT);
intb = ConvertDataType(b, X_INT);
inta.Dump(stderr, "inta:");
intb.Dump(stderr, "intb:");
intc = Sum(inta, intb);
intc.Dump(stderr, "intc:");
}
void SetDataGPUFP16Test() {
srand(time(NULL));
XTensor a1;
InitTensor2D(&a1, 2, 2, X_FLOAT, 0);
_SetDataRand(&a1, -5.0F, 5.0F);
a1.Dump(&a1, stderr, "a:\n");
/*XTensor m;
InitTensor2D(&m, 2, 2, X_FLOAT, 0);
m.SetDataRand(0.0, 10.0);*/
XTensor a;
InitTensor2D(&a, 2, 2, X_FLOAT16, 0);
XTensor * m = NewTensor2D(2, 2, X_FLOAT, 0);
m->SetDataRand(0.0, 10.0);
_SetDataRand(&a, -5.0F, 5.0F);
//XTensor a1;
//InitTensor2D(&a1, 2, 2, X_FLOAT, 0);
//_CopyValues(&m, &a1);
//_SetDataRand(&a1, -1.0F, 1.0F);
//a1.Dump(&a1, stderr, "a:\n");
a.Dump(&a, stderr, "a:\n");
/*XTensor a;
InitTensor2D(&a, 2, 2, X_FLOAT16, 0);*/
XTensor b;
InitTensor2D(&b, 2, 2, X_FLOAT, 0);
XTensor * a = NewTensor2D(2, 2, X_FLOAT16, 0);
b = ConvertDataType(a, X_FLOAT);
_ConvertDataType(m, a);
a->Dump(a, stderr, "a:\n");
_SetDataRand(a, 0.0F, 1.0F);
a->Dump(a, stderr, "a:\n");
//XTensor b;
//InitTensor2D(&b, 2, 2, X_FLOAT, 0);
//b = ConvertDataType(a, X_FLOAT);
//b.Dump(stderr, "b:\n");
b.Dump(stderr, "b:\n");
}
void ClipFP16Test() {
......@@ -447,7 +494,7 @@ void FloatToInt8Test() {
InitTensor2D(&a, 2, 2, X_FLOAT, 0);
InitTensor2D(&b, 2, 2, X_INT8, 0);
a.SetDataRand(-5.0F, 5.0F);
a.SetDataRand(5.0F, 5.0F);
a.Dump(stderr, "a:");
b = ConvertDataType(a, X_INT8);
......@@ -741,8 +788,8 @@ void MultiplyDimFP16Test()
halfA = ConvertDataType(a, X_FLOAT16);
halfB = ConvertDataType(b, X_FLOAT16);
c1 = MultiplyDim(a1, b1, 1, 0);
halfC = MultiplyDim(halfA, halfB, 1, 0);
c1 = MultiplyDim(a1, b1, 1);
halfC = MultiplyDim(halfA, halfB, 1);
c = ConvertDataType(halfC, X_FLOAT);
......@@ -950,26 +997,26 @@ void SubFP16Test()
void DropoutFP16Test()
{
srand(time(NULL));
XTensor a;
XTensor b;
XTensor b1;
XTensor halfA;
XTensor halfB;
InitTensor2D(&a, 10, 10, X_FLOAT, 0);
InitTensor2D(&a, 10, 1, X_FLOAT, 0);
a.SetDataRand(-5.0F, 5.0F);
/*a.Dump(stderr, "a:");*/
a.Dump(stderr, "a:");
halfA = ConvertDataType(a, X_FLOAT16);
halfB = Dropout(halfA, 0.5);
b1 = Dropout(a, 0.3);
b = ConvertDataType(halfB, X_FLOAT);
halfB = Dropout(halfA, 0.2);
b1 = Dropout(a, 0.2);
b.Dump(stderr, "b:");
//b1.Dump(stderr, "b1:");
halfB.Dump(&halfB, stderr, "halfB:");
b1.Dump(&b1, stderr, "b1:");
}
void ConvertBackwardTest()
......@@ -1069,132 +1116,4 @@ void ConvertTest()
a1.Dump(stderr, "halfa:");
}
}
void MatrixMulFloat16AndFloatTest()
{
XTensor a;
XTensor b;
XTensor c;
InitTensor2D(&a, 5000, 5000, X_FLOAT, 0);
InitTensor2D(&b, 5000, 5000, X_FLOAT, 0);
InitTensor2D(&c, 5000, 5000, X_FLOAT, 0);
a.SetDataRand(-10.0F, 10.0F);
b.SetDataRand(-10.0F, 10.0F);
int recurrentNum = 10000;
double startT1 = GetClockSec();
for (int i1 = 0; i1 < recurrentNum; i1++)
{
c= MatrixMul(&a, &b);
}
printf("ElapsedFloat32 = %.2f s \n", GetClockSec() - startT1);
double startT2 = GetClockSec();
for (int i2 = 0; i2 < recurrentNum; i2++)
{
c = MatrixMulFloat16(&a, &b);
}
printf("ElapsedFloat16 = %.2f s \n", GetClockSec() - startT2);
}
void MatrixMul2DFloat16Test()
{
XTensor a;
XTensor b;
XTensor c;
XTensor a00;
XTensor b00;
XTensor c00;
XTensor c01;
XTensor halfa;
XTensor halfb;
XTensor halfc;
InitTensor3D(&a, 3, 2, 3, X_FLOAT, 0);
InitTensor2D(&b, 3, 2, X_FLOAT, 0);
InitTensor3D(&c, 3, 2, 2, X_FLOAT, 0);
InitTensor3D(&a00, 3, 2, 3, X_FLOAT, 0);
InitTensor2D(&b00, 3, 2, X_FLOAT, 0);
InitTensor3D(&c00, 3, 2, 2, X_FLOAT, 0);
InitTensor3D(&c01, 3, 2, 2, X_FLOAT, 0);
InitTensor3D(&halfa, 3, 2, 3, X_FLOAT16, 0);
InitTensor2D(&halfb, 3, 2, X_FLOAT16, 0);
InitTensor3D(&halfc, 3, 2, 2, X_FLOAT16, 0);
DTYPE aData[3][2][3] = { { { 0.02121212144F, -1.0234556667F, 2.04354565678F },
{ 2.0234567332F, -1.0213469654F, -3.01568321F } },
{ { -1.022347899421F, 2.012589653664F, 4.035346643F },
{ 3.01234544634F, 1.0324354635F, 2.0546578332F } },
{ { -1.0235743446F, 3.0335753334F, 2.0653323234F },
{ 1.03235643232F, -1.023463345542F, 0.0335563322F } } };
DTYPE bData[3][2] = { { -1.034466323232F, -2.0546676442F },
{ -3.0224354656F, 4.034467866532F },
{ 5.02354657442F, -6.0324355767443F } };
a.SetData(aData, 18);
b.SetData(bData, 6);
_MatrixMul(&a, X_NOTRANS, &b, X_NOTRANS, &c);
_ConvertDataType(&a, &halfa);
_ConvertDataType(&b, &halfb);
_MatrixMul(&halfa, X_NOTRANS, &halfb, X_NOTRANS, &halfc);
_ConvertDataType(&halfc, &c01);
_ConvertDataType(&halfa, &a00);
_ConvertDataType(&halfb, &b00);
_MatrixMul(&a00, X_NOTRANS, &b00, X_NOTRANS, &c00);
c.Dump(stderr, "c:");
c01.Dump(stderr, "c01:");
c00.Dump(stderr, "c0:");
XTensor a1;
XTensor b1;
XTensor c1;
XTensor a10;
XTensor b10;
XTensor c10;
XTensor c11;
XTensor halfa1;
XTensor halfb1;
XTensor halfc1;
InitTensor2D(&a1, 3, 3, X_FLOAT, 0);
InitTensor2D(&b1, 3, 2, X_FLOAT, 0);
InitTensor2D(&c1, 3, 2, X_FLOAT, 0);
InitTensor2D(&a10, 3, 3, X_FLOAT, 0);
InitTensor2D(&b10, 3, 2, X_FLOAT, 0);
InitTensor2D(&c10, 3, 2, X_FLOAT, 0);
InitTensor2D(&c11, 3, 2, X_FLOAT, 0);
InitTensor2D(&halfa1, 3, 3, X_FLOAT16, 0);
InitTensor2D(&halfb1, 3, 2, X_FLOAT16, 0);
InitTensor2D(&halfc1, 3, 2, X_FLOAT16, 0);
DTYPE a1Data[3][3] = { { 0.02121212144F, -1.0234556667F, 2.043541565678F },
{ -2.0234567332F, 1.0213469657774F, -3.0156837543321F } ,
{ 1.022347899421F, -2.012589653664F, 4.03534634643F }};
DTYPE b1Data[3][2] = { { 1.034466323232F, -2.0546676442F },
{ 3.0224354656F, -4.034467866532F },
{ 5.02354657442F, 6.0324355767443F } };
a1.SetData(a1Data, 9);
b1.SetData(b1Data, 6);
_MatrixMul(&a1, X_NOTRANS, &b1, X_NOTRANS, &c1);
_ConvertDataType(&a1, &halfa1);
_ConvertDataType(&b1, &halfb1);
_MatrixMul(&halfa1, X_NOTRANS, &halfb1, X_NOTRANS, &halfc1);
_ConvertDataType(&halfc1, &c11);
_ConvertDataType(&halfa1, &a10);
_ConvertDataType(&halfb1, &b10);
_MatrixMul(&a10, X_NOTRANS, &b10, X_NOTRANS, &c10);
c1.Dump(stderr, "c1:");
c11.Dump(stderr, "c11:");
c10.Dump(stderr, "c10:");
}
}
\ No newline at end of file
......@@ -87,8 +87,6 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y,
XTensor * dedy, XTensor * padding,
LOSS_FUNCTION_NAME lossName)
{
//return;
if(gold == NULL){
if(dedy->dataType == X_FLOAT)
_SetDataFixedFloat(dedy, 1.0F);
......@@ -97,7 +95,7 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y,
else if(dedy->dataType == X_INT)
_SetDataFixedInt(dedy, 1);
else{
//ShowNTErrors("TODO");
ShowNTErrors("TODO");
}
return;
}
......
......@@ -71,6 +71,8 @@ void XMathGrad::MakeGrad(XTensor * node, bool isEfficient)
GradMultiply(node, isEfficient);
else if(operID == MATH_MULTIPLYDIM)
GradMultiplyDim(node, isEfficient);
else if (operID == MATH_MULTIPLYBROADCAST)
GradMultiplyBroadcast(node, isEfficient);
else if(operID == MATH_NEGATE)
GradNegate(node, isEfficient);
else if(operID == MATH_NORMALIZE)
......@@ -87,6 +89,8 @@ void XMathGrad::MakeGrad(XTensor * node, bool isEfficient)
GradSum(node, isEfficient);
else if(operID == MATH_SUMDIM)
GradSumDim(node, isEfficient);
else if(operID == MATH_SUMBROADCAST)
GradSumBroadcast(node, isEfficient);
else if(operID == REDUCE_REDUCEMEAN)
GradReduceMean(node, isEfficient);
else if(operID == REDUCE_REDUCESUM)
......@@ -736,10 +740,6 @@ dE/db = (dE/dc * a).reduce(0,...,n-1,n+1,...)
*/
void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
{
/* XTensor node1;
node1 = ConvertDataType(*node, X_FLOAT);
node1.Dump(stderr, "node:");*/
XLink &income = node->income;
CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for MULTIPLYDIM!");
......@@ -751,14 +751,6 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
/* dE/da */
_MultiplyDim(node->grad, b, a->grad, n, 1.0F);
//XTensor a1;
//a1 = ConvertDataType(*a, X_FLOAT);
//a1.Dump(stderr, "a:");
//XTensor b1;
//b1 = ConvertDataType(*b, X_FLOAT);
//b1.Dump(stderr, "b:");
/* dE/db */
int order = a->order;
......@@ -777,19 +769,10 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
size of b. Then we can reduce the matrix into a row vector. */
bGradTMP->Reshape(2, reshapedSize);
/*XTensor bGradTMP1;
bGradTMP1 = ConvertDataType(*bGradTMP, X_FLOAT);
bGradTMP1.Dump(stderr, "bGradTMP:");*/
//if(b->outgo.tailNum > 1){
XTensor * bGradTMP2 = NewTensorBuf(b->grad, b->devID, b->mem);
_ReduceSum(bGradTMP, bGradTMP2, 0);
/* XTensor bGradTMP21;
bGradTMP21 = ConvertDataType(*bGradTMP2, X_FLOAT);
bGradTMP21.Dump(stderr, "bGradTMP2:");*/
_Sum(b->grad, bGradTMP2, b->grad);
DelTensorBuf(bGradTMP2);
......@@ -832,21 +815,43 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
DelTensorBuf(interGrad);
}
//printf("\n");
//XTensor a2;
//a2 = ConvertDataType(*a, X_FLOAT);
//a2.Dump(stderr, "a2:");
//XTensor b2;
//b2 = ConvertDataType(*b, X_FLOAT);
//b2.Dump(stderr, "b2:");
DelTensorBuf(bGradTMP);
node->visitMark = NODE_FINISHED;
}
/*
gradient for multiplication by broadcasting:
c = a * b
where some dimensions of b are of size 1
dE/da = dE/dc * b
dE/db = (dE/dc * a).reduce(0...n)
where a.reduce(0...n) is the reduction along the dimension
whose size is 1 in b. Note that there might be several reductions.
>> node - the node (c) for backward computation
>> isEfficient - indicates whether the computation is in
an efficient manner
*/
void XMathGrad::GradMultiplyBroadcast(XTensor * node, bool isEfficient)
{
XLink &income = node->income;
CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for MULTIPLYBROADCAST!");
XTensor * a = income.tails[0];
XTensor * b = income.tails[1];
DTYPE beta = income.GetParam(0);
XNoder::MakeGrad(a);
_MultiplyBroadcast(node->grad, b, a->grad, 1.0F);
if(b->isVar || b->income.tailNum > 0){
ShowNTErrors("TODO");
}
}
/*
gradient for negate
for
c = -a
......@@ -1020,7 +1025,6 @@ void XMathGrad::GradScaleAndShift(XTensor * node, bool isEfficient)
_Sum(a->grad, node->grad, a->grad, scale);
node->visitMark = NODE_FINISHED;
}
......@@ -1284,6 +1288,37 @@ void XMathGrad::GradSumDim(XTensor * node, bool isEfficient)
node->visitMark = NODE_FINISHED;
}
/*
gradient for sum by broadcasting:
c = a + b * \beta
where some dimensions of b are of size 1
dE/da = dE/dc
dE/db = dE/dc * a.reduce(0..n) * \beta
where a.reduce(0..n) is the reduction along the dimension
whose size is 1 in b
>> node - the node (c) for backward computation
>> isEfficient - indicates whether the computation is in
an efficient manner
*/
void XMathGrad::GradSumBroadcast(XTensor * node, bool isEfficient)
{
XLink &income = node->income;
CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for SUMBROADCAST!");
XTensor * a = income.tails[0];
XTensor * b = income.tails[1];
DTYPE beta = income.GetParam(0);
XNoder::MakeGrad(a);
_Sum(a->grad, node->grad, a->grad);
if(b->isVar || b->income.tailNum > 0){
ShowNTErrors("TODO");
}
}
/*
gradient for reduceMean
for
......
......@@ -109,6 +109,11 @@ private:
static
void GradMultiplyDim(XTensor * node, bool isEfficient);
/* gradient for multiply one dimension: c = a * b
where some dimensions of b are of size 1 */
static
void GradMultiplyBroadcast(XTensor * node, bool isEfficient);
/* gradient for negate */
static
void GradNegate(XTensor * node, bool isEfficient);
......@@ -143,6 +148,11 @@ private:
static
void GradSumDim(XTensor * node, bool isEfficient);
/* gradient for sum by broadcasting: c = a + b * \beta
where some dimensions of b are of size 1 */
static
void GradSumBroadcast(XTensor * node, bool isEfficient);
/* gradient for reduceMean */
static
void GradReduceMean(XTensor * node, bool isEfficient);
......
......@@ -27,7 +27,6 @@
#include "XBackwardFunc.h"
#include "XBackwardShape.h"
#include "../tensor/XName.h"
#include "../tensor/core/CHeader.h"
namespace nts{
......@@ -266,7 +265,7 @@ void XNet::BackwardNode(XTensor * node, bool isEfficent)
XMathGrad::MakeGrad(node, isEfficent);
else if(XFuncGrad::IsFunc(node))
XFuncGrad::MakeGrad(node, isEfficent);
else if(XDataGrad::IsDataOP(node))
else if (XDataGrad::IsDataOP(node))
XDataGrad::MakeGrad(node, isEfficent);
else if(XShapeGrad::IsShapeOP(node))
XShapeGrad::MakeGrad(node, isEfficent);
......
......@@ -839,6 +839,9 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
InitModelTensor2D(s, batchSize, model.vSize, model);
InitModelTensor2D(y, batchSize, model.vSize, model);
///* s = h_last * w */
//_MatrixMul(&h_last, X_NOTRANS, &w, X_NOTRANS, &s);
XTensor h_last1;
h_last1 = ScaleAndShift(h_last, 100, 0);
......@@ -850,22 +853,27 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
int8H_last = ConvertDataType(h_last1, X_INT8);
int8W = ConvertDataType(w1, X_INT8);
XTensor s1;
InitTensor2D(&s1, batchSize, model.vSize, X_FLOAT, model.devID, model.mem);
_MatrixMul2D(&int8H_last, X_NOTRANS, &int8W, X_NOTRANS, &s1);
s = ScaleAndShift(s1, 0.0001, 0);
XTensor s1;
InitTensor2D(&s1, batchSize, model.vSize, X_INT, model.devID, model.mem);
_MatrixMul2D(&int8H_last, X_NOTRANS, &int8W, X_NOTRANS, &s1);
XTensor b2D;
InitTensor(&b2D, &s);
InitTensor2D(&b2D, batchSize, model.vSize, X_FLOAT, model.devID, model.mem);
_Unsqueeze(&b, &b2D, 0, batchSize);
_Sum(&s, &b2D, &s);
b2D = ScaleAndShift(b2D, 10000, 0);
XTensor b2D1;
b2D1 = ConvertDataType(b2D, X_INT);
_Sum(&s1, &b2D1, &s1);
s = ConvertDataType(s1, X_FLOAT);
s = ScaleAndShift(s, 0.0001, 0);
/* y = softmax(s) */
_LogSoftmax(&s, &y, 1);
}
}
......@@ -1203,12 +1211,12 @@ void Test(const char * test, const char * result, FNNModel &model)
fprintf(ofile, "%d ", ngrams[0].words[i]);
for (int i = 0; i < ngramNum; i++)
fprintf(ofile, "%d ", ngrams[i].words[model.n - 1]);
fprintf(ofile, "||| ");
fprintf(ofile, "||| ");
for (int i = 0; i < model.n - 1; i++)
fprintf(ofile, "<s> ");
for (int i = 0; i < ngramNum; i++)
fprintf(ofile, "%f ", probs.Get1D(i));
fprintf(ofile, "||| %f\n", prob);
fprintf(ofile, "||| %f\n", prob);
loss += -prob;
wordCount += ngramNum;
......
......@@ -53,6 +53,42 @@ initialize the model
>> myDevID - device id
>> myMem - the memory pool
*/
//void T2TAttention::InitModel(int argc, char ** argv,
// bool myIsMasked, int myIgnored,
// int myDevID, XMem * myMem)
//{
// devID = myDevID;
// mem = myMem;
// isMasked = myIsMasked;
// ignored = myIgnored;
//
// float minmax = 0;
//
// LoadParamInt(argc, argv, "nhead", &nhead, 8);
// LoadParamInt(argc, argv, "d", &dk, DEFAULT_EMBEDDING_SIZE);
// LoadParamInt(argc, argv, "d", &dv, DEFAULT_EMBEDDING_SIZE);
// LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
// LoadParamFloat(argc, argv, "attminmax", &minmax, 0.1F);
// LoadParamFloat(argc, argv, "dropoutatt", &dropoutP, 0);
//
// InitTensor2D(&wk, d, dk, X_FLOAT, devID, mem);
// InitTensor2D(&wq, d, dk, X_FLOAT, devID, mem);
// InitTensor2D(&wv, d, dv, X_FLOAT, devID, mem);
// InitTensor2D(&wa, d, d, X_FLOAT, devID, mem);
// InitTensor2D(&wbig, d, 3 * d, X_FLOAT, devID, mem);
//
// float scale = 1.0F;
// float finfoutk = (float)sqrt(6.0F * scale/(d + dk));
// float finfoutv = (float)sqrt(6.0F * scale/(d + dv));
// float finfouta = (float)sqrt(6.0F * scale / (d + d));
// float finfoutbig = (float)sqrt(6.0F * scale / (d + 3*d));
//
// wk.SetDataRand(-finfoutk, finfoutk);
// wq.SetDataRand(-finfoutk, finfoutk);
// wv.SetDataRand(-finfoutv, finfoutv);
// wa.SetDataRand(-finfouta, finfouta);
// wbig.SetDataRand(-finfoutbig, finfoutbig);
//}
void T2TAttention::InitModel(int argc, char ** argv,
bool myIsMasked, int myIgnored,
......@@ -76,20 +112,19 @@ void T2TAttention::InitModel(int argc, char ** argv,
InitTensor2D(&wq, d, dk, X_FLOAT16, devID, mem);
InitTensor2D(&wv, d, dv, X_FLOAT16, devID, mem);
InitTensor2D(&wa, d, d, X_FLOAT16, devID, mem);
InitTensor2D(&wbig, d, 3 * d, X_FLOAT16, devID, mem);
float scale = 1.0F;
float finfoutk = (float)sqrt(6.0F * scale / (d + dk));
float finfoutv = (float)sqrt(6.0F * scale / (d + dv));
float finfouta = (float)sqrt(6.0F * scale / (d + d));
float finfoutbig = (float)sqrt(6.0F * scale / (d + 3 * d));
wk.SetDataRand(-finfoutk, finfoutk);
wq.SetDataRand(-finfoutk, finfoutk);
wv.SetDataRand(-finfoutv, finfoutv);
wa.SetDataRand(-finfouta, finfouta);
//_SetDataRand(&wk, -finfoutk, finfoutk);
//_SetDataRand(&wq, -finfoutk, finfoutk);
//_SetDataRand(&wv, -finfoutv, finfoutv);
//_SetDataRand(&wa, -finfouta, finfouta);
wbig.SetDataRand(-finfoutbig, finfoutbig);
}
/*
......@@ -103,42 +138,138 @@ make the network
>> isTraining - indicates whether the model is used for training
<< return - multi-attention result
*/
//XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining, bool selfatt)
//{
// XTensor k2;
// XTensor q2;
// XTensor v2;
//
// if (selfatt){
//
// XTensor con;
// XList split;
//
// con = MMul(k, wbig);
//
// int d1 = con.GetDim(0);
// int d2 = con.GetDim(1);
// int d3 = con.GetDim(2) / 3;
//
// InitTensor3D(&k2, d1, d2, d3, X_FLOAT, devID, mem);
// InitTensor3D(&q2, d1, d2, d3, X_FLOAT, devID, mem);
// InitTensor3D(&v2, d1, d2, d3, X_FLOAT, devID, mem);
//
// split.Add(&q2);
// split.Add(&k2);
// split.Add(&v2);
//
// Split(con, split, 2, 3);
// }
//
// else{
// /* linear transofmration before self-attention */
// k2 = MMul(k, wk);
// q2 = MMul(q, wq);
// v2 = MMul(v, wv);
// }
//
// XTensor kheads;
// XTensor qheads;
// XTensor vheads;
//
// /* multi head */
// kheads = Split(k2, k2.order - 1, nhead);
// qheads = Split(q2, q2.order - 1, nhead);
// vheads = Split(v2, v2.order - 1, nhead);
//
// XTensor att;
// XTensor dot;
// XTensor scalar;
//
// /* scalar = softmax(Q * K^T / sqrt(dk)) * V */
// dot = BMMul(qheads, X_NOTRANS, kheads, X_TRANS);
//
// if(isMasked)
// dot = dot + mask;
//
// dot = Linear(dot, 1.0F/(float)sqrt((float)dk/nhead));
//
// scalar = Softmax(dot, -1);
//
// if(isTraining && dropoutP > 0)
// scalar = Dropout(scalar, dropoutP);
//
// att = BMMul(scalar, vheads);
//
// /* concatenate the heads */
// return MMul(Merge(att, att.order - 1), wa);
//}
XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining)
XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining, bool selfatt)
{
XTensor halfK;
XTensor halfK2;
XTensor halfQ2;
XTensor halfV2;
XTensor halfK;
halfK = ConvertDataType(k, X_FLOAT16);
halfK2 = MMul(halfK, wk);
halfQ2 = MMul(halfK, wq);
halfV2 = MMul(halfK, wv);
if (selfatt) {
XTensor halfCon;
XList halfSplit;
halfCon = MMul(halfK, wbig);
int d1 = halfCon.GetDim(0);
int d2 = halfCon.GetDim(1);
int d3 = halfCon.GetDim(2) / 3;
InitTensor3D(&halfK2, d1, d2, d3, X_FLOAT16, devID, mem);
InitTensor3D(&halfQ2, d1, d2, d3, X_FLOAT16, devID, mem);
InitTensor3D(&halfV2, d1, d2, d3, X_FLOAT16, devID, mem);
halfSplit.Add(&halfQ2);
halfSplit.Add(&halfK2);
halfSplit.Add(&halfV2);
Split(halfCon, halfSplit, 2, 3);
}
else {
XTensor halfQ;
XTensor halfV;
halfQ = ConvertDataType(q, X_FLOAT16);
halfV = ConvertDataType(v, X_FLOAT16);
/* linear transofmration before self-attention */
halfK2 = MMul(halfK, wk);
halfQ2 = MMul(halfQ, wq);
halfV2 = MMul(halfV, wv);
}
XTensor halfKheads;
XTensor halfQheads;
XTensor halfVheads;
/* multi head */
halfKheads = Split(halfK2, halfK2.order - 1, nhead);
halfQheads = Split(halfQ2, halfQ2.order - 1, nhead);
halfVheads = Split(halfV2, halfV2.order - 1, nhead);
XTensor halfMask;
XTensor halfAtt;
XTensor halfDot;
XTensor halfScalar;
XTensor halfAtt;
halfMask = ConvertDataType(mask, X_FLOAT16);
/* scalar = softmax(Q * K^T / sqrt(dk)) * V */
halfDot = BMMul(halfQheads, X_NOTRANS, halfKheads, X_TRANS);
//XTensor halfMask(mask.order, mask.dimSize, X_FLOAT16, mask.denseRatio, mask.devID, mask.mem);
if (isMasked) {
XTensor halfMask;
halfMask = ConvertDataType(mask, X_FLOAT16);
halfDot = Sum(halfDot, halfMask);
}
}
halfDot = Linear(halfDot, 1.0F / (float)sqrt((float)dk / nhead));
......@@ -147,13 +278,10 @@ XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bo
if (isTraining && dropoutP > 0)
halfScalar = Dropout(halfScalar, dropoutP);
/*att = BMMul(scalar, vheads);*/
halfAtt = BMMul(halfScalar, halfVheads);
/* concatenate the heads */
return ConvertDataType(MMul(Merge(halfAtt, halfAtt.order - 1), wa), X_FLOAT);
}
}
......@@ -59,7 +59,8 @@ public:
/* transformation after dot-product attention */
XTensor wa;
XTensor wbig;
/* size of transformed Q and K */
int dk;
......@@ -95,7 +96,7 @@ public:
int myDevID = -1, XMem * myMem = NULL);
/* make the network */
XTensor Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining);
XTensor Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining, bool selfatt);
};
}
......
......@@ -21,6 +21,8 @@
#include <math.h>
#include "T2TDecoder.h"
#include "T2TUtility.h"
#include "T2TLayerNormal.h"
#include "../../tensor/core/CHeader.h"
namespace transformer
......@@ -53,16 +55,43 @@ void AttDecoder::InitModel(int argc, char ** argv,
bool myIsMasked, int myIgnored,
int myDevID, XMem * myMem)
{
AttEncoder::InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
//AttEncoder::InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
devID = myDevID;
mem = myMem;
ignored = myIgnored;
LoadParamInt(argc, argv, "nlayer", &nlayer, 6);
LoadParamInt(argc, argv, "hsize", &hSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "esize", &eSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
LoadParamFloat(argc, argv, "dropout", &dropoutP, 0);
CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsize\"");
/* embedding model */
embedder.InitModel(argc, argv, devID, mem, false);
attentions = new T2TAttention[nlayer];
fnns = new T2TFNN[nlayer];
attLayerNorms = new T2TLN[nlayer];
fnnLayerNorms = new T2TLN[nlayer];
attentionsEnde = new T2TAttention[nlayer];
attEndeLayerNorms = new T2TLN[nlayer];
/* initialize the stacked layers */
for(int i = 0; i < nlayer; i++){
attentionsEnde[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
for (int i = 0; i < nlayer; i++) {
attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
fnns[i].InitModel(argc, argv, myDevID, myMem);
attLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
fnnLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
attentionsEnde[i].InitModel(argc, argv, true, myIgnored, myDevID, myMem);
attEndeLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
}
}
/*
......@@ -93,7 +122,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
/******************/
/* self attention */
att = attentions[i].Make(x, x, x, mask, isTraining);
att = attentions[i].Make(x, x, x, mask, isTraining, true);
/* dropout */
if(isTraining && dropoutP > 0)
......@@ -107,7 +136,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
/*****************************/
/* encoder-decoder attention */
ende = attentionsEnde[i].Make(outputEnc, x, outputEnc, maskEncDec, isTraining);
ende = attentionsEnde[i].Make(outputEnc, x, outputEnc, maskEncDec, isTraining, false);
/* dropout */
if(isTraining && dropoutP > 0)
......
......@@ -27,9 +27,56 @@
namespace transformer
{
class AttDecoder : public AttEncoder
class AttDecoder
{
public:
/* device id */
int devID;
/* memory pool */
XMem * mem;
/* layer number */
int nlayer;
/* hidden layer size of the FNN layer */
int hSize;
/* embedding size */
int eSize;
/* vocabulary size */
int vSize;
/* dropout probability */
DTYPE dropoutP;
/* some positions can be ignored in attention. this is useful in lm where the first position needs
* special design for the attention model. */
int ignored;
/* embedding of word at each position */
T2TEmbedder embedder;
/* FNN model of each layer */
T2TFNN * fnns;
/* attention model of each layer */
T2TAttention * attentions;
/* layer normalization for fnn */
T2TLN * fnnLayerNorms;
/* layer normalization for attention */
T2TLN * attLayerNorms;
/* input tensor of the encoder */
XTensor * input;
/* output tensor of the encoder */
XTensor * output;
/* encoder-decoder attention model of each layer */
T2TAttention * attentionsEnde;
......@@ -53,4 +100,4 @@ public:
}
#endif
\ No newline at end of file
#endif
......@@ -48,12 +48,18 @@ initialize the model
>> myDevID - device id
>> myMem - the memory pool
*/
void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, XMem * myMem, bool isEnc)
{
devID = myDevID;
mem = myMem;
LoadParamInt(argc, argv, "vsize", &vSize, -1);
if(isEnc){
LoadParamInt(argc, argv, "vsize", &vSize, -1);
}
else{
LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
}
//LoadParamInt(argc, argv, "vsize", &vSize, -1);
LoadParamInt(argc, argv, "maxlen", &maxLength, 512);
LoadParamInt(argc, argv, "d", &eSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
......@@ -110,7 +116,6 @@ void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length)
delete[] data;
}
/*
make the network
*/
......
......@@ -71,7 +71,7 @@ public:
~T2TEmbedder();
/* initialize the model */
void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL);
void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL, bool isEnc = true);
/* make positional embeddings */
void MakePosEmbedding(int eSize, int d, int length);
......
......@@ -116,8 +116,8 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo
XTensor res;
/* self attention */
att = attentions[i].Make(x, x, x, mask, isTraining);
att = attentions[i].Make(x, x, x, mask, isTraining, true);
/* dropout */
if(isTraining && dropoutP > 0)
att = Dropout(att, dropoutP);
......@@ -160,3 +160,4 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool isTraining)
}
}
......@@ -236,10 +236,10 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
XTensor * maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID, paddingEnc.mem);
_Unsqueeze(&paddingEnc, maskEncDecTMPEnc, paddingEnc.order - 1, paddingDec.GetDim(-1));
_Unsqueeze(&paddingDec, maskEncDecTMPDec, paddingEnc.order, paddingEnc.GetDim(-1));
_Multiply(maskEncDecTMPDec, maskEncDecTMPEnc, maskEncDecTMPDec);
_ScaleAndShiftMe(maskEncDecTMPDec, 1e9F, -1e9F);
_Unsqueeze(maskEncDecTMPDec, &maskEncDec, 0, dims[0]);
//_Unsqueeze(&paddingDec, maskEncDecTMPDec, paddingEnc.order, paddingEnc.GetDim(-1));
//_Multiply(maskEncDecTMPDec, maskEncDecTMPEnc, maskEncDecTMPDec);
_ScaleAndShiftMe(maskEncDecTMPEnc, 1e9F, -1e9F);
_Unsqueeze(maskEncDecTMPEnc, &maskEncDec, 0, dims[0]);
DelTensorBuf(maskEncDecTMPDec);
DelTensorBuf(maskEncDecTMPEnc);
......@@ -274,7 +274,10 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
_Sum(&maskEnc, padding3, &maskEnc);
encoding = MakeEncoder(inputEnc, maskEnc, isTraining);
//encoding.Dump(stderr, "encoding",10);
decoding = MakeDecoder(inputDec, encoding, maskDec, maskEncDec, isTraining);
//decoding.Dump(stderr, "decoding", 10);
outputLayer->Make(decoding, output);
delete[] dims;
......@@ -298,9 +301,10 @@ void T2TModel::GetParams(XList &list)
list.Add(&encoder->fnns[i].b1);
list.Add(&encoder->fnns[i].w2);
list.Add(&encoder->fnns[i].b2);
list.Add(&encoder->attentions[i].wk);
list.Add(&encoder->attentions[i].wq);
list.Add(&encoder->attentions[i].wv);
//list.Add(&encoder->attentions[i].wk);
//list.Add(&encoder->attentions[i].wq);
//list.Add(&encoder->attentions[i].wv);
list.Add(&encoder->attentions[i].wbig);
list.Add(&encoder->attentions[i].wa);
list.Add(&encoder->fnnLayerNorms[i].w);
list.Add(&encoder->fnnLayerNorms[i].b);
......@@ -322,9 +326,10 @@ void T2TModel::GetParams(XList &list)
list.Add(&decoder->attentionsEnde[i].wa);
list.Add(&decoder->attEndeLayerNorms[i].w);
list.Add(&decoder->attEndeLayerNorms[i].b);
list.Add(&decoder->attentions[i].wk);
list.Add(&decoder->attentions[i].wq);
list.Add(&decoder->attentions[i].wv);
//list.Add(&decoder->attentions[i].wk);
//list.Add(&decoder->attentions[i].wq);
//list.Add(&decoder->attentions[i].wv);
list.Add(&decoder->attentions[i].wbig);
list.Add(&decoder->attentions[i].wa);
list.Add(&decoder->fnnLayerNorms[i].w);
list.Add(&decoder->fnnLayerNorms[i].b);
......
......@@ -56,7 +56,7 @@ void T2TOutput::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
float minmax = 0;
LoadParamInt(argc, argv, "vsize", &vSize, -1);
LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE);
LoadParamInt(argc, argv, "d", &hSize, DEFAULT_EMBEDDING_SIZE);
LoadParamFloat(argc, argv, "outputminmax", &minmax, 0.08F);
......
......@@ -41,12 +41,15 @@ T2TTrainer::T2TTrainer()
seqLen2 = NULL;
nseqBuf = 0;
nextSeq = -1;
nextBatch = -1;
argNum = 0;
argArray = NULL;
buf = NULL;
buf2 = NULL;
bufBatch = NULL;
bufSize = 0;
bufBatchSize = 0;
seqOffset = NULL;
}
......@@ -55,6 +58,7 @@ T2TTrainer::~T2TTrainer()
{
delete[] buf;
delete[] buf2;
delete[] bufBatch;
delete[] seqLen;
delete[] seqLen2;
delete[] seqOffset;
......@@ -116,10 +120,12 @@ void T2TTrainer::Init(int argc, char ** argv)
LoadParamBool(argc, argv, "doubledend", &isDoubledEnd, false);
LoadParamBool(argc, argv, "smallbatch", &isSmallBatch, true);
LoadParamBool(argc, argv, "bigbatch", &isBigBatch, false);
LoadParamBool(argc, argv, "smallfootprint", &isSmallFootprint, false);
LoadParamBool(argc, argv, "debug", &isDebugged, false);
LoadParamBool(argc, argv, "randbatch", &isRandomBatch, false);
buf = new int[bufSize];
buf2 = new int[bufSize];
bufBatch = new BatchNode[bufSize];
seqLen = new int[bufSize];
seqLen2 = new int[bufSize];
seqOffset = new int[bufSize];
......@@ -165,12 +171,15 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
XMem * mem = model->mem;
XNet net;
if(isSmallFootprint)
net.SetGradEfficientFlag();
if(isDebugged)
net.SetGradEfficientFlag(false);
PrepareModel(model);
double startT = GetClockSec();
FILE * fileen = fopen("enc.txt", "w");
FILE * filede = fopen("dec.txt", "w");
for(epoch = 1; epoch <= nepoch; epoch++){
#ifndef WIN32
......@@ -205,6 +214,10 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch");
//batchEnc.Dump(stderr, "enc",1);
//batchDec.Dump(stderr, "dec",1);
//paddingDec.Dump(stderr, "paddec");
/* output probabilities */
XTensor output;
......@@ -222,17 +235,18 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
LabelSmooth(&gold, &goldSmoothed, labelSmoothingP);
/* make paddings for the output */
if (output.GetDim(0) > 1)
if (output.GetDim(0) > 0)
PadOutput(&output, &gold, &paddingDec);
/* get probabilities */
float prob = GetProb(&output, &gold, NULL);
//printf("%f\n", prob);
//float prob = 0;
DTYPE lossLocal = -prob / wc;
bool doUpdate = (!IsNAN(lossLocal) && !IsINF(lossLocal) && lossLocal < 1e3F);
XTensor &g = labelSmoothingP > 0 ? goldSmoothed : gold;
XTensor &g = labelSmoothingP > 0 ? goldSmoothed : gold;
//doUpdate = false;
if (doUpdate) {
/* recale the output for normalized loss */
......@@ -292,6 +306,9 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
MakeCheckpoint(model, validFN, modelFN, "epoch", epoch);
}
fclose(fileen);
fclose(filede);
double elapsed = GetClockSec() - startT;
epoch = MIN(epoch, nepoch);
......@@ -434,11 +451,13 @@ void T2TTrainer::MakeCheckpoint(T2TModel * model, const char * validFN, const ch
sprintf(fn2, "%s.%s.%03d.output", modelFN, label, id);
model->Dump(fn);
if(validFN != NULL){
T2TTrainer trainer;
trainer.Init(argNum, argArray);
trainer.Test(validFN, fn2, model);
}
if (model->isLM) {
if (validFN != NULL) {
T2TTrainer trainer;
trainer.Init(argNum, argArray);
trainer.Test(validFN, fn2, model);
}
}
delete[] fn;
delete[] fn2;
......@@ -473,7 +492,8 @@ int T2TTrainer::LoadBuf(FILE * file, bool isSorted, int step)
int wordCount = 0;
while(fgets(line, MAX_SEQUENCE_LENGTH - 1, file)){
int len = (int)strlen(line);
if(line[0]=='b')
break;
while(line[len - 1] == '\r' || line[len - 1] == '\n'){
line[len - 1] = 0;
len--;
......@@ -544,9 +564,14 @@ int T2TTrainer::LoadBuf(FILE * file, bool isSorted, int step)
node.offset = i;
node.p = buf + offset;
node.size = 0;
for(int j = 0; j < step; j++)
int max = 0;
for(int j = 0; j < step; j++){
node.size += seqLen[i + j];
node.value = seqLen[i];
max = MAX(max, seqLen[i + j]);
}
//node.value = seqLen[i+1];
//node.value = MAX(seqLen[i+1],seqLen[i]);
node.value = max;
count++;
offset += node.size;
}
......@@ -768,6 +793,12 @@ int T2TTrainer::LoadBatchLM(FILE * file,
return sc;
}
int CompareBatchNode(const void * a, const void * b)
{
return ((BatchNode*)b)->key - ((BatchNode*)a)->key;
}
/*
load a batch of sequences (for MT)
>> file - the handle to the data file
......@@ -797,10 +828,70 @@ int T2TTrainer::LoadBatchMT(FILE * file,
int devID, XMem * mem,
bool isTraining)
{
if(nextSeq < 0 || nextSeq >= nseqBuf)
//if (nextSeq < 0 || nextSeq >= nseqBuf)
// LoadBuf(file, isSorted, 2);
if (nextBatch < 0 || nextBatch >= bufBatchSize) {
LoadBuf(file, isSorted, 2);
int seq = MAX(nextSeq, 0);
int seq = 0;
bufBatchSize = 0;
nextBatch = 0;
/* we segment the buffer into batches */
while (seq < nseqBuf) {
int wcEnc = 0;
int wcDec = 0;
int wnEnc = 0;
int wnDec = 0;
int maxEnc = 0;
int maxDec = 0;
int sc = 0;
while (seq + sc < nseqBuf) {
/* source-side sequence */
wnEnc = seqLen[seq + sc];
/* target-side sequence */
wnDec = isDoubledEnd ? seqLen[seq + sc + 1] : seqLen[seq + sc + 1] - 1;
int tcEnc = isBigBatch ? (wcEnc + wnEnc): MAX(maxEnc, wnEnc) * (sc + 2) / 2;
int tcDec = isBigBatch ? (wcDec + wnDec): MAX(maxDec, wnDec) * (sc + 2) / 2;
if(sc != 0 && sc > sBatch * 2 && (tcEnc > wBatch || tcDec > wBatch))
break;
wcEnc += wnEnc;
sc += 1;
if(maxEnc < wnEnc)
maxEnc = wnEnc;
wcDec += wnDec;
sc += 1;
if(maxDec < wnDec)
maxDec = wnDec;
}
BatchNode & batch = bufBatch[bufBatchSize];
batch.beg = seq;
batch.end = seq + sc;
batch.maxEnc = maxEnc;
batch.maxDec = maxDec;
batch.key = rand();
bufBatchSize++;
seq = seq + sc;
}
if(isRandomBatch)
qsort(bufBatch, bufBatchSize, sizeof(BatchNode), CompareBatchNode);
}
/*int seq = MAX(nextSeq, 0);
int wcEnc = 0;
int wcDec = 0;
int wnEnc = 0;
......@@ -813,34 +904,44 @@ int T2TTrainer::LoadBatchMT(FILE * file,
while(seq + sc < nseqBuf){
/* source-side sequence */
wnEnc = seqLen[seq + sc];
wnDec = isDoubledEnd ? seqLen[seq + sc + 1] : seqLen[seq + sc + 1] - 1;
int tcEnc = isBigBatch ? (wcEnc + wnEnc): MAX(maxEnc, wnEnc) * (sc + 2) / 2;
int tcDec = isBigBatch ? (wcDec + wnDec): MAX(maxDec, wnDec) * (sc + 2) / 2;
if(sc != 0 && sc > sBatch * 2 && (tcEnc > wBatch || tcDec > wBatch))
break;
wcEnc += wnEnc;
sc += 1;
if(maxEnc < wnEnc)
maxEnc = wnEnc;
/* target-side sequence */
int len = isDoubledEnd ? seqLen[seq + sc] : seqLen[seq + sc] - 1;
wnDec = len;
wcDec += wnDec;
sc += 1;
if(maxDec < wnDec)
maxDec = wnDec;
int tcEnc = isBigBatch ? wcEnc : maxEnc * sc / 2;
int tcDec = isBigBatch ? wcDec : maxDec * sc / 2;
if(sc >= sBatch * 2 && (tcEnc >= wBatch || tcDec >= wBatch))
break;
}
nextSeq = seq + sc;
if(sc <= 0)
return 0;*/
if(bufBatchSize <= 0)
return 0;
BatchNode & batch = bufBatch[nextBatch++];
int seq = batch.beg;
int sc = batch.end - batch.beg;
int maxEnc = batch.maxEnc;
int maxDec = batch.maxDec;
CheckNTErrors(sc % 2 == 0, "The input samples must be paired");
int sCount = sc/2;
int seqSize = 0;
int dimsDec[3] = {sCount, maxDec, vsDec};
......@@ -859,13 +960,14 @@ int T2TTrainer::LoadBatchMT(FILE * file,
int wCountEnc = 0;
int wCountDec = 0;
int wCountPad = 0;
int wGold = 0;
wCount = 0;
int * batchEncValues = new int[batchEnc->unitNum];
int * batchDecValues = new int[batchDec->unitNum];
//MTYPE * paddingEncOffsets = new MTYPE[sc * maxEnc / 2];
//MTYPE * paddingDecOffsets = new MTYPE[sc * maxDec / 2];
MTYPE * paddingDecOffsets = new MTYPE[sc * maxDec / 2];
MTYPE * goldOffsets = new MTYPE[sc * maxDec / 2];
memset(batchEncValues, 0, sizeof(int) * batchEnc->unitNum);
......@@ -899,7 +1001,10 @@ int T2TTrainer::LoadBatchMT(FILE * file,
int num = buf[seqOffset[s] + w];
batchDecValues[batchDec->GetOffset2D(sent, w)] = num;
//paddingDecOffsets[wCountDec] = paddingDec->GetOffset2D(sent, w);
if (w < len-1){
paddingDecOffsets[wCountPad++] = paddingDec->GetOffset2D(sent, w);
wCount++;
}
if (w > 0)
goldOffsets[wGold++] = gold->GetOffset3D(sent, w - 1, buf[seqOffset[s] + w]);
......@@ -909,7 +1014,7 @@ int T2TTrainer::LoadBatchMT(FILE * file,
else
goldOffsets[wGold++] = gold->GetOffset3D(sent, w, buf[seqOffset[s] + w + 1]);
}
wCount++;
//wCount++;
wCountDec++;
if(seqs != NULL)
seqs[seqSize++] = buf[seqOffset[s] + w];
......@@ -922,19 +1027,19 @@ int T2TTrainer::LoadBatchMT(FILE * file,
}
batchDec->SetData(batchDecValues, batchDec->unitNum);
//paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCountDec);
paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCountPad);
XTensor * tmp2 = NewTensorBuf(paddingDec, devID, mem);
_ConvertDataType(batchDec, tmp2);
_NotEqual(tmp2, paddingDec, 0);
DelTensorBuf(tmp2);
//XTensor * tmp2 = NewTensorBuf(paddingDec, devID, mem);
//_ConvertDataType(batchDec, tmp2);
//_NotEqual(tmp2, paddingDec, 0);
//DelTensorBuf(tmp2);
gold->SetDataBatched(goldOffsets, 1.0F, wGold);
delete[] batchEncValues;
delete[] batchDecValues;
//delete[] paddingEncOffsets;
//delete[] paddingDecOffsets;
delete[] paddingDecOffsets;
delete[] goldOffsets;
return sc;
......@@ -1015,20 +1120,20 @@ void T2TTrainer::Update(T2TModel * model, const float lr)
model->GetParams(ws);
for(int i = 0; i < ws.count; i++){
for (int i = 0; i < ws.count; i++) {
XTensor * para = (XTensor*)ws.Get(i);
XTensor * paraGrad = para->grad;
XTensor * paraGrad = para->grad;
if (para->dataType == X_FLOAT)
{
if (paraGrad == NULL)
continue;
CheckNTErrors(para != NULL, "NULL parameter tensor!");
CheckNTErrors(paraGrad != NULL, "NULL gradient tensor!");
if (paraGrad == NULL)
continue;
if (useAdam) {
CheckNTErrors(para != NULL, "NULL parameter tensor!");
CheckNTErrors(paraGrad != NULL, "NULL gradient tensor!");
if (para->dataType == X_FLOAT)
{
if (useAdam)
{
adamBeta1T *= adamBeta1;
adamBeta2T *= adamBeta2;
DTYPE e = lr * (DTYPE)sqrt(1 - adamBeta2T) / (1 - adamBeta1T);
......@@ -1070,18 +1175,12 @@ void T2TTrainer::Update(T2TModel * model, const float lr)
/* clear gradient */
paraGrad->SetZeroAll();
}
else if (para->dataType == X_FLOAT16) {
XTensor paraGrad1(paraGrad->order, paraGrad->dimSize, X_FLOAT, paraGrad->denseRatio, paraGrad->devID, paraGrad->mem);
_ConvertDataType(paraGrad, &paraGrad1);
if (paraGrad == NULL)
continue;
CheckNTErrors(para != NULL, "NULL parameter tensor!");
CheckNTErrors(paraGrad != NULL, "NULL gradient tensor!");
if (useAdam) {
else if (para->dataType == X_FLOAT16)
{
if (useAdam)
{
XTensor paraGrad1(paraGrad->order, paraGrad->dimSize, X_FLOAT, paraGrad->denseRatio, paraGrad->devID, paraGrad->mem);
_ConvertDataType(paraGrad, &paraGrad1);
adamBeta1T *= adamBeta1;
adamBeta2T *= adamBeta2;
......@@ -1129,16 +1228,74 @@ void T2TTrainer::Update(T2TModel * model, const float lr)
}
/* clear gradient */
paraGrad->SetZeroAll();
}
paraGrad->SetZeroAll();
}
else {
ShowNTErrors("Unsupported data types for update!");
}
}
}
}
//void T2TTrainer::Update(T2TModel * model, const float lr)
//{
// XList ws(100);
//
// model->GetParams(ws);
//
// for(int i = 0; i < ws.count; i++){
// XTensor * para = (XTensor*)ws.Get(i);
// XTensor * paraGrad = para->grad;
//
// if (paraGrad == NULL)
// continue;
//
// CheckNTErrors(para != NULL, "NULL parameter tensor!");
// CheckNTErrors(paraGrad != NULL, "NULL gradient tensor!");
//
// if(useAdam){
// adamBeta1T *= adamBeta1;
// adamBeta2T *= adamBeta2;
// DTYPE e = lr * (DTYPE)sqrt(1 - adamBeta2T) / (1 - adamBeta1T);
// DTYPE d = adamDelta * (DTYPE)sqrt(1 - adamBeta2T);
//
// if (para->dataType == X_FLOAT16) {
// d = d * 5e4;
// }
//
// /* m = beta_1 * m + (1-beta_1) * grad */
// XTensor * m = (XTensor*)moments.Get(i);
// _ScaleAndShiftMe(m, adamBeta1, 0);
// _Sum(m, paraGrad, m, (1.0F - adamBeta1));
//
// /* v = beta_2 * v + (1-beta_2) * grad * grad*/
// XTensor * v = (XTensor*)moments2nd.Get(i);
// _Multiply(paraGrad, paraGrad, v, adamBeta2/(1.0F - adamBeta2));
// _ScaleAndShiftMe(v, (1.0F - adamBeta2), 0);
//
// /* v2 = m / (sqrt(v) + delta) */
// XTensor * v2 = NewTensorBuf(v, v->devID, v->mem);
// _Power(v, v2, 0.5F);
// _ScaleAndShiftMe(v2, 1.0F, d);
// _Div(m, v2, v2);
//
// /* the delta rule */
//
// _Sum(para, v2, para, -e);
//
// DelTensorBuf(v2);
//
// }
// else{
// /* the delta rule */
// _Sum(para, paraGrad, para, -lr);
// }
//
// /* clear gradient */
// paraGrad->SetZeroAll();
// }
//}
/*
prepare model for training
>> model - the model for training
......
......@@ -33,6 +33,25 @@ using namespace nts;
namespace transformer
{
/* node to keep batch information */
struct BatchNode
{
/* begining position */
int beg;
/* end position */
int end;
/* maximum word number on the encoder side */
int maxEnc;
/* maximum word number on the decoder side */
int maxDec;
/* a key for sorting */
int key;
};
/* trainer of the T2T model */
class T2TTrainer
{
......@@ -49,9 +68,15 @@ public:
/* another buffer */
int * buf2;
/* batch buf */
BatchNode * bufBatch;
/* buffer size */
int bufSize;
/* size of batch buffer */
int bufBatchSize;
/* length of each sequence */
int * seqLen;
......@@ -66,6 +91,9 @@ public:
/* offset for next sequence in the buffer */
int nextSeq;
/* offset for next batch */
int nextBatch;
/* indicates whether the sequence is sorted by length */
bool isLenSorted;
......@@ -142,8 +170,11 @@ public:
/* counterpart of "isSmallBatch" */
bool isBigBatch;
/* indicates whether we use small memory footprint for backward process */
bool isSmallFootprint;
/* randomize batches */
bool isRandomBatch;
/* indicates whether we intend to debug the net */
bool isDebugged;
public:
/* constructor */
......
......@@ -60,12 +60,13 @@ int TransformerMain(int argc, const char ** argv)
LoadParamString(argc, args, "output", outputFN, "");
srand((unsigned int)time(NULL));
T2TTrainer trainer;
trainer.Init(argc, args);
T2TModel model;
model.InitModel(argc, args);
model.InitModel(argc, args);
/* learn model parameters */
if(strcmp(trainFN, ""))
trainer.Train(trainFN, testFN, strcmp(modelFN, "") ? modelFN : "checkpoint.model", &model);
......
......@@ -24,6 +24,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include "XDevice.h"
#include "XGlobal.h"
#include "XThread.h"
......@@ -59,6 +60,7 @@ XDevice::~XDevice()
cublasDestroy(cublasHandle);
if(stream != NULL)
delete stream;
curandDestroyGenerator(gen);
#endif
}
......@@ -68,6 +70,7 @@ void XDevice::Init(int myDevID)
Clear();
devID = myDevID;
seed = rand();
/* CPU information */
if(devID < 0){
......@@ -80,6 +83,10 @@ void XDevice::Init(int myDevID)
cudaDeviceProp prop;
cudaSetDevice(myDevID);
curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
curandSetPseudoRandomGeneratorSeed(gen, seed);
if(cudaGetDeviceProperties(&prop, devID) != cudaSuccess){
XPRINT1(0, stderr, "cannot get GPU(%d) information.", devID);
exit(1);
......@@ -270,6 +277,8 @@ XDevManager::~XDevManager()
/* initialize it and get the CPU and GPU information */
void XDevManager::Init()
{
srand((unsigned int)time(NULL));
Clear();
/* CPUs (we actually do not care about how many CPUs are using) */
......
......@@ -99,6 +99,9 @@ public:
/* default stream for the device */
XStream * stream;
/* seed for random number generation */
int seed;
#ifdef USE_CUDA
/* mutex for handle (GPU cublas) */
......@@ -109,6 +112,9 @@ public:
/* specify if the handle is initialized */
bool isHandleReady;
/* generater of random numbers */
curandGenerator_t gen;
#endif
......
......@@ -1461,6 +1461,23 @@ void XMem::CreateBLASHandle()
#endif
}
/* show profile of the memory pool */
void XMem::ShowMemUsage(FILE * file)
{
MTYPE used = 0;
MTYPE total = 0;
for(int i = 0; i < blockNum; i++){
if(blocks[i].mem != NULL){
used += blocks[i].used;
total += blocks[i].size;
}
}
fprintf(file, "mem:%.1fMB used:%.1fMB usage:%.3f\n",
(DTYPE)used/MILLION, (DTYPE)total/MILLION, (DTYPE)used/total);
}
#ifdef USE_CUDA
/* get the handle of cublas */
......
......@@ -24,6 +24,7 @@
#ifndef __XMEM_H__
#define __XMEM_H__
#include <stdio.h>
#include <stdlib.h>
#ifdef CUDA_BLAS
......@@ -402,6 +403,9 @@ public:
/* create a new cublas handle */
void CreateBLASHandle();
/* show profile of the memory pool */
void ShowMemUsage(FILE * file);
#ifdef USE_CUDA
/* get the handle of cublas */
cublasHandle_t * GetCublasHandle();
......
......@@ -67,6 +67,8 @@ const char * GetOPName(int type)
return "M_MULTIPLY";
else if (type == MATH_MULTIPLYDIM)
return "M_MULTIPLYDIM";
else if (type == MATH_MULTIPLYBROADCAST)
return "M_MULTIPLYBROADCAST";
else if (type == MATH_NEGATE)
return "M_NEGATE";
else if (type == MATH_NORMALIZE)
......@@ -85,6 +87,8 @@ const char * GetOPName(int type)
return "M_SUM";
else if (type == MATH_SUMDIM)
return "M_SUMDIM";
else if (type == MATH_SUMBROADCAST)
return "M_SUMBROADCAST";
else if (type == REDUCE_REDUCEMAX)
return "R_REDUCEMAX";
else if (type == REDUCE_REDUCEMEAN)
......
......@@ -52,7 +52,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#define MATH_MATRIXMULBATCHED MATH_MATRIXMUL + 1
#define MATH_MULTIPLY MATH_MATRIXMULBATCHED + 1
#define MATH_MULTIPLYDIM MATH_MULTIPLY + 1
#define MATH_NEGATE MATH_MULTIPLYDIM + 1
#define MATH_MULTIPLYBROADCAST MATH_MULTIPLYDIM + 1
#define MATH_NEGATE MATH_MULTIPLYBROADCAST + 1
#define MATH_NORMALIZE MATH_NEGATE + 1
#define MATH_POWER MATH_NORMALIZE + 1
#define MATH_SCALEANDSHIFT MATH_POWER + 1
......@@ -61,8 +62,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#define MATH_SUBDIM MATH_SUB + 1
#define MATH_SUM MATH_SUBDIM + 1
#define MATH_SUMDIM MATH_SUM + 1
#define MATH_SUMBROADCAST MATH_SUMDIM + 1
#define REDUCE MATH_SUMDIM + 1
#define REDUCE MATH_SUMBROADCAST + 1
#define REDUCE_REDUCEMAX REDUCE + 1
#define REDUCE_REDUCEMEAN REDUCE_REDUCEMAX + 1
#define REDUCE_REDUCESUM REDUCE_REDUCEMEAN + 1
......
......@@ -60,6 +60,7 @@
#include "core/utilities/FlushToMem.cuh"
#include "core/utilities/SetAscendingOrder.cuh"
#endif
/* the nts (NiuTrans.Tensor) namespace */
......@@ -690,9 +691,6 @@ set the tensor items by a uniform distribution in range [lower, upper]
>> lower - lower value of the range
>> upper - upper value of the range
*/
void XTensor::SetDataRand(DTYPE lower, DTYPE upper)
{
// TODO: cuda code!!!!!!!
......@@ -703,7 +701,6 @@ void XTensor::SetDataRand(DTYPE lower, DTYPE upper)
// srand((unsigned)time(0));
DTYPE variance = upper - lower;
void * d = NULL;
if (dataType == X_FLOAT) {
d = new float[unitNum];
for (int i = 0; i < unitNum; i++) {
......@@ -715,7 +712,7 @@ void XTensor::SetDataRand(DTYPE lower, DTYPE upper)
d = new double[unitNum];
for (int i = 0; i < unitNum; i++) {
*((double*)d + i) = lower + variance * rand() / RAND_MAX;
}
}
}
else if (dataType == X_FLOAT16) {
unsigned short random;
......@@ -1700,7 +1697,6 @@ void XTensor::Dump(const XTensor * tensor, FILE * file, const char * label, cons
{
ShowNTErrors("TO DO!");
}
}
/*
......
......@@ -46,8 +46,6 @@ void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int le
"Unmatched tensors in multiplication!");
CheckNTErrors((a->order == b->order && a->order == c->order),
"Unmatched tensors!");
CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
"Unmatched tensors in addition!");
#ifdef USE_CUDA
if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
......
......@@ -22,6 +22,8 @@
#include "DivDim.cuh"
#include "../../XDevice.h"
#include "cuda_fp16.h"
#include "device_launch_parameters.h"
#include "../../XDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......@@ -38,43 +40,89 @@ where a is a tensor and b is a row vector
>> colNum - number of columns of a and c (i.e., the size of b)
>> alpha - the scaling factor
*/
template <class T, bool alphaFired>
__global__
void KernelDivWithRow(T * a, T * b, T * c, int rowNum, int colNum,DTYPE alpha)
void KernelDivWithRow(DTYPE * a, DTYPE * b, DTYPE * c, int rowNum, int colNum, DTYPE alpha,bool alphaFired)
{
__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int col = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
__shared__ DTYPE bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int col = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
if(col >= colNum || row >= rowNum)
return;
if (col >= colNum || row >= rowNum)
return;
if(threadIdx.y == 0)
bv[threadIdx.x] = b[col];
__syncthreads();
if (threadIdx.y == 0)
bv[threadIdx.x] = b[col];
__syncthreads();
//#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
//int offset = colNum * row + col;
T alpha1;
if (sizeof(T) - sizeof(half) == 0) {
alpha1 = __float2half(alpha);
}
else {
alpha1 = (DTYPE)alpha;
}
int offset = colNum * row + col;
if (alphaFired)
c[offset] = a[offset] / bv[threadIdx.x] + c[offset] * alpha1;
c[offset] = a[offset] / bv[threadIdx.x] + c[offset] * alpha;
else
c[offset] = a[offset] / bv[threadIdx.x];
}
__global__
void KernelDivWithRowHalf(half * a, half * b, half * c, int rowNum, int colNum, half alpha, bool alphaFired)
{
__shared__ half bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int col = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
if (col >= colNum || row >= rowNum)
return;
//#endif
if (threadIdx.y == 0)
bv[threadIdx.x] = b[col];
__syncthreads();
int offset = colNum * row + col;
if (alphaFired)
c[offset] = a[offset] / bv[threadIdx.x] + c[offset] * alpha;
else
c[offset] = a[offset] / bv[threadIdx.x];
}
//template <class T, bool alphaFired>
//__global__
//void KernelDivWithRow(T * a, T * b, T * c, int rowNum, int colNum,DTYPE alpha)
//{
// __shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
// int col = blockDim.x * blockIdx.x + threadIdx.x;
// int row = blockDim.y * blockIdx.y + threadIdx.y;
//
// if(col >= colNum || row >= rowNum)
// return;
//
// if(threadIdx.y == 0)
// bv[threadIdx.x] = b[col];
// __syncthreads();
//
////#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
//
// //int offset = colNum * row + col;
// T alpha1;
// if (sizeof(T) - sizeof(half) == 0) {
// alpha1 = __float2half(alpha);
// }
// else {
// alpha1 = (DTYPE)alpha;
// }
//
// int offset = colNum * row + col;
//
// if (alphaFired)
// c[offset] = a[offset] / bv[threadIdx.x] + c[offset] * alpha1;
// else
// c[offset] = a[offset] / bv[threadIdx.x];
//
////#endif
//
//}
/*
tensor division of a tensor and a colum vector
c = a / b + alpha * c
......@@ -88,45 +136,107 @@ where a is a tensor and b is a colum vector
>> blockNum - number of matrics
>> alpha - the scaling factor
*/
template <class T, bool alphaFired>
__global__
void KernelDivWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, DTYPE alpha)
void KernelDivWithCol(DTYPE * a, DTYPE * b, DTYPE * c, int rowNum, int colNum, int blockSize, int blockNum, DTYPE alpha,bool alphaFired)
{
__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ DTYPE bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = colIndex % colNum;
int block = colIndex / colNum;
int col = colIndex % colNum;
int block = colIndex / colNum;
if(row >= rowNum || block >= blockNum)
return;
if (row >= rowNum || block >= blockNum)
return;
if(threadIdx.x == 0)
bv[threadIdx.y] = b[row];
__syncthreads();
if (threadIdx.x == 0)
bv[threadIdx.y] = b[row];
__syncthreads();
//#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
//#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
int offset = block * blockSize + row * colNum + col;
T alpha1;
if (sizeof(T) -sizeof(half) == 0) {
alpha1 = __float2half(alpha);
}
else {
alpha1 = alpha;
}
if (alphaFired)
c[offset] = a[offset] / bv[threadIdx.y] + c[offset] * alpha1;
c[offset] = a[offset] / bv[threadIdx.y] + c[offset] * alpha;
else
c[offset] = a[offset] / bv[threadIdx.y];
//#endif
//#endif
}
__global__
void KernelDivWithColHalf(half * a, half * b, half * c, int rowNum, int colNum, int blockSize, int blockNum, half alpha, bool alphaFired)
{
__shared__ half bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = colIndex % colNum;
int block = colIndex / colNum;
if (row >= rowNum || block >= blockNum)
return;
if (threadIdx.x == 0)
bv[threadIdx.y] = b[row];
__syncthreads();
int offset = block * blockSize + row * colNum + col;
if (alphaFired)
c[offset] = a[offset] / bv[threadIdx.y] + c[offset] * alpha;
else
c[offset] = a[offset] / bv[threadIdx.y];
\
}
//
//
//template <class T, bool alphaFired>
//__global__
//void KernelDivWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, DTYPE alpha)
//{
// __shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
//
// int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
// int row = blockDim.y * blockIdx.y + threadIdx.y;
//
// int col = colIndex % colNum;
// int block = colIndex / colNum;
//
// if(row >= rowNum || block >= blockNum)
// return;
//
// if(threadIdx.x == 0)
// bv[threadIdx.y] = b[row];
// __syncthreads();
//
////#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
//
// int offset = block * blockSize + row * colNum + col;
//
// T alpha1;
// if (sizeof(T) -sizeof(half) == 0) {
// alpha1 = __float2half(alpha);
// }
// else {
// alpha1 = alpha;
// }
//
// if (alphaFired)
// c[offset] = a[offset] / bv[threadIdx.y] + c[offset] * alpha1;
// else
// c[offset] = a[offset] / bv[threadIdx.y];
//
////#endif
//}
/*
tensor division
......@@ -171,24 +281,24 @@ void _CudaDivDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE
if(stride > 1){
GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
if(alpha == (DTYPE)0.0F)
KernelDivWithCol<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
KernelDivWithCol <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockSize, stride, blockSize * stride, blockNum, alpha);
blockSize, stride, blockSize * stride, blockNum, alpha,false);
else
KernelDivWithCol<DTYPE, true> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
KernelDivWithCol<<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockSize, stride, blockSize * stride, blockNum, alpha);
blockSize, stride, blockSize * stride, blockNum, alpha,true);
}
else if(stride == 1){
GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
if(alpha == (DTYPE)0.0F)
KernelDivWithRow<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
KernelDivWithRow<<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockNum, blockSize, alpha);
blockNum, blockSize, alpha,false);
else
KernelDivWithRow<DTYPE, true> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
KernelDivWithRow <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockNum, blockSize, alpha);
blockNum, blockSize, alpha,true);
}
else{
ShowNTErrors("Something is wrong!");
......@@ -197,25 +307,40 @@ void _CudaDivDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE
else if (a->dataType == X_FLOAT16) {
if (stride > 1) {
GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
if (alpha == (DTYPE)0.0F)
KernelDivWithCol<half, false> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((half *)a->data, (half*)b->data, (half*)c->data,
blockSize, stride, blockSize * stride, blockNum, alpha);
else
KernelDivWithCol<half, true> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((half*)a->data, (half*)b->data, (half*)c->data,
blockSize, stride, blockSize * stride, blockNum, alpha);
if (alpha == (DTYPE)0.0F){
unsigned short temp = FloatToFloat16(alpha);
half alpha1 = *((half *)&temp);
KernelDivWithColHalf << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((half *)a->data, (half*)b->data, (half*)c->data,
blockSize, stride, blockSize * stride, blockNum, alpha1, false);
}
else{
unsigned short temp = FloatToFloat16(alpha);
half alpha1 = *((half *)&temp);
KernelDivWithColHalf << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((half*)a->data, (half*)b->data, (half*)c->data,
blockSize, stride, blockSize * stride, blockNum, alpha1, true);
}
}
else if (stride == 1) {
GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
if (alpha == (DTYPE)0.0F)
KernelDivWithRow<half, false> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((half*)a->data, (half*)b->data, (half*)c->data,
blockNum, blockSize, alpha);
else
KernelDivWithRow<half, true> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((half*)a->data, (half*)b->data, (half*)c->data,
blockNum, blockSize, alpha);
if (alpha == (DTYPE)0.0F) {
unsigned short temp = FloatToFloat16(alpha);
half alpha1 = *((half *)&temp);
KernelDivWithRowHalf << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((half*)a->data, (half*)b->data, (half*)c->data,
blockNum, blockSize, alpha1, false);
}
else {
unsigned short temp = FloatToFloat16(alpha);
half alpha1 = *((half *)&temp);
KernelDivWithRowHalf << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((half*)a->data, (half*)b->data, (half*)c->data,
blockNum, blockSize, alpha1, true);
}
}
else {
ShowNTErrors("Something is wrong!");
......
......@@ -22,12 +22,10 @@
#include "../../XTensor.h"
#include "../../XDevice.h"
#include "../../XName.h"
#include "../CHeader.h"
#include "MatrixMul.h"
#include "MatrixMul2D.h"
#include "XTensorBLAS.h"
#include "MatrixMulBatched.h"
#include "timer.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......@@ -276,7 +274,8 @@ matrix multiplication with no transposition c = a * b * alpha
>> parallelRunner - parallel processing module
<< return - the result of matrix multiplication
*/
XTensor MatrixMul(const XTensor &a, const XTensor &b, DTYPE alpha, XPRunner * parallelRunner)
XTensor MatrixMul(const XTensor &a, const XTensor &b,
DTYPE alpha, XPRunner * parallelRunner)
{
CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
......@@ -317,212 +316,4 @@ XTensor MatrixMul(const XTensor &a, const XTensor &b, DTYPE alpha, XPRunner * pa
return c;
}
/*
matrix multiplication (return a XTensor structure) c = trans(a) * trans(b) * alpha
make a new tensor to keep the result and return it
For the input tensors a and b, we perform matrix multiplication on the first two dimentsions.
E.g., let A be a tensor of size y * z * m and B be a tensor of size x * y * n.
For A * B, we go over each order-2 tensor of A (of size x * y) and each order-2 tensor B (of size z * x),
like this c_{i,j} = trans(ai) * trans(bj) * alpha + c_{i,j} * beta
where trans() returns the transposed matrix if the flag is fired, ai is the i-th element tensor of A,
bj is the j-th element tensor of B, and c_{i,j} is the (i,j) element tensor of the result C.
The result C should be a tensor of z * x * n * m.
Obviously C = A * B performs normal matrix multiplication if A = y * z and B = x * y.
>> a - tensor a
>> transposedA - indicates whether the matrices in a are transposed
>> b - tensor b
>> transposedB - indicates whether teh matrices in b are transposed
>> alpha - a coefficient
>> parallelRunner - parallel processing module
<< return - the result of matrix multiplication
*/
XTensor MatrixMulFloat16(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
const XTensor &b, MATRIX_TRANS_TYPE transposedB,
DTYPE alpha, XPRunner * parallelRunner)
{
CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
int an = transposedA == X_TRANS ? a.dimSizeRDI[0] : a.dimSizeRDI[1];
int am = transposedA == X_TRANS ? a.dimSizeRDI[1] : a.dimSizeRDI[0];
int bn = transposedB == X_TRANS ? b.dimSizeRDI[0] : b.dimSizeRDI[1];
int bm = transposedB == X_TRANS ? b.dimSizeRDI[1] : b.dimSizeRDI[0];
CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
int order = a.order + b.order - 2;
int sub = 0;
int * dimSize = new int[order];
for (int i = 2; i < a.order; i++)
dimSize[sub++] = a.dimSizeRDI[a.order + 1 - i];
for (int i = 2; i < b.order; i++)
dimSize[sub++] = b.dimSizeRDI[b.order + 1 - i];
dimSize[sub++] = an;
dimSize[sub++] = bm;
float dr = (!a.isSparse || !b.isSparse) ? 1.0F : MAX(a.denseRatio, b.denseRatio);
XTensor c(order, dimSize, a.dataType, dr, a.devID, a.mem);
c.SetTMPFlag();
//XTensor * halfA = NewTensorBuf(a.order, a.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
//XTensor * halfB = NewTensorBuf(b.order, b.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
//XTensor * halfC = NewTensorBuf(c.order, c.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
//_ConvertDataType(&a, halfA);
//_ConvertDataType(&b, halfB);
//_MatrixMul(halfA, transposedA, halfB, transposedB, halfC, alpha, 0, parallelRunner);
//_ConvertDataType(halfC, &c);
//DelTensorBuf(halfC);
//DelTensorBuf(halfB);
//DelTensorBuf(halfA);
XTensor * halfA = NewTensorBuf(a.order, a.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
XTensor * halfB = NewTensorBuf(b.order, b.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
XTensor * halfC = NewTensorBuf(c.order, c.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
/*timer_c timerConvert1;
timerConvert1.m_start_timer();
*/
_ConvertDataType(&a, halfA);
_ConvertDataType(&b, halfB);
/*timerConvert1.m_end_timer();
printf("time convert1 %f ms\n", timerConvert1.m_get_time_diff_msec());
timer_c timerMatrixMul;
timerMatrixMul.m_start_timer();*/
_MatrixMul(halfA, transposedA, halfB, transposedB, halfC, alpha, 0, parallelRunner);
/*timerMatrixMul.m_end_timer();
printf("time matrixmul %f ms\n", timerMatrixMul.m_get_time_diff_msec());
timer_c timerConvert2;
timerConvert2.m_start_timer();
*/
_ConvertDataType(halfC, &c);
/*timerConvert2.m_end_timer();
printf("time convert2 %f ms\n\n", timerConvert2.m_get_time_diff_msec());*/
DelTensorBuf(halfC);
DelTensorBuf(halfB);
DelTensorBuf(halfA);
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_MATRIXMUL);
XLink::AddParamToHeadTrans(&c, transposedA);
XLink::AddParamToHeadTrans(&c, transposedB);
XLink::AddParamToHead(&c, alpha);
/* destroy variables */
delete[] dimSize;
return c;
}
/*
matrix multiplication with no transposition c = a * b * alpha
>> a - tensor a
>> b - tensor b
>> alpha - a coefficient
>> parallelRunner - parallel processing module
<< return - the result of matrix multiplication
*/
XTensor MatrixMulFloat16(const XTensor &a, const XTensor &b,
DTYPE alpha, XPRunner * parallelRunner)
{
CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
int an = a.dimSizeRDI[1];
int am = a.dimSizeRDI[0];
int bn = b.dimSizeRDI[1];
int bm = b.dimSizeRDI[0];
CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
int order = a.order + b.order - 2;
int sub = 0;
int * dimSize = new int[order];
for (int i = 2; i < a.order; i++)
dimSize[sub++] = a.dimSizeRDI[a.order + 1 - i];
for (int i = 2; i < b.order; i++)
dimSize[sub++] = b.dimSizeRDI[b.order + 1 - i];
dimSize[sub++] = an;
dimSize[sub++] = bm;
float dr = (!a.isSparse || !b.isSparse) ? 1.0F : MAX(a.denseRatio, b.denseRatio);
XTensor c(order, dimSize, a.dataType, dr, a.devID, a.mem);
c.SetTMPFlag();
XTensor * halfA = NewTensorBuf(a.order, a.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
XTensor * halfB = NewTensorBuf(b.order, b.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
XTensor * halfC = NewTensorBuf(c.order, c.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
_ConvertDataType(&a, halfA);
_ConvertDataType(&b, halfB);
_MatrixMul(halfA, X_NOTRANS, halfB, X_NOTRANS, halfC, alpha, 0, parallelRunner);
_ConvertDataType(halfC, &c);
DelTensorBuf(halfC);
DelTensorBuf(halfB);
DelTensorBuf(halfA);
//XTensor * halfA = NewTensorBuf(a.order, a.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
//XTensor * halfB = NewTensorBuf(b.order, b.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
//XTensor * halfC = NewTensorBuf(c.order, c.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
//timer_c timerConvert1;
//timerConvert1.m_start_timer();
//_ConvertDataType(&a, halfA);
//_ConvertDataType(&b, halfB);
//timerConvert1.m_end_timer();
//printf("time convert1 %f ms\n", timerConvert1.m_get_time_diff_msec());
//timer_c timerMatrixMul;
//timerMatrixMul.m_start_timer();
//_MatrixMul(halfA, X_NOTRANS, halfB, X_NOTRANS, halfC, alpha, 0, parallelRunner);
//timerMatrixMul.m_end_timer();
//printf("time matrixmul %f ms\n", timerMatrixMul.m_get_time_diff_msec());
//timer_c timerConvert2;
//timerConvert2.m_start_timer();
//_ConvertDataType(halfC, &c);
//timerConvert2.m_end_timer();
//printf("time convert2 %f ms\n\n", timerConvert2.m_get_time_diff_msec());
//DelTensorBuf(halfC);
//DelTensorBuf(halfB);
//DelTensorBuf(halfA);
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_MATRIXMUL);
XLink::AddParamToHeadTrans(&c, X_NOTRANS);
XLink::AddParamToHeadTrans(&c, X_NOTRANS);
XLink::AddParamToHead(&c, alpha);
/* destroy variables */
delete[] dimSize;
return c;
}
}
\ No newline at end of file
}// namespace nts(NiuTrans.Tensor)
\ No newline at end of file
......@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
#define MMul MatrixMul
#define MMul16 MatrixMulFloat16
/*
matrix multiplication c = trans(a) * trans(b) * alpha + c * beta
......@@ -64,13 +63,6 @@ XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor
XTensor MatrixMul(const XTensor &a, const XTensor &b,
DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
XTensor MatrixMulFloat16(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
/* matrix multiplication with no transposition c = a * b * alpha*/
XTensor MatrixMulFloat16(const XTensor &a, const XTensor &b,
DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
} // namespace nts(NiuTrans.Tensor)
#endif // __MATRIXMUL_H__
\ No newline at end of file
......@@ -82,19 +82,7 @@ void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
_MatrixMULCPU(a, transposedA, b, transposedB, c, alpha, beta);
else
_MatrixMul2DParallel(a, transposedA, b, transposedB, c, alpha, beta, parallelRunner);
/*if (a->dataType == DEFAULT_DTYPE &&
b->dataType == DEFAULT_DTYPE &&
c->dataType == DEFAULT_DTYPE)
{
if (useBLAS)
_MatrixMULCPU(a, transposedA, b, transposedB, c, alpha, beta);
else
_MatrixMul2DParallel(a, transposedA, b, transposedB, c, alpha, beta, parallelRunner);
}*/
//else {
// // TODO!!
// ShowNTErrors("TODO!");
//}
}
/* a dense matrix multiply a sparse matrix */
else if (!a->isSparse && b->isSparse) {
......
......@@ -156,18 +156,6 @@ void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
if (stream != NULL)
cublasSetStream(*handle, stream->stream);
//if (a->dataType == X_FLOAT && b->dataType == X_FLOAT && c->dataType == X_FLOAT) {
// _CudaBLASMatrixMUL(handle, a->data, transposedA, a->dataType,
// b->data, transposedB, a->dataType, c->data, c->dataType,
// a->dimSize[0], a->dimSize[1],
// b->dimSize[0], b->dimSize[1],
// c->dimSize[0], c->dimSize[1],
// alpha, beta);
//}
//else {
// // TODO!!
// ShowNTErrors("TODO!");
//}
_CudaBLASMatrixMUL(handle, a->data, transposedA, a->dataType,
b->data, transposedB, a->dataType, c->data, c->dataType,
a->dimSize[0], a->dimSize[1],
......
......@@ -63,44 +63,6 @@ void _MatrixMul2DParallel(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
/* trans(a) * b */
else if (transposedA == X_TRANS && transposedB == X_NOTRANS) {
int num = an;
/*if (a->dataType == X_FLOAT16) {
for (int i = 0; i < cn; i++) {
X_FLOAT16 *p3 = (X_FLOAT16*)c->data + i * cm;
for (int j = 0; j < cm; j++) {
X_FLOAT16 r = 0;
X_FLOAT16 * p1 = (X_FLOAT16*)a->data + 0 * am + i;
X_FLOAT16 * p2 = (X_FLOAT16*)b->data + 0 * bm + j;
for (int k = 0; k < num; k++) {
r += (*p1) * (*p2) * alpha;
p1 += aColNum;
p2 += bColNum;
}
*p3 = *p3 * beta + r;
p3 += 1;
}
}
}
else {
for (int i = 0; i < cn; i++) {
DTYPE * p3 = (DTYPE*)c->data + i * cm;
for (int j = 0; j < cm; j++) {
DTYPE r = 0;
DTYPE * p1 = (DTYPE*)a->data + 0 * am + i;
DTYPE * p2 = (DTYPE*)b->data + 0 * bm + j;
for (int k = 0; k < num; k++) {
r += (*p1) * (*p2) * alpha;
p1 += aColNum;
p2 += bColNum;
}
*p3 = *p3 * beta + r;
p3 += 1;
}
}
}*/
for (int i = 0; i < cn; i++) {
DTYPE * p3 = (DTYPE*)c->data + i * cm;
for (int j = 0; j < cm; j++) {
......
......@@ -22,7 +22,6 @@
#include "../../XTensor.h"
#include "../../XDevice.h"
#include "../../XName.h"
#include "../CHeader.h"
#include "MatrixMulBatched.h"
#include "XTensorBLAS.h"
#include "MatrixMul2D.h"
......@@ -388,142 +387,4 @@ XTensor MatrixMulBatched(const XTensor &a, const XTensor &b,
return c;
}
/*
matrix multiplication of the two tensors (do it on site)
c = trans(a) * trans(b) * alpha
make a new tensor to keep the result and return it
for each 2-dimensional data array in a (denoted as ai) and
each 2-dimensional data array in b (denoted as bi), we have
ci = trans(ai) * trans(bi) * alpha + cm * beta
where trans() returns the transposed matrix if the flag is fired.
>> a - tensor a
>> transposedA - indicates whether the matrices in a are transposed
>> b - tensor b
>> transposedB - indicates whether teh matrices in b are transposed
>> alpha - a coefficient
>> parallelRunner - parallel processing module
<< return - the result of matrix multiplication of the two tensors
*/
XTensor MatrixMulBatchedFloat16(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
DTYPE alpha, XPRunner * parallelRunner)
{
CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
CheckNTErrors(a.order == b.order, "Input tensor and output tensor must have same order!");
int an = transposedA == X_TRANS ? a.dimSizeRDI[0] : a.dimSizeRDI[1];
int am = transposedA == X_TRANS ? a.dimSizeRDI[1] : a.dimSizeRDI[0];
int bn = transposedB == X_TRANS ? b.dimSizeRDI[0] : b.dimSizeRDI[1];
int bm = transposedB == X_TRANS ? b.dimSizeRDI[1] : b.dimSizeRDI[0];
CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
int order = a.order;
int sub = 0;
int * dimSize = new int[order];
for (int i = 0; i < a.order - 2; i++)
dimSize[sub++] = a.dimSize[i];
dimSize[sub++] = an;
dimSize[sub++] = bm;
float dr = (!a.isSparse || !b.isSparse) ? 1.0F : MAX(a.denseRatio, b.denseRatio);
XTensor c(order, dimSize, a.dataType, dr, a.devID, a.mem);
c.SetTMPFlag();
///*call _MatrixMulBatched function */
//_MatrixMulBatched(&a, transposedA, &b, transposedB, &c, alpha, 0, parallelRunner);
XTensor * halfA = NewTensorBuf(a.order, a.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
XTensor * halfB = NewTensorBuf(b.order, b.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
XTensor * halfC = NewTensorBuf(c.order, c.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
_ConvertDataType(&a, halfA);
_ConvertDataType(&b, halfB);
_MatrixMulBatched(halfA, transposedA, halfB, transposedB, halfC, alpha, 0, parallelRunner);
_ConvertDataType(halfC, &c);
DelTensorBuf(halfC);
DelTensorBuf(halfB);
DelTensorBuf(halfA);
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_MATRIXMULBATCHED);
XLink::AddParamToHeadTrans(&c, transposedA);
XLink::AddParamToHeadTrans(&c, transposedB);
XLink::AddParamToHead(&c, alpha);
/* destroy variables */
delete[] dimSize;
return c;
}
/*
matrix multiplication of the two tensors (do it on site)
c = a * b * alpha
make a new tensor to keep the result and return it
for each 2-dimensional data array in a (denoted as ai) and
each 2-dimensional data array in b (denoted as bi), we have
ci = ai * bi * alpha + cm * beta
>> a - tensor a
>> b - tensor b
>> alpha - a coefficient
>> parallelRunner - parallel processing module
<< return - the result of matrix multiplication of the two tensors
*/
XTensor MatrixMulBatchedFloat16(const XTensor &a, const XTensor &b,
DTYPE alpha, XPRunner * parallelRunner)
{
CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
CheckNTErrors(a.order == b.order, "Input tensor and output tensor must have same order!");
int an = a.dimSizeRDI[1];
int am = a.dimSizeRDI[0];
int bn = b.dimSizeRDI[1];
int bm = b.dimSizeRDI[0];
CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
int order = a.order;
int sub = 0;
int * dimSize = new int[order];
for (int i = 0; i < a.order - 2; i++)
dimSize[sub++] = a.dimSize[i];
dimSize[sub++] = an;
dimSize[sub++] = bm;
float dr = (!a.isSparse || !b.isSparse) ? 1.0F : MAX(a.denseRatio, b.denseRatio);
XTensor c(order, dimSize, a.dataType, dr, a.devID, a.mem);
c.SetTMPFlag();
///*call _MatrixMulBatched function */
//_MatrixMulBatched(&a, X_NOTRANS, &b, X_NOTRANS, &c, alpha, 0, parallelRunner);
XTensor * halfA = NewTensorBuf(a.order, a.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
XTensor * halfB = NewTensorBuf(b.order, b.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
XTensor * halfC = NewTensorBuf(c.order, c.dimSize, X_FLOAT16, 1.0F, a.devID, a.mem);
_ConvertDataType(&a, halfA);
_ConvertDataType(&b, halfB);
_MatrixMulBatched(halfA, X_NOTRANS, halfB, X_NOTRANS, halfC, alpha, 0, parallelRunner);
_ConvertDataType(halfC, &c);
DelTensorBuf(halfC);
DelTensorBuf(halfB);
DelTensorBuf(halfA);
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_MATRIXMULBATCHED);
XLink::AddParamToHeadTrans(&c, X_NOTRANS);
XLink::AddParamToHeadTrans(&c, X_NOTRANS);
XLink::AddParamToHead(&c, alpha);
/* destroy variables */
delete[] dimSize;
return c;
}
} // namespace nts(NiuTrans.Tensor)
......@@ -27,7 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
#define BMMul MatrixMulBatched
#define BMMul16 MatrixMulBatchedFloat16
/*
matrix multiplication of the two tensors c = trans(a) * trans(b) * alpha + c * beta
......@@ -85,12 +84,6 @@ ci = ai * bi * alpha + cm * beta
XTensor MatrixMulBatched(const XTensor &a, const XTensor &b,
DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
XTensor MatrixMulBatchedFloat16(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
XTensor MatrixMulBatchedFloat16(const XTensor &a, const XTensor &b,
DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
} // namespace nts(NiuTrans.Tensor)
#endif // __MATRIXMULBATCHED_H__
\ No newline at end of file
......@@ -46,8 +46,6 @@ void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, i
"Unmatched tensors in multiplication!");
CheckNTErrors((a->order == b->order && a->order == c->order),
"Unmatched tensors!");
CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
"Unmatched tensors in addition!");
#ifdef USE_CUDA
if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
......
......@@ -22,9 +22,10 @@
#include "Multiply.h"
#include "MultiplyDim.h"
#include "MultiplyDim.cuh"
#include "../shape/Unsqueeze.h"
#include "../../XName.h"
#include "../../XUtility.h"
#include "../movement/CopyValues.h"
#include "../getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......@@ -136,29 +137,168 @@ void _MultiplyDimMe(XTensor * a, const XTensor * b, int n, DTYPE alpha)
tensor multiplication (return an XTensor structure and make tensor connections)
make a new tensor to keep the result and return it
c = a * b + \alpha * c
c = a * b
where the size of b is equal to the n-th dimension of a,
i.e., a is multiplied with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> n - the dimension index
>> alpha - the scaling factor
<< return - the result tensor by tensor multiplication
*/
XTensor MultiplyDim(const XTensor &a, const XTensor &b, int n, DTYPE alpha)
XTensor MultiplyDim(const XTensor &a, const XTensor &b, int n)
{
XTensor c(&a);
c.SetTMPFlag();
/* call _Multiply function */
_MultiplyDim(&a, &b, &c, n, alpha);
_MultiplyDim(&a, &b, &c, n, 0);
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM);
XLink::AddParamToHeadInt(&c, n);
XLink::AddParamToHead(&c, alpha);
XLink::AddParamToHead(&c, 0);
return c;
}
/*
tensor broadcast multiplication
c = a * b + c * \beta
where some of dimensions of b can be of size 1
>> a - a tensor
>> b - another tensor that would be broadcasted
>> c - the resulting tensor
>> beta - the scaling factor
*/
void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
{
CheckNTErrors(a->order == b->order, "Wrong tensor orders!");
CheckNTErrors(a->order == c->order, "Wrong tensor orders!");
CheckNTErrors(a->order > 0, "TODO!");
int order = a->order;
int count = 0;
void * source = 0;
void * target = 0;
for(int i = 0; i < order; i++){
if(a->GetDim(i) == b->GetDim(i))
continue;
if(b->GetDim(i) == 1){
int fitSize = a->GetDim(i);
int j = i + 1;
/* we define a range over dimensions. It is to be unsqueezed */
for(; j < order; j++){
if(a->GetDim(j) == b->GetDim(j))
break;
fitSize *= a->GetDim(j);
}
int dimsS[MAX_TENSOR_DIM_NUM];
int dimsT[MAX_TENSOR_DIM_NUM];
for(int k = 0; k < i; k++){
dimsS[k] = a->GetDim(k);
dimsT[k] = a->GetDim(k);
}
dimsT[i] = fitSize;
bool isLast = true;
for(int k = j; k < order; k++){
dimsS[i + k - j + 0] = b->GetDim(k);
dimsT[i + k - j + 1] = b->GetDim(k);
if(a->GetDim(k) != b->GetDim(k)){
if(b->GetDim(k) == 1)
isLast = false;
else{
ShowNTErrors("Wrong dimension size!")
}
}
}
dimsS[0] = -dimsS[0];
dimsT[0] = -dimsT[0];
XTensor * s = NewTensor(order - (j - i), dimsS, a->dataType, a->denseRatio, a->devID, a->mem);
XTensor * t = NewTensor(order - (j - i) + 1, dimsT, b->dataType, b->denseRatio, b->devID, b->mem);
if(count == 0)
source = b->data;
else{
source = target;
}
target = t->mem != NULL ?
t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize):
XMemAlloc(t->devID, t->unitNum * t->unitSize);
s->data = source;
t->data = target;
_Unsqueeze(s, t, i, fitSize);
/* free the memory space of the one before the last allocation */
if(count > 0){
int size = s->unitNum * s->unitSize;
if(t->mem != NULL)
t->mem->ReleaseBuf(t->devID, size);
else
XMemFree(t->devID, source);
}
/* we do multiplication here */
if(isLast){
CheckNTErrors(t->unitNum == c->unitNum, "Wrong tensor size!");
_Multiply(a, t, c, beta);
if(t->mem != NULL)
t->mem->ReleaseBuf(t->devID, t->unitNum * t->unitSize);
else
XMemFree(t->devID, target);
target = NULL;
}
s->data = NULL;
t->data = NULL;
DelTensor(s);
DelTensor(t);
i = j;
count++;
}
}
if(count == 0)
_Multiply(a, b, c, beta);
CheckNTErrors(target == NULL, "Something is wrong!");
}
/*
tensor broadcast multiplication
c = a * b
where some of dimensions of b can be of size 1
>> a - a tensor
>> b - another tensor that would be broadcasted
<< return - the resulting tensor c
*/
XTensor MultiplyBroadcast(const XTensor &a, const XTensor &b)
{
XTensor c(&a);
c.SetTMPFlag();
/* call _SumBroadcast function */
_MultiplyBroadcast(&a, &b, &c, 0);
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYBROADCAST);
XLink::AddParamToHead(&c, 0);
return c;
}
......
......@@ -217,8 +217,6 @@ void _CudaMultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n,
}
}
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
else if (a->dataType == X_FLOAT16) {
if (stride > 1) {
......@@ -243,7 +241,9 @@ void _CudaMultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n,
}
}
#endif
else {
ShowNTErrors("TODO!");
}
BacktoCudaDev(a->devID, devIDBackup);
}
......
......@@ -34,9 +34,16 @@ void _MultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYP
i.e., a is multiplied with b by broadcasting. we keep the result in the input tensor a and return nothing */
void _MultiplyDimMe(XTensor * a, const XTensor * b, int n, DTYPE alpha = 0.0);
/* tensor multiplication c = a * b + \alpha * c where the size of b is equal to the n-th dimension of a,
/* tensor multiplication c = a * b where the size of b is equal to the n-th dimension of a,
i.e., a is multiplied with b by broadcasting. We make a new tensor c to keep the result and return it */
XTensor MultiplyDim(const XTensor &a, const XTensor &b, int n, DTYPE alpha = 0.0);
XTensor MultiplyDim(const XTensor &a, const XTensor &b, int n);
/* tensor multiplication summation c = a * b + c * \beta where some of dimensions of b can be of size 1 */
void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
/* tensor broadcast multiplication c = a * b where some of dimensions of b can be of size 1.
we return the resulting tensor here */
XTensor MultiplyBroadcast(const XTensor &a, const XTensor &b);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -22,6 +22,8 @@
#include "SubDim.cuh"
#include "../../XDevice.h"
#include "cuda_fp16.h"
#include "device_launch_parameters.h"
#include "../../XDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......@@ -38,11 +40,10 @@ where a is a tensor and b is a row vector
>> colNum - number of columns of a and c (i.e., the size of b)
>> beta - the scaling factor
*/
template <class T, bool betaFired>
__global__
void KernelSubWithRow(T * a, T * b, T * c, int rowNum, int colNum, DTYPE beta)
void KernelSubWithRow(DTYPE * a, DTYPE * b, DTYPE * c, int rowNum, int colNum, DTYPE beta,bool betaFired)
{
__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ DTYPE bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int col = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
......@@ -54,21 +55,66 @@ __global__
__syncthreads();
T beta1;
if (sizeof(T) - sizeof(half) == 0) {
beta1 =__float2half(beta);
}
else {
beta1 = beta;
}
int offset = colNum * row + col;
if (betaFired)
c[offset] = a[offset] - bv[threadIdx.x] * beta;
else
c[offset] = a[offset] - bv[threadIdx.x];
}
__global__
void KernelSubWithRowHalf(half * a, half * b, half * c, int rowNum, int colNum, half beta, bool betaFired)
{
__shared__ half bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int col = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
if (col >= colNum || row >= rowNum)
return;
if (threadIdx.y == 0)
bv[threadIdx.x] = b[col];
__syncthreads();
int offset = colNum * row + col;
if (betaFired)
c[offset] = a[offset] - bv[threadIdx.x] * beta1;
c[offset] = a[offset] - bv[threadIdx.x] * beta;
else
c[offset] = a[offset] - bv[threadIdx.x];
}
//template <class T, bool betaFired>
//__global__
//void KernelSubWithRow(T * a, T * b, T * c, int rowNum, int colNum, DTYPE beta)
//{
// __shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
// int col = blockDim.x * blockIdx.x + threadIdx.x;
// int row = blockDim.y * blockIdx.y + threadIdx.y;
//
// if (col >= colNum || row >= rowNum)
// return;
//
// if (threadIdx.y == 0)
// bv[threadIdx.x] = b[col];
//
// __syncthreads();
//
// T beta1;
// if (sizeof(T) - sizeof(half) == 0) {
// beta1 =__float2half(beta);
// }
// else {
// beta1 = beta;
// }
//
// int offset = colNum * row + col;
// if (betaFired)
// c[offset] = a[offset] - bv[threadIdx.x] * beta1;
// else
// c[offset] = a[offset] - bv[threadIdx.x];
//}
/*
tensor subtraction of a tensor and a colum vector
c = a - b * \beta
......@@ -82,11 +128,11 @@ where a is a tensor and b is a colum vector
>> blockNum - number of matrics
>> beta - the scaling factor
*/
template <class T, bool betaFired>
__global__
void KernelSubWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, DTYPE beta)
void KernelSubWithCol(DTYPE * a, DTYPE * b, DTYPE * c, int rowNum, int colNum, int blockSize, int blockNum, DTYPE beta,bool betaFired)
{
__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ DTYPE bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
......@@ -102,23 +148,79 @@ __global__
__syncthreads();
T beta1;
int offset = block * blockSize + row * colNum + col;
if (sizeof(T) - sizeof(half) == 0) {
beta1 = __float2half(beta);
}
else {
beta1 = beta;
}
if (betaFired)
c[offset] = a[offset] - bv[threadIdx.y] * beta;
else
c[offset] = a[offset] - bv[threadIdx.y];
}
__global__
void KernelSubWithColHalf(half * a, half * b, half * c, int rowNum, int colNum, int blockSize, int blockNum, half beta, bool betaFired)
{
__shared__ half bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = colIndex % colNum;
int block = colIndex / colNum;
if (row >= rowNum || block >= blockNum)
return;
if (threadIdx.x == 0)
bv[threadIdx.y] = b[row];
__syncthreads();
int offset = block * blockSize + row * colNum + col;
if (betaFired)
c[offset] = a[offset] - bv[threadIdx.y] * beta1;
c[offset] = a[offset] - bv[threadIdx.y] * beta;
else
c[offset] = a[offset] - bv[threadIdx.y];
}
//
//template <class T, bool betaFired>
//__global__
// void KernelSubWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, DTYPE beta)
//{
// __shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
//
// int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
// int row = blockDim.y * blockIdx.y + threadIdx.y;
//
// int col = colIndex % colNum;
// int block = colIndex / colNum;
//
// if (row >= rowNum || block >= blockNum)
// return;
//
// if (threadIdx.x == 0)
// bv[threadIdx.y] = b[row];
//
// __syncthreads();
//
// T beta1;
//
// if (sizeof(T) - sizeof(half) == 0) {
// beta1 = __float2half(beta);
// }
// else {
// beta1 = beta;
// }
//
// int offset = block * blockSize + row * colNum + col;
//
// if (betaFired)
// c[offset] = a[offset] - bv[threadIdx.y] * beta1;
// else
// c[offset] = a[offset] - bv[threadIdx.y];
//}
/*
tensor subtraction (cuda version)
......@@ -163,24 +265,24 @@ void _CudaSubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE
if (stride > 1) {
GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
if (beta == (DTYPE)1.0F)
KernelSubWithCol<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
KernelSubWithCol <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockSize, stride, blockSize * stride, blockNum, beta);
blockSize, stride, blockSize * stride, blockNum, beta,false);
else
KernelSubWithCol<DTYPE, true> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
KernelSubWithCol <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockSize, stride, blockSize * stride, blockNum, beta);
blockSize, stride, blockSize * stride, blockNum, beta,true);
}
else if (stride == 1) {
GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
if (beta == (DTYPE)1.0F)
KernelSubWithRow<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
KernelSubWithRow <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockNum, blockSize, beta);
blockNum, blockSize, beta,false);
else
KernelSubWithRow<DTYPE, true> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
KernelSubWithRow<<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockNum, blockSize, beta);
blockNum, blockSize, beta,true);
}
else {
ShowNTErrors("Something is wrong!");
......@@ -190,25 +292,40 @@ void _CudaSubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE
else if (a->dataType == X_FLOAT16) {
if (stride > 1) {
GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
if (beta == (DTYPE)1.0F)
KernelSubWithCol<half, false> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((half*)a->data, (half*)b->data, (half*)c->data,
blockSize, stride, blockSize * stride, blockNum, beta);
else
KernelSubWithCol<half, true> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((half*)a->data, (half*)b->data, (half*)c->data,
blockSize, stride, blockSize * stride, blockNum, beta);
if (beta == (DTYPE)1.0F){
unsigned short temp = FloatToFloat16(beta);
half beta1 = *((half *)&temp);
KernelSubWithColHalf << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((half*)a->data, (half*)b->data, (half*)c->data,
blockSize, stride, blockSize * stride, blockNum, beta1, false);
}
else {
unsigned short temp = FloatToFloat16(beta);
half beta1 = *((half *)&temp);
KernelSubWithColHalf << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((half*)a->data, (half*)b->data, (half*)c->data,
blockSize, stride, blockSize * stride, blockNum, beta1, true);
}
}
else if (stride == 1) {
GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
if (beta == (DTYPE)1.0F)
KernelSubWithRow<half, false> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((half*)a->data, (half*)b->data, (half*)c->data,
blockNum, blockSize, beta);
else
KernelSubWithRow<half, true> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((half*)a->data, (half*)b->data, (half*)c->data,
blockNum, blockSize, beta);
if (beta == (DTYPE)1.0F) {
unsigned short temp = FloatToFloat16(beta);
half beta1 = *((half *)&temp);
KernelSubWithRowHalf << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((half*)a->data, (half*)b->data, (half*)c->data,
blockNum, blockSize, beta1, false);
}
else{
unsigned short temp = FloatToFloat16(beta);
half beta1 = *((half *)&temp);
KernelSubWithRowHalf << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((half*)a->data, (half*)b->data, (half*)c->data,
blockNum, blockSize, beta1, true);
}
}
else {
ShowNTErrors("Something is wrong!");
......
......@@ -60,6 +60,16 @@ void KernelADDHalf(__half * a, __half * b, __half * c, int size, DTYPE beta)
#endif
}
__global__
void KernelADDInt(int * a, int * b, int * c, int size, DTYPE beta)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size)
c[i] = a[i] + b[i] * (int)beta;
}
/*
tensor summation c = a + b * \beta (cuda version)
......@@ -101,7 +111,7 @@ void _CudaSum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
if ((c == a && handle != NULL) && *handle != 0) {
#ifdef DOUBELPRICSION
cublasDaxpy(*handle, a->unitNum, &beta, (DTYPE*)->data, 1, (DTYPE*)a->data, 1);
cublasDaxpy(*handle, a->unitNum, &beta, (DTYPE*)b->data, 1, (DTYPE*)a->data, 1);
#else
cublasSaxpy(*handle, a->unitNum, &beta, (DTYPE*)b->data, 1, (DTYPE*)a->data, 1);
#endif
......@@ -121,16 +131,6 @@ void _CudaSum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
b->dataType == X_FLOAT16 &&
c->dataType == X_FLOAT16)
{
cublasHandle_t * handle = NULL;
if ((a->mem != NULL) && (b->mem != NULL)) {
cublasHandle_t * handleA = a->mem->GetCublasHandle();
cublasHandle_t * handleB = b->mem->GetCublasHandle();
handle = *handleA != 0 ? handleA : handleB;
}
else {
handle = GDevs.GetCudaHandle(a->devID);
}
int gridSize[3], blockSize[3];
GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
......@@ -141,6 +141,20 @@ void _CudaSum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
KernelADDHalf << <blocks, threads >> >((__half *)a->data, (__half *)b->data, (__half *)c->data, a->unitNum, beta);
}
else if (a->dataType == X_INT &&
b->dataType == X_INT &&
c->dataType == X_INT)
{
int gridSize[3], blockSize[3];
GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
dim3 blocks(gridSize[0]);
dim3 threads(blockSize[0]);
//KernelADD << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, a->unitNum, beta);
KernelADDInt << <blocks, threads >> >((int *)a->data, (int *)b->data, (int *)c->data, a->unitNum, beta);
}
else {
// TODO!!
......
......@@ -17,12 +17,16 @@
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-29
* &Updated by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-12-26
* Add summation by broadcasting.
*/
#include "Sum.h"
#include "SumDim.h"
#include "SumDim.cuh"
#include "../shape/Unsqueeze.h"
#include "../../XName.h"
#include "../../XUtility.h"
#include "../movement/CopyValues.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......@@ -152,7 +156,7 @@ XTensor SumDim(const XTensor &a, const XTensor &b, int n, DTYPE beta)
XTensor c(&a);
c.SetTMPFlag();
/* call _Sum function */
/* call _SumDim function */
_SumDim(&a, &b, &c, n, beta);
/* tensor connections */
......@@ -162,5 +166,146 @@ XTensor SumDim(const XTensor &a, const XTensor &b, int n, DTYPE beta)
return c;
}
/*
tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1
c = a + b * \beta
>> a - a tensor
>> b - another tensor that would be broadcasted
>> c - the resulting tensor
>> beta - the scaling factor
*/
void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
{
CheckNTErrors(a->order == b->order, "Wrong tensor orders!");
CheckNTErrors(a->order == c->order, "Wrong tensor orders!");
CheckNTErrors(a->order > 0, "TODO!");
int order = a->order;
int count = 0;
void * source = 0;
void * target = 0;
for(int i = 0; i < order; i++){
if(a->GetDim(i) == b->GetDim(i))
continue;
if(b->GetDim(i) == 1){
int fitSize = a->GetDim(i);
int j = i + 1;
/* we define a range over dimensions. It is to be unsqueezed */
for(; j < order; j++){
if(a->GetDim(j) == b->GetDim(j))
break;
fitSize *= a->GetDim(j);
}
int dimsS[MAX_TENSOR_DIM_NUM];
int dimsT[MAX_TENSOR_DIM_NUM];
for(int k = 0; k < i; k++){
dimsS[k] = a->GetDim(k);
dimsT[k] = a->GetDim(k);
}
dimsT[i] = fitSize;
bool isLast = true;
for(int k = j; k < order; k++){
dimsS[i + k - j + 0] = b->GetDim(k);
dimsT[i + k - j + 1] = b->GetDim(k);
if(a->GetDim(k) != b->GetDim(k)){
if(b->GetDim(k) == 1)
isLast = false;
else{
ShowNTErrors("Wrong dimension size!")
}
}
}
dimsS[0] = -dimsS[0];
dimsT[0] = -dimsT[0];
XTensor * s = NewTensor(order - (j - i), dimsS, a->dataType, a->denseRatio, a->devID, a->mem);
XTensor * t = NewTensor(order - (j - i) + 1, dimsT, b->dataType, b->denseRatio, b->devID, b->mem);
if(count == 0)
source = b->data;
else{
source = target;
}
target = t->mem != NULL ?
t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize):
XMemAlloc(t->devID, t->unitNum * t->unitSize);
s->data = source;
t->data = target;
_Unsqueeze(s, t, i, fitSize);
/* free the memory space of the one before the last allocation */
if(count > 0){
int size = s->unitNum * s->unitSize;
if(t->mem != NULL)
t->mem->ReleaseBuf(t->devID, size);
else
XMemFree(t->devID, source);
}
/* we do summation here */
if(isLast){
CheckNTErrors(t->unitNum == c->unitNum, "Wrong tensor size!");
_Sum(a, t, c, beta);
if(t->mem != NULL)
t->mem->ReleaseBuf(t->devID, t->unitNum * t->unitSize);
else
XMemFree(t->devID, target);
target = NULL;
}
s->data = NULL;
t->data = NULL;
DelTensor(s);
DelTensor(t);
i = j;
count++;
}
}
if(count == 0)
_Sum(a, b, c, beta);
CheckNTErrors(target == NULL, "Something is wrong!");
}
/*
tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1
c = a + b * \beta
we return c here
>> a - a tensor
>> b - another tensor that would be broadcasted
>> beta - the scaling factor
<< return - the resulting tensor c
*/
XTensor SumBroadcast(const XTensor &a, const XTensor &b, DTYPE beta)
{
XTensor c(&a);
c.SetTMPFlag();
/* call _SumBroadcast function */
_SumBroadcast(&a, &b, &c, beta);
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_SUMBROADCAST);
XLink::AddParamToHead(&c, beta);
return c;
}
}
......@@ -22,6 +22,8 @@
#include "SumDim.cuh"
#include "../../XDevice.h"
#include "cuda_fp16.h"
#include "device_launch_parameters.h"
#include "../../XDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......@@ -38,38 +40,82 @@ where a is a tensor and b is a row vector
>> colNum - number of columns of a and c (i.e., the size of b)
>> beta - the scaling factor
*/
template <class T, bool betaFired>
__global__
void KernelAddWithRow(T * a, T * b, T * c, int rowNum, int colNum, DTYPE beta)
{
__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int col = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
void KernelAddWithRow(DTYPE *a, DTYPE *b, DTYPE *c, int rowNum, int colNum, DTYPE beta,bool betaFired) {
__shared__ DTYPE bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int col = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
if(col >= colNum || row >= rowNum)
return;
if (col >= colNum || row >= rowNum)
return;
if(threadIdx.y == 0)
bv[threadIdx.x] = b[col];
if (threadIdx.y == 0)
bv[threadIdx.x] = b[col];
__syncthreads();
__syncthreads();
T beta1;
if (sizeof(T) - sizeof(half) == 0) {
beta1 = __float2half(beta);
}
else {
beta1 = beta;
}
int offset = colNum * row + col;
if (betaFired)
c[offset] = a[offset] + bv[threadIdx.x] * beta;
else
c[offset] = a[offset] + bv[threadIdx.x];
}
__global__
void KernelAddWithRowHalf(half *a, half *b, half *c, int rowNum, int colNum, half beta, bool betaFired) {
__shared__ half bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int col = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
if (col >= colNum || row >= rowNum)
return;
if (threadIdx.y == 0)
bv[threadIdx.x] = b[col];
int offset = colNum * row + col;
if(betaFired)
c[offset] = a[offset] + bv[threadIdx.x] * beta1;
else
c[offset] = a[offset] + bv[threadIdx.x];
__syncthreads();
int offset = colNum * row + col;
if (betaFired)
c[offset] = a[offset] + bv[threadIdx.x] * beta;
else
c[offset] = a[offset] + bv[threadIdx.x];
}
//
//template <class T, bool betaFired>
//__global__
//void KernelAddWithRow(T * a, T * b, T * c, int rowNum, int colNum, DTYPE beta)
//{
// __shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
// int col = blockDim.x * blockIdx.x + threadIdx.x;
// int row = blockDim.y * blockIdx.y + threadIdx.y;
//
// if(col >= colNum || row >= rowNum)
// return;
//
// if(threadIdx.y == 0)
// bv[threadIdx.x] = b[col];
//
// __syncthreads();
//
// T beta1;
//
// if (sizeof(T) - sizeof(half) == 0) {
// beta1 = __float2half(beta);
// }
// else {
// beta1 = beta;
// }
//
// int offset = colNum * row + col;
// if(betaFired)
// c[offset] = a[offset] + bv[threadIdx.x] * beta1;
// else
// c[offset] = a[offset] + bv[threadIdx.x];
//}
/*
tensor summation of a tensor and a colum vector
c = a + b * \beta
......@@ -83,42 +129,95 @@ where a is a tensor and b is a colum vector
>> blockNum - number of matrics
>> beta - the scaling factor
*/
template <class T, bool betaFired>
__global__
void KernelAddWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, DTYPE beta)
{
__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
void KernelAddWithCol(DTYPE *a, DTYPE *b, DTYPE *c, int rowNum, int colNum, int blockSize, int blockNum, DTYPE beta, bool betaFired) {
__shared__ DTYPE bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = colIndex % colNum;
int block = colIndex / colNum;
int col = colIndex % colNum;
int block = colIndex / colNum;
if(row >= rowNum || block >= blockNum)
return;
if (row >= rowNum || block >= blockNum)
return;
if(threadIdx.x == 0)
bv[threadIdx.y] = b[row];
if (threadIdx.x == 0)
bv[threadIdx.y] = b[row];
__syncthreads();
__syncthreads();
T beta1;
if (sizeof(T) - sizeof(half) == 0) {
beta1 = __float2half(beta);
}
else {
beta1 = beta;
}
int offset = block * blockSize + row * colNum + col;
if (betaFired)
c[offset] = a[offset] + bv[threadIdx.y] * beta;
else
c[offset] = a[offset] + bv[threadIdx.y];
}
__global__
void KernelAddWithColHalf(half *a, half *b, half *c, int rowNum, int colNum, int blockSize, int blockNum, half beta, bool betaFired) {
__shared__ half bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = colIndex % colNum;
int block = colIndex / colNum;
if (row >= rowNum || block >= blockNum)
return;
int offset = block * blockSize + row * colNum + col;
if(betaFired)
c[offset] = a[offset] + bv[threadIdx.y] * beta1;
else
c[offset] = a[offset] + bv[threadIdx.y];
if (threadIdx.x == 0)
bv[threadIdx.y] = b[row];
__syncthreads();
int offset = block * blockSize + row * colNum + col;
if (betaFired)
c[offset] = a[offset] + bv[threadIdx.y] * beta;
else
c[offset] = a[offset] + bv[threadIdx.y];
}
//
//template <class T, bool betaFired>
//__global__
//void KernelAddWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, DTYPE beta)
//{
// __shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
//
// int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
// int row = blockDim.y * blockIdx.y + threadIdx.y;
//
// int col = colIndex % colNum;
// int block = colIndex / colNum;
//
// if(row >= rowNum || block >= blockNum)
// return;
//
// if(threadIdx.x == 0)
// bv[threadIdx.y] = b[row];
//
// __syncthreads();
//
// T beta1;
// if (sizeof(T) - sizeof(half) == 0) {
// beta1 = __float2half(beta);
// }
// else {
// beta1 = beta;
// }
//
// int offset = block * blockSize + row * colNum + col;
//
// if(betaFired)
// c[offset] = a[offset] + bv[threadIdx.y] * beta1;
// else
// c[offset] = a[offset] + bv[threadIdx.y];
//}
/*
tensor summation (cuda version)
......@@ -162,25 +261,25 @@ void _CudaSumDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE
if (a->dataType == DEFAULT_DTYPE){
if(stride > 1){
GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
if(beta == (DTYPE)1.0F)
KernelAddWithCol<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockSize, stride, blockSize * stride, blockNum, beta);
if (beta == (DTYPE)1.0F)
KernelAddWithCol << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockSize, stride, blockSize * stride, blockNum, beta, false);
else
KernelAddWithCol<DTYPE, true> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
KernelAddWithCol <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockSize, stride, blockSize * stride, blockNum, beta);
blockSize, stride, blockSize * stride, blockNum, beta, true);
}
else if(stride == 1){
GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
if(beta == (DTYPE)1.0F)
KernelAddWithRow<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
KernelAddWithRow <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockNum, blockSize, beta);
blockNum, blockSize, beta,false);
else
KernelAddWithRow<DTYPE, true> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
KernelAddWithRow <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockNum, blockSize, beta);
blockNum, blockSize, beta,true);
}
else{
ShowNTErrors("Something is wrong!");
......@@ -190,25 +289,37 @@ void _CudaSumDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE
else if (a->dataType==X_FLOAT16) {
if (stride > 1) {
GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
if (beta == (DTYPE)1.0F)
KernelAddWithCol<half, false> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
if (beta == (DTYPE)1.0F) {
unsigned short temp = FloatToFloat16(beta);
half beta1 = *((half *)&temp);
KernelAddWithColHalf << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((half*)a->data, (half*)b->data, (half*)c->data,
blockSize, stride, blockSize * stride, blockNum, beta1, false);
}
else{
unsigned short temp = FloatToFloat16(beta);
half beta1 = *((half *)&temp);
KernelAddWithColHalf << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((half*)a->data, (half*)b->data, (half*)c->data,
blockSize, stride, blockSize * stride, blockNum, beta);
else
KernelAddWithCol<half, true> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((half*)a->data, (half*)b->data, (half*)c->data,
blockSize, stride, blockSize * stride, blockNum, beta);
blockSize, stride, blockSize * stride, blockNum, beta1,true);
}
}
else if (stride == 1) {
GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
if (beta == (DTYPE)1.0F)
KernelAddWithRow<half, false> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((half*)a->data, (half*)b->data, (half*)c->data,
blockNum, blockSize, beta);
else
KernelAddWithRow<half, true> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((half*)a->data, (half*)b->data, (half*)c->data,
blockNum, blockSize, beta);
if (beta == (DTYPE)1.0F) {
unsigned short temp = FloatToFloat16(beta);
half beta1 = *((half *)&temp);
KernelAddWithRowHalf << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((half*)a->data, (half*)b->data, (half*)c->data,
blockNum, blockSize, beta1, false);
}
else{
unsigned short temp = FloatToFloat16(beta);
half beta1 = *((half *)&temp);
KernelAddWithRowHalf << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((half*)a->data, (half*)b->data, (half*)c->data,
blockNum, blockSize, beta1, true);
}
}
else {
ShowNTErrors("Something is wrong!");
......
......@@ -17,6 +17,8 @@
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-29
* &Updated by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-12-26
* Add summation by broadcasting.
*/
#ifndef __SUMDIM_CUH__
......
......@@ -18,6 +18,9 @@
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-29
* It reached to 39 centigrade around 3:00 pm in Shenyang
* &Updated by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-12-26
* Add summation by broadcasting.
* Four of my master students graduated. Good luck to them for their future work!
*/
#ifndef __SUMDIM_H__
......@@ -38,6 +41,13 @@ void _SumDim(XTensor * a, const XTensor * b, int n, DTYPE beta = (DTYPE)1.0);
/* tensor summation c = a + b * \beta where the size of b is equal to the n-th dimension of a,
i.e., a is summed with b by broadcasting. We make a new tensor c to keep the result and return it */
XTensor SumDim(const XTensor &a, const XTensor &b, int n, DTYPE beta = (DTYPE)1.0);
/* tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1 */
void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
/* tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1.
we return the resulting tensor here */
XTensor SumBroadcast(const XTensor &a, const XTensor &b, DTYPE beta = (DTYPE)1.0);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -23,89 +23,12 @@
#include "../../XDevice.h"
#include "../../XTensor.h"
#include "XTensorBLAS.h"
#include <stdint.h>
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
#include <stdint.h>
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
//typedef char __int8;
half uint16_as_fp16(uint16_t a)
{
half res;
#if defined (__cplusplus)
memcpy(&res, &a, sizeof(res));
#else /* __cplusplus */
volatile union {
half f;
uint16_t i;
} cvt;
cvt.i = a;
res = cvt.f;
#endif /* __cplusplus */
return res;
}
uint32_t fp32_as_uint32(float a)
{
uint32_t res;
#if defined (__cplusplus)
memcpy(&res, &a, sizeof(res));
#else /* __cplusplus */
volatile union {
float f;
uint32_t i;
} cvt;
cvt.f = a;
res = cvt.i;
#endif /* __cplusplus */
return res;
}
/* host version of device function __float2half_rn() */
half float2half_rn(float a)
{
uint32_t ia = fp32_as_uint32(a);
uint16_t ir;
ir = (ia >> 16) & 0x8000;
if ((ia & 0x7f800000) == 0x7f800000) {
if ((ia & 0x7fffffff) == 0x7f800000) {
ir |= 0x7c00; /* infinity */
}
else {
ir = 0x7fff; /* canonical NaN */
}
}
else if ((ia & 0x7f800000) >= 0x33000000) {
int shift = (int)((ia >> 23) & 0xff) - 127;
if (shift > 15) {
ir |= 0x7c00; /* infinity */
}
else {
ia = (ia & 0x007fffff) | 0x00800000; /* extract mantissa */
if (shift < -14) { /* denormal */
ir |= ia >> (-1 - shift);
ia = ia << (32 - (-1 - shift));
}
else { /* normal */
ir |= ia >> (24 - 11);
ia = ia << (32 - (24 - 11));
ir = ir + ((14 + shift) << 10);
}
/* IEEE-754 round to nearest of even */
if ((ia > 0x80000000) || ((ia == 0x80000000) && (ir & 1))) {
ir++;
}
}
}
return uint16_as_fp16(ir);
}
/*
matrix multiplication via cuda version BLAS
*/
......@@ -171,11 +94,24 @@ void _CudaBLASMatrixMUL(cublasHandle_t * handle,
cublasGemmEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, &alpha2, (const int8_t*)b, CUDA_R_8I, mb, (const int8_t*)a, CUDA_R_8I, ma, &beta2, (float*)c, CUDA_R_32F, mc, CUDA_R_32F, CUBLAS_GEMM_DEFAULT);
}
else if (dataTypeA == X_INT8 && dataTypeB == X_INT8 && dataTypeC == X_INT) {
//ShowNTErrors("TO DO!");
int alpha2 = (int)alpha;
int beta2 = (int)beta;
/*
CUDA requires that the dimension of two tensor( lda, ldb ) should be multiples of 4.
details in https://devtalk.nvidia.com/default/topic/999101/about-cublasgemm-int8-support/
*/
if (mb % 4 != 0 || ma % 4 != 0) {
ShowNTErrors("mb, ma( lda, ldb ) should be multiples of 4!");
return;
}
if (transposedA == X_NOTRANS && transposedB == X_NOTRANS)
cublasGemmEx(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, &alpha2, (const int8_t*)b, CUDA_R_8I, mb, (const int8_t*)a, CUDA_R_8I, ma, &beta2, (int*)c, CUDA_R_32I, mc, CUDA_R_32I, CUBLAS_GEMM_DEFAULT);
else if (transposedA == X_TRANS && transposedB == X_NOTRANS)
cublasGemmEx(*handle, CUBLAS_OP_N, CUBLAS_OP_T, mc, nc, na, &alpha2, (const int8_t*)b, CUDA_R_8I, mb, (const int8_t*)a, CUDA_R_8I, ma, &beta2, (int*)c, CUDA_R_32I, mc, CUDA_R_32I, CUBLAS_GEMM_DEFAULT);
else if (transposedA == X_NOTRANS && transposedB == X_TRANS)
cublasGemmEx(*handle, CUBLAS_OP_T, CUBLAS_OP_N, mc, nc, ma, &alpha2, (const int8_t*)b, CUDA_R_8I, mb, (const int8_t*)a, CUDA_R_8I, ma, &beta2, (int*)c, CUDA_R_32I, mc, CUDA_R_32I, CUBLAS_GEMM_DEFAULT);
else if (transposedA == X_TRANS && transposedB == X_TRANS)
cublasGemmEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, &alpha2, (const int8_t*)b, CUDA_R_8I, mb, (const int8_t*)a, CUDA_R_8I, ma, &beta2, (int*)c, CUDA_R_32I, mc, CUDA_R_32I, CUBLAS_GEMM_DEFAULT);
}
else {
ShowNTErrors("Unsupported data type!");
......
......@@ -26,8 +26,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
half float2half_rn(float a);
/* matrix multiplication (BLAS) */
void _MatrixMULCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB,
XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
......
......@@ -430,6 +430,39 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
//delete t2;
}
}
/*
generate data items with a uniform distribution in [lower, upper] and set
the item to a pre-defined value if the item >= p, set the item to 0 otherwise
>> tensor - the tensor whose data array would be initialized
>> lower - lower value of the range
>> upper - upper value of the range
>> p - the threshold
>> value - the value we intend to assign to the item
*/
void _SetDataRandP(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value)
{
//CheckNTErrors(tensor->dataType == DEFAULT_DTYPE, "TODO");
if (tensor->devID < 0) {
_SetDataRand(tensor, lower, upper);
DTYPE * data = (DTYPE*)tensor->data;
for (int i = 0; i < tensor->unitNum; i++) {
if (data[i] >= p)
data[i] = value;
else
data[i] = 0;
}
}
else {
#ifdef USE_CUDA
_CudaSetDataRandP(tensor, lower, upper, p, value);
#else
ShowNTErrors("Please recompile the code by specifying USE_CUDA");
#endif // USE_CUDA
}
}
/*
generate data items with a normal distribution with specified mean and standard deviation
......
......@@ -28,6 +28,7 @@
#include "../../XDevice.h"
#include "../../XUtility.h"
#include "../getandset/ConvertDataType.h"
#include "../movement/CopyValues.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......@@ -197,6 +198,7 @@ set data array with a uniform distribution in [low, high]
__global__
void KernelSetDataRandHalf(half * d, int size, DTYPE lower, DTYPE variance)
{
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
half lowerHalf = __float2half(lower);
half varianceHalf = __float2half(variance);
......@@ -204,6 +206,47 @@ void KernelSetDataRandHalf(half * d, int size, DTYPE lower, DTYPE variance)
if (i < size) {
d[i] = d[i] * varianceHalf + lowerHalf;
}
#endif
}
/*
set data items to a pre-defined value if its value >= p, set it to 0 otherwise
>> d - pointer to the data array
>> size - size of the array
>> lower - low value of the range
>> variance - the variance of the range
*/
__global__
void KernelSetDataPCut(DTYPE * d, int size, DTYPE p, DTYPE value)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) {
if (d[i] >= p)
d[i] = value;
else
d[i] = 0;
}
}
__global__
void KernelSetDataPCutHalf(half * d, int size, DTYPE p, DTYPE value)
{
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
half halfP = __float2half(p);
half halfValue = __float2half(value);
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) {
if (d[i] >= halfP)
d[i] = halfValue;
else
d[i] = 0;
}
#endif
}
/*
......@@ -473,34 +516,81 @@ void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
int devIDBackup;
ProtectCudaDev(tensor->devID, devIDBackup);
curandGenerator_t gen;
curandCreateGenerator (&gen, CURAND_RNG_PSEUDO_DEFAULT);
curandSetPseudoRandomGeneratorSeed(gen, time(NULL));
DTYPE variance = upper - lower;
XTensor tensor1(tensor->order, tensor->dimSize, X_FLOAT, tensor->denseRatio, tensor->devID, tensor->mem);
if (tensor->dataType == X_FLOAT) {
if (tensor->dataType == X_FLOAT){
curandGenerator_t & gen = GDevs.GPUs[tensor->devID].gen;
curandGenerateUniform(gen, (float*)tensor->data, tensor->unitNum);
curandDestroyGenerator(gen);
KernelSetDataRandFloat << <blocks, threads >> >((float*)tensor->data, tensor->unitNum, lower, variance);
}
else if (tensor->dataType == X_DOUBLE) {
curandGenerateUniform(gen, (float*)tensor->data, tensor->unitNum);
curandDestroyGenerator(gen);
KernelSetDataRandDouble << <blocks, threads >> >((double*)tensor->data, tensor->unitNum, lower, variance);
else{
curandGenerator_t & gen = GDevs.GPUs[tensor->devID].gen;
curandGenerateUniform(gen, (float*)tensor1.data, tensor1.unitNum);
}
//curandGenerator_t & gen = GDevs.GPUs[tensor->devID].gen;
//curandGenerateUniform(gen, (float*)tensor->data, tensor->unitNum);
else if (tensor->dataType == X_FLOAT16) {
XTensor tensor1(tensor->order, tensor->dimSize, X_FLOAT, tensor->denseRatio, tensor->devID,tensor->mem);
curandGenerateUniform(gen, (float *)tensor1.data, tensor1.unitNum);
curandDestroyGenerator(gen);
_ConvertDataType(&tensor1, tensor);
KernelSetDataRandHalf << <blocks, threads >> >((half*)tensor->data, tensor->unitNum, lower, variance);
}
DTYPE variance = upper - lower;
if (variance != 1.0F || lower != 0) {
if (tensor->dataType == X_FLOAT) {
KernelSetDataRandFloat << <blocks, threads >> >((float*)tensor->data, tensor->unitNum, lower, variance);
}
else if (tensor->dataType == X_DOUBLE) {
KernelSetDataRandDouble << <blocks, threads >> >((double*)tensor->data, tensor->unitNum, lower, variance);
}
else if (tensor->dataType == X_FLOAT16) {
_ConvertDataType(&tensor1, tensor);
KernelSetDataRandHalf << <blocks, threads >> >((half*)tensor->data, tensor->unitNum, lower, variance);
}
else {
ShowNTErrors("TODO!");
}
}
else if (tensor->dataType == X_FLOAT16) {
_ConvertDataType(&tensor1, tensor);
}
BacktoCudaDev(tensor->devID, devIDBackup);
}
/*
generate data items with a uniform distribution in [lower, upper] and set
the item to a pre-defined value if the item >= p, set the item to 0 otherwise
>> tensor - the tensor whose data array would be initialized
>> lower - lower value of the range
>> upper - upper value of the range
>> p - the threshold
>> value - the value we intend to assign to the item
*/
void _CudaSetDataRandP(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value)
{
_CudaSetDataRand(tensor, lower, upper);
int gridSize[3];
int blockSize[3];
GDevs.GetCudaThread(tensor->devID, tensor->unitNum, gridSize, blockSize);
dim3 blocks(gridSize[0]);
dim3 threads(blockSize[0]);
int devIDBackup;
ProtectCudaDev(tensor->devID, devIDBackup);
if (tensor->dataType == X_FLOAT) {
KernelSetDataPCut << <blocks, threads >> >((float*)tensor->data, tensor->unitNum, p, value);
}
else if (tensor->dataType == X_FLOAT16) {
KernelSetDataPCutHalf << <blocks, threads >> >((__half*)tensor->data, tensor->unitNum, p, value);
}
else {
ShowNTErrors("TODO!")
}
BacktoCudaDev(tensor->devID, devIDBackup);
}
/*
set the data with an array of offsets (kernel version)
>> data - pointer to the data array
......
......@@ -49,6 +49,10 @@ void _CudaSetDataLowTri(XTensor * tensor, DTYPE p, int shift);
/* generate data items with a uniform distribution in [lower, upper] */
void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper);
/* generate data items with a uniform distribution in [lower, upper] and set
the item to a pre-defined value if the item >= p, set the item to 0 otherwise */
void _CudaSetDataRandP(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value);
/* set the data with an array of offsets */
void _CudaSetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYPE num);
......
......@@ -57,6 +57,10 @@ void _SetDataLowTri(XTensor * tensor, DTYPE p, int shift);
/* generate data items with a uniform distribution in [lower, upper] */
void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper);
/* generate data items with a uniform distribution in [lower, upper] and set
the item to a pre-defined value if the item >= p, set the item to 0 otherwise */
void _SetDataRandP(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value);
/* generate data items with a normal distribution with specified mean and standard deviation */
void _SetDataRandN(XTensor * tensor, DTYPE mean = 0.0F, DTYPE standardDeviation = 1.0F);
......
......@@ -35,8 +35,6 @@ get the power(a, p)
*/
void _Power(const XTensor * a, XTensor * b, DTYPE p)
{
CheckNTErrors(a->dataType == b->dataType, "Unmatched tensors in addition!");
#ifdef USE_CUDA
/* run it on GPUs */
if (a->devID >= 0) {
......
......@@ -138,4 +138,4 @@ XTensor Gather(XTensor &s, XTensor &index)
}
}
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
} // namespace nts(NiuTrans.Tensor)
......@@ -270,4 +270,4 @@ void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index)
}
}
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
} // namespace nts(NiuTrans.Tensor)
......@@ -416,4 +416,4 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcI
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
} // namespace nts(NiuTrans.Tensor)
......@@ -23,7 +23,6 @@
#include "../../XName.h"
#include "ReduceMax.h"
#include "ReduceMax.cuh"
#include "../getandset/ConvertDataType.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
......
......@@ -24,7 +24,6 @@
#include "../../XUtility.h"
#include "ReduceMax.h"
#include "ReduceMax.cuh"
#include "../getandset/ConvertDataType.h"
namespace nts{ // namespace nts(NiuTrans.Tensor)
......@@ -544,22 +543,22 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
int devIDBackup;
ProtectCudaDev(input->devID, devIDBackup);
if (input->dataType == DEFAULT_DTYPE) {
if (stride == 1 && blockNum >= 10) {
dim3 grids;
dim3 blocks;
continuousStorageThreadAllocation(grids, blocks, (long long)blockNum, strideNum);
if (blocks.y >= 128) {
KernelReduceMaxOp << <grids, blocks >> > ((DTYPE *)input->data, (DTYPE*)output->data, stride, strideNum, grids.y, blockSize, blockNum);
}
else {
if (blockNum % 4 != 0) blockNum = (int)(blockNum / 4) + 1;
else blockNum = blockNum / 4;
KernelReduceMaxOpLessBlocks << <blockNum, 128 >> > ((DTYPE *)input->data, (DTYPE*)output->data, strideNum, blockNum);
}
if (stride == 1 && blockNum >= 10 && input->dataType == DEFAULT_DTYPE) {
dim3 grids;
dim3 blocks;
continuousStorageThreadAllocation(grids, blocks, (long long)blockNum, strideNum);
if (blocks.y >= 128) {
KernelReduceMaxOp <<<grids, blocks >>> ((DTYPE *)input->data, (DTYPE*)output->data, stride, strideNum, grids.y, blockSize, blockNum);
}
else {
do {
if (blockNum % 4 != 0) blockNum = (int)(blockNum / 4) + 1;
else blockNum = blockNum / 4;
KernelReduceMaxOpLessBlocks <<<blockNum, 128 >>> ((DTYPE *)input->data, (DTYPE*)output->data, strideNum, blockNum);
}
}
else {
do {
if (input->dataType == DEFAULT_DTYPE) {
DTYPE * iData = NULL;
DTYPE * oData = NULL;
if (iter == 0) {
......@@ -581,7 +580,7 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
KernelReduceMax << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
KernelReduceMax <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else if (strideNum < 128) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
......@@ -590,7 +589,7 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
oData = (DTYPE*)output->data;
CheckNTErrors(cudaBlockSize[0] >= 64, "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads);
KernelReduceMaxFast<64> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
KernelReduceMaxFast<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else if (strideNum < 256) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
......@@ -599,7 +598,7 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
oData = (DTYPE*)output->data;
CheckNTErrors(cudaBlockSize[0] >= 128, "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads);
KernelReduceMaxFast<128> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
KernelReduceMaxFast<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else if (strideNum < 512) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
......@@ -608,7 +607,7 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
oData = (DTYPE*)output->data;
CheckNTErrors(cudaBlockSize[0] >= 256, "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads);
KernelReduceMaxFast<256> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
KernelReduceMaxFast<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
......@@ -617,75 +616,69 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
oData = (DTYPE*)output->data;
CheckNTErrors(cudaBlockSize[0] >= 512, "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads);
KernelReduceMaxFast<512> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
KernelReduceMaxFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
strideNum = cudaGridSize[0];
blockSize = cudaGridSize[0];
iter++;
} while (strideNum > 1);
}
}
else if (input->dataType == X_FLOAT16) {
do {
__half * buf1ft16 = (__half *)buf1;
__half * buf2ft16 = (__half *)buf2;
__half * iData = NULL;
__half * oData = NULL;
if (iter == 0) {
iData = (__half*)input->data;
oData = buf1ft16;
}
else if (iter % 2 == 1) {
iData = buf1ft16;
oData = buf2ft16;
}
else {
iData = buf2ft16;
oData = buf1ft16;
}
else if (input->dataType == X_FLOAT16) {
__half * buf1ft16 = (__half *)buf1;
__half * buf2ft16 = (__half *)buf2;
__half * iData = NULL;
__half * oData = NULL;
if (iter == 0) {
iData = (__half*)input->data;
oData = buf1ft16;
}
else if (iter % 2 == 1) {
iData = buf1ft16;
oData = buf2ft16;
}
else {
iData = buf2ft16;
oData = buf1ft16;
}
/* unroll the reduction procedure. The code is messy but it is faster. */
if (strideNum < 32) {
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
KernelReduceMax << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else if (strideNum < 128) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors(cudaBlockSize[0] >= 64, "Incorrect thread number when calling the cuda kernel!");
KernelReduceMaxFast<64> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else if (strideNum < 256) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors(cudaBlockSize[0] >= 128, "Incorrect thread number when calling the cuda kernel!");
KernelReduceMaxFast<128> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else if (strideNum < 512) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors(cudaBlockSize[0] >= 256, "Incorrect thread number when calling the cuda kernel!");
KernelReduceMaxFast<256> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors(cudaBlockSize[0] >= 512, "Incorrect thread number when calling the cuda kernel!");
KernelReduceMaxFast<512> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
/* unroll the reduction procedure. The code is messy but it is faster. */
if (strideNum < 32) {
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
KernelReduceMax <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else if (strideNum < 128) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors(cudaBlockSize[0] >= 64, "Incorrect thread number when calling the cuda kernel!");
KernelReduceMaxFast<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else if (strideNum < 256) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors(cudaBlockSize[0] >= 128, "Incorrect thread number when calling the cuda kernel!");
KernelReduceMaxFast<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else if (strideNum < 512) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors(cudaBlockSize[0] >= 256, "Incorrect thread number when calling the cuda kernel!");
KernelReduceMaxFast<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
else {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors(cudaBlockSize[0] >= 512, "Incorrect thread number when calling the cuda kernel!");
KernelReduceMaxFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
}
}
strideNum = cudaGridSize[0];
blockSize = cudaGridSize[0];
......@@ -693,9 +686,6 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
} while (strideNum > 1);
}
else {
ShowNTErrors("TODO!");
}
BacktoCudaDev(input->devID, devIDBackup);
......
......@@ -735,34 +735,34 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
int devIDBackup;
ProtectCudaDev(input->devID, devIDBackup);
if (input->dataType == DEFAULT_DTYPE) {
if (stride == 1 && blockNum >= 10) {
dim3 grids;
dim3 blocks;
continuousStorageThreadAllocation(grids, blocks, (long long)blockNum, strideNum);
if (blocks.y >= 128)
KernelReduceSumOp <<<grids, blocks>>> ((DTYPE *)input->data, (DTYPE*)output->data, stride,
strideNum, grids.y, blockSize, blockNum, sp, power, isExp);
else {
if (blockNum % 4 != 0)
blockNum = (int)(blockNum / 4) + 1;
else
blockNum = blockNum / 4;
KernelReduceSumOpLessBlocks <<<blockNum, 128>>> ((DTYPE *)input->data, (DTYPE*)output->data,
strideNum, blockNum, sp, power, isExp);
}
}
else if (stride != 1 && stride * blockNum > 4096){
//GDevs->GetGridAndBlockSize2D(devID, stride * blockNum, strideNum,MAX_INT, cudaGridSize, cudaBlockSize);
//unsigned int* goutput = (unsigned int *)input->data;
//convert2uintV2 << <dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >> > ((float*)input->data, goutput, stride, strideNum, blockNum, strideNum*blockNum*stride);
dim3 grid, block;
discontinuousStorageNoShareMemThreadAllocation(&grid, &block, stride, blockNum);
KernelReduceSumDiscontinuousStorage <<<grid, block>>> ((DTYPE *)input->data, (DTYPE*)output->data, stride,
strideNum, blockNum,sp, power, isExp);
}
if (stride == 1 && blockNum >= 10 && input->dataType == DEFAULT_DTYPE) {
dim3 grids;
dim3 blocks;
continuousStorageThreadAllocation(grids, blocks, (long long)blockNum, strideNum);
if (blocks.y >= 128)
KernelReduceSumOp <<<grids, blocks>>> ((DTYPE *)input->data, (DTYPE*)output->data, stride,
strideNum, grids.y, blockSize, blockNum, sp, power, isExp);
else {
do {
if (blockNum % 4 != 0)
blockNum = (int)(blockNum / 4) + 1;
else
blockNum = blockNum / 4;
KernelReduceSumOpLessBlocks <<<blockNum, 128>>> ((DTYPE *)input->data, (DTYPE*)output->data,
strideNum, blockNum, sp, power, isExp);
}
}
else if (stride != 1 && stride * blockNum > 4096 && input->dataType == DEFAULT_DTYPE){
//GDevs->GetGridAndBlockSize2D(devID, stride * blockNum, strideNum,MAX_INT, cudaGridSize, cudaBlockSize);
//unsigned int* goutput = (unsigned int *)input->data;
//convert2uintV2 << <dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >> > ((float*)input->data, goutput, stride, strideNum, blockNum, strideNum*blockNum*stride);
dim3 grid, block;
discontinuousStorageNoShareMemThreadAllocation(&grid, &block, stride, blockNum);
KernelReduceSumDiscontinuousStorage <<<grid, block>>> ((DTYPE *)input->data, (DTYPE*)output->data, stride,
strideNum, blockNum,sp, power, isExp);
}
else {
do {
if (input->dataType == DEFAULT_DTYPE) {
DTYPE * iData = NULL;
DTYPE * oData = NULL;
if (iter == 0) {
......@@ -783,8 +783,8 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
dim3 blocks(cudaGridSize[0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1]);
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
KernelReduceSum << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.x,
blockSize, blockNum, sp, power, isExp);
KernelReduceSum <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.x,
blockSize, blockNum, sp, power, isExp);
}
else if (strideNum < 128) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
......@@ -793,8 +793,8 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(&blocks, &threads);
KernelReduceSumFast<64> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.x,
blockSize, blockNum, sp, power, isExp);
KernelReduceSumFast<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.x,
blockSize, blockNum, sp, power, isExp);
}
else if (strideNum < 256) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
......@@ -803,8 +803,8 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(&blocks, &threads);
KernelReduceSumFast<128> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.x,
blockSize, blockNum, sp, power, isExp);
KernelReduceSumFast<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.x,
blockSize, blockNum, sp, power, isExp);
}
else if (strideNum < 512) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
......@@ -813,8 +813,8 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(&blocks, &threads);
KernelReduceSumFast<256> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.x,
blockSize, blockNum, sp, power, isExp);
KernelReduceSumFast<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.x,
blockSize, blockNum, sp, power, isExp);
}
else {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
......@@ -823,87 +823,76 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(&blocks, &threads);
KernelReduceSumFast<512> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.x,
blockSize, blockNum, sp, power, isExp);
KernelReduceSumFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.x,
blockSize, blockNum, sp, power, isExp);
}
strideNum = cudaGridSize[0];
blockSize = cudaGridSize[0];
sp = NULL;
power = (DTYPE)1.0;
isExp = false;
iter++;
} while (strideNum > 1);
}
}
else if (input->dataType == X_FLOAT16) {
do {
__half * buf1ft16 = (__half *)buf1;
__half * buf2ft16 = (__half *)buf2;
__half * spft16 = (__half *)sp;
unsigned short power2 = FloatToFloat16(power);
__half * powerft16p = (__half*)&power2;
__half * iData = NULL;
__half * oData = NULL;
if (iter == 0) {
iData = (__half*)input->data;
oData = buf1ft16;
}
else if (iter % 2 == 1) {
iData = buf1ft16;
oData = buf2ft16;
}
else {
iData = buf2ft16;
oData = buf1ft16;
}
else if (input->dataType == X_FLOAT16) {
__half * buf1ft16 = (__half *)buf1;
__half * buf2ft16 = (__half *)buf2;
__half * spft16 = (__half *)sp;
unsigned short power2 = FloatToFloat16(power);
__half * powerft16p = (__half*)&power2;
__half * iData = NULL;
__half * oData = NULL;
if (iter == 0) {
iData = (__half*)input->data;
oData = buf1ft16;
}
else if (iter % 2 == 1) {
iData = buf1ft16;
oData = buf2ft16;
}
else {
iData = buf2ft16;
oData = buf1ft16;
}
/* unroll the reduction procedure. The code is messy but it is faster. */
if (strideNum < 32) {
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
KernelReduceSum << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y,
blockSize, blockNum, spft16, *powerft16p, isExp);
}
else if (strideNum < 128) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!");
KernelReduceSumFast<64> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y,
blockSize, blockNum, spft16, *powerft16p, isExp);
}
else if (strideNum < 256) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!");
KernelReduceSumFast<128> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y,
blockSize, blockNum, spft16, *powerft16p, isExp);
}
else if (strideNum < 512) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!");
KernelReduceSumFast<256> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y,
blockSize, blockNum, spft16, *powerft16p, isExp);
}
else {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!");
KernelReduceSumFast<512> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y,
blockSize, blockNum, spft16, *powerft16p, isExp);
/* unroll the reduction procedure. The code is messy but it is faster. */
if (strideNum < 32) {
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
KernelReduceSum <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y,
blockSize, blockNum, spft16, *powerft16p, isExp);
}
else if (strideNum < 128) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!");
KernelReduceSumFast<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y,
blockSize, blockNum, spft16, *powerft16p, isExp);
}
else if (strideNum < 256) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!");
KernelReduceSumFast<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y,
blockSize, blockNum, spft16, *powerft16p, isExp);
}
else if (strideNum < 512) {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!");
KernelReduceSumFast<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y,
blockSize, blockNum, spft16, *powerft16p, isExp);
}
else {
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
oData = (__half*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!");
KernelReduceSumFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y,
blockSize, blockNum, spft16, *powerft16p, isExp);
}
}
strideNum = cudaGridSize[0];
......@@ -915,13 +904,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
iter++;
} while (strideNum > 1);
}
else {
ShowNTErrors("TODO!");
}
ProtectCudaDev(input->devID, devIDBackup);
if (mem != NULL)
......
......@@ -27,6 +27,7 @@
#include "../core/arithmetic/MultiplyDim.h"
#include "../core/math/ScaleAndShift.h"
#include "../core/CHeader.h"
#include "../core/getandset/SetData.h"
namespace nts{ // namespace nts(NiuTrans.Tensor
......@@ -40,7 +41,7 @@ for more details.
Here, the output is scaled by a factor of \frac{1}{1-p} so that we do not need
to mark the tensor with probability p in the inference phase. Instead we perform
the same inference procedure as that with no use of dropout on the test data.
the same inference procedure as that on the test data withno nb use of dropout.
>> x - input tensor
>> y - output tensor
......@@ -123,8 +124,8 @@ void _DropoutBackward(const XTensor * y, const XTensor * x,
else
ShowNTErrors("TODO!");
}
/*
/*
dropout function (we make tensor connections here)
It randomly zeroes some of the elements of the input tensor
with probability p via a Bernoulli distribution.
......@@ -135,89 +136,108 @@ for more details.
Here, the output is scaled by a factor of \frac{1}{1-p} so that we do not need
to mark the tensor with probability p in the inference phase. Instead we perform
the same inference procedure as that with no use of dropout on the test data.
>> x - input tensor
>> dropProb - probability to set an element to zero
>> leadingDim - the dimension which we generate the random numbers and perform broadcasting
>> leadingDim2 - another dimension which we generate the random numbers and perform broadcasting
<< return - tensor after dropout
*/
XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim)
XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim, int leadingDim2)
{
CheckNTErrors(dropProb >= 0.0 && dropProb <= 1.0, "The probability must be 0-1!");
int n = leadingDim < 0 ? x.order - 1 : leadingDim;
XTensor mask;
DTYPE * maskArray = NULL;
DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - dropProb);
CheckNTErrors(n >= 0 && n < x.order, "Wrong leadingDim!");
if(leadingDim < 0 && leadingDim2 < 0){
XTensor mask;
InitTensor(&mask, &x);
_SetDataRandP(&mask, 0, 1.0F, dropProb, scaleFactor);
return Multiply(x, mask);
}
else if(leadingDim2 < 0){
int n = leadingDim;
DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - dropProb);
CheckNTErrors(n >= 0 && n < x.order, "Wrong leadingDim!");
/* generate a mask tensor with probability p */
int unitNum = x.dimSize[n];
maskArray = new DTYPE[unitNum];
//srand((unsigned int)time(NULL));
for (int i = 0; i < unitNum; i++)
maskArray[i] = RandomBernoulli(dropProb, scaleFactor);
/* generate a mask tensor with probability p */
int unitNum = x.dimSize[n];
DTYPE * maskArray = new DTYPE[unitNum];
XTensor mask;
InitTensor1D(&mask, unitNum, X_FLOAT, x.devID, x.mem);
mask.SetData(maskArray, unitNum);
//srand((unsigned int)time(NULL));
for (int i = 0; i < unitNum; i++) {
maskArray[i] = RandomBernoulli(dropProb, scaleFactor);
}
delete[] maskArray;
XTensor mask;
InitTensor1D(&mask, unitNum, X_FLOAT, x.devID, x.mem);
mask.SetData(maskArray, unitNum);
if (x.dataType == X_FLOAT)
{
return MultiplyDim(x, mask, n);
}
else if (x.dataType == X_FLOAT16)
{
XTensor mask1(mask.order, mask.dimSize, X_FLOAT16, mask.denseRatio, mask.devID, mask.mem);
//mask1 = ConvertDataType(mask, X_FLOAT16);
_ConvertDataType(&mask, &mask1);
return MultiplyDim(x, mask1, n);
}
else {
ShowNTErrors("TODO!");
}
}
else{
int n = leadingDim;
int m = leadingDim2;
delete[] maskArray;
CheckNTErrors(n >= 0 && n < x.order, "Wrong leadingDim!");
CheckNTErrors(m >= 0 && m < x.order, "Wrong leadingDim!");
/* generate a mask tensor with probability p */
int unitNum = x.dimSize[n] * x.dimSize[m];
maskArray = new DTYPE[unitNum];
if (x.dataType == X_FLOAT)
{
return MultiplyDim(x, mask, n, 0);
}
else
{
XTensor mask1;
mask1 = ConvertDataType(mask, X_FLOAT16);
//srand((unsigned int)time(NULL));
for (int i = 0; i < unitNum; i++)
maskArray[i] = RandomBernoulli(dropProb, scaleFactor);
return MultiplyDim(x, mask1, n, 0);
}
}
int dims[MAX_TENSOR_DIM_NUM];
for(int i = 0; i < x.order; i++)
dims[i] = 1;
dims[n] = x.GetDim(n);
dims[m] = x.GetDim(m);
InitTensor(&mask, x.order, dims, X_FLOAT, x.denseRatio,x.devID, x.mem);
mask.SetData(maskArray, unitNum);
delete[] maskArray;
if (x.dataType == X_FLOAT)
{
return MultiplyBroadcast(x, mask);
}
else if (x.dataType == X_FLOAT16)
{
XTensor mask1(mask.order, mask.dimSize, X_FLOAT16, mask.denseRatio, mask.devID, mask.mem);
//mask1 = ConvertDataType(mask, X_FLOAT16);
_ConvertDataType(&mask, &mask1);
return MultiplyBroadcast(x, mask1);
}
else {
ShowNTErrors("TODO!");
}
}
//XTensor DropoutFloat16(const XTensor &x, DTYPE dropProb, int leadingDim)
//{
// CheckNTErrors(dropProb >= 0.0 && dropProb <= 1.0, "The probability must be 0-1!");
//
// int n = leadingDim < 0 ? x.order - 1 : leadingDim;
//
// CheckNTErrors(n >= 0 && n < x.order, "Wrong leadingDim!");
//
// DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - dropProb);
//
// /* generate a mask tensor with probability p */
// int unitNum = x.dimSize[n];
// DTYPE * maskArray = new DTYPE[unitNum];
//
// //srand((unsigned int)time(NULL));
// for (int i = 0; i < unitNum; i++) {
// maskArray[i] = RandomBernoulli(dropProb, scaleFactor);
// }
//
// XTensor mask;
// InitTensor1D(&mask, unitNum, X_FLOAT, x.devID, x.mem);
// mask.SetData(maskArray, unitNum);
//
// delete[] maskArray;
//
// XTensor halfMask;
// halfMask = ConvertDataType(mask, X_FLOAT16);
// XTensor halfX;
// halfX = ConvertDataType(x, X_FLOAT16);
// XTensor result;
// XTensor halfResult;
//
// halfResult = MultiplyDim(halfX, halfMask, n, 0);
//
// result = ConvertDataType(halfResult, X_FLOAT);
// return result;
// /*return MultiplyDim(x, mask1, n, 0);*/
//}
}
/*
dropout function without broadcast
......@@ -235,7 +255,6 @@ XTensor DropoutWithoutBroadcast(const XTensor &x, DTYPE dropProb)
int unitNum = x.unitNum;
DTYPE * maskArray = new DTYPE[unitNum];
srand((unsigned int)time(NULL));
for (int i = 0; i < unitNum; i++)
maskArray[i] = RandomBernoulli(dropProb, scaleFactor);
......
......@@ -30,7 +30,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
inline DTYPE RandomBernoulli(DTYPE dropProb, DTYPE value)
{
return (DTYPE)rand()/(DTYPE)RAND_MAX >= dropProb ? (DTYPE)value : 0;
}
/* dropout function */
......@@ -40,11 +39,9 @@ void _Dropout(const XTensor * x, XTensor * y, unsigned int seed, DTYPE dropProb,
void _DropoutBackward(const XTensor * y, const XTensor * x,
const XTensor * dedy, XTensor * dedx,
unsigned int seed, DTYPE dropProb, int leadingDim = -1);
/* dropout function */
XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim = -1);
/* dropout function */
XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim = -1, int leadingDim2 = -1);
/* dropout function without broadcast */
XTensor DropoutWithoutBroadcast(const XTensor &x, DTYPE dropProb);
......
......@@ -27,9 +27,6 @@
#include "../core/reduce/ReduceSum.h"
#include "../core/reduce/ReduceMax.h"
#include "../core/movement/CopyValues.h"
#include "../../tensor/core/getandset/ConvertDataType.h"
using namespace nts;
namespace nts { // namespace nts(NiuTrans.Tensor)
......@@ -182,125 +179,6 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
delete[] dimSize;
// if (!x->isSparse && !y->isSparse &&
// x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE)
// {
// int * dimSize = new int[x->order - 1];
// for (int i = 0; i < x->order; i++) {
// if (i < leadDim)
// dimSize[i] = -x->dimSize[i];
// else if (i > leadDim)
// dimSize[i - 1] = -x->dimSize[i];
// }
//
// XMem * mem = x->mem;
// XTensor * max = NULL;
// XTensor * sum = NULL;
// XTensor * blockx = NULL;
// XTensor * blocky = NULL;
// XTensor * blockMax = NULL;
// XTensor * blockSum = NULL;
//
// int dimensionSize = y->dimSizeRDI[leadDimRDI];
// int stride = 1;
// int blockSize = 1;
// int blockNum = 1;
//
// for (int i = 0; i < leadDimRDI; i++)
// stride *= y->dimSizeRDI[i];
// blockSize = stride * dimensionSize;
// blockNum = y->unitNum / blockSize;
//
// max = NewTensorBuf(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
// sum = NewTensorBuf(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
//
// _ReduceMax(x, max, leadDim);
// _ReduceSum(x, sum, leadDim, max, 1.0F, true);
//
// if (x->devID >= 0) {
// if (leadDimRDI == 0) {
// blockSize = y->unitNum;
// blockNum = 1;
// blockx = NewTensor2D(blockSize / dimensionSize, -dimensionSize, x->dataType, x->devID, mem);
// blocky = NewTensor2D(blockSize / dimensionSize, -dimensionSize, x->dataType, x->devID, mem);
// blockMax = NewTensor2D(blockSize / dimensionSize, -1, x->dataType, x->devID, mem);
// blockSum = NewTensor2D(blockSize / dimensionSize, -1, x->dataType, x->devID, mem);
// }
// else {
// blockx = NewTensor2D(-stride, dimensionSize, x->dataType, x->devID, mem);
// blocky = NewTensor2D(-stride, dimensionSize, x->dataType, x->devID, mem);
// blockMax = NewTensor2D(-stride, 1, x->dataType, x->devID, mem);
// blockSum = NewTensor2D(-stride, 1, x->dataType, x->devID, mem);
// }
// }
//
// for (int k = 0; k < blockNum; k++) {
// int m = stride;
// int n = dimensionSize;
//
// DTYPE * ip = (DTYPE*)x->data + k * blockSize;
// DTYPE * op = (DTYPE*)y->data + k * blockSize;
// DTYPE * mp = (DTYPE*)max->data + k * blockSize / dimensionSize;
// DTYPE * sp = (DTYPE*)sum->data + k * blockSize / dimensionSize;
//
// if (x->devID < 0) {
// for (int j = 0; j < m; j++) {
// DTYPE sumValue = sp[j];
// if (sumValue == 0) {
// for (int i = 0; i < n; i++)
// op[i * m + j] = 0;
// }
// else {
// for (int i = 0; i < n; i++) {
// DTYPE r = (DTYPE)log(exp(ip[i * m + j] - mp[j]) / sp[j]);
// if (IsNAN(r))
// r = LOGPROB_MIN;
// if (IsINF(r))
// r = LOGPROB_MIN;
//
// op[i * m + j] = MAX(r, LOGPROB_MIN);
// }
// }
// }
// }
// else {
// blockx->data = ip;
// blocky->data = op;
// blockMax->data = mp;
// blockSum->data = sp;
//#ifdef USE_CUDA
// if (leadDimRDI == 0)
// _CudaLogSoftmaxSumMax(blockx, blocky, 1, blockSum, blockMax);
// else
// _CudaLogSoftmaxSumMax(blockx, blocky, leadDim, blockSum, blockMax);
//#else
// ShowNTErrors("Please specify USE_CUDA and recompile the code!");
//#endif
// blockx->data = NULL;
// blocky->data = NULL;
// blockMax->data = NULL;
// blockSum->data = NULL;
// }
// }
//
// DelTensorBuf(max);
// DelTensorBuf(sum);
//
// if (x->devID >= 0) {
// delete blockx;
// delete blocky;
// delete blockMax;
// delete blockSum;
// }
//
// delete[] dimSize;
// }
// else
// ShowNTErrors("TODO!");
}
/*
......
......@@ -26,7 +26,6 @@
#include "../core/reduce/ReduceSum.cuh"
#include "../core/reduce/ReduceMax.cuh"
#include "../XDevice.h"
#include "device_launch_parameters.h"
#include "cuda_fp16.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......
......@@ -26,7 +26,6 @@
#include "../XUtility.h"
#include "../core/reduce/ReduceSum.h"
#include "../core/reduce/ReduceMax.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......
......@@ -29,7 +29,6 @@
#include "../core/arithmetic/Sum.h"
#include "../XDevice.h"
#include "../XUtility.h"
#include "../core/getandset/ConvertDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
......@@ -86,8 +85,6 @@ void KernelSoftmaxComputeTensor(DTYPE * x, DTYPE * max, DTYPE * sum, DTYPE * y,
/* synchronize to make sure the values of max and sum are loaded */
__syncthreads();
//printf("1: %d %d %d %d\n", i, strideSizeTotal, j, strideNum);
if(i < strideSizeTotal && j < strideNum){
int offset = int(i / stride) * blockSize + j * stride + i2[threadIdx.x];
DTYPE r = exp(x[offset] - xMax[threadIdx.x])/xSum[threadIdx.x];
......@@ -145,8 +142,6 @@ void KernelSoftmaxComputeTensorHalf(__half * x, __half * max, __half * sum, __ha
/* synchronize to make sure the values of max and sum are loaded */
__syncthreads();
//printf("2: %d %d %d %d\n",i ,stride * blockNum ,j ,strideNum);
if(i < stride * blockNum && j < strideNum){
int offset = int(i / stride) * blockSize + j * stride + i2[threadIdx.x];
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
......@@ -256,7 +251,6 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s
if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
if (leadDim != 0 || dimensionSize <= 10) {
//printf("%d %d %d %d\n", cudaGridSize[0], cudaGridSize[1], cudaBlockSize[0], cudaBlockSize[1]);
KernelSoftmaxComputeTensor <<< dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >>>
((DTYPE*)x->data, (DTYPE*)max->data, (DTYPE*)sum->data, (DTYPE*)y->data,
stride, dimensionSize, stride * dimensionSize, blockNum, stride * blockNum);
......@@ -269,8 +263,6 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s
}
else if(x->dataType == X_FLOAT16 && y->dataType == X_FLOAT16){
//printf("%d\n\n",dimensionSize);
//printf("%d %d %d %d\n", cudaGridSize[0], cudaGridSize[1], cudaBlockSize[0], cudaBlockSize[1]);
KernelSoftmaxComputeTensorHalf <<< dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >>>
((__half*)x->data, (__half*)max->data, (__half*)sum->data, (__half*)y->data,
stride, dimensionSize, blockNum);
......@@ -279,10 +271,6 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s
ShowNTErrors("TODO!");
}
/*XTensor y1;
y1 = ConvertDataType(*y, X_FLOAT);
y1.Dump(stderr, "y1:");*/
BacktoCudaDev(x->devID, devIDBackup);
}
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论