Commit 9a477f7d by linye

Clean the codes

parent 44bf9fa6
...@@ -74,12 +74,12 @@ void XDataGrad::GradConvertDataType(XTensor * node, bool isEfficent) ...@@ -74,12 +74,12 @@ void XDataGrad::GradConvertDataType(XTensor * node, bool isEfficent)
XNoder::MakeGrad(input); XNoder::MakeGrad(input);
XTensor * tmp = NewTensorBuf(input->grad, input->devID, input->mem); XTensor * tmp = NewTensorBuf(input->grad, input->devID, input->mem);
_ConvertDataType(node->grad, tmp); _ConvertDataType(node->grad, tmp);
_SumMe(input->grad, tmp); _SumMe(input->grad, tmp);
DelTensorBuf(tmp); DelTensorBuf(tmp);
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
} }
/* /*
...@@ -92,7 +92,7 @@ dE/da = IndexToOnehot(b) ...@@ -92,7 +92,7 @@ dE/da = IndexToOnehot(b)
>> isEfficient - indicates whether the computation is in >> isEfficient - indicates whether the computation is in
an efficient manner an efficient manner
*/ */
void XDataGrad::GradIndexToOnehot(XTensor * node, bool isEfficent) void XDataGrad::GradOnehotToIndex(XTensor * node, bool isEfficent)
{ {
XLink &income = node->income; XLink &income = node->income;
CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for IndexToOnehot!"); CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for IndexToOnehot!");
...@@ -101,11 +101,20 @@ void XDataGrad::GradIndexToOnehot(XTensor * node, bool isEfficent) ...@@ -101,11 +101,20 @@ void XDataGrad::GradIndexToOnehot(XTensor * node, bool isEfficent)
XNoder::MakeGrad(input); XNoder::MakeGrad(input);
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
} }
void XDataGrad::GradOnehotToIndex(XTensor * node, bool isEfficent) /*
gradient computation for IndexToOnehot
for
b = IndexToOnehot(a)
we have
dE/da = IndexToOnehot(b)
>> node - the node (c) for backward computation
>> isEfficient - indicates whether the computation is in
an efficient manner
*/
void XDataGrad::GradIndexToOnehot(XTensor * node, bool isEfficent)
{ {
XLink &income = node->income; XLink &income = node->income;
CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for IndexToOnehot!"); CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for IndexToOnehot!");
...@@ -114,8 +123,7 @@ void XDataGrad::GradOnehotToIndex(XTensor * node, bool isEfficent) ...@@ -114,8 +123,7 @@ void XDataGrad::GradOnehotToIndex(XTensor * node, bool isEfficent)
XNoder::MakeGrad(input); XNoder::MakeGrad(input);
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
} }
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
...@@ -20,7 +20,9 @@ ...@@ -20,7 +20,9 @@
*/ */
#include "XBackwardLoss.h" #include "XBackwardLoss.h"
#include "XNoder.h"
#include "../tensor/XName.h" #include "../tensor/XName.h"
#include "../tensor/function/FHeader.h"
#include "../tensor/core/getandset/SetData.h" #include "../tensor/core/getandset/SetData.h"
#include "../tensor/function/HardTanH.h" #include "../tensor/function/HardTanH.h"
#include "../tensor/function/Identity.h" #include "../tensor/function/Identity.h"
...@@ -31,6 +33,60 @@ ...@@ -31,6 +33,60 @@
namespace nts{ namespace nts{
/* compute dE/dx of a node */
void XLossGrad::MakeGrad(XTensor * node, bool isEfficient)
{
XLink &income = node->income;
int operID = income.typeID;
CheckNTErrors(income.tailNum >= 1, "Wrong number of tensors for loss computation!");
XTensor * output = income.tails[0];
XTensor * gold = NULL;
XTensor * weight = NULL;
XTensor * padding = NULL;
int leadingDim;
XNoder::MakeGrad(output);
XTensor * dedy = output->grad;
if (income.tailNum == 1) {
if(dedy->dataType == X_FLOAT)
_SetDataFixedFloat(dedy, 1.0F);
else if(dedy->dataType == X_DOUBLE)
_SetDataFixedDouble(dedy, 1.0);
else if(dedy->dataType == X_INT)
_SetDataFixedInt(dedy, 1);
else
ShowNTErrors("TODO");
return;
}
gold = income.tails[1];
if(operID == LOSS_CROSSENTROPY) {
if (income.tailNum == 3)
padding = income.tails[2];
leadingDim = income.GetParamInt(0);
CheckNTErrors(leadingDim >= 0 && leadingDim < output->order, "wrong leading dimension in logsoftmax!");
_CrossEntropyBackward(dedy, output, gold, weight, padding, leadingDim);
}
else{
ShowNTErrors("Wrong activation function type!");
}
node->visitMark = NODE_FINISHED;
}
/* indicates whether the node is for a loss computation */
bool XLossGrad::IsLossOP(XTensor * node)
{
XLink &income = node->income;
return (income.typeID & LOSS_BASE) != 0;
}
/* /*
compute dE/dx for a given function y = f(x) compute dE/dx for a given function y = f(x)
>> gold - gold standard to measure error (or loss) >> gold - gold standard to measure error (or loss)
......
...@@ -23,6 +23,7 @@ ...@@ -23,6 +23,7 @@
#include "../tensor/XTensor.h" #include "../tensor/XTensor.h"
#include "../tensor/function/FHeader.h" #include "../tensor/function/FHeader.h"
#include "../tensor/loss/LHeader.h"
#ifndef __XBACKWARDLOSS_H__ #ifndef __XBACKWARDLOSS_H__
#define __XBACKWARDLOSS_H__ #define __XBACKWARDLOSS_H__
...@@ -34,6 +35,14 @@ namespace nts{ ...@@ -34,6 +35,14 @@ namespace nts{
class XLossGrad class XLossGrad
{ {
public: public:
/* compute dE/dx of a node */
static
void MakeGrad(XTensor * node, bool isEfficient);
/* indicates whether the node is for a Loss computation */
static
bool IsLossOP(XTensor * node);
/* compute dE/dx for a given function y = f(x) */ /* compute dE/dx for a given function y = f(x) */
void Compute(XTensor * gold, XTensor * y, XTensor * x, void Compute(XTensor * gold, XTensor * y, XTensor * x,
XTensor * dedy, XTensor * dedx, XTensor * padding, XTensor * dedy, XTensor * dedx, XTensor * padding,
......
...@@ -81,6 +81,12 @@ void XMathGrad::MakeGrad(XTensor * node, bool isEfficient) ...@@ -81,6 +81,12 @@ void XMathGrad::MakeGrad(XTensor * node, bool isEfficient)
GradPower(node, isEfficient); GradPower(node, isEfficient);
else if(operID == MATH_SCALEANDSHIFT) else if(operID == MATH_SCALEANDSHIFT)
GradScaleAndShift(node, isEfficient); GradScaleAndShift(node, isEfficient);
else if(operID == MATH_SCALE)
GradScale(node, isEfficient);
else if(operID == MATH_DESCALE)
GradDescale(node, isEfficient);
else if(operID == MATH_SHIFT)
GradShift(node, isEfficient);
else if(operID == MATH_SUB) else if(operID == MATH_SUB)
GradSub(node, isEfficient); GradSub(node, isEfficient);
else if(operID == MATH_SUBDIM) else if(operID == MATH_SUBDIM)
...@@ -99,6 +105,8 @@ void XMathGrad::MakeGrad(XTensor * node, bool isEfficient) ...@@ -99,6 +105,8 @@ void XMathGrad::MakeGrad(XTensor * node, bool isEfficient)
GradReduceSumSquared(node, isEfficient); GradReduceSumSquared(node, isEfficient);
else if(operID == REDUCE_REDUCEVARIANCE) else if(operID == REDUCE_REDUCEVARIANCE)
GradReduceVariance(node, isEfficient); GradReduceVariance(node, isEfficient);
else if (operID == MATH_MULANDSHIFT)
GradMulAndShift(node, isEfficient);
else{ else{
ShowNTErrors("TODO!"); ShowNTErrors("TODO!");
} }
...@@ -717,12 +725,18 @@ void XMathGrad::GradMultiply(XTensor * node, bool isEfficient) ...@@ -717,12 +725,18 @@ void XMathGrad::GradMultiply(XTensor * node, bool isEfficient)
XTensor * a = income.tails[0]; XTensor * a = income.tails[0];
XTensor * b = income.tails[1]; XTensor * b = income.tails[1];
XNoder::MakeGrad(a);
XNoder::MakeGrad(b);
CheckNTErrors(XTensor::IsSameShaped(a, b), "Wrong sized input tensors!"); CheckNTErrors(XTensor::IsSameShaped(a, b), "Wrong sized input tensors!");
_Multiply(node->grad, b, a->grad, 1.0F);
_Multiply(node->grad, a, b->grad, 1.0F); if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
_Multiply(node->grad, b, a->grad, 1.0F);
}
if (!isEfficient || b->isGrad) {
XNoder::MakeGrad(b);
_Multiply(node->grad, a, b->grad, 1.0F);;
}
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
} }
...@@ -887,88 +901,8 @@ gradient for normalize ...@@ -887,88 +901,8 @@ gradient for normalize
*/ */
void XMathGrad::GradNormalize(XTensor * node, bool isEfficient) void XMathGrad::GradNormalize(XTensor * node, bool isEfficient)
{ {
ShowNTErrors("This is really a bad piece of code!!!"); ShowNTErrors("TODO!");
XLink &income = node->income;
CheckNTErrors(income.tailNum == 5, "Wrong input tensor number for NORMALIZE!");
XTensor * input = income.tails[0];
XTensor * mean = income.tails[1];
XTensor * var = income.tails[2];
XTensor * a = income.tails[3];
XTensor * b = income.tails[4];
XTensor * c = NewTensor(var);
XTensor * d = NewTensor(a);
XTensor * e = NewTensor(a);
XTensor * f = NewTensor(a);
XTensor * g = NewTensor(a);
XTensor * h = NewTensor(a);
XTensor * i = NewTensor(a);
XTensor * j = NewTensor(a);
XTensor * k = NewTensor(var);
XTensor * p = NewTensor(var);
XTensor * q = NewTensor(var);
XTensor * r = NewTensor(a);
XTensor * x = NewTensor(mean);
XTensor * y = NewTensor(mean);
XTensor * z = NewTensor(mean);
DTYPE epsilon = income.GetParam(1);
int dim = income.GetParamInt(0);
int n = a->GetDim(dim);
XNoder::MakeGrad(input);
XNoder::MakeGrad(mean);
XNoder::MakeGrad(var);
XNoder::MakeGrad(a);
XNoder::MakeGrad(b);
/* dEdinput */
_ScaleAndShift(var, c, 1.0F, epsilon);
_Unsqueeze(c, d, dim, n);
_Power(d, e, -0.5F);
_Multiply(a, e, f);
_Multiply(node->grad, f, input->grad, 1.0F);
/* dEdmean */
_ScaleAndShift(f, g, -1.0F);
_ReduceSum(g, x, dim);
_ReduceSum(node->grad, y, dim);
_Multiply(y, x, mean->grad, 1.0F);
/* dEdvar */
_Unsqueeze(mean, h, dim, n);
_Sub(input, h, i);
_Multiply(a, i, j);
_Power(var, k, -1.5F);
_ScaleAndShift(k, p, -0.5F);
_ReduceSum(j, z, dim);
_Multiply(z, p, q);
_Multiply(y, q, var->grad, 1.0F);
/* dEda */
_Multiply(i, e, r);
_Multiply(node->grad, r, a->grad, 1.0F);
/* dEdb */
_Sum(b->grad, node->grad, b->grad);
node->visitMark = NODE_FINISHED;
delete c;
delete d;
delete e;
delete f;
delete g;
delete h;
delete i;
delete j;
delete k;
delete p;
delete q;
delete r;
delete x;
delete y;
delete z;
} }
/* /*
...@@ -1029,6 +963,82 @@ void XMathGrad::GradScaleAndShift(XTensor * node, bool isEfficient) ...@@ -1029,6 +963,82 @@ void XMathGrad::GradScaleAndShift(XTensor * node, bool isEfficient)
} }
/* /*
gradient for Scale
for
c = a * scale
we have
dE/da = dE/dc * scale
>> node - the node (c) for backward computation
>> isEfficient - indicates whether the computation is in
an efficient manner
*/
void XMathGrad::GradScale(XTensor * node, bool isEfficient)
{
XLink &income = node->income;
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SCALE!");
XTensor * a = income.tails[0];
DTYPE scale = income.GetParam(0);
XNoder::MakeGrad(a);
_Sum(a->grad, node->grad, a->grad, scale);
node->visitMark = NODE_FINISHED;
}
/*
gradient for Descale
for
c = a / descale
we have
dE/da = dE/dc / descale
>> node - the node (c) for backward computation
>> isEfficient - indicates whether the computation is in
an efficient manner
*/
void XMathGrad::GradDescale(XTensor * node, bool isEfficient)
{
XLink &income = node->income;
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for DESCALE!");
XTensor * a = income.tails[0];
DTYPE descale = income.GetParam(0);
XNoder::MakeGrad(a);
_Sum(a->grad, node->grad, a->grad, 1/descale);
node->visitMark = NODE_FINISHED;
}
/*
gradient for Shift
for
c = a + shift
we have
dE/da = dE/dc
>> node - the node (c) for backward computation
>> isEfficient - indicates whether the computation is in
an efficient manner
*/
void XMathGrad::GradShift(XTensor * node, bool isEfficient)
{
XLink &income = node->income;
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SHIFT!");
XTensor * a = income.tails[0];
XNoder::MakeGrad(a);
_Sum(a->grad, node->grad, a->grad);
node->visitMark = NODE_FINISHED;
}
/*
gradient for minus gradient for minus
for for
c = a - b * \beta c = a - b * \beta
...@@ -1487,4 +1497,126 @@ void XMathGrad::GradReduceVariance(XTensor * node, bool isEfficient) ...@@ -1487,4 +1497,126 @@ void XMathGrad::GradReduceVariance(XTensor * node, bool isEfficient)
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
} }
/*
gradient for operation
for c = matmul(x, w) + b
we have
dE/dx = dE/dc * w^T
dE/dw = x^T * dE/dc
dE/db = dE/dc * x.reduce(0,...,n-1,n+1,...)
>> node - the node (c) for backward computation
>> isEfficient - indicates whether the computation is in
an efficient manner
*/
void XMathGrad::GradMulAndShift(XTensor * node, bool isEfficient)
{
XLink &income = node->income;
CheckNTErrors(income.tailNum == 3, "wrong input tensor number")
XTensor * x = income.tails[0];
XTensor * w = income.tails[1];
XTensor * b = income.tails[2];
int n = income.GetParamInt(0);
MATRIX_TRANS_TYPE transW = income.GetParamTrans(1);
MATRIX_TRANS_TYPE transX = income.GetParamTrans(2);
if (!isEfficient || w->isGrad)
XNoder::MakeGrad(w);
if (!isEfficient || x->isGrad)
XNoder::MakeGrad(x);
if (!isEfficient || b->isGrad)
XNoder::MakeGrad(b);
int order = node->order;
int dimSize[MAX_TENSOR_DIM_NUM];
memcpy(dimSize, node->dimSize, sizeof(int) * node->order);
/* compute dE/db */
if (n == order - 1) {
int reshapedSize[MAX_TENSOR_DIM_NUM];
reshapedSize[0] = node->unitNum / dimSize[order - 1];
reshapedSize[1] = dimSize[order - 1];
/* we reshape dE/dc to a matrix whose column number is equal to the
size of b. Then we can reduce the matrix into a row vector. */
node->grad->Reshape(2, reshapedSize);
XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
_ReduceSum(node->grad, bGradTMP, 0);
_Sum(bGradTMP, b->grad, b->grad);
DelTensorBuf(bGradTMP);
node->grad->Reshape(order, dimSize);
}
else {
int reshapedSize[MAX_TENSOR_DIM_NUM];
reshapedSize[0] = 1;
reshapedSize[1] = dimSize[n];
reshapedSize[2] = 1;
for (int i = 0; i < order; i++) {
if (i < n)
reshapedSize[0] *= dimSize[i];
}
reshapedSize[2] = node->unitNum / (reshapedSize[0] * reshapedSize[1]);
/* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|.
Then reduce along with z and x to obtain dE/db. */
node->grad->Reshape(3, reshapedSize);
XTensor * interGrad = NewTensorBuf(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
_ReduceSum(node->grad, interGrad, 2);
XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
_ReduceSum(interGrad, bGradTMP, 0);
_Sum(bGradTMP, b->grad, b->grad);
DelTensorBuf(bGradTMP);
node->grad->Reshape(order, dimSize);
DelTensorBuf(interGrad);
}
/* compute dE/dx, dE/dw */
XTensor * c = node;
XTensor * dedc = node->grad;
XTensor * dedw = w->grad;
XTensor * dedx = x->grad;
if (x->order == 2 && w->order == 2)
GradMatrixMul(x, dedx, transX, w, dedw, transW, dedc, 1.0F, isEfficient);
else if (transX == X_NOTRANS && x->order > 2 && w->order == 2){
int orderBackupX = x->order;
int orderBackupC = c->order;
int dimsBackupX[MAX_TENSOR_DIM_NUM];
int dimsBackupC[MAX_TENSOR_DIM_NUM];
memcpy(dimsBackupX, x->dimSize, sizeof(int) * x->order);
memcpy(dimsBackupC, c->dimSize, sizeof(int) * c->order);
x->Reshape(x->unitNum / x->GetDim(-1), x->GetDim(-1));
c->Reshape(c->unitNum / c->GetDim(-1), c->GetDim(-1));
if (!isEfficient || x->isGrad)
dedx->Reshape(dedx->unitNum / dedx->GetDim(-1), dedx->GetDim(-1));
dedc->Reshape(dedc->unitNum / dedc->GetDim(-1), dedc->GetDim(-1));
GradMatrixMul(x, dedx, transX, w, dedw, transW, dedc, 1.0F, isEfficient);
x->Reshape(orderBackupX, dimsBackupX);
c->Reshape(orderBackupC, dimsBackupC);
if (!isEfficient || x->isGrad)
dedx->Reshape(orderBackupX, dimsBackupX);
dedc->Reshape(orderBackupC, dimsBackupC);
}
node->visitMark = NODE_FINISHED;
}
} }
...@@ -130,6 +130,18 @@ private: ...@@ -130,6 +130,18 @@ private:
static static
void GradScaleAndShift(XTensor * node, bool isEfficient); void GradScaleAndShift(XTensor * node, bool isEfficient);
/* gradient for Scale */
static
void GradScale(XTensor * node, bool isEfficient);
/* gradient for Shift */
static
void GradShift(XTensor * node, bool isEfficient);
/* gradient for Descale */
static
void GradDescale(XTensor * node, bool isEfficient);
/* gradient for Minus */ /* gradient for Minus */
static static
void GradSub(XTensor * node, bool isEfficient); void GradSub(XTensor * node, bool isEfficient);
...@@ -168,6 +180,10 @@ private: ...@@ -168,6 +180,10 @@ private:
/* gradient for reduceVariance */ /* gradient for reduceVariance */
static static
void GradReduceVariance(XTensor * node, bool isEfficient); void GradReduceVariance(XTensor * node, bool isEfficient);
/* gradient for operation */
static
void GradMulAndShift(XTensor * node, bool isEfficient);
}; };
} }
......
...@@ -43,6 +43,8 @@ void XShapeGrad::MakeGrad(XTensor * node, bool isEfficent) ...@@ -43,6 +43,8 @@ void XShapeGrad::MakeGrad(XTensor * node, bool isEfficent)
GradCopyIndexed(node, isEfficent); GradCopyIndexed(node, isEfficent);
else if(operID == MOVEMENT_GATHER) else if(operID == MOVEMENT_GATHER)
GradGather(node, isEfficent); GradGather(node, isEfficent);
else if (operID == MOVEMENT_DROPOUTWITHINDEX)
GradDropoutWithIndex(node, isEfficent);
else if(operID == SHAPE_MERGE) else if(operID == SHAPE_MERGE)
GradMerge(node, isEfficent); GradMerge(node, isEfficent);
else if(operID == SHAPE_MERGE_LIST) else if(operID == SHAPE_MERGE_LIST)
...@@ -115,7 +117,7 @@ dE/da = spreadforgather(b) ...@@ -115,7 +117,7 @@ dE/da = spreadforgather(b)
void XShapeGrad::GradGather(XTensor * node, bool isEfficent) void XShapeGrad::GradGather(XTensor * node, bool isEfficent)
{ {
XLink &income = node->income; XLink &income = node->income;
CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for CopyIndexed!"); CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for Gather!");
XTensor * input = income.tails[0]; XTensor * input = income.tails[0];
XTensor * index = income.tails[1]; XTensor * index = income.tails[1];
...@@ -126,6 +128,43 @@ void XShapeGrad::GradGather(XTensor * node, bool isEfficent) ...@@ -126,6 +128,43 @@ void XShapeGrad::GradGather(XTensor * node, bool isEfficent)
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
} }
/*
gradient computation for DropoutWithIndex function
*/
void XShapeGrad::GradDropoutWithIndex(XTensor * node, bool isEfficent)
{
XLink &income = node->income;
CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for DropoutWithIndex!");
XTensor * input = income.tails[0];
XTensor * index = income.tails[1];
DTYPE scale = income.GetParam(0);
XNoder::MakeGrad(input);
//_Identity(node->grad, input->grad);
_CopyValues(node->grad, input->grad);
int order = node->grad->order;
int * dimSize = new int[order];
for (int i = 0; i < order; i++) {
dimSize[i] = node->grad->dimSize[i];
}
int order1 = 1;
int * dimSize1 = new int[order1];
dimSize1[0] = input->grad->unitNum;
input->grad->Reshape(order1, dimSize1);
_DropoutWithIndex(node->grad, index, input->grad);
_ScaleAndShiftMe(input->grad, scale);
input->grad->Reshape(order, dimSize);
node->visitMark = NODE_FINISHED;
}
/* /*
gradient for merge gradient for merge
for for
......
...@@ -54,6 +54,10 @@ private: ...@@ -54,6 +54,10 @@ private:
static static
void GradGather(XTensor * node, bool isEfficent); void GradGather(XTensor * node, bool isEfficent);
/* gradient computation for dropout with indexs */
static
void GradDropoutWithIndex(XTensor * node, bool isEfficent);
/* gradient computation for merge: c = merge(a, b, ...) */ /* gradient computation for merge: c = merge(a, b, ...) */
static static
void GradMerge(XTensor * node, bool isEfficent); void GradMerge(XTensor * node, bool isEfficent);
......
...@@ -23,6 +23,7 @@ ...@@ -23,6 +23,7 @@
#include "../tensor/XTensor.h" #include "../tensor/XTensor.h"
#include "../tensor/function/FHeader.h" #include "../tensor/function/FHeader.h"
#include "../tensor/loss/LHeader.h"
#ifndef __XNET_H__ #ifndef __XNET_H__
#define __XNET_H__ #define __XNET_H__
...@@ -111,6 +112,10 @@ struct XNet ...@@ -111,6 +112,10 @@ struct XNet
/* show network topology */ /* show network topology */
void ShowNetwork(FILE * file, XTensor * node); void ShowNetwork(FILE * file, XTensor * node);
/* search a node in a top-down manner by its name */
//static
//XTensor * SearchNode(XTensor * top, const char * name);
}; };
/* we make a unique id for every tensor */ /* we make a unique id for every tensor */
......
...@@ -839,38 +839,14 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net) ...@@ -839,38 +839,14 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
InitModelTensor2D(s, batchSize, model.vSize, model); InitModelTensor2D(s, batchSize, model.vSize, model);
InitModelTensor2D(y, batchSize, model.vSize, model); InitModelTensor2D(y, batchSize, model.vSize, model);
///* s = h_last * w */ /* s = h_last * w */
//_MatrixMul(&h_last, X_NOTRANS, &w, X_NOTRANS, &s); _MatrixMul(&h_last, X_NOTRANS, &w, X_NOTRANS, &s);
XTensor h_last1;
h_last1 = ScaleAndShift(h_last, 100, 0);
XTensor w1;
w1 = ScaleAndShift(w, 100, 0);
XTensor int8H_last;
XTensor int8W;
int8H_last = ConvertDataType(h_last1, X_INT8);
int8W = ConvertDataType(w1, X_INT8);
XTensor s1;
InitTensor2D(&s1, batchSize, model.vSize, X_INT, model.devID, model.mem);
_MatrixMul2D(&int8H_last, X_NOTRANS, &int8W, X_NOTRANS, &s1);
XTensor b2D; XTensor b2D;
InitTensor2D(&b2D, batchSize, model.vSize, X_FLOAT, model.devID, model.mem); InitTensor(&b2D, &s);
_Unsqueeze(&b, &b2D, 0, batchSize); _Unsqueeze(&b, &b2D, 0, batchSize);
b2D = ScaleAndShift(b2D, 10000, 0); _Sum(&s, &b2D, &s);
XTensor b2D1;
b2D1 = ConvertDataType(b2D, X_INT);
_Sum(&s1, &b2D1, &s1);
s = ConvertDataType(s1, X_FLOAT);
s = ScaleAndShift(s, 0.0001, 0);
/* y = softmax(s) */ /* y = softmax(s) */
_LogSoftmax(&s, &y, 1); _LogSoftmax(&s, &y, 1);
...@@ -1224,6 +1200,7 @@ void Test(const char * test, const char * result, FNNModel &model) ...@@ -1224,6 +1200,7 @@ void Test(const char * test, const char * result, FNNModel &model)
} }
fclose(file); fclose(file);
fclose(ofile);
double elapsed = GetClockSec() - startT; double elapsed = GetClockSec() - startT;
......
...@@ -53,52 +53,15 @@ initialize the model ...@@ -53,52 +53,15 @@ initialize the model
>> myDevID - device id >> myDevID - device id
>> myMem - the memory pool >> myMem - the memory pool
*/ */
//void T2TAttention::InitModel(int argc, char ** argv, void T2TAttention::InitModel(int argc, char ** argv,
// bool myIsMasked, int myIgnored, bool myIsMasked, int myIgnored,
// int myDevID, XMem * myMem) int myDevID, XMem * myMem)
//{
// devID = myDevID;
// mem = myMem;
// isMasked = myIsMasked;
// ignored = myIgnored;
//
// float minmax = 0;
//
// LoadParamInt(argc, argv, "nhead", &nhead, 8);
// LoadParamInt(argc, argv, "d", &dk, DEFAULT_EMBEDDING_SIZE);
// LoadParamInt(argc, argv, "d", &dv, DEFAULT_EMBEDDING_SIZE);
// LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
// LoadParamFloat(argc, argv, "attminmax", &minmax, 0.1F);
// LoadParamFloat(argc, argv, "dropoutatt", &dropoutP, 0);
//
// InitTensor2D(&wk, d, dk, X_FLOAT, devID, mem);
// InitTensor2D(&wq, d, dk, X_FLOAT, devID, mem);
// InitTensor2D(&wv, d, dv, X_FLOAT, devID, mem);
// InitTensor2D(&wa, d, d, X_FLOAT, devID, mem);
// InitTensor2D(&wbig, d, 3 * d, X_FLOAT, devID, mem);
//
// float scale = 1.0F;
// float finfoutk = (float)sqrt(6.0F * scale/(d + dk));
// float finfoutv = (float)sqrt(6.0F * scale/(d + dv));
// float finfouta = (float)sqrt(6.0F * scale / (d + d));
// float finfoutbig = (float)sqrt(6.0F * scale / (d + 3*d));
//
// wk.SetDataRand(-finfoutk, finfoutk);
// wq.SetDataRand(-finfoutk, finfoutk);
// wv.SetDataRand(-finfoutv, finfoutv);
// wa.SetDataRand(-finfouta, finfouta);
// wbig.SetDataRand(-finfoutbig, finfoutbig);
//}
void T2TAttention::InitModel(int argc, char ** argv,
bool myIsMasked, int myIgnored,
int myDevID, XMem * myMem)
{ {
devID = myDevID; devID = myDevID;
mem = myMem; mem = myMem;
isMasked = myIsMasked; isMasked = myIsMasked;
ignored = myIgnored; ignored = myIgnored;
float minmax = 0; float minmax = 0;
LoadParamInt(argc, argv, "nhead", &nhead, 8); LoadParamInt(argc, argv, "nhead", &nhead, 8);
...@@ -108,17 +71,17 @@ void T2TAttention::InitModel(int argc, char ** argv, ...@@ -108,17 +71,17 @@ void T2TAttention::InitModel(int argc, char ** argv,
LoadParamFloat(argc, argv, "attminmax", &minmax, 0.1F); LoadParamFloat(argc, argv, "attminmax", &minmax, 0.1F);
LoadParamFloat(argc, argv, "dropoutatt", &dropoutP, 0); LoadParamFloat(argc, argv, "dropoutatt", &dropoutP, 0);
InitTensor2D(&wk, d, dk, X_FLOAT16, devID, mem); InitTensor2D(&wk, d, dk, X_FLOAT, devID, mem);
InitTensor2D(&wq, d, dk, X_FLOAT16, devID, mem); InitTensor2D(&wq, d, dk, X_FLOAT, devID, mem);
InitTensor2D(&wv, d, dv, X_FLOAT16, devID, mem); InitTensor2D(&wv, d, dv, X_FLOAT, devID, mem);
InitTensor2D(&wa, d, d, X_FLOAT16, devID, mem); InitTensor2D(&wa, d, d, X_FLOAT, devID, mem);
InitTensor2D(&wbig, d, 3 * d, X_FLOAT16, devID, mem); InitTensor2D(&wbig, d, 3 * d, X_FLOAT, devID, mem);
float scale = 1.0F; float scale = 1.0F;
float finfoutk = (float)sqrt(6.0F * scale / (d + dk)); float finfoutk = (float)sqrt(6.0F * scale/(d + dk));
float finfoutv = (float)sqrt(6.0F * scale / (d + dv)); float finfoutv = (float)sqrt(6.0F * scale/(d + dv));
float finfouta = (float)sqrt(6.0F * scale / (d + d)); float finfouta = (float)sqrt(6.0F * scale / (d + d));
float finfoutbig = (float)sqrt(6.0F * scale / (d + 3 * d)); float finfoutbig = (float)sqrt(6.0F * scale / (d + 3*d));
wk.SetDataRand(-finfoutk, finfoutk); wk.SetDataRand(-finfoutk, finfoutk);
wq.SetDataRand(-finfoutk, finfoutk); wq.SetDataRand(-finfoutk, finfoutk);
...@@ -138,150 +101,95 @@ make the network ...@@ -138,150 +101,95 @@ make the network
>> isTraining - indicates whether the model is used for training >> isTraining - indicates whether the model is used for training
<< return - multi-attention result << return - multi-attention result
*/ */
//XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining, bool selfatt) XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining)
//{
// XTensor k2;
// XTensor q2;
// XTensor v2;
//
// if (selfatt){
//
// XTensor con;
// XList split;
//
// con = MMul(k, wbig);
//
// int d1 = con.GetDim(0);
// int d2 = con.GetDim(1);
// int d3 = con.GetDim(2) / 3;
//
// InitTensor3D(&k2, d1, d2, d3, X_FLOAT, devID, mem);
// InitTensor3D(&q2, d1, d2, d3, X_FLOAT, devID, mem);
// InitTensor3D(&v2, d1, d2, d3, X_FLOAT, devID, mem);
//
// split.Add(&q2);
// split.Add(&k2);
// split.Add(&v2);
//
// Split(con, split, 2, 3);
// }
//
// else{
// /* linear transofmration before self-attention */
// k2 = MMul(k, wk);
// q2 = MMul(q, wq);
// v2 = MMul(v, wv);
// }
//
// XTensor kheads;
// XTensor qheads;
// XTensor vheads;
//
// /* multi head */
// kheads = Split(k2, k2.order - 1, nhead);
// qheads = Split(q2, q2.order - 1, nhead);
// vheads = Split(v2, v2.order - 1, nhead);
//
// XTensor att;
// XTensor dot;
// XTensor scalar;
//
// /* scalar = softmax(Q * K^T / sqrt(dk)) * V */
// dot = BMMul(qheads, X_NOTRANS, kheads, X_TRANS);
//
// if(isMasked)
// dot = dot + mask;
//
// dot = Linear(dot, 1.0F/(float)sqrt((float)dk/nhead));
//
// scalar = Softmax(dot, -1);
//
// if(isTraining && dropoutP > 0)
// scalar = Dropout(scalar, dropoutP);
//
// att = BMMul(scalar, vheads);
//
// /* concatenate the heads */
// return MMul(Merge(att, att.order - 1), wa);
//}
XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining, bool selfatt)
{ {
XTensor halfK2; XTensor k2;
XTensor halfQ2; XTensor q2;
XTensor halfV2; XTensor v2;
XTensor halfK; /* linear transformation before self-attention */
halfK = ConvertDataType(k, X_FLOAT16); k2 = MMul(k, wk);
q2 = MMul(q, wq);
if (selfatt) { v2 = MMul(v, wv);
XTensor halfCon; return MakeAttention(k2, q2, v2, mask, isTraining);
XList halfSplit; }
halfCon = MMul(halfK, wbig);
/*
int d1 = halfCon.GetDim(0); make the network given a big tensor that keeps keys, queries and values
int d2 = halfCon.GetDim(1); >> kqv - the big tensor
int d3 = halfCon.GetDim(2) / 3; >> mask - as it is
>> isTraining - indicates whether the model is used for training
InitTensor3D(&halfK2, d1, d2, d3, X_FLOAT16, devID, mem); */
InitTensor3D(&halfQ2, d1, d2, d3, X_FLOAT16, devID, mem); XTensor T2TAttention::MakeBig(XTensor &kqv, XTensor &mask, bool isTraining)
InitTensor3D(&halfV2, d1, d2, d3, X_FLOAT16, devID, mem); {
XTensor k2;
halfSplit.Add(&halfQ2); XTensor q2;
halfSplit.Add(&halfK2); XTensor v2;
halfSplit.Add(&halfV2); XTensor kqv2;
XList split;
Split(halfCon, halfSplit, 2, 3);
} kqv2 = MMul(kqv, wbig);
else { int d1 = kqv2.GetDim(0);
XTensor halfQ; int d2 = kqv2.GetDim(1);
XTensor halfV; int d3 = kqv2.GetDim(2) / 3;
halfQ = ConvertDataType(q, X_FLOAT16);
halfV = ConvertDataType(v, X_FLOAT16); InitTensor3D(&k2, d1, d2, d3, X_FLOAT, devID, mem);
InitTensor3D(&q2, d1, d2, d3, X_FLOAT, devID, mem);
/* linear transofmration before self-attention */ InitTensor3D(&v2, d1, d2, d3, X_FLOAT, devID, mem);
halfK2 = MMul(halfK, wk);
halfQ2 = MMul(halfQ, wq); split.Add(&q2);
halfV2 = MMul(halfV, wv); split.Add(&k2);
} split.Add(&v2);
XTensor halfKheads; Split(kqv2, split, 2, 3);
XTensor halfQheads;
XTensor halfVheads; return MakeAttention(k2, q2, v2, mask, isTraining);
}
/*
make the attention network given keys, queries and values (after linear transformation)
>> k - keys. It might be of size B * L * H
where B = batch size, L = sequence length,
and H = vector size of each position
>> q - queries
>> v - values
>> mask - as it is
>> isTraining - indicates whether the model is used for training
*/
XTensor T2TAttention::MakeAttention(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining)
{
XTensor kheads;
XTensor qheads;
XTensor vheads;
/* multi head */ /* multi head */
halfKheads = Split(halfK2, halfK2.order - 1, nhead); kheads = Split(k, k.order - 1, nhead);
halfQheads = Split(halfQ2, halfQ2.order - 1, nhead); qheads = Split(q, q.order - 1, nhead);
halfVheads = Split(halfV2, halfV2.order - 1, nhead); vheads = Split(v, v.order - 1, nhead);
XTensor halfAtt; XTensor att;
XTensor halfDot; XTensor dot;
XTensor halfScalar; XTensor scalar;
/* scalar = softmax(Q * K^T / sqrt(dk)) * V */ /* scalar = softmax(Q * K^T / sqrt(dk)) * V */
halfDot = BMMul(halfQheads, X_NOTRANS, halfKheads, X_TRANS); dot = BMMul(qheads, X_NOTRANS, kheads, X_TRANS);
//XTensor halfMask(mask.order, mask.dimSize, X_FLOAT16, mask.denseRatio, mask.devID, mask.mem); if(isMasked)
dot = dot + mask;
if (isMasked) {
XTensor halfMask; dot = Linear(dot, 1.0F/(float)sqrt((float)dk/nhead));
halfMask = ConvertDataType(mask, X_FLOAT16);
halfDot = Sum(halfDot, halfMask); scalar = Softmax(dot, -1);
}
if(isTraining && dropoutP > 0)
halfDot = Linear(halfDot, 1.0F / (float)sqrt((float)dk / nhead)); scalar = Dropout(scalar, dropoutP);
halfScalar = Softmax(halfDot, -1); att = BMMul(scalar, vheads);
if (isTraining && dropoutP > 0)
halfScalar = Dropout(halfScalar, dropoutP);
halfAtt = BMMul(halfScalar, halfVheads);
/* concatenate the heads */ /* concatenate the heads */
return ConvertDataType(MMul(Merge(halfAtt, halfAtt.order - 1), wa), X_FLOAT); return MMul(Merge(att, att.order - 1), wa);
} }
} }
...@@ -61,6 +61,7 @@ public: ...@@ -61,6 +61,7 @@ public:
XTensor wa; XTensor wa;
XTensor wbig; XTensor wbig;
/* size of transformed Q and K */ /* size of transformed Q and K */
int dk; int dk;
...@@ -96,7 +97,13 @@ public: ...@@ -96,7 +97,13 @@ public:
int myDevID = -1, XMem * myMem = NULL); int myDevID = -1, XMem * myMem = NULL);
/* make the network */ /* make the network */
XTensor Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining, bool selfatt); XTensor Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining);
/* make the network given a big tensor that keeps keys, queries and values */
XTensor MakeBig(XTensor &kqv, XTensor &mask, bool isTraining);
/* make the attention network given keys, queries and values (after linear transformation) */
XTensor MakeAttention(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining);
}; };
} }
......
...@@ -80,7 +80,6 @@ void AttDecoder::InitModel(int argc, char ** argv, ...@@ -80,7 +80,6 @@ void AttDecoder::InitModel(int argc, char ** argv,
attentionsEnde = new T2TAttention[nlayer]; attentionsEnde = new T2TAttention[nlayer];
attEndeLayerNorms = new T2TLN[nlayer]; attEndeLayerNorms = new T2TLN[nlayer];
/* initialize the stacked layers */ /* initialize the stacked layers */
for (int i = 0; i < nlayer; i++) { for (int i = 0; i < nlayer; i++) {
attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem); attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
...@@ -89,9 +88,7 @@ void AttDecoder::InitModel(int argc, char ** argv, ...@@ -89,9 +88,7 @@ void AttDecoder::InitModel(int argc, char ** argv,
fnnLayerNorms[i].InitModel(argc, argv, myDevID, myMem); fnnLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
attentionsEnde[i].InitModel(argc, argv, true, myIgnored, myDevID, myMem); attentionsEnde[i].InitModel(argc, argv, true, myIgnored, myDevID, myMem);
attEndeLayerNorms[i].InitModel(argc, argv, myDevID, myMem); attEndeLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
} }
} }
/* /*
...@@ -122,7 +119,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X ...@@ -122,7 +119,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
/******************/ /******************/
/* self attention */ /* self attention */
att = attentions[i].Make(x, x, x, mask, isTraining, true); att = attentions[i].MakeBig(x, mask, isTraining);
/* dropout */ /* dropout */
if(isTraining && dropoutP > 0) if(isTraining && dropoutP > 0)
...@@ -136,7 +133,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X ...@@ -136,7 +133,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
/*****************************/ /*****************************/
/* encoder-decoder attention */ /* encoder-decoder attention */
ende = attentionsEnde[i].Make(outputEnc, x, outputEnc, maskEncDec, isTraining, false); ende = attentionsEnde[i].Make(outputEnc, x, outputEnc, maskEncDec, isTraining);
/* dropout */ /* dropout */
if(isTraining && dropoutP > 0) if(isTraining && dropoutP > 0)
......
...@@ -100,4 +100,4 @@ public: ...@@ -100,4 +100,4 @@ public:
} }
#endif #endif
\ No newline at end of file
...@@ -103,8 +103,6 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo ...@@ -103,8 +103,6 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo
x = embedder.Make(input); x = embedder.Make(input);
//x.Dump(tmpFILE, "embedding: ");
/* dropout */ /* dropout */
if(isTraining && dropoutP > 0) if(isTraining && dropoutP > 0)
x = Dropout(x, dropoutP); x = Dropout(x, dropoutP);
...@@ -116,7 +114,7 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo ...@@ -116,7 +114,7 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo
XTensor res; XTensor res;
/* self attention */ /* self attention */
att = attentions[i].Make(x, x, x, mask, isTraining, true); att = attentions[i].MakeBig(x, mask, isTraining);
/* dropout */ /* dropout */
if(isTraining && dropoutP > 0) if(isTraining && dropoutP > 0)
...@@ -160,4 +158,3 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool isTraining) ...@@ -160,4 +158,3 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool isTraining)
} }
} }
...@@ -89,13 +89,15 @@ XTensor T2TFNN::Make(XTensor &input, bool isTraining) ...@@ -89,13 +89,15 @@ XTensor T2TFNN::Make(XTensor &input, bool isTraining)
XTensor t1; XTensor t1;
/* t1 = max(0, x * w1 + b1) */ /* t1 = max(0, x * w1 + b1) */
t1 = Rectify(MMul(input, w1) + b1); //t1 = Rectify(MMul(input, w1) + b1);
t1 = Rectify(MulAndShift(input, w1, b1));
if(isTraining && dropoutP > 0) if(isTraining && dropoutP > 0)
t1 = Dropout(t1, dropoutP); t1 = Dropout(t1, dropoutP);
/* result = t1 * w2 + b2 */ /* result = t1 * w2 + b2 */
return MMul(t1, w2) + b2; //return MMul(t1, w2) + b2;
return MulAndShift(t1, w2, b2);
} }
......
...@@ -203,31 +203,49 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe ...@@ -203,31 +203,49 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
XTensor maskEnc; XTensor maskEnc;
XTensor maskDec; XTensor maskDec;
XTensor maskEncDec; XTensor maskEncDec;
/* encoder mask */
MakeMTMaskEnc(inputEnc, paddingEnc, maskEnc);
/* generate mask to see "previous" words on the decoder side */ /* decoder mask */
//int len = inputDec.GetDim(inputDec.order - 2); MakeMTMaskDec(inputEnc, inputDec, paddingEnc, paddingDec, maskDec, maskEncDec);
//int * dims = new int[inputDec.order + 1];
//for(int i = 0; i < inputDec.order; i++) encoding = MakeEncoder(inputEnc, maskEnc, isTraining);
// dims[i + 1] = inputDec.GetDim(i);
//dims[0] = nhead; decoding = MakeDecoder(inputDec, encoding, maskDec, maskEncDec, isTraining);
//dims[inputDec.order] = len;
//InitTensor(&maskDec, inputDec.order + 1, dims, X_FLOAT, 1.0F, inputDec.devID, inputDec.mem);
outputLayer->Make(decoding, output);
}
/*
make the mask for training MT models
>> inputEnc - input of the encoder
>> inputDec - input of the decoder
>> paddingEnc - padding of the encoder input
>> paddingDec - padding of the decoder input
>> maskEnc - mask of the encoder self-attention
>> maksDec - mask of the decoder self-attention
>> maksEncDec - mask of the decoder enc-dec attention
*/
void T2TModel::MakeMTMask(XTensor &inputEnc, XTensor &inputDec,
XTensor &paddingEnc, XTensor &paddingDec,
XTensor &maskEnc, XTensor &maskDec, XTensor &maskEncDec)
{
int len = inputDec.GetDim(inputDec.order - 1); int len = inputDec.GetDim(inputDec.order - 1);
int * dims = new int[inputDec.order + 2]; int * dims = new int[inputDec.order + 2];
for(int i = 0; i < inputDec.order; i++) for(int i = 0; i < inputDec.order; i++)
dims[i + 1] = inputDec.GetDim(i); dims[i + 1] = inputDec.GetDim(i);
dims[0] = nhead; dims[0] = nhead;
dims[inputDec.order + 1] = len; dims[inputDec.order + 1] = len;
InitTensor(&maskDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingEnc.devID, paddingEnc.mem); InitTensor(&maskDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingDec.devID, paddingDec.mem);
/* a upper triangular matrix where the cells of the upper triangular are set to -1e-9. /* an upper triangular matrix where the cells of the upper triangular are set to -1e-9.
this matrix can be used to prevent the attention to current or following words in this matrix can be used to prevent the attention to current or following words in
a given sequence. */ a given sequence. */
_SetDataLowTri(&maskDec, 1e9F, 0); _SetDataLowTri(&maskDec, 1e9F, 0);
_ScaleAndShiftMe(&maskDec, 1.0F, -1e9F); _ScaleAndShiftMe(&maskDec, 1.0F, -1e9F);
/* encoder-decoder mask that prevent the attention to padding dummy words */ /* encoder-decoder mask that prevents the attention to padding dummy words */
dims[inputDec.order + 1] = inputEnc.GetDim(inputEnc.order - 1); dims[inputDec.order + 1] = inputEnc.GetDim(inputEnc.order - 1);
InitTensor(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingEnc.devID, paddingEnc.mem); InitTensor(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingEnc.devID, paddingEnc.mem);
...@@ -236,8 +254,6 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe ...@@ -236,8 +254,6 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
XTensor * maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID, paddingEnc.mem); XTensor * maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID, paddingEnc.mem);
_Unsqueeze(&paddingEnc, maskEncDecTMPEnc, paddingEnc.order - 1, paddingDec.GetDim(-1)); _Unsqueeze(&paddingEnc, maskEncDecTMPEnc, paddingEnc.order - 1, paddingDec.GetDim(-1));
//_Unsqueeze(&paddingDec, maskEncDecTMPDec, paddingEnc.order, paddingEnc.GetDim(-1));
//_Multiply(maskEncDecTMPDec, maskEncDecTMPEnc, maskEncDecTMPDec);
_ScaleAndShiftMe(maskEncDecTMPEnc, 1e9F, -1e9F); _ScaleAndShiftMe(maskEncDecTMPEnc, 1e9F, -1e9F);
_Unsqueeze(maskEncDecTMPEnc, &maskEncDec, 0, dims[0]); _Unsqueeze(maskEncDecTMPEnc, &maskEncDec, 0, dims[0]);
...@@ -273,20 +289,98 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe ...@@ -273,20 +289,98 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
/* generate the mask on the source language side (for padding) */ /* generate the mask on the source language side (for padding) */
_Sum(&maskEnc, padding3, &maskEnc); _Sum(&maskEnc, padding3, &maskEnc);
encoding = MakeEncoder(inputEnc, maskEnc, isTraining);
//encoding.Dump(stderr, "encoding",10);
decoding = MakeDecoder(inputDec, encoding, maskDec, maskEncDec, isTraining);
//decoding.Dump(stderr, "decoding", 10);
outputLayer->Make(decoding, output);
delete[] dims; delete[] dims;
delete[] dimsPadding; delete[] dimsPadding;
DelTensorBuf(padding3); DelTensorBuf(padding3);
DelTensorBuf(padding2); DelTensorBuf(padding2);
} }
/*
make the mask of the encoder
>> inputEnc - input of the encoder
>> paddingEnc - padding of the encoder input
>> maskEnc - mask of the encoder self-attention
*/
void T2TModel::MakeMTMaskEnc(XTensor &inputEnc, XTensor &paddingEnc, XTensor &maskEnc)
{
/* padding on the source side */
int * dimsPadding = new int[paddingEnc.order + 2];
for (int i = 0; i < paddingEnc.order - 1; i++)
dimsPadding[i] = paddingEnc.GetDim(i);
dimsPadding[paddingEnc.order - 1] = paddingEnc.GetDim(-1);
dimsPadding[paddingEnc.order] = paddingEnc.GetDim(-1);
XTensor * padding2 = NewTensorBuf(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType,
paddingEnc.denseRatio, paddingEnc.devID, paddingEnc.mem);
for (int i = 0; i < padding2->order; i++)
dimsPadding[i + 1] = padding2->GetDim(i);
dimsPadding[0] = nhead;
XTensor * padding3 = NewTensorBuf(paddingEnc.order + 2, dimsPadding, paddingEnc.dataType,
paddingEnc.denseRatio, paddingEnc.devID, paddingEnc.mem);
/* mask of the padding */
_Unsqueeze(&paddingEnc, padding2, paddingEnc.order - 1, paddingEnc.GetDim(-1));
_Unsqueeze(padding2, padding3, 0, nhead);
_ScaleAndShiftMe(padding3, 1e9F, -1e9F);
InitTensor(&maskEnc, padding3);
maskEnc.SetZeroAll();
/* generate the mask on the source language side (for padding) */
_Sum(&maskEnc, padding3, &maskEnc);
DelTensorBuf(padding3);
DelTensorBuf(padding2);
delete[] dimsPadding;
}
/*
make the mask of the decoder
>> inputEnc - input of the encoder
>> inputDec - input of the decoder
>> paddingEnc - padding of the encoder input
>> paddingDec - padding of the decoder input
>> maksDec - mask of the decoder self-attention
>> maksEncDec - mask of the decoder enc-dec attention
*/
void T2TModel::MakeMTMaskDec(XTensor &inputEnc, XTensor &inputDec,
XTensor &paddingEnc, XTensor &paddingDec,
XTensor &maskDec, XTensor &maskEncDec)
{
int len = inputDec.GetDim(inputDec.order - 1);
int * dims = new int[inputDec.order + 2];
for(int i = 0; i < inputDec.order; i++)
dims[i + 1] = inputDec.GetDim(i);
dims[0] = nhead;
dims[inputDec.order + 1] = len;
InitTensor(&maskDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingDec.devID, paddingDec.mem);
/* an upper triangular matrix where the cells of the upper triangular are set to -1e-9.
this matrix can be used to prevent the attention to current or following words in
a given sequence. */
_SetDataLowTri(&maskDec, 1e9F, 0);
_ScaleAndShiftMe(&maskDec, 1.0F, -1e9F);
/* encoder-decoder mask that prevents the attention to padding dummy words */
dims[inputDec.order + 1] = inputEnc.GetDim(inputEnc.order - 1);
InitTensor(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingEnc.devID, paddingEnc.mem);
XTensor * maskEncDecTMPEnc = NewTensorBuf(paddingEnc.order + 1, dims + 1, paddingEnc.dataType,
paddingEnc.denseRatio, paddingEnc.devID, paddingEnc.mem);
XTensor * maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID, paddingEnc.mem);
_Unsqueeze(&paddingEnc, maskEncDecTMPEnc, paddingEnc.order - 1, paddingDec.GetDim(-1));
_ScaleAndShiftMe(maskEncDecTMPEnc, 1e9F, -1e9F);
_Unsqueeze(maskEncDecTMPEnc, &maskEncDec, 0, dims[0]);
DelTensorBuf(maskEncDecTMPDec);
DelTensorBuf(maskEncDecTMPEnc);
delete[] dims;
}
/* /*
get parameter matrics get parameter matrics
>> list - the list that keeps the parameter matrics >> list - the list that keeps the parameter matrics
......
...@@ -31,6 +31,9 @@ ...@@ -31,6 +31,9 @@
namespace transformer namespace transformer
{ {
/* a transformer model that keeps parameters of the encoder,
the decoder and the output layer (softmax). Also, it creates
the network used in transformer. */
class T2TModel class T2TModel
{ {
public: public:
...@@ -78,7 +81,21 @@ public: ...@@ -78,7 +81,21 @@ public:
void MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool isTraining); void MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool isTraining);
/* make the network for machine translation (with the output softmax layer) */ /* make the network for machine translation (with the output softmax layer) */
void MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTensor &paddingEnc, XTensor &paddingDec, bool isTraining); void MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output,
XTensor &paddingEnc, XTensor &paddingDec, bool isTraining);
/* make the mask for training MT models */
void MakeMTMask(XTensor &inputEnc, XTensor &inputDec,
XTensor &paddingEnc, XTensor &paddingDec,
XTensor &maskEnc, XTensor &maskDec, XTensor &maskEncDec);
/* make the mask of the encoder */
void MakeMTMaskEnc(XTensor &inputEnc, XTensor &paddingEnc, XTensor &maskEnc);
/* make the mask of the decoder */
void MakeMTMaskDec(XTensor &inputEnc, XTensor &inputDec,
XTensor &paddingEnc, XTensor &paddingDec,
XTensor &maskDec, XTensor &maskEncDec);
/* get parameter matrics */ /* get parameter matrics */
void GetParams(XList &list); void GetParams(XList &list);
......
...@@ -93,8 +93,8 @@ void T2TOutput::Make(XTensor &input, XTensor &output) ...@@ -93,8 +93,8 @@ void T2TOutput::Make(XTensor &input, XTensor &output)
{ {
XTensor &x = input; XTensor &x = input;
output = LogSoftmax(MMul(x, w), -1); //output = LogSoftmax(MMul(x, w), -1);
//output = Softmax(MMul(x, w), -1); output = Softmax(MMul(x, w), -1);
} }
} }
...@@ -176,6 +176,9 @@ public: ...@@ -176,6 +176,9 @@ public:
/* indicates whether we intend to debug the net */ /* indicates whether we intend to debug the net */
bool isDebugged; bool isDebugged;
/* bucket size */
int bucketSize;
public: public:
/* constructor */ /* constructor */
T2TTrainer(); T2TTrainer();
...@@ -205,10 +208,10 @@ public: ...@@ -205,10 +208,10 @@ public:
int LoadBatch(FILE * file, bool isLM, int LoadBatch(FILE * file, bool isLM,
XTensor * batchEnc, XTensor * paddingEnc, XTensor * batchEnc, XTensor * paddingEnc,
XTensor * batchDec, XTensor * paddingDec, XTensor * batchDec, XTensor * paddingDec,
XTensor * gold, XTensor * gold, XTensor * label,
int * seqs, int * seqs,
int vsEnc, int vsDec, int sBatch, int wBatch, int vsEnc, int vsDec, int sBatch, int wBatch,
bool isSorted, int &wCount, bool isSorted, int &ws, int &wCount,
int devID, XMem * mem, int devID, XMem * mem,
bool isTraining); bool isTraining);
...@@ -216,7 +219,7 @@ public: ...@@ -216,7 +219,7 @@ public:
int LoadBatchLM(FILE * file, int LoadBatchLM(FILE * file,
XTensor * batchEnc, XTensor * paddingEnc, XTensor * batchEnc, XTensor * paddingEnc,
XTensor * batchDec, XTensor * paddingDec, XTensor * batchDec, XTensor * paddingDec,
XTensor * gold, XTensor * gold, XTensor * label,
int * seqs, int vs, int sBatch, int wBatch, int * seqs, int vs, int sBatch, int wBatch,
bool isSorted, int &wCount, bool isSorted, int &wCount,
int devID, XMem * mem, int devID, XMem * mem,
...@@ -226,9 +229,9 @@ public: ...@@ -226,9 +229,9 @@ public:
int LoadBatchMT(FILE * file, int LoadBatchMT(FILE * file,
XTensor * batchEnc, XTensor * paddingEnc, XTensor * batchEnc, XTensor * paddingEnc,
XTensor * batchDec, XTensor * paddingDec, XTensor * batchDec, XTensor * paddingDec,
XTensor * gold, XTensor * gold, XTensor * label,
int * seqs, int vsEnc, int vsDec, int sBatch, int wBatch, int * seqs, int vsEnc, int vsDec, int sBatch, int wBatch,
bool isSorted, int &wCount, bool isSorted, int &ws, int &wCount,
int devID, XMem * mem, int devID, XMem * mem,
bool isTraining); bool isTraining);
......
...@@ -36,8 +36,6 @@ int TransformerMain(int argc, const char ** argv) ...@@ -36,8 +36,6 @@ int TransformerMain(int argc, const char ** argv)
{ {
if(argc == 0) if(argc == 0)
return 1; return 1;
fprintf(stderr, "%e\n", log(1e-8F));
char ** args = new char*[argc]; char ** args = new char*[argc];
for(int i = 0; i < argc; i++){ for(int i = 0; i < argc; i++){
...@@ -65,19 +63,22 @@ int TransformerMain(int argc, const char ** argv) ...@@ -65,19 +63,22 @@ int TransformerMain(int argc, const char ** argv)
trainer.Init(argc, args); trainer.Init(argc, args);
T2TModel model; T2TModel model;
model.InitModel(argc, args); model.InitModel(argc, args);
//if(strcmp(modelFN, ""))
// model.Read(modelFN);
/* learn model parameters */ /* learn model parameters */
if(strcmp(trainFN, "")) if(strcmp(trainFN, ""))
trainer.Train(trainFN, testFN, strcmp(modelFN, "") ? modelFN : "checkpoint.model", &model); trainer.Train(trainFN, testFN, strcmp(modelFN, "") ? modelFN : "checkpoint.model", &model);
/* save the final model */ /* save the final model */
if(strcmp(modelFN, "") && strcmp(trainFN, "")) //if(strcmp(modelFN, "") && strcmp(trainFN, ""))
model.Dump(modelFN); //model.Dump(modelFN);
/* load the model if neccessary */ /* load the model if neccessary */
if(strcmp(modelFN, "")) //if(strcmp(modelFN, ""))
model.Read(modelFN); //model.Read(modelFN);
T2TTrainer tester; T2TTrainer tester;
tester.Init(argc, args); tester.Init(argc, args);
......
...@@ -30,6 +30,7 @@ ...@@ -30,6 +30,7 @@
#include "XDevice.h" #include "XDevice.h"
#include "./test/Test.h" #include "./test/Test.h"
#include "./core/CHeader.h" #include "./core/CHeader.h"
#include "./loss/CrossEntropy.h"
//#define CRTDBG_MAP_ALLOC //#define CRTDBG_MAP_ALLOC
//#include <stdlib.h> //#include <stdlib.h>
......
...@@ -47,15 +47,8 @@ extern const char * GetDataTypeName(TENSOR_DATA_TYPE type); ...@@ -47,15 +47,8 @@ extern const char * GetDataTypeName(TENSOR_DATA_TYPE type);
extern TENSOR_DATA_TYPE GetDataType(const char * typeName); extern TENSOR_DATA_TYPE GetDataType(const char * typeName);
/* data conversion (for lower precision computation) */ /* data conversion (for lower precision computation) */
inline unsigned short cal_complement(unsigned short sig, unsigned short tal);
unsigned short Float16Add(unsigned short a, unsigned short b);
unsigned short Float16Sub(unsigned short a, unsigned short b);
unsigned short Float16Mul(unsigned short a, unsigned short b);
unsigned short Float16Div(unsigned short a, unsigned short b);
unsigned short FloatToFloat16(float f); unsigned short FloatToFloat16(float f);
float Float16ToFloat(unsigned short h); float Float16ToFloat(unsigned short h);
unsigned short FloatbitsToHalfbits(float ff);
float HalfbitsToFloatbits(unsigned short h);
void ConvertDataType(int devID, void ConvertDataType(int devID,
void * s, TENSOR_DATA_TYPE typeS, void * s, TENSOR_DATA_TYPE typeS,
void * t, TENSOR_DATA_TYPE typeT, int size); void * t, TENSOR_DATA_TYPE typeT, int size);
......
...@@ -266,6 +266,10 @@ XDevManager::XDevManager() ...@@ -266,6 +266,10 @@ XDevManager::XDevManager()
{ {
Clear(); Clear();
Init(); Init();
#ifndef USE_CPP11
fprintf(stderr, "Warning!!! c++ 11 is RECOMMENDED for compilation.\n");
#endif
} }
/* de-constructor */ /* de-constructor */
......
...@@ -43,13 +43,17 @@ ...@@ -43,13 +43,17 @@
/* the nts (NiuTrans.Tensor) namespace */ /* the nts (NiuTrans.Tensor) namespace */
namespace nts { namespace nts {
#if (__cplusplus >= 201103L || _MSC_VER >= 1700)
#define USE_CPP11
#endif
#define _XINLINE_ #define _XINLINE_
//#define DOUBELPRICSION //#define DOUBELPRICSION
#ifdef DOUBELPRICSION #ifdef DOUBELPRICSION
#define DTYPE double #define DTYPE double
#define DTYPE_MIN (DTYPE)1.79E+308 #define DTYPE_MIN (DTYPE)-1.79E+308
#else #else
#define DTYPE float #define DTYPE float
#define DTYPE_MIN (DTYPE)-3.40E+38 #define DTYPE_MIN (DTYPE)-3.40E+38
......
...@@ -307,6 +307,27 @@ void XLink::MakeLink(const XTensor * t1, const XTensor * t2, XTensor * h, int id ...@@ -307,6 +307,27 @@ void XLink::MakeLink(const XTensor * t1, const XTensor * t2, XTensor * h, int id
MakeLink(&list, h, id); MakeLink(&list, h, id);
} }
/*
create a hyperedge with two input tensors and a output tensor
>> t1 - a tail tensor
>> t2 - the second tail tensor
>> t3 - the third tail tensor
>> h - head tensor
>> id - id of the edge type
*/
void XLink::MakeLink(const XTensor * t1, const XTensor * t2, const XTensor * t3,XTensor * h, int id)
{
if (h == NULL)
return;
XList list(3);
list.Add(t1);
list.Add(t2);
list.Add(t3);
MakeLink(&list, h, id);
}
/* /*
create a hyper edge with a list of tensors and a output tensor create a hyper edge with a list of tensors and a output tensor
>> list - a list of input tensors >> list - a list of input tensors
...@@ -509,6 +530,88 @@ void XLink::Replace(const XTensor * oldOne, XTensor * newOne) ...@@ -509,6 +530,88 @@ void XLink::Replace(const XTensor * oldOne, XTensor * newOne)
} }
} }
/*
copy a node with another, i.e., we add the links to the new node
>> src - the node to be copied
>> tgt - the new node
*/
void XLink::Copy(const XTensor * reference, XTensor * target)
{
if (reference == NULL || target == NULL)
return;
XLink &newIncome = target->income;
XLink &newOutgo = target->outgo;
XLink::ClearOutgoing(target);
XLink::ClearIncoming(target);
/* incoming nodes */
if (reference->income.typeID != 0) {
if (newIncome.tailNum < reference->income.tailNum) {
delete[] newIncome.tails;
newIncome.tails = new XTensor*[reference->income.tailNum];
}
newIncome.SetType(reference->income.typeID);
newIncome.head = target;
newIncome.tailNum = reference->income.tailNum;
memcpy(newIncome.tails, reference->income.tails, sizeof(XTensor*) * newIncome.tailNum);
int paraArraySize = reference->income.paramNum * reference->income.paramSize;
newIncome.params = new char[paraArraySize];
memcpy(newIncome.params, reference->income.params, paraArraySize);
newIncome.paramNum = reference->income.paramNum;
/* update the link to each child node */
for (int i = 0; i < newIncome.tailNum; i++) {
XTensor * child = newIncome.tails[i];
XLink &childOutgo = child->outgo;
bool hit = false;
for (int j = 0; j < childOutgo.tailNum; j++) {
if (childOutgo.tails[j] == reference) {
//childOutgo.tails[j] = target;
childOutgo.AddTail(target);
hit = true;
break;
}
}
if (childOutgo.tailNum > 0) {
CheckNTErrors(hit, "No proper node found in child.outgo edge!");
}
}
}
if (newOutgo.tailNum < reference->outgo.tailNum) {
delete[] newOutgo.tails;
newOutgo.tails = new XTensor*[reference->outgo.tailNum];
}
/* outgoing nodes */
newOutgo.head = target;
newOutgo.tailNum = reference->outgo.tailNum;
memcpy(newOutgo.tails, reference->outgo.tails, sizeof(XTensor*) * newOutgo.tailNum);
/* update the link to each parent node */
for (int i = 0; i < newOutgo.tailNum; i++) {
XTensor * parent = newOutgo.tails[i];
XLink &parentIncome = parent->income;
bool hit = false;
for (int j = 0; j < parentIncome.tailNum; j++) {
if (parentIncome.tails[j] == reference) {
//parentIncome.tails[j] = target;
parentIncome.AddTail(target);
hit = true;
}
}
if (parentIncome.tailNum > 0) {
CheckNTErrors(hit, "No proper node found in parent.income edge!");
}
}
}
/* /*
copy incoming edges of a given node copy incoming edges of a given node
>> reference - the node we copy from >> reference - the node we copy from
...@@ -634,6 +737,29 @@ void XLink::ShowNode(FILE * file, XTensor * node) ...@@ -634,6 +737,29 @@ void XLink::ShowNode(FILE * file, XTensor * node)
fprintf(stderr, "\n"); fprintf(stderr, "\n");
} }
/*
search for a node in a top-down manner by its name
>> top - the top most node
<< return - the node we found
*/
/*XTensor * XLink::SearchNode(XTensor * top, const char * name)
{
if(!strcmp(top->name, name))
return top;
XLink &incoming = top->income;
for(int i = 0; i < incoming.tailNum; i++){
XTensor * child = incoming.tails[i];
XTensor * hit = SearchNode(child, name);
if(hit != NULL)
return hit;
}
return NULL;
}*/
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
...@@ -33,7 +33,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor) ...@@ -33,7 +33,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
/* cross reference */ /* cross reference */
struct XTensor; struct XTensor;
#define MAX_OP_NAME_LENGTH 16 #define MAX_OP_NAME_LENGTH 64
#define PARAM_UNTI_SIZE 64 #define PARAM_UNTI_SIZE 64
/* /*
...@@ -138,6 +138,10 @@ struct XLink ...@@ -138,6 +138,10 @@ struct XLink
static static
void MakeLink(const XTensor * t1, const XTensor * t2, XTensor * h, int id); void MakeLink(const XTensor * t1, const XTensor * t2, XTensor * h, int id);
/* create a hyper edge with two input tensors and a output tensor */
static
void MakeLink(const XTensor * t1, const XTensor * t2, const XTensor * t3, XTensor * h, int id);
/* create a hyper edge with a list of input tensors and a output tensor */ /* create a hyper edge with a list of input tensors and a output tensor */
static static
void MakeLink(const XList * list, XTensor * h, int id); void MakeLink(const XList * list, XTensor * h, int id);
...@@ -170,6 +174,10 @@ struct XLink ...@@ -170,6 +174,10 @@ struct XLink
static static
void Replace(const XTensor * oldOne, XTensor * newOne); void Replace(const XTensor * oldOne, XTensor * newOne);
/* copy a node with another, i.e., we add the links to the new node */
static
void Copy(const XTensor * reference, XTensor * target);
/* copy links of a given node */ /* copy links of a given node */
static static
void CopyIncoming(const XTensor * reference, XTensor * target); void CopyIncoming(const XTensor * reference, XTensor * target);
...@@ -181,6 +189,10 @@ struct XLink ...@@ -181,6 +189,10 @@ struct XLink
/* show a node */ /* show a node */
static static
void ShowNode(FILE * file, XTensor * node); void ShowNode(FILE * file, XTensor * node);
/* search a node in a top-down manner by its name */
//static
//XTensor * SearchNode(XTensor * top, const char * name);
}; };
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -77,6 +77,14 @@ const char * GetOPName(int type) ...@@ -77,6 +77,14 @@ const char * GetOPName(int type)
return "M_POWER"; return "M_POWER";
else if (type == MATH_SCALEANDSHIFT) else if (type == MATH_SCALEANDSHIFT)
return "M_SCALEANDSHIFT"; return "M_SCALEANDSHIFT";
else if (type == MATH_SCALE)
return "M_SCALE";
else if (type == MATH_DESCALE)
return "M_DESCALE";
else if (type == MATH_SHIFT)
return "M_SHIFT";
else if (type == MATH_MULANDSHIFT)
return "M_OPERATION";
else if (type == MATH_SIGN) else if (type == MATH_SIGN)
return "M_SIGN"; return "M_SIGN";
else if (type == MATH_SUB) else if (type == MATH_SUB)
...@@ -100,23 +108,25 @@ const char * GetOPName(int type) ...@@ -100,23 +108,25 @@ const char * GetOPName(int type)
else if (type == REDUCE_REDUCEVARIANCE) else if (type == REDUCE_REDUCEVARIANCE)
return "R_REDUCEVARIANCE"; return "R_REDUCEVARIANCE";
} }
else if ((type & DATA_BASE) != 0) { else if ((type & DATA_BASE) != 0) {
if (type == GETANDSET_CONVERTDATATYPE) if (type == GETANDSET_CONVERTDATATYPE)
return "G_CONVERTDATATYPE"; return "G_CONVERTDATATYPE";
else if (type == GETANDSET_INDEXTOONEHOT) else if (type == GETANDSET_INDEXTOONEHOT)
return "G_INDEXTOONEHOT"; return "G_INDEXTOONEHOT";
else if (type == GETANDSET_ONEHOTTOINDEX) else if (type == GETANDSET_ONEHOTTOINDEX)
return "G_ONEHOTTOINDEX"; return "G_ONEHOTTOINDEX";
} else if (type == GETANDSET_SELECT)
else if ((type & SHAPE_BASE) != 0) { return "G_SELECT";
if (type == GETANDSET_SELECT) }
return "G_SELECT"; else if ((type & SHAPE_BASE) != 0) {
else if (type == MOVEMENT_COPYINDEXED) if (type == MOVEMENT_COPYINDEXED)
return "M_COPYINDEXED"; return "M_COPYINDEXED";
else if (type == MOVEMENT_COPYVALUES) else if (type == MOVEMENT_COPYVALUES)
return "M_COPYVALUES"; return "M_COPYVALUES";
else if (type == MOVEMENT_GATHER) else if (type == MOVEMENT_GATHER)
return "M_GATHER"; return "M_GATHER";
else if (type == MOVEMENT_DROPOUTWITHINDEX)
return "M_DROPOUTWITHINDEX";
else if (type == SHAPE_CONCATENATE) else if (type == SHAPE_CONCATENATE)
return "S_CONCATENATE"; return "S_CONCATENATE";
else if (type == SHAPE_MERGE) else if (type == SHAPE_MERGE)
...@@ -158,6 +168,10 @@ const char * GetOPName(int type) ...@@ -158,6 +168,10 @@ const char * GetOPName(int type)
else if (type == FUNC_SOFTMAX) else if (type == FUNC_SOFTMAX)
return "F_SOFTMAX"; return "F_SOFTMAX";
} }
else if ((type & LOSS_BASE) != 0) {
if (type == LOSS_CROSSENTROPY)
return "L_CROSSENTROPY";
}
return "NULL"; return "NULL";
} }
......
...@@ -57,7 +57,12 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -57,7 +57,12 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#define MATH_NORMALIZE MATH_NEGATE + 1 #define MATH_NORMALIZE MATH_NEGATE + 1
#define MATH_POWER MATH_NORMALIZE + 1 #define MATH_POWER MATH_NORMALIZE + 1
#define MATH_SCALEANDSHIFT MATH_POWER + 1 #define MATH_SCALEANDSHIFT MATH_POWER + 1
#define MATH_SIGN MATH_SCALEANDSHIFT + 1 #define MATH_MULANDSHIFT MATH_SCALEANDSHIFT + 1
#define MATH_SCALE MATH_MULANDSHIFT + 1
#define MATH_DESCALE MATH_SCALE + 1
#define MATH_SHIFT MATH_DESCALE + 1
#define MATH_MOD MATH_SHIFT + 1
#define MATH_SIGN MATH_MOD + 1
#define MATH_SUB MATH_SIGN + 1 #define MATH_SUB MATH_SIGN + 1
#define MATH_SUBDIM MATH_SUB + 1 #define MATH_SUBDIM MATH_SUB + 1
#define MATH_SUM MATH_SUBDIM + 1 #define MATH_SUM MATH_SUBDIM + 1
...@@ -84,8 +89,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -84,8 +89,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#define MOVEMENT_COPYINDEXED MOVEMENT + 1 #define MOVEMENT_COPYINDEXED MOVEMENT + 1
#define MOVEMENT_COPYVALUES MOVEMENT_COPYINDEXED + 1 #define MOVEMENT_COPYVALUES MOVEMENT_COPYINDEXED + 1
#define MOVEMENT_GATHER MOVEMENT_COPYVALUES + 1 #define MOVEMENT_GATHER MOVEMENT_COPYVALUES + 1
#define MOVEMENT_DROPOUTWITHINDEX MOVEMENT_GATHER + 1
#define SHAPE MOVEMENT_GATHER + 1 #define SHAPE MOVEMENT_DROPOUTWITHINDEX + 1
#define SHAPE_CONCATENATE SHAPE + 1 #define SHAPE_CONCATENATE SHAPE + 1
#define SHAPE_MERGE SHAPE_CONCATENATE + 1 #define SHAPE_MERGE SHAPE_CONCATENATE + 1
#define SHAPE_MERGE_LIST SHAPE_MERGE + 1 #define SHAPE_MERGE_LIST SHAPE_MERGE + 1
...@@ -111,6 +117,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -111,6 +117,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#define FUNC_SIGMOID FUNC_RECTIFY + 1 #define FUNC_SIGMOID FUNC_RECTIFY + 1
#define FUNC_SOFTMAX FUNC_SIGMOID + 1 #define FUNC_SOFTMAX FUNC_SIGMOID + 1
#define LOSS_BASE FUNCTION_BASE * 2
#define LOSS_CROSSENTROPY LOSS_BASE + 1
/* get operator name */ /* get operator name */
const char * GetOPName(int type); const char * GetOPName(int type);
......
...@@ -48,7 +48,6 @@ ...@@ -48,7 +48,6 @@
#include "core/math/ScaleAndShift.h" #include "core/math/ScaleAndShift.h"
#include "core/getandset/SetData.h" #include "core/getandset/SetData.h"
#include "function/Identity.h" #include "function/Identity.h"
#include "core/getandset/ConvertDataType.h"
#ifdef USE_CUDA #ifdef USE_CUDA
...@@ -60,7 +59,6 @@ ...@@ -60,7 +59,6 @@
#include "core/utilities/FlushToMem.cuh" #include "core/utilities/FlushToMem.cuh"
#include "core/utilities/SetAscendingOrder.cuh" #include "core/utilities/SetAscendingOrder.cuh"
#endif #endif
/* the nts (NiuTrans.Tensor) namespace */ /* the nts (NiuTrans.Tensor) namespace */
...@@ -70,8 +68,6 @@ int tensorIDGlobal = 0; ...@@ -70,8 +68,6 @@ int tensorIDGlobal = 0;
MUTEX_HANDLE tensorMutex; MUTEX_HANDLE tensorMutex;
XTensor NULLTensor; XTensor NULLTensor;
#define RAND_MAX16 0xff
/* generate a tensor id */ /* generate a tensor id */
int MakeTensorID() int MakeTensorID()
{ {
...@@ -196,6 +192,36 @@ XTensor::XTensor(const XTensor &reference) ...@@ -196,6 +192,36 @@ XTensor::XTensor(const XTensor &reference)
isTmp = reference.isTmp; isTmp = reference.isTmp;
} }
/* copy constructor (with right value reference) */
#ifdef USE_CPP11
XTensor::XTensor(const XTensor &&reference)
{
Init();
SetDataPointer();
id = MakeTensorID();
ShallowCopy(reference);
data = NULL;
dataHost = NULL;
devID = reference.devID;
mem = reference.mem;
data = reference.data;
signature = reference.signature;
/* what we really want to do is "reference.data = NULL;"
As "reference" is constant, we cannot reset reference.data
here. So we save the ADDRESS of reference.data in
reference.dataP, and do this work by updating "*reference.dataP".
This is VERY trick and might not be the best solution :) */
*reference.dataP = NULL;
XLink::Replace(&reference, this);
isInit = true;
isTmp = reference.isTmp;
}
#endif
/* de-constructor */ /* de-constructor */
XTensor::~XTensor() XTensor::~XTensor()
{ {
...@@ -215,7 +241,6 @@ XTensor::~XTensor() ...@@ -215,7 +241,6 @@ XTensor::~XTensor()
XLink::Replace(this, newTensor); XLink::Replace(this, newTensor);
} }
XLink::ClearOutgoing(this); XLink::ClearOutgoing(this);
XLink::ClearIncoming(this); XLink::ClearIncoming(this);
...@@ -298,7 +323,7 @@ void XTensor::ShallowCopy(const XTensor &tensor) ...@@ -298,7 +323,7 @@ void XTensor::ShallowCopy(const XTensor &tensor)
/* overloading of the equal-sign */ /* overloading of the equal-sign */
XTensor& XTensor::operator= (const XTensor& tensor) XTensor& XTensor::operator= (const XTensor& tensor)
{ {
/* we must make a hard copy of the tensor if it is the input /* we must make a hard copy of the tensor if it is the input
of another node. */ of another node. */
if(outgo.tailNum > 0){ if(outgo.tailNum > 0){
...@@ -373,50 +398,97 @@ XTensor& XTensor::operator= (const XTensor& tensor) ...@@ -373,50 +398,97 @@ XTensor& XTensor::operator= (const XTensor& tensor)
return *this; return *this;
} }
/* overloading of the equal-sign (with right value reference) */
XTensor& XTensor::operator= (const XTensor&& tensor)
{
/* we must make a hard copy of the tensor if it is the input
of another node. */
if(outgo.tailNum > 0){
int dims[MAX_TENSOR_DIM_NUM];
memcpy(dims, dimSize, order * sizeof(int));
dims[0] = -dims[0];
XTensor * newTensor = new XTensor(order, dims, dataType, denseRatio, devID, mem);
newTensor->SetTMPFlag();
newTensor->data = data;
newTensor->dataHost = dataHost;
newTensor->signature = tensor.signature;
XLink::Replace(this, newTensor);
XLink::ClearOutgoing(this);
XLink::ClearIncoming(this);
newTensor->ShallowCopy(this);
data = NULL;
dataHost = NULL;
}
DestroyData();
ShallowCopy(tensor);
isInit = true;
devID = tensor.devID;
mem = tensor.mem;
data = tensor.data;
signature = tensor.signature;
/* what we really want to do is "reference.data = NULL;"
As "reference" is constant, we cannot reset reference.data
here. So we save the ADDRESS of reference.data in
reference.dataP, and do this work by updating "*reference.dataP".
This is VERY trick and might not be the best solution :) */
*tensor.dataP = NULL;
XLink::Replace(&tensor, this);
return *this;
}
/* overloading of the plus-sign */ /* overloading of the plus-sign */
XTensor XTensor::operator+ (const XTensor& tensor) XTensor XTensor::operator+ (const XTensor& tensor) const
{ {
return Sum(*this, tensor); return Sum(*this, tensor);
} }
/* overloading of the plus-sign */ /* overloading of the plus-sign */
XTensor XTensor::operator+ (const DTYPE shift) XTensor XTensor::operator+ (const DTYPE shift) const
{ {
return ScaleAndShift(*this, 1, shift); return ScaleAndShift(*this, 1, shift);
} }
/* overloading of the multiply-sign */ /* overloading of the multiply-sign */
XTensor XTensor::operator* (const XTensor& tensor) XTensor XTensor::operator* (const XTensor& tensor) const
{ {
return Multiply(*this, tensor); return Multiply(*this, tensor);
} }
/* overloading of the multiply-sign */ /* overloading of the multiply-sign */
XTensor XTensor::operator* (const DTYPE scale) XTensor XTensor::operator* (const DTYPE scale) const
{ {
return ScaleAndShift(*this, scale, 0); return ScaleAndShift(*this, scale, 0);
} }
/* overloading of the minus-sign */ /* overloading of the minus-sign */
XTensor XTensor::operator- (const XTensor& tensor) XTensor XTensor::operator- (const XTensor& tensor) const
{ {
return Sub(*this, tensor); return Sub(*this, tensor);
} }
/* overloading of the minus-sign */ /* overloading of the minus-sign */
XTensor XTensor::operator- (const DTYPE shift) XTensor XTensor::operator- (const DTYPE shift) const
{ {
return ScaleAndShift(*this, 1, -shift); return ScaleAndShift(*this, 1, -shift);
} }
/* overloading of the division-sign */ /* overloading of the division-sign */
XTensor XTensor::operator/ (const XTensor& tensor) XTensor XTensor::operator/ (const XTensor& tensor) const
{ {
return Div(*this, tensor); return Div(*this, tensor);
} }
/* overloading of the division-sign */ /* overloading of the division-sign */
XTensor XTensor::operator/ (const DTYPE scale) XTensor XTensor::operator/ (const DTYPE scale) const
{ {
return ScaleAndShift(*this, (DTYPE)1/scale, 0); return ScaleAndShift(*this, (DTYPE)1/scale, 0);
} }
...@@ -426,7 +498,7 @@ linear transformation b = a * \scale + \shift ...@@ -426,7 +498,7 @@ linear transformation b = a * \scale + \shift
>> scale - the slope >> scale - the slope
>> shift - the intercept >> shift - the intercept
*/ */
XTensor XTensor::Lin(DTYPE scale, DTYPE shift) XTensor XTensor::Lin(DTYPE scale, DTYPE shift) const
{ {
return Linear(*this, scale, shift); return Linear(*this, scale, shift);
} }
...@@ -462,6 +534,37 @@ bool XTensor::IsSameShaped(const XTensor * a, const XTensor * b) ...@@ -462,6 +534,37 @@ bool XTensor::IsSameShaped(const XTensor * a, const XTensor * b)
return true; return true;
} }
bool XTensor::IsReduceShaped(const XTensor * a, const XTensor * b, int dim)
{
if (a == NULL || b == NULL)
return false;
if ((a->order - 1) != b->order)
return false;
for (int i = 0; i < b->order; i++) {
if (i < dim) {
if (a->dimSize[i] != b->dimSize[i])
return false;
}
else if (i >= dim) {
if (a->dimSize[i+1] != b->dimSize[i])
return false;
}
}
if(a->dataType != b->dataType)
return false;
if(a->denseRatio != b->denseRatio)
return false;
if(a->isSparse != b->isSparse)
return false;
return true;
}
/* /*
judge whether the three matrices are in the same type and size judge whether the three matrices are in the same type and size
>> a - input tensor >> a - input tensor
...@@ -712,15 +815,6 @@ void XTensor::SetDataRand(DTYPE lower, DTYPE upper) ...@@ -712,15 +815,6 @@ void XTensor::SetDataRand(DTYPE lower, DTYPE upper)
d = new double[unitNum]; d = new double[unitNum];
for (int i = 0; i < unitNum; i++) { for (int i = 0; i < unitNum; i++) {
*((double*)d + i) = lower + variance * rand() / RAND_MAX; *((double*)d + i) = lower + variance * rand() / RAND_MAX;
}
}
else if (dataType == X_FLOAT16) {
unsigned short random;
unsigned short ulower = FloatToFloat16(lower), uvariance = FloatToFloat16(variance);
d = new unsigned short[unitNum];
for (int i = 0; i < unitNum; i++) {
random = FloatToFloat16(rand() % RAND_MAX16 * 1.0 / RAND_MAX16);
*((unsigned short*)d + i) = Float16Add(ulower, Float16Mul(uvariance, random));
} }
} }
else { else {
...@@ -1634,17 +1728,6 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg, ...@@ -1634,17 +1728,6 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg,
fprintf(file, " %d", f); fprintf(file, " %d", f);
} }
} }
else if (dataType == X_FLOAT16) {
int end = MIN(n > 0 ? beg + n : beg + unitNum, unitNum);
for (int i = beg; i < end; i++) {
unsigned short f = ((unsigned short*)d)[i];
if (i == beg)
fprintf(file, "%u", f);
else
fprintf(file, " %u", f);
}
}
else else
ShowNTErrors("TODO!"); ShowNTErrors("TODO!");
} }
...@@ -1681,22 +1764,9 @@ dump data to a file ...@@ -1681,22 +1764,9 @@ dump data to a file
*/ */
void XTensor::Dump(const XTensor * tensor, FILE * file, const char * label, const int n, const int beg, const int verbose) void XTensor::Dump(const XTensor * tensor, FILE * file, const char * label, const int n, const int beg, const int verbose)
{ {
if (tensor->dataType == X_FLOAT) XTensor a(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, tensor->devID, tensor->mem);
{ _CopyValues(tensor, &a);
XTensor a(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, tensor->devID, tensor->mem); a.Dump(file, label, n, beg, verbose);
_CopyValues(tensor, &a);
a.Dump(file, label, n, beg, verbose);
}
else if (tensor->dataType == X_FLOAT16)
{
XTensor a(tensor->order, tensor->dimSize, X_FLOAT, tensor->denseRatio, tensor->devID, tensor->mem);
_ConvertDataType(tensor, &a);
a.Dump(file, label, n, beg, verbose);
}
else
{
ShowNTErrors("TO DO!");
}
} }
/* /*
...@@ -1774,14 +1844,6 @@ void XTensor::Read(FILE * file, const char * label) ...@@ -1774,14 +1844,6 @@ void XTensor::Read(FILE * file, const char * label)
} }
} }
} }
else if (dataType == X_FLOAT16) {
for (int i = 0; i < unitNum; i++) {
unsigned short * f = ((unsigned short*)data) + i;
if (fscanf(file, "%u", f) < 1) {
ShowNTErrors("Incorrect tensor format!");
}
}
}
else { else {
ShowNTErrors("TODO!"); ShowNTErrors("TODO!");
} }
......
...@@ -189,6 +189,11 @@ public: ...@@ -189,6 +189,11 @@ public:
/* copy constructor */ /* copy constructor */
XTensor(const XTensor &reference); XTensor(const XTensor &reference);
/* copy constructor (with right value reference) */
#ifdef USE_CPP11
XTensor(const XTensor &&reference);
#endif
/* de-constructor */ /* de-constructor */
~XTensor(); ~XTensor();
...@@ -204,32 +209,37 @@ public: ...@@ -204,32 +209,37 @@ public:
/* overloading of the equal-sign */ /* overloading of the equal-sign */
XTensor& operator= (const XTensor &tensor); XTensor& operator= (const XTensor &tensor);
/* overloading of the equal-sign (with right value reference) */
#ifdef USE_CPP11
XTensor& operator= (const XTensor &&tensor);
#endif
/* overloading of the plus-sign */ /* overloading of the plus-sign */
XTensor operator+ (const XTensor &tensor); XTensor operator+ (const XTensor &tensor) const;
/* overloading of the plus-sign */ /* overloading of the plus-sign */
XTensor operator+ (const DTYPE shift); XTensor operator+ (const DTYPE shift) const;
/* overloading of the multiply-sign */ /* overloading of the multiply-sign */
XTensor operator* (const XTensor &tensor); XTensor operator* (const XTensor &tensor) const;
/* overloading of the multiply-sign */ /* overloading of the multiply-sign */
XTensor operator* (const DTYPE scale); XTensor operator* (const DTYPE scale) const;
/* overloading of the minus-sign */ /* overloading of the minus-sign */
XTensor operator- (const XTensor &tensor); XTensor operator- (const XTensor &tensor) const;
/* overloading of the minus-sign */ /* overloading of the minus-sign */
XTensor operator- (const DTYPE shift); XTensor operator- (const DTYPE shift) const;
/* overloading of the division-sign */ /* overloading of the division-sign */
XTensor operator/ (const XTensor &tensor); XTensor operator/ (const XTensor &tensor) const;
/* overloading of the division-sign */ /* overloading of the division-sign */
XTensor operator/ (const DTYPE scale); XTensor operator/ (const DTYPE scale) const;
/* linear transformation */ /* linear transformation */
XTensor Lin(DTYPE scale, DTYPE shift = 0); XTensor Lin(DTYPE scale, DTYPE shift = 0) const;
/* judge whether the two matrices are in the same type and size */ /* judge whether the two matrices are in the same type and size */
static static
...@@ -239,6 +249,10 @@ public: ...@@ -239,6 +249,10 @@ public:
static static
bool IsSameShaped(const XTensor * a, const XTensor * b, const XTensor * c); bool IsSameShaped(const XTensor * a, const XTensor * b, const XTensor * c);
/* judge whether b is the reduced shape of a ?? */
static
bool IsReduceShaped(const XTensor * a, const XTensor * b, int dim);
/* set the size of each dimension */ /* set the size of each dimension */
void SetDim(int * myDimSize); void SetDim(int * myDimSize);
......
...@@ -28,6 +28,7 @@ ...@@ -28,6 +28,7 @@
#include "arithmetic/Div.h" #include "arithmetic/Div.h"
#include "arithmetic/DivDim.h" #include "arithmetic/DivDim.h"
#include "arithmetic/Mask.h"
#include "arithmetic/MatrixMul.h" #include "arithmetic/MatrixMul.h"
#include "arithmetic/MatrixMul2D.h" #include "arithmetic/MatrixMul2D.h"
#include "arithmetic/MatrixMul2DMultiTheading.h" #include "arithmetic/MatrixMul2DMultiTheading.h"
...@@ -44,12 +45,14 @@ ...@@ -44,12 +45,14 @@
#include "arithmetic/SumByColumnVT.h" #include "arithmetic/SumByColumnVT.h"
#include "arithmetic/SumDim.h" #include "arithmetic/SumDim.h"
#include "arithmetic/XTensorBLAS.h" #include "arithmetic/XTensorBLAS.h"
#include "arithmetic/MulAndShift.h"
#include "getandset/ConvertDataType.h" #include "getandset/ConvertDataType.h"
#include "getandset/OnehotAndIndex.h" #include "getandset/OnehotAndIndex.h"
#include "getandset/Select.h" #include "getandset/Select.h"
#include "getandset/SetData.h" #include "getandset/SetData.h"
#include "math/Binary.h"
#include "math/Clip.h" #include "math/Clip.h"
#include "math/Compare.h" #include "math/Compare.h"
#include "math/Normalize.h" #include "math/Normalize.h"
......
...@@ -214,4 +214,55 @@ XTensor Div(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim) ...@@ -214,4 +214,55 @@ XTensor Div(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim)
return c; return c;
} }
/*
element-wise division of two tensors
c(i) = a(i)/b(i) + \alpha * c(i)
where i is the index of the item
>> a - tensor a
>> b - tensor b
>> c - result tensor
>> alpha - the coefficient
>> leadingDim - the dimension along which we perform broadcasting
>> requireLink - if add operation to network
*/
void Div(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int leadingDim, bool requireLink)
{
if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
InitTensor(&c, &a);
}
int n = GetDivDimIndex(a, b);
if (n == -1) {
CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
/* call _Div function */
_Div(&a, &b, &c, 0, leadingDim);
if (requireLink) {
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_DIV);
XLink::AddParamToHead(&c, alpha);
XLink::AddParamToHeadInt(&c, leadingDim);
}
}
else if (n >= 0 && n < a.order) {
/* call _DivDim function */
_DivDim(&a, &b, &c, n, alpha);
if (requireLink) {
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_DIVDIM);
XLink::AddParamToHeadInt(&c, n);
XLink::AddParamToHead(&c, alpha);
}
}
else {
ShowNTErrors("Something is wrong!");
}
}
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
...@@ -43,15 +43,6 @@ void KernelDivElementWise(DTYPE * a, DTYPE * b, DTYPE * c, int size) ...@@ -43,15 +43,6 @@ void KernelDivElementWise(DTYPE * a, DTYPE * b, DTYPE * c, int size)
c[i] = a[i] / b[i]; c[i] = a[i] / b[i];
} }
__global__
void KernelDivElementWiseHalf(__half * a, __half * b, __half * c, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size)
c[i] = a[i] / b[i];
}
/* /*
division of data arrays in a element-wise manner c(i) = a(i)/b(i) + \alpha*c(i) division of data arrays in a element-wise manner c(i) = a(i)/b(i) + \alpha*c(i)
>> a - data array a >> a - data array a
...@@ -69,18 +60,6 @@ void KernelDivElementWiseV2(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE alp ...@@ -69,18 +60,6 @@ void KernelDivElementWiseV2(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE alp
c[i] = a[i] / b[i] + alpha * c[i]; c[i] = a[i] / b[i] + alpha * c[i];
} }
__global__
void KernelDivElementWiseV2Half(__half * a, __half * b, __half * c, int size, DTYPE alpha)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
__half alpha1 = __float2half(alpha);
if (i < size)
c[i] = a[i] / b[i] + alpha1 * c[i];
#endif
}
/* /*
division of two tensors in a element-wise manner c(i) = a(i)/b(i). division of two tensors in a element-wise manner c(i) = a(i)/b(i).
Note that a and b can be of different sizes here, i.e., Note that a and b can be of different sizes here, i.e.,
...@@ -201,25 +180,6 @@ void _CudaDiv(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, in ...@@ -201,25 +180,6 @@ void _CudaDiv(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, in
} }
} }
} }
else if (a->dataType == X_FLOAT16 && b->dataType == X_FLOAT16) {
int cudaGridSize[3];
int cudaBlockSize[3];
if (a->unitNum == c->unitNum && b->unitNum == c->unitNum) {
GDevs.GetCudaThread(a->devID, c->unitNum, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[0]), threads(cudaBlockSize[0]);
if (alpha == 0)
KernelDivElementWiseHalf << <blocks, threads >> >((__half*)a->data, (__half*)b->data, (__half*)c->data, c->unitNum);
else
KernelDivElementWiseV2Half << <blocks, threads >> >((__half*)a->data, (__half*)b->data, (__half*)c->data, c->unitNum, alpha);
}
else {
// TODO!!
ShowNTErrors("TODO!");
}
}
else { else {
// TODO!! // TODO!!
ShowNTErrors("TODO!"); ShowNTErrors("TODO!");
......
...@@ -49,6 +49,13 @@ where i is the index of the element ...@@ -49,6 +49,13 @@ where i is the index of the element
*/ */
XTensor Div(const XTensor &a, const XTensor &b, DTYPE alpha = 0.0, int leadingDim = 0); XTensor Div(const XTensor &a, const XTensor &b, DTYPE alpha = 0.0, int leadingDim = 0);
/*
element-wise division of two tensors:
c(i) = a(i)/b(i) + \alpha * c(i)
where i is the index of the element
*/
void Div(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha = 0.0, int leadingDim = 0, bool requireLink = false);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
#endif // __DIV_H__ #endif // __DIV_H__
\ No newline at end of file
...@@ -162,5 +162,36 @@ XTensor DivDim(const XTensor &a, const XTensor &b, int n, DTYPE alpha) ...@@ -162,5 +162,36 @@ XTensor DivDim(const XTensor &a, const XTensor &b, int n, DTYPE alpha)
return c; return c;
} }
/*
tensor division
c = a / b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is divided with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> c - where we put result. we save it in a if c is NULL
>> n - the dimension index
>> alpha - the scaling factor
>> requireLink - if add operation to network
*/
void DivDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE alpha, bool requireLink)
{
if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
InitTensor(&c, &a);
}
/* call _Div function */
_DivDim(&a, &b, &c, n, alpha);
if (requireLink) {
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_DIVDIM);
XLink::AddParamToHeadInt(&c, n);
XLink::AddParamToHead(&c, alpha);
}
}
} }
...@@ -52,6 +52,14 @@ i.e., a is divided with b by broadcasting ...@@ -52,6 +52,14 @@ i.e., a is divided with b by broadcasting
we make a new tensor c to keep the result and return it we make a new tensor c to keep the result and return it
*/ */
XTensor DivDim(const XTensor &a, const XTensor &b, int n, DTYPE alpha = (DTYPE)0.0); XTensor DivDim(const XTensor &a, const XTensor &b, int n, DTYPE alpha = (DTYPE)0.0);
/*
tensor division of two tensors:
c(i) = a/b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is divided with b by broadcasting
*/
void DivDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE alpha = (DTYPE)0.0, bool requireLink = false);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2019-04-24
* I'll attend several conferences and workshops in the following weeks -
* busy days :(
*/
#include "../../XTensor.h"
#include "../../XName.h"
#include "../../XUtility.h"
#include "Mask.h"
#include "Mask.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
mask entries of a given tensor:
c(i) = a(i) if mask(i) is non-zero
c(i) = alpha if mask(i) = 0
where i is the index of the element
*/
void _Mask(const XTensor * a, const XTensor * mask, XTensor * c, DTYPE alpha)
{
CheckNTErrors(a && mask && c, "Empty tensor input!");
CheckNTErrors(a->unitNum == mask->unitNum && a->unitNum == c->unitNum,
"Unmatched tensors in addition!");
CheckNTErrors(mask->dataType == X_INT, "The mask tensor must be in X_INT!")
//CheckNTErrors(a->dataType == mask->dataType && a->dataType == c->dataType,
// "Unmatched tensors in addition!");
if (a->devID >= 0 || mask->devID >= 0 || c->devID >= 0) {
#ifdef USE_CUDA
if (a == c) {
int P2PAccesible = 0;
#ifdef CUDA_UVA
cudaDeviceCanAccessPeer(&P2PAccesible, a->devID, b->devID);
#endif
if ((a->devID < 0 && mask->devID >= 0) ||
(a->devID >= 0 && mask->devID < 0) ||
(a->devID >= 0 && mask->devID >= 0 && a->devID != mask->devID && !P2PAccesible))
{
ShowNTErrors("Cannot run this method on multiple devices simultaneously!");
}
else
_CudaMask(a, mask, c, alpha);
}
else
_CudaMask(a, mask, c, alpha);
#endif
}
else {
if (!a->isSparse && !mask->isSparse) {
CheckNTErrors(!c->isSparse, "Illegal use of sparse tensor in addition!");
if (a->dataType == DEFAULT_DTYPE &&
mask->dataType == X_INT &&
c->dataType == DEFAULT_DTYPE)
{
DTYPE * ap = (DTYPE*)a->data;
int * maskp = (int*)mask->data;
DTYPE * cp = (DTYPE*)c->data;
/* unrolling */
int num = a->unitNum;
if (num % 2 == 0) {
for (int i = 0; i < num; i += 2) {
if (maskp[i] == 0) {
cp[i] = alpha;
}
else {
cp[i] = ap[i];
}
if (maskp[i + 1] == 0) {
cp[i + 1] = alpha;
}
else {
cp[i + 1] = ap[i + 1];
}
}
}
else {
for (int i = 0; i < num; i++) {
if (maskp[i] == 0) {
cp[i] = alpha;
}
else {
cp[i] = ap[i];
}
}
}
}
else {
// TODO!!
ShowNTErrors("TODO!");
}
}
else {
// TODO!!
ShowNTErrors("TODO!");
}
}
}
/*
mask entries of a given tensor (on site):
a(i) = a(i) if mask(i) is non-zero
a(i) = alpha if mask(i) = 0
where i is the index of the element
*/
void _MaskMe(XTensor * a, const XTensor * mask, DTYPE alpha)
{
_Mask(a, mask, a, alpha);
}
/*
mask entries of a given tensor (return an XTensor structure):
a(i) = a(i) if mask(i) is non-zero
a(i) = alpha if mask(i) = 0
where i is the index of the element
*/
XTensor Mask(const XTensor &a, const XTensor &mask, DTYPE alpha)
{
XTensor c(&a);
c.SetTMPFlag();
/* call _Sum function */
_Mask(&a, &mask, &c, alpha);
/* tensor connections */
//XLink::MakeLink(&a, &mask, &c, MATH_SUM);
//XLink::AddParamToHead(&c, alpha);
// TODO!!
ShowNTErrors("TODO!");
return c;
}
}
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2019-04-24
* I'll attend several conferences and workshops in the following weeks -
* busy days :(
*/
#include "../../XDevice.h"
#include "../../XUtility.h"
#include "Sub.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/*
mask entries of a given tensor (CUDA Kernel)
c = a - b * \beta
>> a - A matrix
>> mask - mask matrix
>> c - where we put masked a
>> size - the size of a/b/c
>> alpha - value
*/
__global__
void KernelMASK(DTYPE * a, int * mask, DTYPE * c, int size, DTYPE alpha)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) {
if (mask[i] == 0) {
c[i] = alpha;
}
else {
c[i] = a[i];
}
}
}
/*
mask entries of a given tensor (cuda version)
>> a - a tensor
>> mask - mask tensor
>> c - where we put masked a
>> alpha - value
*/
void _CudaMask(const XTensor * a, const XTensor * mask, XTensor * c, DTYPE alpha)
{
CheckNTErrors(a && mask && c, "Empty tensor input!");
CheckNTErrors((a->unitNum == mask->unitNum && a->unitNum == c->unitNum),
"Unmatched tensors in addition!");
CheckNTErrors(mask->dataType == X_INT, "The mask tensor must be in X_INT!")
//CheckNTErrors((a->dataType == mask->dataType && a->dataType == c->dataType),
// "Unmatched tensors in addition!");
CheckNTErrors((a->devID == mask->devID && a->devID == c->devID),
"The tensors must be on the same!");
int devIDBackup = XDevice::GetGPUDevice();
XDevice::SetGPUDevice(a->devID);
if (!a->isSparse && !mask->isSparse) {
CheckNTErrors(!c->isSparse, "Illegal use of sparse matrix in addition!");
if (a->dataType == DEFAULT_DTYPE &&
mask->dataType == X_INT &&
c->dataType == DEFAULT_DTYPE)
{
int gridSize[3], blockSize[3];
GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
dim3 blocks(gridSize[0]);
dim3 threads(blockSize[0]);
KernelMASK << <blocks, threads >> >((DTYPE*)a->data, (int *)mask->data, (DTYPE*)c->data, a->unitNum, alpha);
}
else {
// TODO!!
ShowNTErrors("TODO!");
}
}
else {
// TODO!!
ShowNTErrors("TODO!");
}
XDevice::SetGPUDevice(devIDBackup);
}
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2019-04-24
* I'll attend several conferences and workshops in the following weeks -
* busy days :(
*/
#ifndef __MASK_CUH__
#define __MASK_CUH__
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* mask entries of a given tensor (cuda version) */
void _CudaMask(const XTensor * a, const XTensor * mask, XTensor * c = NULL, DTYPE alpha = (DTYPE)1.0);
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
#endif // __MASK_CUH__
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2019-04-24
* I'll attend several conferences and workshops in the following weeks -
* busy days :(
*/
#ifndef __MASK_H__
#define __MASK_H__
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
mask entries of a given tensor:
c(i) = a(i) if mask(i) is non-zero
c(i) = alpha if mask(i) = 0
where i is the index of the element
*/
void _Mask(const XTensor * a, const XTensor * mask, XTensor * c, DTYPE alpha);
/*
mask entries of a given tensor (on site):
a(i) = a(i) if mask(i) is non-zero
a(i) = alpha if mask(i) = 0
where i is the index of the element
*/
void _MaskMe(XTensor * a, const XTensor * mask, DTYPE alpha);
/*
mask entries of a given tensor (return an XTensor structure):
a(i) = a(i) if mask(i) is non-zero
a(i) = alpha if mask(i) = 0
where i is the index of the element
*/
XTensor Mask(const XTensor &a, const XTensor &mask, DTYPE alpha = 0.0);
} // namespace nts(NiuTrans.Tensor)
#endif // __MASK_H__
...@@ -59,10 +59,17 @@ Obviously C = A * B performs normal matrix multiplication if A = y * z and B = x ...@@ -59,10 +59,17 @@ Obviously C = A * B performs normal matrix multiplication if A = y * z and B = x
XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB, XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL); DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
void MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
XTensor &c, DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL, bool requireLink = false);
/* matrix multiplication with no transposition c = a * b * alpha*/ /* matrix multiplication with no transposition c = a * b * alpha*/
XTensor MatrixMul(const XTensor &a, const XTensor &b, XTensor MatrixMul(const XTensor &a, const XTensor &b,
DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL); DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
void MatrixMul(const XTensor &a, const XTensor &b, XTensor &c,
DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL, bool requireLink = false);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
#endif // __MATRIXMUL_H__ #endif // __MATRIXMUL_H__
\ No newline at end of file
...@@ -50,7 +50,7 @@ void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -50,7 +50,7 @@ void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
XPRunner * parallelRunner, XStream * stream) XPRunner * parallelRunner, XStream * stream)
{ {
CheckNTErrors((a && b && c), "Empty input tensors!"); CheckNTErrors((a && b && c), "Empty input tensors!");
CheckNTErrors((a->dataType == b->dataType), "Input tensors should have the same data type!"); CheckNTErrors((a->dataType == b->dataType), "Input tensors should have the same data type!");
CheckNTErrors((a->order == 2 && b->order == 2 && c->order == 2), CheckNTErrors((a->order == 2 && b->order == 2 && c->order == 2),
"Input tensors must have a order = 2!"); "Input tensors must have a order = 2!");
...@@ -78,11 +78,19 @@ void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -78,11 +78,19 @@ void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
if (!a->isSparse && !b->isSparse) { if (!a->isSparse && !b->isSparse) {
CheckNTErrors(!c->isSparse, "Illegal use of sparse matrix in multiplication!"); CheckNTErrors(!c->isSparse, "Illegal use of sparse matrix in multiplication!");
if (useBLAS) if (a->dataType == DEFAULT_DTYPE &&
_MatrixMULCPU(a, transposedA, b, transposedB, c, alpha, beta); b->dataType == DEFAULT_DTYPE &&
else c->dataType == DEFAULT_DTYPE)
_MatrixMul2DParallel(a, transposedA, b, transposedB, c, alpha, beta, parallelRunner); {
if (useBLAS)
_MatrixMULCPU(a, transposedA, b, transposedB, c, alpha, beta);
else
_MatrixMul2DParallel(a, transposedA, b, transposedB, c, alpha, beta, parallelRunner);
}
else {
// TODO!!
ShowNTErrors("TODO!");
}
} }
/* a dense matrix multiply a sparse matrix */ /* a dense matrix multiply a sparse matrix */
else if (!a->isSparse && b->isSparse) { else if (!a->isSparse && b->isSparse) {
......
...@@ -156,12 +156,18 @@ void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA, ...@@ -156,12 +156,18 @@ void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
if (stream != NULL) if (stream != NULL)
cublasSetStream(*handle, stream->stream); cublasSetStream(*handle, stream->stream);
_CudaBLASMatrixMUL(handle, a->data, transposedA, a->dataType, if (a->dataType == X_FLOAT && b->dataType == X_FLOAT && c->dataType == X_FLOAT) {
b->data, transposedB, a->dataType, c->data, c->dataType, _CudaBLASMatrixMUL(handle, a->data, transposedA, a->dataType,
a->dimSize[0], a->dimSize[1], b->data, transposedB, a->dataType, c->data, c->dataType,
b->dimSize[0], b->dimSize[1], a->dimSize[0], a->dimSize[1],
c->dimSize[0], c->dimSize[1], b->dimSize[0], b->dimSize[1],
alpha, beta); c->dimSize[0], c->dimSize[1],
alpha, beta);
}
else {
// TODO!!
ShowNTErrors("TODO!");
}
} }
/* a dense matrix multiply a sparse matrix */ /* a dense matrix multiply a sparse matrix */
else if (!a->isSparse && b->isSparse) { else if (!a->isSparse && b->isSparse) {
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: JIANG Yufan (email: jiangyufan2018@outlook.com) 2019-02-27
*/
#include "../../XTensor.h"
#include "../../XDevice.h"
#include "../../XName.h"
#include "MulAndShift.h"
#include "MatrixMul.h"
#include "Sum.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
return a dimension if the sum is performed as SumDim (in more details in SumDim.h)
>> a - a tensor
>> b - another tensor for sum
*/
int GetSumIndex(const XTensor &a, const XTensor &b)
{
if (a.order < b.order)
return -1;
if (XTensor::IsSameShaped(&a, &b))
return -1;
int hitCount = 0;
int hitDim = -1;
for (int i = 0; i < b.order; i++) {
if (b.dimSize[b.order - 1 - i] == 1)
continue;
else if (b.dimSize[b.order - 1 - i] == a.dimSize[a.order - 1 - i]) {
hitCount++;
hitDim = a.order - b.order + i;
}
}
if (hitCount == 1)
return hitDim;
else
return -1;
}
/*
operation c = x * w + b MulAndShift
>> x - tensor x
>> w - tensor w
>> b - tensor b
>> parallelRunner - parallel processing module
<< return - the result of matrix multiplication
*/
XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
DTYPE alpha, XPRunner * parallelRunner)
{
CheckNTErrors(x.dataType == w.dataType, "Input tensors should have the same data type!");
CheckNTErrors(x.order >= 2 && w.order >= 2, "Input tensors must have a order >= 2!");
int xn = x.dimSizeRDI[1];
int xm = x.dimSizeRDI[0];
int wn = w.dimSizeRDI[1];
int wm = w.dimSizeRDI[0];
CheckNTErrors(xm == wn, "Unmatched tensors in multiplication!");
int order = x.order + w.order - 2;
int sub = 0;
int * dimSize = new int[order];
for (int i = 2; i < x.order; i++)
dimSize[sub++] = x.dimSizeRDI[x.order + 1 - i];
for (int i = 2; i < w.order; i++)
dimSize[sub++] = w.dimSizeRDI[w.order + 1 - i];
dimSize[sub++] = xn;
dimSize[sub++] = wm;
float dr = (!x.isSparse || !w.isSparse) ? 1.0F : MAX(x.denseRatio, w.denseRatio);
XTensor * tmp = NewTensorBuf(order, dimSize, x.dataType, dr, x.devID, x.mem);
/* call _MatrixMul function */
_MatrixMul(&x, X_NOTRANS, &w, X_NOTRANS, tmp, alpha, 0, parallelRunner);
XTensor c(tmp);
c.SetTMPFlag();
int n = GetSumIndex(tmp, b);
if (n == -1) {
/* call _Sum function */
_Sum(tmp, &b, &c);
// TODO!!
ShowNTErrors("TODO!");
}
else if (n >= 0 && n < tmp->order) {
/* call _SumDim function */
_SumDim(tmp, &b, &c, n);
}
else {
ShowNTErrors("Something is wrong!");
}
/* tensor connections */
XLink::MakeLink(&x, &w, &b, &c, MATH_MULANDSHIFT);
XLink::AddParamToHeadInt(&c, n);
XLink::AddParamToHeadTrans(&c, X_NOTRANS);
XLink::AddParamToHeadTrans(&c, X_NOTRANS);
//XLink::AddParamToHead(&c, beta);
/* destroy variables */
delete[] dimSize;
DelTensorBuf(tmp);
return c;
}
}
\ No newline at end of file
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: JIANG Yufan (email: jiangyufan2018@outlook.com) 2019-02-27
*/
#ifndef __MULANDSHIFT_H__
#define __MULANDSHIFT_H__
#include "../../XTensor.h"
#include "../CHeader.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
} // namespace nts(NiuTrans.Tensor)
#endif // __OPERATION_H__
...@@ -215,4 +215,55 @@ XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim ...@@ -215,4 +215,55 @@ XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim
return c; return c;
} }
/*
element-wise product of two tensors
c(i) = a(i)*b(i) + \alpha * c(i)
where i is the index of the item
>> a - tensor a
>> b - tensor b
>> c - result tensor
>> alpha - the coefficient
>> leadingDim - the dimension along which we perform broadcasting
>> requireLink - if add operation to network
*/
void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int leadingDim, bool requireLink)
{
if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
InitTensor(&c, &a);
}
int n = GetMultiplyDimIndex(a, b);
if (n == -1) {
CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
/* call _Multiply function */
_Multiply(&a, &b, &c, 0, leadingDim);
if (requireLink) {
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_MULTIPLY);
XLink::AddParamToHead(&c, alpha);
XLink::AddParamToHeadInt(&c, leadingDim);
}
}
else if (n >= 0 && n < a.order) {
/* call _MultiplyDim function */
_MultiplyDim(&a, &b, &c, n, alpha);
if (requireLink) {
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM);
XLink::AddParamToHeadInt(&c, n);
XLink::AddParamToHead(&c, alpha);
}
}
else {
ShowNTErrors("Something is wrong!");
}
}
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
...@@ -43,15 +43,6 @@ void KernelMulElementWise(DTYPE * a, DTYPE * b, DTYPE * c, int size) ...@@ -43,15 +43,6 @@ void KernelMulElementWise(DTYPE * a, DTYPE * b, DTYPE * c, int size)
c[i] = a[i] * b[i]; c[i] = a[i] * b[i];
} }
__global__
void KernelMulElementWiseHalf(__half * a, __half * b, __half * c, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size)
c[i] = a[i] * b[i];
}
/* /*
multiplication of data arrays in a element-wise manner c(i) = a(i)*b(i) + \alpha*c(i) multiplication of data arrays in a element-wise manner c(i) = a(i)*b(i) + \alpha*c(i)
>> a - data array a >> a - data array a
...@@ -69,18 +60,6 @@ void KernelMulElementWiseV2(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE alp ...@@ -69,18 +60,6 @@ void KernelMulElementWiseV2(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE alp
c[i] = a[i] * b[i] + alpha * c[i]; c[i] = a[i] * b[i] + alpha * c[i];
} }
__global__
void KernelMulElementWiseV2Half(__half * a, __half * b, __half * c, int size, DTYPE alpha)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
__half alpha1 = __float2half(alpha);
if (i < size)
c[i] = a[i] * b[i] + alpha1 * c[i];
#endif
}
/* /*
multiplication of two tensors in a element-wise manner c(i) = a(i)*b(i). multiplication of two tensors in a element-wise manner c(i) = a(i)*b(i).
Note that a and b can be of different sizes here, i.e., Note that a and b can be of different sizes here, i.e.,
...@@ -201,25 +180,6 @@ void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alph ...@@ -201,25 +180,6 @@ void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alph
} }
} }
} }
else if (a->dataType == X_FLOAT16 && b->dataType == X_FLOAT16) {
int cudaGridSize[3];
int cudaBlockSize[3];
if (a->unitNum == c->unitNum && b->unitNum == c->unitNum) {
GDevs.GetCudaThread(a->devID, c->unitNum, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[0]), threads(cudaBlockSize[0]);
if (alpha == 0)
KernelMulElementWiseHalf << <blocks, threads >> >((__half *)a->data, (half *)b->data, (half *)c->data, c->unitNum);
else
KernelMulElementWiseV2Half << <blocks, threads >> >((__half*)a->data, (__half*)b->data, (__half*)c->data, c->unitNum, alpha);
}
else {
// TODO!!
ShowNTErrors("TODO!");
}
}
else { else {
// TODO!! // TODO!!
ShowNTErrors("TODO!"); ShowNTErrors("TODO!");
......
...@@ -49,6 +49,13 @@ where i is the index of the element ...@@ -49,6 +49,13 @@ where i is the index of the element
*/ */
XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha = 0.0, int leadingDim = 0); XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha = 0.0, int leadingDim = 0);
/*
element-wise product of two tensors:
c(i) = a(i)*b(i) + \alpha * c(i)
where i is the index of the element
*/
void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha = 0.0, int leadingDim = 0, bool requireLink = false);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
#endif // __MULTIPLY_H__ #endif // __MULTIPLY_H__
\ No newline at end of file
...@@ -162,6 +162,36 @@ XTensor MultiplyDim(const XTensor &a, const XTensor &b, int n) ...@@ -162,6 +162,36 @@ XTensor MultiplyDim(const XTensor &a, const XTensor &b, int n)
return c; return c;
} }
/*
tensor multiplication
c = a * b + \alpha * c
where the size of b is equal to the n-th dimension of a,
i.e., a is multiplied with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> c - where we put a * b + \alpha * c. we save it in a if c is NULL
>> n - the dimension index
>> requireLink - if add operation to network
*/
void MultiplyDim(const XTensor &a, const XTensor &b, XTensor &c, int n, bool requireLink)
{
if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
InitTensor(&c, &a);
}
/* call _Multiply function */
_MultiplyDim(&a, &b, &c, n, 0);
if (requireLink) {
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM);
XLink::AddParamToHeadInt(&c, n);
XLink::AddParamToHead(&c, 0);
}
}
/* /*
tensor broadcast multiplication tensor broadcast multiplication
c = a * b + c * \beta c = a * b + c * \beta
...@@ -302,4 +332,30 @@ XTensor MultiplyBroadcast(const XTensor &a, const XTensor &b) ...@@ -302,4 +332,30 @@ XTensor MultiplyBroadcast(const XTensor &a, const XTensor &b)
return c; return c;
} }
/*
tensor broadcast multiplication
c = a * b + c * \beta
where some of dimensions of b can be of size 1
>> a - a tensor
>> b - another tensor that would be broadcasted
>> c - the resulting tensor
>> requireLink - if add operation to network
*/
void MultiplyBroadcast(const XTensor &a, const XTensor &b, XTensor &c, bool requireLink)
{
if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
InitTensor(&c, &a);
}
/* call _SumBroadcast function */
_MultiplyBroadcast(&a, &b, &c, 0);
if (requireLink) {
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYBROADCAST);
XLink::AddParamToHead(&c, 0);
}
}
} }
...@@ -22,10 +22,6 @@ ...@@ -22,10 +22,6 @@
#include "../../XDevice.h" #include "../../XDevice.h"
#include "../../XUtility.h" #include "../../XUtility.h"
#include "MultiplyDim.cuh" #include "MultiplyDim.cuh"
#include "../getandset/ConvertDataType.h"
#include "../arithmetic/XTensorBLAS.h"
#include "../math/ScaleAndShift.h"
#include "cuda_fp16.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -65,25 +61,6 @@ void KernelMultiplyWithRow(T * a, T * b, T * c, int rowNum, int colNum, T alpha) ...@@ -65,25 +61,6 @@ void KernelMultiplyWithRow(T * a, T * b, T * c, int rowNum, int colNum, T alpha)
c[offset] = a[offset] * bv[threadIdx.x]; c[offset] = a[offset] * bv[threadIdx.x];
} }
__global__
void KernelMultiplyWithRowHalf(__half * a, __half * b, __half * c, int rowNum, int colNum)
{
__shared__ __half bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int col = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
if (col >= colNum || row >= rowNum)
return;
if (threadIdx.y == 0)
bv[threadIdx.x] = b[col];
__syncthreads();
int offset = colNum * row + col;
c[offset] = a[offset] * bv[threadIdx.x];
}
/* /*
tensor multiplication of a tensor and a colum vector tensor multiplication of a tensor and a colum vector
c = a * b + \alpha * c c = a * b + \alpha * c
...@@ -125,30 +102,6 @@ void KernelMultiplyWithCol(T * a, T * b, T * c, int rowNum, int colNum, int bloc ...@@ -125,30 +102,6 @@ void KernelMultiplyWithCol(T * a, T * b, T * c, int rowNum, int colNum, int bloc
c[offset] = a[offset] * bv[threadIdx.y]; c[offset] = a[offset] * bv[threadIdx.y];
} }
__global__
void KernelMultiplyWithColHalf(__half * a, __half * b, __half * c, int rowNum, int colNum, int blockSize, int blockNum)
{
__shared__ __half bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = colIndex % colNum;
int block = colIndex / colNum;
if (row >= rowNum || block >= blockNum)
return;
if (threadIdx.x == 0)
bv[threadIdx.y] = b[row];
__syncthreads();
int offset = block * blockSize + row * colNum + col;
c[offset] = a[offset] * bv[threadIdx.y];
}
/* /*
tensor multiplication tensor multiplication
...@@ -182,13 +135,14 @@ void _CudaMultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, ...@@ -182,13 +135,14 @@ void _CudaMultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n,
else if (i < n) else if (i < n)
blockNum *= a->dimSize[i]; blockNum *= a->dimSize[i];
} }
int cudaGrids[3]; int cudaGrids[3];
int cudaBlocks[3]; int cudaBlocks[3];
int devIDBackup = 0; int devIDBackup = 0;
ProtectCudaDev(a->devID, devIDBackup); ProtectCudaDev(a->devID, devIDBackup);
if (a->dataType == DEFAULT_DTYPE) { if (a->dataType == DEFAULT_DTYPE) {
if (stride > 1) { if (stride > 1) {
GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks); GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
if(alpha == (DTYPE)0.0F) if(alpha == (DTYPE)0.0F)
...@@ -202,48 +156,23 @@ void _CudaMultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, ...@@ -202,48 +156,23 @@ void _CudaMultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n,
} }
else if (stride == 1) { else if (stride == 1) {
GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks); GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
if (alpha == (DTYPE)0.0F) if(alpha == (DTYPE)0.0F)
KernelMultiplyWithRow<DTYPE, false> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> > KernelMultiplyWithRow<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockNum, blockSize, alpha); blockNum, blockSize, alpha);
else else
KernelMultiplyWithRow<DTYPE, true> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>> KernelMultiplyWithRow<DTYPE, true> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockNum, blockSize, alpha); blockNum, blockSize, alpha);
} }
else {
ShowNTErrors("Something is wrong!");
}
}
else if (a->dataType == X_FLOAT16) {
if (stride > 1) {
GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
KernelMultiplyWithColHalf<< <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((__half *)a->data, (__half *)b->data, (__half *)c->data,
blockSize, stride, blockSize * stride, blockNum);
}
else if (stride == 1) {
/*__half alpha1 = float2half_rn(alpha);*/
GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
KernelMultiplyWithRowHalf<< <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((__half *)a->data, (__half *)b->data, (__half *)c->data,
blockNum, blockSize);
}
else { else {
ShowNTErrors("Something is wrong!"); ShowNTErrors("Something is wrong!");
} }
} }
else { else {
ShowNTErrors("TODO!"); ShowNTErrors("TODO!");
} }
BacktoCudaDev(a->devID, devIDBackup); BacktoCudaDev(a->devID, devIDBackup);
} }
......
...@@ -38,6 +38,10 @@ void _MultiplyDimMe(XTensor * a, const XTensor * b, int n, DTYPE alpha = 0.0); ...@@ -38,6 +38,10 @@ void _MultiplyDimMe(XTensor * a, const XTensor * b, int n, DTYPE alpha = 0.0);
i.e., a is multiplied with b by broadcasting. We make a new tensor c to keep the result and return it */ i.e., a is multiplied with b by broadcasting. We make a new tensor c to keep the result and return it */
XTensor MultiplyDim(const XTensor &a, const XTensor &b, int n); XTensor MultiplyDim(const XTensor &a, const XTensor &b, int n);
/* tensor multiplication c = a * b + \alpha * c where the size of b is equal to the n-th dimension of a,
i.e., a is multiplied with b by broadcasting */
void MultiplyDim(const XTensor &a, const XTensor &b, XTensor &c, int n, bool requireLink = false);
/* tensor multiplication summation c = a * b + c * \beta where some of dimensions of b can be of size 1 */ /* tensor multiplication summation c = a * b + c * \beta where some of dimensions of b can be of size 1 */
void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0); void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
...@@ -45,6 +49,9 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE ...@@ -45,6 +49,9 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
we return the resulting tensor here */ we return the resulting tensor here */
XTensor MultiplyBroadcast(const XTensor &a, const XTensor &b); XTensor MultiplyBroadcast(const XTensor &a, const XTensor &b);
/* tensor multiplication summation c = a * b + c * \beta where some of dimensions of b can be of size 1 */
void MultiplyBroadcast(const XTensor &a, const XTensor &b, XTensor &c, bool requireLink = false);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
#endif // __MULTIPLYDIM_H__ #endif // __MULTIPLYDIM_H__
...@@ -79,4 +79,25 @@ XTensor Negate(const XTensor & a) ...@@ -79,4 +79,25 @@ XTensor Negate(const XTensor & a)
return b; return b;
} }
/*
set every entry to its minus value
>> a - input tensor we are processing
>> b - output tensor we are processing
>> requireLink - if add operation to network
*/
void Negate(const XTensor & a, XTensor & b, bool requireLink)
{
if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {
InitTensor(&b, &a);
}
/* call _Negate function */
_Negate(&a, &b);
if (requireLink) {
/* tensor connections */
XLink::MakeLink(&a, NULL, &b, MATH_NEGATE);
}
}
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
...@@ -41,6 +41,9 @@ make a new tensor to keep the result and return it ...@@ -41,6 +41,9 @@ make a new tensor to keep the result and return it
*/ */
XTensor Negate(const XTensor & a); XTensor Negate(const XTensor & a);
/* set every entry to its minus value */
void Negate(const XTensor & a, XTensor & b, bool requireLink = false);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
#endif // __NEGATE_H__ #endif // __NEGATE_H__
...@@ -84,4 +84,25 @@ XTensor Sign(const XTensor & a) ...@@ -84,4 +84,25 @@ XTensor Sign(const XTensor & a)
return b; return b;
} }
/*
set every entry to its sign value
>> a - input tensor we are processing
>> b - output tensor we are processing
>> requireLink - if add operation to network
*/
void Sign(const XTensor & a, XTensor & b, bool requireLink)
{
if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {
InitTensor(&b, &a);
}
/* call _Sign function */
_Sign(&a, &b);
if (requireLink) {
/* tensor connections */
XLink::MakeLink(&a, NULL, &b, MATH_SIGN);
}
}
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
...@@ -23,7 +23,6 @@ ...@@ -23,7 +23,6 @@
#include "../../XTensor.h" #include "../../XTensor.h"
#include "Sign.h" #include "Sign.h"
#include "Sign.cuh" #include "Sign.cuh"
#include "cuda_fp16.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -57,25 +56,9 @@ This is for float16 computation ...@@ -57,25 +56,9 @@ This is for float16 computation
>> size - size of the data array >> size - size of the data array
*/ */
__global__ __global__
void KernelSignHalf(__half * a, __half * b, int size) void KernelSign(__half * a, __half * b, int size)
{ {
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) return;
__half zero = __float2half(0.0F);
__half one = __float2half(1.0F);
__half one_1 = __float2half(-1.0F);
int i = blockDim.x * blockIdx.x + threadIdx.x;
DTYPE flag = __half2float(a[i]);
if (i < size) {
if (flag > 0)
b[i] = one;
else if (flag < 0)
b[i] = one_1;
else
b[i] = zero;
}
#endif
} }
/* /*
...@@ -103,7 +86,7 @@ void _CudaSign(const XTensor * a, XTensor * b) ...@@ -103,7 +86,7 @@ void _CudaSign(const XTensor * a, XTensor * b)
KernelSign << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum); KernelSign << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);
} }
else if (a->dataType == X_FLOAT16) { else if (a->dataType == X_FLOAT16) {
KernelSignHalf << <blocks, threads >> >((__half*)a->data, (__half*)b->data, a->unitNum); KernelSign << <blocks, threads >> >((__half*)a->data, (__half*)b->data, a->unitNum);
} }
else { else {
ShowNTErrors("TODO!"); ShowNTErrors("TODO!");
......
...@@ -41,6 +41,9 @@ make a new tensor to keep the result and return it ...@@ -41,6 +41,9 @@ make a new tensor to keep the result and return it
*/ */
XTensor Sign(const XTensor & a); XTensor Sign(const XTensor & a);
/* set every entry to its sign value */
void Sign(const XTensor & a, XTensor & b, bool requireLink = false);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
#endif // __SIGN_H__ #endif // __SIGN_H__
...@@ -194,4 +194,47 @@ XTensor Sub(const XTensor &a, const XTensor &b, DTYPE beta) ...@@ -194,4 +194,47 @@ XTensor Sub(const XTensor &a, const XTensor &b, DTYPE beta)
return c; return c;
} }
/*
tensor subtraction c = a - b * \beta
>> a - a tensor
>> b - another tensor
>> c - where we put a-b*\beta. we save it in a if c is NULL
>> beta - the scaling factor
>> requireLink - if add operation to network
*/
void Sub(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta, bool requireLink)
{
if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
InitTensor(&c, &a);
}
int n = GetSubDimIndex(a, b);
if (n == -1) {
/* call _Sub function */
_Sub(&a, &b, &c, beta);
if (requireLink) {
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_SUB);
XLink::AddParamToHead(&c, beta);
}
}
else if (n >= 0 && n < a.order) {
/* call _SubDim function */
_SubDim(&a, &b, &c, n, beta);
if (requireLink) {
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_SUBDIM);
XLink::AddParamToHeadInt(&c, n);
XLink::AddParamToHead(&c, beta);
}
}
else {
ShowNTErrors("Something is wrong!");
}
}
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
...@@ -22,8 +22,6 @@ ...@@ -22,8 +22,6 @@
#include "../../XDevice.h" #include "../../XDevice.h"
#include "../../XUtility.h" #include "../../XUtility.h"
#include "Sub.cuh" #include "Sub.cuh"
#include "cuda_fp16.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -48,30 +46,6 @@ void KernelSUB(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta) ...@@ -48,30 +46,6 @@ void KernelSUB(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta)
} }
/* /*
subtraction of data arrays (CUDA Kernel) Half Precision
c = a - b * \beta
>> a - A matrix
>> b - another matrix
>> c - where we put a-b
>> size - the size of a/b/c
>> beta - the coefficient
*/
__global__
void KernelSUBHalf(half * a, half * b, half * c, int size, DTYPE beta)
{
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
__half beta1 = __float2half(beta);
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size)
c[i] = a[i] - b[i] * beta1;
#endif
}
/*
tensor subtraction c = a - b * \beta (cuda version) tensor subtraction c = a - b * \beta (cuda version)
>> a - a tensor >> a - a tensor
>> b - another tensor >> b - another tensor
...@@ -105,22 +79,10 @@ void _CudaSub(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta) ...@@ -105,22 +79,10 @@ void _CudaSub(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
dim3 threads(blockSize[0]); dim3 threads(blockSize[0]);
KernelSUB << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, a->unitNum, beta); KernelSUB << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, a->unitNum, beta);
} }
else if(a->dataType == X_FLOAT16 && else {
b->dataType == X_FLOAT16 && // TODO!!
c->dataType == X_FLOAT16){ ShowNTErrors("TODO!");
int gridSize[3], blockSize[3];
GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
dim3 blocks(gridSize[0]);
dim3 threads(blockSize[0]);
KernelSUBHalf << <blocks, threads >> >((__half*)a->data, (__half*)b->data, (__half*)c->data, a->unitNum, beta);
} }
else {
// TODO!!
ShowNTErrors("TODO!");
}
} }
else { else {
// TODO!! // TODO!!
......
...@@ -42,6 +42,9 @@ make a new tensor c to keep the result and return it ...@@ -42,6 +42,9 @@ make a new tensor c to keep the result and return it
*/ */
XTensor Sub(const XTensor &a, const XTensor &b, DTYPE beta = (DTYPE)1.0); XTensor Sub(const XTensor &a, const XTensor &b, DTYPE beta = (DTYPE)1.0);
/* tensor subtraction c = a - b * \beta */
void Sub(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta = (DTYPE)1.0, bool requireLink = false);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
#endif // __SUB_H__ #endif // __SUB_H__
...@@ -163,4 +163,35 @@ XTensor SubDim(const XTensor &a, const XTensor &b, int n, DTYPE beta) ...@@ -163,4 +163,35 @@ XTensor SubDim(const XTensor &a, const XTensor &b, int n, DTYPE beta)
return c; return c;
} }
/*
tensor subtraction
c = a - b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> c - where we put a-b*\beta. we save it in a if c is NULL
>> n - the dimension index
>> beta - the scaling factor
>> requireLink - if add operation to network
*/
void SubDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta, bool requireLink)
{
if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
InitTensor(&c, &a);
}
/* call _Sub function */
_SubDim(&a, &b, &c, n, beta);
if (requireLink) {
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_SUBDIM);
XLink::AddParamToHeadInt(&c, n);
XLink::AddParamToHead(&c, beta);
}
}
} }
...@@ -21,9 +21,6 @@ ...@@ -21,9 +21,6 @@
#include "SubDim.cuh" #include "SubDim.cuh"
#include "../../XDevice.h" #include "../../XDevice.h"
#include "cuda_fp16.h"
#include "device_launch_parameters.h"
#include "../../XDataType.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -40,10 +37,11 @@ where a is a tensor and b is a row vector ...@@ -40,10 +37,11 @@ where a is a tensor and b is a row vector
>> colNum - number of columns of a and c (i.e., the size of b) >> colNum - number of columns of a and c (i.e., the size of b)
>> beta - the scaling factor >> beta - the scaling factor
*/ */
template <class T, bool betaFired>
__global__ __global__
void KernelSubWithRow(DTYPE * a, DTYPE * b, DTYPE * c, int rowNum, int colNum, DTYPE beta,bool betaFired) void KernelSubWithRow(T * a, T * b, T * c, int rowNum, int colNum, T beta)
{ {
__shared__ DTYPE bv[MAX_CUDA_THREAD_NUM_PER_BLOCK]; __shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int col = blockDim.x * blockIdx.x + threadIdx.x; int col = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y; int row = blockDim.y * blockIdx.y + threadIdx.y;
...@@ -62,59 +60,6 @@ void KernelSubWithRow(DTYPE * a, DTYPE * b, DTYPE * c, int rowNum, int colNum, D ...@@ -62,59 +60,6 @@ void KernelSubWithRow(DTYPE * a, DTYPE * b, DTYPE * c, int rowNum, int colNum, D
c[offset] = a[offset] - bv[threadIdx.x]; c[offset] = a[offset] - bv[threadIdx.x];
} }
__global__
void KernelSubWithRowHalf(half * a, half * b, half * c, int rowNum, int colNum, half beta, bool betaFired)
{
__shared__ half bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int col = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
if (col >= colNum || row >= rowNum)
return;
if (threadIdx.y == 0)
bv[threadIdx.x] = b[col];
__syncthreads();
int offset = colNum * row + col;
if (betaFired)
c[offset] = a[offset] - bv[threadIdx.x] * beta;
else
c[offset] = a[offset] - bv[threadIdx.x];
}
//template <class T, bool betaFired>
//__global__
//void KernelSubWithRow(T * a, T * b, T * c, int rowNum, int colNum, DTYPE beta)
//{
// __shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
// int col = blockDim.x * blockIdx.x + threadIdx.x;
// int row = blockDim.y * blockIdx.y + threadIdx.y;
//
// if (col >= colNum || row >= rowNum)
// return;
//
// if (threadIdx.y == 0)
// bv[threadIdx.x] = b[col];
//
// __syncthreads();
//
// T beta1;
// if (sizeof(T) - sizeof(half) == 0) {
// beta1 =__float2half(beta);
// }
// else {
// beta1 = beta;
// }
//
// int offset = colNum * row + col;
// if (betaFired)
// c[offset] = a[offset] - bv[threadIdx.x] * beta1;
// else
// c[offset] = a[offset] - bv[threadIdx.x];
//}
/* /*
tensor subtraction of a tensor and a colum vector tensor subtraction of a tensor and a colum vector
c = a - b * \beta c = a - b * \beta
...@@ -128,11 +73,11 @@ where a is a tensor and b is a colum vector ...@@ -128,11 +73,11 @@ where a is a tensor and b is a colum vector
>> blockNum - number of matrics >> blockNum - number of matrics
>> beta - the scaling factor >> beta - the scaling factor
*/ */
template <class T, bool betaFired>
__global__ __global__
void KernelSubWithCol(DTYPE * a, DTYPE * b, DTYPE * c, int rowNum, int colNum, int blockSize, int blockNum, DTYPE beta,bool betaFired) void KernelSubWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, T beta)
{ {
__shared__ DTYPE bv[MAX_CUDA_THREAD_NUM_PER_BLOCK]; __shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int colIndex = blockDim.x * blockIdx.x + threadIdx.x; int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y; int row = blockDim.y * blockIdx.y + threadIdx.y;
...@@ -156,71 +101,6 @@ void KernelSubWithCol(DTYPE * a, DTYPE * b, DTYPE * c, int rowNum, int colNum, i ...@@ -156,71 +101,6 @@ void KernelSubWithCol(DTYPE * a, DTYPE * b, DTYPE * c, int rowNum, int colNum, i
c[offset] = a[offset] - bv[threadIdx.y]; c[offset] = a[offset] - bv[threadIdx.y];
} }
__global__
void KernelSubWithColHalf(half * a, half * b, half * c, int rowNum, int colNum, int blockSize, int blockNum, half beta, bool betaFired)
{
__shared__ half bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = colIndex % colNum;
int block = colIndex / colNum;
if (row >= rowNum || block >= blockNum)
return;
if (threadIdx.x == 0)
bv[threadIdx.y] = b[row];
__syncthreads();
int offset = block * blockSize + row * colNum + col;
if (betaFired)
c[offset] = a[offset] - bv[threadIdx.y] * beta;
else
c[offset] = a[offset] - bv[threadIdx.y];
}
//
//template <class T, bool betaFired>
//__global__
// void KernelSubWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, DTYPE beta)
//{
// __shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
//
// int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
// int row = blockDim.y * blockIdx.y + threadIdx.y;
//
// int col = colIndex % colNum;
// int block = colIndex / colNum;
//
// if (row >= rowNum || block >= blockNum)
// return;
//
// if (threadIdx.x == 0)
// bv[threadIdx.y] = b[row];
//
// __syncthreads();
//
// T beta1;
//
// if (sizeof(T) - sizeof(half) == 0) {
// beta1 = __float2half(beta);
// }
// else {
// beta1 = beta;
// }
//
// int offset = block * blockSize + row * colNum + col;
//
// if (betaFired)
// c[offset] = a[offset] - bv[threadIdx.y] * beta1;
// else
// c[offset] = a[offset] - bv[threadIdx.y];
//}
/* /*
tensor subtraction (cuda version) tensor subtraction (cuda version)
...@@ -265,72 +145,28 @@ void _CudaSubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE ...@@ -265,72 +145,28 @@ void _CudaSubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE
if (stride > 1) { if (stride > 1) {
GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks); GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
if (beta == (DTYPE)1.0F) if (beta == (DTYPE)1.0F)
KernelSubWithCol <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>> KernelSubWithCol<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockSize, stride, blockSize * stride, blockNum, beta,false); blockSize, stride, blockSize * stride, blockNum, beta);
else else
KernelSubWithCol <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>> KernelSubWithCol<DTYPE, true> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockSize, stride, blockSize * stride, blockNum, beta,true); blockSize, stride, blockSize * stride, blockNum, beta);
} }
else if (stride == 1) { else if (stride == 1) {
GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks); GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
if (beta == (DTYPE)1.0F) if (beta == (DTYPE)1.0F)
KernelSubWithRow <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> > KernelSubWithRow<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockNum, blockSize, beta,false); blockNum, blockSize, beta);
else else
KernelSubWithRow<<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> > KernelSubWithRow<DTYPE, true> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockNum, blockSize, beta,true); blockNum, blockSize, beta);
}
else {
ShowNTErrors("Something is wrong!");
}
}
else if (a->dataType == X_FLOAT16) {
if (stride > 1) {
GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
if (beta == (DTYPE)1.0F){
unsigned short temp = FloatToFloat16(beta);
half beta1 = *((half *)&temp);
KernelSubWithColHalf << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((half*)a->data, (half*)b->data, (half*)c->data,
blockSize, stride, blockSize * stride, blockNum, beta1, false);
}
else {
unsigned short temp = FloatToFloat16(beta);
half beta1 = *((half *)&temp);
KernelSubWithColHalf << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((half*)a->data, (half*)b->data, (half*)c->data,
blockSize, stride, blockSize * stride, blockNum, beta1, true);
}
}
else if (stride == 1) {
GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
if (beta == (DTYPE)1.0F) {
unsigned short temp = FloatToFloat16(beta);
half beta1 = *((half *)&temp);
KernelSubWithRowHalf << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((half*)a->data, (half*)b->data, (half*)c->data,
blockNum, blockSize, beta1, false);
}
else{
unsigned short temp = FloatToFloat16(beta);
half beta1 = *((half *)&temp);
KernelSubWithRowHalf << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((half*)a->data, (half*)b->data, (half*)c->data,
blockNum, blockSize, beta1, true);
}
} }
else { else {
ShowNTErrors("Something is wrong!"); ShowNTErrors("Something is wrong!");
} }
} }
else { else {
ShowNTErrors("TODO!"); ShowNTErrors("TODO!");
......
...@@ -38,6 +38,10 @@ void _SubDim(XTensor * a, const XTensor * b, int n, DTYPE beta = (DTYPE)1.0); ...@@ -38,6 +38,10 @@ void _SubDim(XTensor * a, const XTensor * b, int n, DTYPE beta = (DTYPE)1.0);
i.e., a is subtracted with b by broadcasting. We make a new tensor c to keep the result and return it */ i.e., a is subtracted with b by broadcasting. We make a new tensor c to keep the result and return it */
XTensor SubDim(const XTensor &a, const XTensor &b, int n, DTYPE beta = (DTYPE)1.0); XTensor SubDim(const XTensor &a, const XTensor &b, int n, DTYPE beta = (DTYPE)1.0);
/* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a,
i.e., a is subtracted with b by broadcasting*/
void SubDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta = (DTYPE)1.0, bool requireLink = false);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
#endif // __SUBDIM_H__ #endif // __SUBDIM_H__
...@@ -21,7 +21,6 @@ ...@@ -21,7 +21,6 @@
#include "../../XTensor.h" #include "../../XTensor.h"
#include "../../XName.h" #include "../../XName.h"
#include "../getandset/ConvertDataType.h"
#include "../../XUtility.h" #include "../../XUtility.h"
#include "../movement/CopyValues.h" #include "../movement/CopyValues.h"
#include "Sum.h" #include "Sum.h"
...@@ -38,58 +37,15 @@ tensor summation c = a + b * \beta ...@@ -38,58 +37,15 @@ tensor summation c = a + b * \beta
>> c - where we put a+b*\beta. we save it in a if c is NULL >> c - where we put a+b*\beta. we save it in a if c is NULL
>> beta - the scaling factor >> beta - the scaling factor
*/ */
void _MySum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
{
CheckNTErrors(a && b && c, "Empty tensor input!");
CheckNTErrors(a->unitNum == b->unitNum && a->unitNum == c->unitNum,
"Unmatched tensors in addition!");
if (beta == 0) {
_CopyValues(a, c);
return;
}
XTensor b1(b->order, b->dimSize, a->dataType, b->denseRatio, b->devID, b->mem);
b1.SetTMPFlag();
_ConvertDataType(b, &b1);
if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
#ifdef USE_CUDA
if (a == c) {
int P2PAccesible = 0;
#ifdef CUDA_UVA
cudaDeviceCanAccessPeer(&P2PAccesible, a->devID, b->devID);
#endif
if ((a->devID < 0 && b->devID >= 0) ||
(a->devID >= 0 && b->devID < 0) ||
(a->devID >= 0 && b->devID >= 0 && a->devID != b->devID && !P2PAccesible))
{
ShowNTErrors("Cannot run this method on multiple devices simultaneously!");
}
else
_CudaSum(a, &b1, c, beta);
}
else
_CudaSum(a, &b1, c, beta);
#endif
}
else {
// TODO!!
ShowNTErrors("TODO!");
}
}
void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta) void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
{ {
CheckNTErrors(a && b && c, "Empty tensor input!"); CheckNTErrors(a && b && c, "Empty tensor input!");
CheckNTErrors(a->unitNum == b->unitNum && a->unitNum == c->unitNum, CheckNTErrors(a->unitNum == b->unitNum && a->unitNum == c->unitNum,
"Unmatched tensors in addition!"); "Unmatched tensors in addition!");
CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
"Unmatched tensors in addition!");
if (beta == 0) { if(beta == 0){
_CopyValues(a, c); _CopyValues(a, c);
return; return;
} }
...@@ -118,7 +74,7 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta) ...@@ -118,7 +74,7 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
else { else {
if (!a->isSparse && !b->isSparse) { if (!a->isSparse && !b->isSparse) {
CheckNTErrors(!c->isSparse, "Illegal use of sparse tensor in addition!"); CheckNTErrors(!c->isSparse, "Illegal use of sparse tensor in addition!");
if (a->dataType == DEFAULT_DTYPE && if (a->dataType == DEFAULT_DTYPE &&
b->dataType == DEFAULT_DTYPE && b->dataType == DEFAULT_DTYPE &&
c->dataType == DEFAULT_DTYPE) c->dataType == DEFAULT_DTYPE)
...@@ -126,7 +82,7 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta) ...@@ -126,7 +82,7 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
DTYPE * ap = (DTYPE*)a->data; DTYPE * ap = (DTYPE*)a->data;
DTYPE * bp = (DTYPE*)b->data; DTYPE * bp = (DTYPE*)b->data;
DTYPE * cp = (DTYPE*)c->data; DTYPE * cp = (DTYPE*)c->data;
/* unrolling */ /* unrolling */
int num = a->unitNum; int num = a->unitNum;
if (num % 4 == 0) { if (num % 4 == 0) {
...@@ -243,4 +199,46 @@ XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta) ...@@ -243,4 +199,46 @@ XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta)
return c; return c;
} }
/*
tensor summation c = a + b * \beta
>> a - a tensor
>> b - another tensor
>> beta - the scaling factor
>> requireLink - if add operation to network
*/
void Sum(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta, bool requireLink)
{
if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
InitTensor(&c, &a);
}
int n = GetSumDimIndex(a, b);
if (n == -1) {
/* call _Sum function */
_Sum(&a, &b, &c, beta);
if (requireLink) {
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_SUM);
XLink::AddParamToHead(&c, beta);
}
}
else if (n >= 0 && n < a.order) {
/* call _SumDim function */
_SumDim(&a, &b, &c, n, beta);
if (requireLink) {
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_SUMDIM);
XLink::AddParamToHeadInt(&c, n);
XLink::AddParamToHead(&c, beta);
}
}
else {
ShowNTErrors("Something is wrong!");
}
}
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
...@@ -23,7 +23,6 @@ ...@@ -23,7 +23,6 @@
#include "../../XUtility.h" #include "../../XUtility.h"
#include "Sum.cuh" #include "Sum.cuh"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
...@@ -46,31 +45,6 @@ void KernelADD(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta) ...@@ -46,31 +45,6 @@ void KernelADD(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta)
c[i] = a[i] + b[i] * beta; c[i] = a[i] + b[i] * beta;
} }
__global__
void KernelADDHalf(__half * a, __half * b, __half * c, int size, DTYPE beta)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
__half beta1 = __float2half(beta);
if (i < size)
c[i] = a[i] + b[i] * beta1;
#endif
}
__global__
void KernelADDInt(int * a, int * b, int * c, int size, DTYPE beta)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size)
c[i] = a[i] + b[i] * (int)beta;
}
/* /*
tensor summation c = a + b * \beta (cuda version) tensor summation c = a + b * \beta (cuda version)
>> a - a tensor >> a - a tensor
...@@ -126,36 +100,6 @@ void _CudaSum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta) ...@@ -126,36 +100,6 @@ void _CudaSum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
KernelADD << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, a->unitNum, beta); KernelADD << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, a->unitNum, beta);
} }
} }
else if (a->dataType == X_FLOAT16 &&
b->dataType == X_FLOAT16 &&
c->dataType == X_FLOAT16)
{
int gridSize[3], blockSize[3];
GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
dim3 blocks(gridSize[0]);
dim3 threads(blockSize[0]);
//KernelADD << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, a->unitNum, beta);
KernelADDHalf << <blocks, threads >> >((__half *)a->data, (__half *)b->data, (__half *)c->data, a->unitNum, beta);
}
else if (a->dataType == X_INT &&
b->dataType == X_INT &&
c->dataType == X_INT)
{
int gridSize[3], blockSize[3];
GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
dim3 blocks(gridSize[0]);
dim3 threads(blockSize[0]);
//KernelADD << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, a->unitNum, beta);
KernelADDInt << <blocks, threads >> >((int *)a->data, (int *)b->data, (int *)c->data, a->unitNum, beta);
}
else { else {
// TODO!! // TODO!!
ShowNTErrors("TODO!"); ShowNTErrors("TODO!");
......
...@@ -27,8 +27,6 @@ ...@@ -27,8 +27,6 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* tensor summation c = a + b * \beta */ /* tensor summation c = a + b * \beta */
void _MySum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0); void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
/* /*
...@@ -43,6 +41,9 @@ make a new tensor c to keep the result and return it ...@@ -43,6 +41,9 @@ make a new tensor c to keep the result and return it
*/ */
XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta = (DTYPE)1.0); XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta = (DTYPE)1.0);
/* tensor summation c = a + b * \beta */
void Sum(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta = (DTYPE)1.0, bool requireLink = false);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
#endif // __SUM_H__ #endif // __SUM_H__
...@@ -64,6 +64,20 @@ void _SumDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE bet ...@@ -64,6 +64,20 @@ void _SumDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE bet
return; return;
} }
/*int dims[MAX_TENSOR_DIM_NUM];
for(int i = 0; i < a->order; i++)
dims[i] = 1;
dims[n] = a->GetDim(n);
XTensor * b2 = NewTensor(a->order, dims, b->dataType, b->denseRatio, b->devID, b->mem);
_CopyValues(b, b2);
_SumBroadcast(a, b2, c, beta);
DelTensor(b2);
return;*/
if(a->devID >= 0 || b->devID >= 0 || c->devID >= 0){ if(a->devID >= 0 || b->devID >= 0 || c->devID >= 0){
#ifdef USE_CUDA #ifdef USE_CUDA
_CudaSumDim(a, b, c, n, beta); _CudaSumDim(a, b, c, n, beta);
...@@ -167,6 +181,37 @@ XTensor SumDim(const XTensor &a, const XTensor &b, int n, DTYPE beta) ...@@ -167,6 +181,37 @@ XTensor SumDim(const XTensor &a, const XTensor &b, int n, DTYPE beta)
return c; return c;
} }
/*
tensor summation
c = a + b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is summed with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> c - where we put a+b*\beta. we save it in a if c is NULL
>> n - the dimension index
>> beta - the scaling factor
>> requireLink - if add operation to network
*/
void SumDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta, bool requireLink)
{
if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
InitTensor(&c, &a);
}
/* call _SumDim function */
_SumDim(&a, &b, &c, n, beta);
if (requireLink) {
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_SUMDIM);
XLink::AddParamToHeadInt(&c, n);
XLink::AddParamToHead(&c, beta);
}
}
/* /*
tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1 tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1
c = a + b * \beta c = a + b * \beta
...@@ -307,5 +352,31 @@ XTensor SumBroadcast(const XTensor &a, const XTensor &b, DTYPE beta) ...@@ -307,5 +352,31 @@ XTensor SumBroadcast(const XTensor &a, const XTensor &b, DTYPE beta)
return c; return c;
} }
/*
tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1
c = a + b * \beta
>> a - a tensor
>> b - another tensor that would be broadcasted
>> c - the resulting tensor
>> beta - the scaling factor
>> requireLink - if add operation to network
*/
void SumBroadcast(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta, bool requireLink)
{
if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
InitTensor(&c, &a);
}
/* call _SumBroadcast function */
_SumBroadcast(&a, &b, &c, beta);
if (requireLink) {
/* tensor connections */
XLink::MakeLink(&a, &b, &c, MATH_SUMBROADCAST);
XLink::AddParamToHead(&c, beta);
}
}
} }
...@@ -42,12 +42,19 @@ void _SumDim(XTensor * a, const XTensor * b, int n, DTYPE beta = (DTYPE)1.0); ...@@ -42,12 +42,19 @@ void _SumDim(XTensor * a, const XTensor * b, int n, DTYPE beta = (DTYPE)1.0);
i.e., a is summed with b by broadcasting. We make a new tensor c to keep the result and return it */ i.e., a is summed with b by broadcasting. We make a new tensor c to keep the result and return it */
XTensor SumDim(const XTensor &a, const XTensor &b, int n, DTYPE beta = (DTYPE)1.0); XTensor SumDim(const XTensor &a, const XTensor &b, int n, DTYPE beta = (DTYPE)1.0);
/* tensor summation c = a + b * \beta where the size of b is equal to the n-th dimension of a,
i.e., a is summed with b by broadcasting */
void SumDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta = (DTYPE)1.0, bool requireLink = false);
/* tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1 */ /* tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1 */
void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0); void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
/* tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1. /* tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1.
we return the resulting tensor here */ we return the resulting tensor here */
XTensor SumBroadcast(const XTensor &a, const XTensor &b, DTYPE beta = (DTYPE)1.0); XTensor SumBroadcast(const XTensor &a, const XTensor &b, DTYPE beta = (DTYPE)1.0);
/* tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1 */
void SumBroadcast(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta = (DTYPE)1.0, bool requireLink = false);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -23,7 +23,6 @@ ...@@ -23,7 +23,6 @@
#include "../../XDevice.h" #include "../../XDevice.h"
#include "../../XTensor.h" #include "../../XTensor.h"
#include "XTensorBLAS.h" #include "XTensorBLAS.h"
#include <stdint.h>
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -81,38 +80,6 @@ void _CudaBLASMatrixMUL(cublasHandle_t * handle, ...@@ -81,38 +80,6 @@ void _CudaBLASMatrixMUL(cublasHandle_t * handle,
else if (transposedA == X_TRANS && transposedB == X_TRANS) else if (transposedA == X_TRANS && transposedB == X_TRANS)
cublasHgemm(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, alpha3, (const __half*)b, mb, (const __half*)a, ma, beta3, (__half*)c, mc); cublasHgemm(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, alpha3, (const __half*)b, mb, (const __half*)a, ma, beta3, (__half*)c, mc);
} }
else if (dataTypeA == X_INT8 && dataTypeB == X_INT8 && dataTypeC == X_FLOAT) {
float alpha2 = (float)alpha;
float beta2 = (float)beta;
if (transposedA == X_NOTRANS && transposedB == X_NOTRANS)
cublasGemmEx(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, &alpha2, (const int8_t*)b, CUDA_R_8I, mb, (const int8_t*)a, CUDA_R_8I, ma, &beta2, (float*)c, CUDA_R_32F, mc, CUDA_R_32F, CUBLAS_GEMM_DEFAULT);
else if (transposedA == X_TRANS && transposedB == X_NOTRANS)
cublasGemmEx(*handle, CUBLAS_OP_N, CUBLAS_OP_T, mc, nc, na, &alpha2, (const int8_t*)b, CUDA_R_8I, mb, (const int8_t*)a, CUDA_R_8I, ma, &beta2, (float*)c, CUDA_R_32F, mc, CUDA_R_32F, CUBLAS_GEMM_DEFAULT);
else if (transposedA == X_NOTRANS && transposedB == X_TRANS)
cublasGemmEx(*handle, CUBLAS_OP_T, CUBLAS_OP_N, mc, nc, ma, &alpha2, (const int8_t*)b, CUDA_R_8I, mb, (const int8_t*)a, CUDA_R_8I, ma, &beta2, (float*)c, CUDA_R_32F, mc, CUDA_R_32F, CUBLAS_GEMM_DEFAULT);
else if (transposedA == X_TRANS && transposedB == X_TRANS)
cublasGemmEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, &alpha2, (const int8_t*)b, CUDA_R_8I, mb, (const int8_t*)a, CUDA_R_8I, ma, &beta2, (float*)c, CUDA_R_32F, mc, CUDA_R_32F, CUBLAS_GEMM_DEFAULT);
}
else if (dataTypeA == X_INT8 && dataTypeB == X_INT8 && dataTypeC == X_INT) {
int alpha2 = (int)alpha;
int beta2 = (int)beta;
/*
CUDA requires that the dimension of two tensor( lda, ldb ) should be multiples of 4.
details in https://devtalk.nvidia.com/default/topic/999101/about-cublasgemm-int8-support/
*/
if (mb % 4 != 0 || ma % 4 != 0) {
ShowNTErrors("mb, ma( lda, ldb ) should be multiples of 4!");
return;
}
if (transposedA == X_NOTRANS && transposedB == X_NOTRANS)
cublasGemmEx(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, &alpha2, (const int8_t*)b, CUDA_R_8I, mb, (const int8_t*)a, CUDA_R_8I, ma, &beta2, (int*)c, CUDA_R_32I, mc, CUDA_R_32I, CUBLAS_GEMM_DEFAULT);
else if (transposedA == X_TRANS && transposedB == X_NOTRANS)
cublasGemmEx(*handle, CUBLAS_OP_N, CUBLAS_OP_T, mc, nc, na, &alpha2, (const int8_t*)b, CUDA_R_8I, mb, (const int8_t*)a, CUDA_R_8I, ma, &beta2, (int*)c, CUDA_R_32I, mc, CUDA_R_32I, CUBLAS_GEMM_DEFAULT);
else if (transposedA == X_NOTRANS && transposedB == X_TRANS)
cublasGemmEx(*handle, CUBLAS_OP_T, CUBLAS_OP_N, mc, nc, ma, &alpha2, (const int8_t*)b, CUDA_R_8I, mb, (const int8_t*)a, CUDA_R_8I, ma, &beta2, (int*)c, CUDA_R_32I, mc, CUDA_R_32I, CUBLAS_GEMM_DEFAULT);
else if (transposedA == X_TRANS && transposedB == X_TRANS)
cublasGemmEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, &alpha2, (const int8_t*)b, CUDA_R_8I, mb, (const int8_t*)a, CUDA_R_8I, ma, &beta2, (int*)c, CUDA_R_32I, mc, CUDA_R_32I, CUBLAS_GEMM_DEFAULT);
}
else { else {
ShowNTErrors("Unsupported data type!"); ShowNTErrors("Unsupported data type!");
} }
...@@ -160,7 +127,7 @@ void _CudaBLASMatrixMULBatched(cublasHandle_t * handle, ...@@ -160,7 +127,7 @@ void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
unsigned short alpha2 = FloatToFloat16(alpha); unsigned short alpha2 = FloatToFloat16(alpha);
unsigned short beta2 = FloatToFloat16(beta); unsigned short beta2 = FloatToFloat16(beta);
__half * alpha3 = (__half*)&alpha2; __half * alpha3 = (__half*)&alpha2;
__half * beta3 = (__half*)&beta2; __half * beta3 = (__half*)&beta2;
if (transposedA == X_NOTRANS && transposedB == X_NOTRANS) if (transposedA == X_NOTRANS && transposedB == X_NOTRANS)
cublasHgemmBatched(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, alpha3, (const __half**)b, mb, (const __half**)a, ma, beta3, (__half**)c, mc, count); cublasHgemmBatched(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, alpha3, (const __half**)b, mb, (const __half**)a, ma, beta3, (__half**)c, mc, count);
else if (transposedA == X_TRANS && transposedB == X_NOTRANS) else if (transposedA == X_TRANS && transposedB == X_NOTRANS)
...@@ -217,19 +184,13 @@ void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle, ...@@ -217,19 +184,13 @@ void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
__half * alpha3 = (__half*)&alpha2; __half * alpha3 = (__half*)&alpha2;
__half * beta3 = (__half*)&beta2; __half * beta3 = (__half*)&beta2;
if (transposedA == X_NOTRANS && transposedB == X_NOTRANS) if (transposedA == X_NOTRANS && transposedB == X_NOTRANS)
cublasHgemmStridedBatched(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, (const __half*)alpha3, (const __half*)b, mb, strideB, (const __half*)a, ma, strideA, (const __half*)beta3, (__half*)c, mc, strideC, count); cublasHgemmStridedBatched(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, (const __half*)alpha3, (const __half*)b, mb, strideB, (const __half*)a, ma, strideA, (const __half*)beta3, (__half*)c, mc, strideC, count);
else if (transposedA == X_TRANS && transposedB == X_NOTRANS) else if (transposedA == X_TRANS && transposedB == X_NOTRANS)
cublasHgemmStridedBatched(*handle, CUBLAS_OP_N, CUBLAS_OP_T, mc, nc, na, (const __half*)alpha3, (const __half*)b, mb, strideB, (const __half*)a, ma, strideA, (const __half*)beta3, (__half*)c, mc, strideC, count); cublasHgemmStridedBatched(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, (const __half*)alpha3, (const __half*)b, mb, strideB, (const __half*)a, ma, strideA, (const __half*)beta3, (__half*)c, mc, strideC, count);
else if (transposedA == X_NOTRANS && transposedB == X_TRANS) else if (transposedA == X_NOTRANS && transposedB == X_TRANS)
cublasHgemmStridedBatched(*handle, CUBLAS_OP_T, CUBLAS_OP_N, mc, nc, ma, (const __half*)alpha3, (const __half*)b, mb, strideB, (const __half*)a, ma, strideA, (const __half*)beta3, (__half*)c, mc, strideC, count); cublasHgemmStridedBatched(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, (const __half*)alpha3, (const __half*)b, mb, strideB, (const __half*)a, ma, strideA, (const __half*)beta3, (__half*)c, mc, strideC, count);
else if (transposedA == X_TRANS && transposedB == X_TRANS) else if (transposedA == X_TRANS && transposedB == X_TRANS)
cublasHgemmStridedBatched(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, (const __half*)alpha3, (const __half*)b, mb, strideB, (const __half*)a, ma, strideA, (const __half*)beta3, (__half*)c, mc, strideC, count); cublasHgemmStridedBatched(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, (const __half*)alpha3, (const __half*)b, mb, strideB, (const __half*)a, ma, strideA, (const __half*)beta3, (__half*)c, mc, strideC, count);
}
else if (dataTypeA == X_INT8 && dataTypeB == X_INT8 && dataTypeC == X_FLOAT) {
//float alpha2 = (float)alpha;
//float beta2 = (float)beta;
//cublasGemmStridedBatchedEx(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, &alpha2, (const int8_t*)b, CUDA_R_8I, mb, strideB, (const int8_t*)a, CUDA_R_8I, ma, strideA, &beta2, (float*)c, CUDA_R_32I, mc, strideC, count, CUDA_R_32I, CUBLAS_GEMM_DEFAULT);
ShowNTErrors("TO DO!");
} }
else { else {
ShowNTErrors("Unsupported data type!"); ShowNTErrors("Unsupported data type!");
......
...@@ -35,10 +35,8 @@ convert data type ...@@ -35,10 +35,8 @@ convert data type
*/ */
void _ConvertDataType(const XTensor * input, XTensor * output) void _ConvertDataType(const XTensor * input, XTensor * output)
{ {
if (input->dataType == output->dataType) { if (input->dataType == output->dataType)
_CopyValues(input, output); return;
return ;
}
#ifdef USE_CUDA #ifdef USE_CUDA
/* run it on GPUs */ /* run it on GPUs */
......
...@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <cublas_v2.h> #include <cublas_v2.h>
#include <cuda_fp16.h> #include <cuda_fp16.h>
#include <stdint.h>
__global__ __global__
void KernelFloatToFloat16(float * s, __half * t, int size) void KernelFloatToFloat16(float * s, __half * t, int size)
...@@ -61,16 +60,6 @@ void KernelFloatToInt(float * inputData, int * outputData, int size) ...@@ -61,16 +60,6 @@ void KernelFloatToInt(float * inputData, int * outputData, int size)
} }
} }
__global__
void KernelFloatToInt8(float * inputData, int8_t * outputData, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) {
outputData[i] = (int8_t)(inputData[i]);
}
}
__global__ __global__
void KernelIntToFloat(int * inputData, float * outputData, int size) void KernelIntToFloat(int * inputData, float * outputData, int size)
{ {
...@@ -78,18 +67,7 @@ void KernelIntToFloat(int * inputData, float * outputData, int size) ...@@ -78,18 +67,7 @@ void KernelIntToFloat(int * inputData, float * outputData, int size)
if (i < size){ if (i < size){
outputData[i] = (float)(inputData[i]); outputData[i] = (float)(inputData[i]);
} }}
}
__global__
void KernelIntToFloat8(int8_t * inputData, float * outputData, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) {
outputData[i] = (float)(inputData[i]);
}
}
/* /*
data conversion (cuda code) data conversion (cuda code)
...@@ -145,6 +123,7 @@ void _CudaConvertDataType(const XTensor * input, XTensor * output) ...@@ -145,6 +123,7 @@ void _CudaConvertDataType(const XTensor * input, XTensor * output)
int blockSize[3]; int blockSize[3];
GDevs.GetCudaThread(input->devID, input->unitNum, gridSize, blockSize); GDevs.GetCudaThread(input->devID, input->unitNum, gridSize, blockSize);
dim3 blocks(gridSize[0]); dim3 blocks(gridSize[0]);
dim3 threads(blockSize[0]); dim3 threads(blockSize[0]);
...@@ -155,14 +134,10 @@ void _CudaConvertDataType(const XTensor * input, XTensor * output) ...@@ -155,14 +134,10 @@ void _CudaConvertDataType(const XTensor * input, XTensor * output)
KernelFloatToInt<<<blocks, threads>>>((float*)input->data, (int*)output->data, input->unitNum); KernelFloatToInt<<<blocks, threads>>>((float*)input->data, (int*)output->data, input->unitNum);
else if(input->dataType == X_INT && output->dataType == X_FLOAT) else if(input->dataType == X_INT && output->dataType == X_FLOAT)
KernelIntToFloat<<<blocks, threads>>>((int*)input->data, (float*)output->data, input->unitNum); KernelIntToFloat<<<blocks, threads>>>((int*)input->data, (float*)output->data, input->unitNum);
else if (input->dataType == X_FLOAT && output->dataType == X_FLOAT16) else if(input->dataType == X_FLOAT && output->dataType == X_FLOAT16)
KernelFloatToFloat16 << <blocks, threads >> >((float*)input->data, (__half*)output->data, input->unitNum); KernelFloatToFloat16<<<blocks, threads>>>((float*)input->data, (__half*)output->data, input->unitNum);
else if (input->dataType == X_FLOAT16 && output->dataType == X_FLOAT) else if(input->dataType == X_FLOAT16 && output->dataType == X_FLOAT)
KernelFloat16ToFloat << <blocks, threads >> >((__half*)input->data, (float*)output->data, input->unitNum); KernelFloat16ToFloat<<<blocks, threads>>>((__half*)input->data, (float*)output->data, input->unitNum);
else if (input->dataType == X_FLOAT && output->dataType == X_INT8)
KernelFloatToInt8 << <blocks, threads >> >((float*)input->data, (int8_t*)output->data, input->unitNum);
else if (input->dataType == X_INT8 && output->dataType == X_FLOAT)
KernelIntToFloat8 << <blocks, threads >> >((int8_t*)input->data, (float*)output->data, input->unitNum);
else{ else{
ShowNTErrors("Unsupported data types for conversion!"); ShowNTErrors("Unsupported data types for conversion!");
} }
...@@ -170,91 +145,6 @@ void _CudaConvertDataType(const XTensor * input, XTensor * output) ...@@ -170,91 +145,6 @@ void _CudaConvertDataType(const XTensor * input, XTensor * output)
ProtectCudaDev(input->devID, devIDBackup); ProtectCudaDev(input->devID, devIDBackup);
} }
__global__
void KernelConvertf32tof16(float* input, __half *output, int stride, int strideNum, int blockNum, int size)
{
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int idy = blockDim.y * blockIdx.y + threadIdx.y;
int blockIndex = idy / stride;
int offsetInBlock = idy% stride;
#pragma unroll
for (int i = idx * stride + stride * strideNum * blockIndex + offsetInBlock;
i < stride * strideNum * blockIndex + offsetInBlock + stride * strideNum && i < size;
i += stride * blockDim.x) {
output[i] = __float2half(input[i]);
}
}
__global__
void KernelConvertf16tof32(__half* input, float *output, int stride, int strideNum, int blockNum, int size)
{
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int idy = blockDim.y * blockIdx.y + threadIdx.y;
int blockIndex = idy / stride;
int offsetInBlock = idy% stride;
#pragma unroll
for (int i = idx * stride + stride * strideNum * blockIndex + offsetInBlock;
i < stride * strideNum * blockIndex + offsetInBlock + stride * strideNum && i < size;
i += stride * blockDim.x) {
output[i] = __half2float(input[i]);
}
}
/*
convert data type (cuda code)
>> input - input tensor
>> output - output tensor
*/
void _CudaConvertDataTypeNew(const XTensor * input, XTensor * output)
{
//CheckNTErrors((input->unitSize == output->unitSize), "Input and Output must be same in size!");
if (input->dataType == output->dataType)
return;
//int dimRDI = input->order - 1;
int dimRDI = 0;
int stride = 1;
int strideNumA = input->dimSizeRDI[dimRDI];
for (int i = 0; i < dimRDI; i++)
stride *= input->dimSizeRDI[i];
int blockNum = 1;
for (int i = dimRDI + 1; i < input->order; i++)
blockNum *= input->dimSizeRDI[i];
int gridSize[3];
int blockSize[3];
/*for (int i = 0; i < input->order; ++i)
{
printf("%d ", input->dimSizeRDI[i]);
}
printf("\n");
printf("%d %d\n", dimRDI, input->dimSizeRDI[dimRDI]);
printf("%d %d %d\n", stride, strideNumA, blockNum);*/
int workerNum = 64;
GDevs.GetCudaThread2D(input->devID, workerNum, stride * blockNum, MAX_INT, gridSize, blockSize);
dim3 blocks(gridSize[0], gridSize[1]);
dim3 threads(blockSize[0], blockSize[1]);
int devIDBackup;
ProtectCudaDev(input->devID, devIDBackup);
if (input->dataType == X_FLOAT && output->dataType == X_INT)
KernelFloatToInt << <blocks, threads >> > ((float*)input->data, (int*)output->data, input->unitNum);
else if (input->dataType == X_INT && output->dataType == X_FLOAT)
KernelIntToFloat << <blocks, threads >> > ((int*)input->data, (float*)output->data, input->unitNum);
else if (input->dataType == X_FLOAT && output->dataType == X_FLOAT16)
KernelConvertf32tof16 << <blocks, threads >> > ((float*)input->data, (__half*)output->data, stride, strideNumA, blockNum, input->unitNum);
else if (input->dataType == X_FLOAT16 && output->dataType == X_FLOAT)
KernelConvertf16tof32 << <blocks, threads >> >((__half*)input->data, (float*)output->data, stride, strideNumA, blockNum, input->unitNum);
else {
ShowNTErrors("Unsupported data types for conversion!");
}
ProtectCudaDev(input->devID, devIDBackup);
}
#endif // USE_CUDA #endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
...@@ -47,9 +47,6 @@ void KernelIntToFloat(int * inputData, float * outputData, int size); ...@@ -47,9 +47,6 @@ void KernelIntToFloat(int * inputData, float * outputData, int size);
/* convert data type */ /* convert data type */
void _CudaConvertDataType(const XTensor * input, XTensor * output); void _CudaConvertDataType(const XTensor * input, XTensor * output);
/* convert data type */
void _CudaConvertDataTypeNew(const XTensor * input, XTensor * output);
#endif // USE_CUDA #endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#include "OnehotAndIndex.h" #include "OnehotAndIndex.h"
#include "OnehotAndIndex.cuh" #include "OnehotAndIndex.cuh"
#include "SetData.h"
namespace nts{ // namespace nts(NiuTrans.Tensor) namespace nts{ // namespace nts(NiuTrans.Tensor)
...@@ -31,43 +32,65 @@ convert onehot tensor to index tensor ...@@ -31,43 +32,65 @@ convert onehot tensor to index tensor
>> index - index tensor, which value is an integer num >> index - index tensor, which value is an integer num
>> size - the last dimension size of the onehot tensor >> size - the last dimension size of the onehot tensor
*/ */
void _OnehotToIndex(XTensor * onehot, XTensor * index, int size) void _OnehotToIndex(XTensor * onehot, XTensor * index, int dim)
{ {
CheckNTErrors(onehot->GetDim(-1) == size, "Illegal tensor dimension!"); dim = (dim < 0 ? onehot->GetDim(-1) : dim);
CheckNTErrors(onehot->order == index->order + 1, "Illegal tensor order!"); CheckNTErrors(onehot->order == index->order + 1, "Illegal tensor order!");
CheckNTErrors(dim < onehot->order, "Illegal speficied dimension!")
CheckNTErrors(onehot->dataType == X_INT, "The onehot tensor must be in X_INT!") CheckNTErrors(onehot->dataType == X_INT, "The onehot tensor must be in X_INT!")
CheckNTErrors(index->dataType == X_INT, "The index tensor must be in X_INT!") CheckNTErrors(index->dataType == X_INT, "The index tensor must be in X_INT!")
for (int i = 0; i < index->order; i++) for (int i = 0; i < index->order; i++) {
CheckNTErrors(index->GetDim(i) == onehot->GetDim(i), "Illegal tensor order!"); if (i < dim) {
CheckNTErrors(index->GetDim(i) == onehot->GetDim(i), "Illegal tensor order!");
}
else {
CheckNTErrors(index->GetDim(i) == onehot->GetDim(i + 1), "Illegal tensor order!");
}
}
#ifdef USE_CUDA #ifdef USE_CUDA
if(onehot->devID >= 0 && index->devID >= 0) { if(onehot->devID >= 0 && index->devID >= 0) {
_CudaOnehotToIndex(onehot, index, size); _CudaOnehotToIndex(onehot, index, dim);
return; return;
} }
#endif #endif
int blockNum = index->unitNum; int blockNum = 1;
int stride = size; int blockSize = 1;
int dimSize = 1;
int stride = 1;
for (int i = 0; i < dim; i++)
blockNum *= onehot->GetDim(i);
blockSize = onehot->unitNum / blockNum;
dimSize = onehot->GetDim(dim);
for (int i = dim + 1; i < onehot->order; i++)
stride *= onehot->GetDim(i);
int * onehotData = (int *)onehot->data; int * onehotData = (int *)onehot->data;
int * indexData = (int *)index->data; int * indexData = (int *)index->data;
for (int i = 0; i < blockNum; i++) { for (int i = 0; i < blockNum; i++) {
int * od = onehotData + i * stride;
int record = -1;
for (int j = 0; j < stride; j++) { for (int j = 0; j < stride; j++) {
if (od[j] != 0) { int * od = onehotData + i * blockSize + j;
if (record == -1) int * index = indexData + i * stride + j;
record = j;
else int record = -1;
ShowNTErrors("The value of onehot tensor is illegal!"); for (int j = 0; j < dimSize; j++) {
if (od[j*stride] != 0) {
if (record == -1)
record = j;
else
ShowNTErrors("The value of onehot tensor is illegal!");
}
} }
*index = record;
} }
indexData[i] = record;
} }
} }
/* /*
...@@ -99,11 +122,11 @@ convert index tensor to onehot tensor ...@@ -99,11 +122,11 @@ convert index tensor to onehot tensor
>> onehot - onehot tensor, which value is 0 or 1 >> onehot - onehot tensor, which value is 0 or 1
>> size - the last dimension size of the onehot tensor >> size - the last dimension size of the onehot tensor
*/ */
void _IndexToOnehot(XTensor * index, XTensor * onehot, int size) void _IndexToOnehot(const XTensor * index, XTensor * onehot, int size, float labelSmoothingP)
{ {
CheckNTErrors(onehot->GetDim(-1) == size, "Illegal tensor dimension!"); CheckNTErrors(onehot->GetDim(-1) == size, "Illegal tensor dimension!");
CheckNTErrors(onehot->order == index->order + 1, "Illegal tensor order!"); CheckNTErrors(onehot->order == index->order + 1, "Illegal tensor order!");
CheckNTErrors(onehot->dataType == X_INT, "The onehot tensor must be in X_INT!") //CheckNTErrors(onehot->dataType == X_INT, "The onehot tensor must be in X_INT!")
CheckNTErrors(index->dataType == X_INT, "The index tensor must be in X_INT!") CheckNTErrors(index->dataType == X_INT, "The index tensor must be in X_INT!")
for (int i = 0; i < index->order; i++) for (int i = 0; i < index->order; i++)
...@@ -111,9 +134,14 @@ void _IndexToOnehot(XTensor * index, XTensor * onehot, int size) ...@@ -111,9 +134,14 @@ void _IndexToOnehot(XTensor * index, XTensor * onehot, int size)
onehot->SetZeroAll(); onehot->SetZeroAll();
float confidence = 1 - labelSmoothingP;
float lowconfidence = labelSmoothingP / size;
//_SetDataFixedFloat(onehot, lowconfidence);
#ifdef USE_CUDA #ifdef USE_CUDA
if(onehot->devID >= 0 && index->devID >= 0) { if(onehot->devID >= 0 && index->devID >= 0) {
_CudaIndexToOnehot(index, onehot, size); _CudaIndexToOnehot(index, onehot, size, confidence, lowconfidence);
return; return;
} }
#endif #endif
...@@ -122,12 +150,12 @@ void _IndexToOnehot(XTensor * index, XTensor * onehot, int size) ...@@ -122,12 +150,12 @@ void _IndexToOnehot(XTensor * index, XTensor * onehot, int size)
int stride = size; int stride = size;
int * indexData = (int *)index->data; int * indexData = (int *)index->data;
int * onehotData = (int *)onehot->data; DTYPE * onehotData = (DTYPE *)onehot->data;
for (int i = 0; i < blockNum; i++) { for (int i = 0; i < blockNum; i++) {
int id = indexData[i]; int id = indexData[i];
int * od = onehotData + i * stride; DTYPE * od = onehotData + i * stride;
od[id] = 1; od[id] = confidence;
} }
} }
...@@ -138,9 +166,10 @@ make a new tensor to keep the result and return it ...@@ -138,9 +166,10 @@ make a new tensor to keep the result and return it
>> index - index tensor, which value is an integer num >> index - index tensor, which value is an integer num
>> size - the last dimension size of the onehot tensor >> size - the last dimension size of the onehot tensor
>> confidence - labelsmoothing
<< return - the onehot tensor << return - the onehot tensor
*/ */
XTensor IndexToOnehot(XTensor & index, int size) XTensor IndexToOnehot(XTensor & index, int size, float labelSmoothingP)
{ {
CheckNTErrors(index.dataType == X_INT, "The onehot tensor must be in X_INT!") CheckNTErrors(index.dataType == X_INT, "The onehot tensor must be in X_INT!")
...@@ -151,9 +180,9 @@ XTensor IndexToOnehot(XTensor & index, int size) ...@@ -151,9 +180,9 @@ XTensor IndexToOnehot(XTensor & index, int size)
int * dim = new int[order + 1]; int * dim = new int[order + 1];
memcpy(dim, index.dimSize, order * sizeof(int)); memcpy(dim, index.dimSize, order * sizeof(int));
dim[order] = size; dim[order] = size;
InitTensor(&onehot, index.order + 1, dim, X_INT, 1.0F, index.devID, index.mem); InitTensor(&onehot, index.order + 1, dim, X_FLOAT, 1.0F, index.devID, index.mem);
_IndexToOnehot(&index, &onehot, size); _IndexToOnehot(&index, &onehot, size, labelSmoothingP);
delete[] dim; delete[] dim;
......
...@@ -96,7 +96,7 @@ convert index tensor to onehot tensor (kernel version) ...@@ -96,7 +96,7 @@ convert index tensor to onehot tensor (kernel version)
>> stride - stride of a data block >> stride - stride of a data block
*/ */
__global__ __global__
void KernelIndexToOnehot(int * onehotData, int * indexData, int blockNum, int stride) void KernelIndexToOnehot(DTYPE * onehotData, int * indexData, int blockNum, int stride, float confidence, float lowconfidence)
{ {
/* block id */ /* block id */
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
...@@ -107,10 +107,16 @@ void KernelIndexToOnehot(int * onehotData, int * indexData, int blockNum, int st ...@@ -107,10 +107,16 @@ void KernelIndexToOnehot(int * onehotData, int * indexData, int blockNum, int st
if (i >= blockNum || offset >= stride) if (i >= blockNum || offset >= stride)
return; return;
int * od = onehotData + i * stride; DTYPE * od = onehotData + i * stride;
int id = indexData[i]; int id = indexData[i];
od[id] = 1; //od[id] = confidence;
if (offset == id)
od[offset] = confidence;
else{
od[offset] = lowconfidence;
}
} }
/* /*
...@@ -120,7 +126,7 @@ convert index tensor to onehot tensor (cuda version) ...@@ -120,7 +126,7 @@ convert index tensor to onehot tensor (cuda version)
>> onehot - onehot tensor, which value is 0 or 1 >> onehot - onehot tensor, which value is 0 or 1
>> size - the last dimension size of the onehot tensor >> size - the last dimension size of the onehot tensor
*/ */
void _CudaIndexToOnehot(XTensor * index, XTensor * onehot, int size) void _CudaIndexToOnehot(const XTensor * index, XTensor * onehot, int size, float confidence, float lowconfidence)
{ {
int devID = onehot->devID; int devID = onehot->devID;
...@@ -138,10 +144,10 @@ void _CudaIndexToOnehot(XTensor * index, XTensor * onehot, int size) ...@@ -138,10 +144,10 @@ void _CudaIndexToOnehot(XTensor * index, XTensor * onehot, int size)
dim3 blocks(cudaGrids[0], cudaGrids[1]); dim3 blocks(cudaGrids[0], cudaGrids[1]);
dim3 threads(cudaBlocks[0], cudaBlocks[1]); dim3 threads(cudaBlocks[0], cudaBlocks[1]);
int * onehotData = (int *)onehot->data; DTYPE * onehotData = (DTYPE *)onehot->data;
int * indexData = (int *)index->data; int * indexData = (int *)index->data;
KernelIndexToOnehot<<<blocks, threads >>>(onehotData, indexData, blockNum, stride); KernelIndexToOnehot<<<blocks, threads >>>(onehotData, indexData, blockNum, stride, confidence, lowconfidence);
BacktoCudaDev(devID, devIDBackup); BacktoCudaDev(devID, devIDBackup);
} }
......
...@@ -30,7 +30,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor) ...@@ -30,7 +30,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
void _CudaOnehotToIndex(XTensor * onehot, XTensor * index, int size); void _CudaOnehotToIndex(XTensor * onehot, XTensor * index, int size);
/* convert index tensor to onehot tensor (cuda version) */ /* convert index tensor to onehot tensor (cuda version) */
void _CudaIndexToOnehot(XTensor * index, XTensor * onehot, int size); void _CudaIndexToOnehot(const XTensor * index, XTensor * onehot, int size, float confidence, float lowconfidence);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -27,18 +27,18 @@ ...@@ -27,18 +27,18 @@
namespace nts{ // namespace nts(NiuTrans.Tensor) namespace nts{ // namespace nts(NiuTrans.Tensor)
/* convert onehot tensor to index tensor */ /* convert onehot tensor to index tensor */
void _OnehotToIndex(XTensor * onehot, XTensor * index, int size); void _OnehotToIndex(XTensor * onehot, XTensor * index, int dim);
/* convert onehot tensor to index tensor (return an XTensor structure) /* convert onehot tensor to index tensor (return an XTensor structure)
make a new tensor to keep the result and return it */ make a new tensor to keep the result and return it */
XTensor OnehotToIndex(XTensor & onehot, int num); XTensor OnehotToIndex(XTensor & onehot, int size);
/* convert index tensor to onehot tensor */ /* convert index tensor to onehot tensor */
void _IndexToOnehot(XTensor * index, XTensor * onehot, int size); void _IndexToOnehot(const XTensor * index, XTensor * onehot, int size, float labelSmoothingP = 0.0F);
/* convert index tensor to onehot tensor (return an XTensor structure) /* convert index tensor to onehot tensor (return an XTensor structure)
make a new tensor to keep the result and return it */ make a new tensor to keep the result and return it */
XTensor IndexToOnehot(XTensor & index, int num); XTensor IndexToOnehot(XTensor & index, int num, float labelSmoothingP = 0.0F);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -168,6 +168,17 @@ void SetDataFixed(XTensor &tensor, DTYPE p) ...@@ -168,6 +168,17 @@ void SetDataFixed(XTensor &tensor, DTYPE p)
{ {
_SetDataFixed(&tensor, &p); _SetDataFixed(&tensor, &p);
} }
/*
generate data items with a fixed value p (in integer)
>> tensor - the tensor whose data array would be initialized
>> p - an integer
*/
void SetDataFixedInt(XTensor &tensor, int p)
{
CheckNTErrors(tensor.dataType == X_INT, "An integer tensor is required!");
_SetDataFixed(&tensor, &p);
}
/* /*
generate data items with a fixed value p (in integer) generate data items with a fixed value p (in integer)
...@@ -387,7 +398,7 @@ generate data items with a uniform distribution in [lower, upper] ...@@ -387,7 +398,7 @@ generate data items with a uniform distribution in [lower, upper]
>> lower - lower value of the range >> lower - lower value of the range
>> upper - upper value of the range >> upper - upper value of the range
*/ */
void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper) void _SetDataRand(const XTensor * tensor, DTYPE lower, DTYPE upper)
{ {
CheckNTErrors(upper > lower, "the high value must be greater than low value!"); CheckNTErrors(upper > lower, "the high value must be greater than low value!");
...@@ -440,9 +451,9 @@ the item to a pre-defined value if the item >= p, set the item to 0 otherwise ...@@ -440,9 +451,9 @@ the item to a pre-defined value if the item >= p, set the item to 0 otherwise
>> p - the threshold >> p - the threshold
>> value - the value we intend to assign to the item >> value - the value we intend to assign to the item
*/ */
void _SetDataRandP(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value) void _SetDataRandP(const XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value)
{ {
//CheckNTErrors(tensor->dataType == DEFAULT_DTYPE, "TODO"); CheckNTErrors(tensor->dataType == DEFAULT_DTYPE, "TODO");
if (tensor->devID < 0) { if (tensor->devID < 0) {
_SetDataRand(tensor, lower, upper); _SetDataRand(tensor, lower, upper);
......
...@@ -27,8 +27,6 @@ ...@@ -27,8 +27,6 @@
#include <curand_kernel.h> #include <curand_kernel.h>
#include "../../XDevice.h" #include "../../XDevice.h"
#include "../../XUtility.h" #include "../../XUtility.h"
#include "../getandset/ConvertDataType.h"
#include "../movement/CopyValues.h"
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
...@@ -188,30 +186,6 @@ void KernelSetDataRandDouble(double * d, int size, DTYPE lower, DTYPE variance) ...@@ -188,30 +186,6 @@ void KernelSetDataRandDouble(double * d, int size, DTYPE lower, DTYPE variance)
} }
/* /*
set data array with a uniform distribution in [low, high]
>> deviceStates - the state of curand
>> d - float datatype pointer to the data array
>> size - size of the array
>> lower - low value of the range
>> variance - the variance of the range
*/
__global__
void KernelSetDataRandHalf(half * d, int size, DTYPE lower, DTYPE variance)
{
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
half lowerHalf = __float2half(lower);
half varianceHalf = __float2half(variance);
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) {
d[i] = d[i] * varianceHalf + lowerHalf;
}
#endif
}
/*
set data items to a pre-defined value if its value >= p, set it to 0 otherwise set data items to a pre-defined value if its value >= p, set it to 0 otherwise
>> d - pointer to the data array >> d - pointer to the data array
>> size - size of the array >> size - size of the array
...@@ -231,24 +205,6 @@ void KernelSetDataPCut(DTYPE * d, int size, DTYPE p, DTYPE value) ...@@ -231,24 +205,6 @@ void KernelSetDataPCut(DTYPE * d, int size, DTYPE p, DTYPE value)
} }
} }
__global__
void KernelSetDataPCutHalf(half * d, int size, DTYPE p, DTYPE value)
{
#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
half halfP = __float2half(p);
half halfValue = __float2half(value);
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) {
if (d[i] >= halfP)
d[i] = halfValue;
else
d[i] = 0;
}
#endif
}
/* /*
set data items along with a given dimension (and keep the remaining items unchanged) - kernel version set data items along with a given dimension (and keep the remaining items unchanged) - kernel version
>> tensor - the tensor whose data array would be initialized >> tensor - the tensor whose data array would be initialized
...@@ -501,7 +457,7 @@ generate data items with a uniform distribution in [lower, upper] ...@@ -501,7 +457,7 @@ generate data items with a uniform distribution in [lower, upper]
>> lower - lower value of the range >> lower - lower value of the range
>> upper - upper value of the range >> upper - upper value of the range
*/ */
void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper) void _CudaSetDataRand(const XTensor * tensor, DTYPE lower, DTYPE upper)
{ {
CheckNTErrors(upper > lower, "the high value must be greater than low value!"); CheckNTErrors(upper > lower, "the high value must be greater than low value!");
...@@ -516,44 +472,21 @@ void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper) ...@@ -516,44 +472,21 @@ void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
int devIDBackup; int devIDBackup;
ProtectCudaDev(tensor->devID, devIDBackup); ProtectCudaDev(tensor->devID, devIDBackup);
XTensor tensor1(tensor->order, tensor->dimSize, X_FLOAT, tensor->denseRatio, tensor->devID, tensor->mem); curandGenerator_t & gen = GDevs.GPUs[tensor->devID].gen;
curandGenerateUniform(gen , (float*)tensor->data , tensor->unitNum);
if (tensor->dataType == X_FLOAT){
curandGenerator_t & gen = GDevs.GPUs[tensor->devID].gen;
curandGenerateUniform(gen, (float*)tensor->data, tensor->unitNum);
}
else{
curandGenerator_t & gen = GDevs.GPUs[tensor->devID].gen;
curandGenerateUniform(gen, (float*)tensor1.data, tensor1.unitNum);
}
//curandGenerator_t & gen = GDevs.GPUs[tensor->devID].gen;
//curandGenerateUniform(gen, (float*)tensor->data, tensor->unitNum);
DTYPE variance = upper - lower; DTYPE variance = upper - lower;
if (variance != 1.0F || lower != 0) { if(variance != 1.0F || lower != 0){
if (tensor->dataType == X_FLOAT) { if (tensor->dataType == X_FLOAT)
KernelSetDataRandFloat << <blocks, threads >> >((float*)tensor->data, tensor->unitNum, lower, variance); KernelSetDataRandFloat <<<blocks, threads >>>((float*) tensor->data, tensor->unitNum, lower, variance);
} else if (tensor->dataType == X_DOUBLE)
else if (tensor->dataType == X_DOUBLE) { KernelSetDataRandDouble <<<blocks, threads >>>((double*)tensor->data, tensor->unitNum, lower, variance);
KernelSetDataRandDouble << <blocks, threads >> >((double*)tensor->data, tensor->unitNum, lower, variance);
}
else if (tensor->dataType == X_FLOAT16) {
_ConvertDataType(&tensor1, tensor);
KernelSetDataRandHalf << <blocks, threads >> >((half*)tensor->data, tensor->unitNum, lower, variance);
}
else {
ShowNTErrors("TODO!");
}
} }
else if (tensor->dataType == X_FLOAT16) {
_ConvertDataType(&tensor1, tensor);
}
BacktoCudaDev(tensor->devID, devIDBackup); BacktoCudaDev(tensor->devID, devIDBackup);
} }
/* /*
generate data items with a uniform distribution in [lower, upper] and set generate data items with a uniform distribution in [lower, upper] and set
the item to a pre-defined value if the item >= p, set the item to 0 otherwise the item to a pre-defined value if the item >= p, set the item to 0 otherwise
...@@ -563,7 +496,7 @@ the item to a pre-defined value if the item >= p, set the item to 0 otherwise ...@@ -563,7 +496,7 @@ the item to a pre-defined value if the item >= p, set the item to 0 otherwise
>> p - the threshold >> p - the threshold
>> value - the value we intend to assign to the item >> value - the value we intend to assign to the item
*/ */
void _CudaSetDataRandP(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value) void _CudaSetDataRandP(const XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value)
{ {
_CudaSetDataRand(tensor, lower, upper); _CudaSetDataRand(tensor, lower, upper);
...@@ -577,16 +510,8 @@ void _CudaSetDataRandP(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYP ...@@ -577,16 +510,8 @@ void _CudaSetDataRandP(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYP
int devIDBackup; int devIDBackup;
ProtectCudaDev(tensor->devID, devIDBackup); ProtectCudaDev(tensor->devID, devIDBackup);
if (tensor->dataType == X_FLOAT) { KernelSetDataPCut << <blocks, threads >> >((float*)tensor->data, tensor->unitNum, p, value);
KernelSetDataPCut << <blocks, threads >> >((float*)tensor->data, tensor->unitNum, p, value);
}
else if (tensor->dataType == X_FLOAT16) {
KernelSetDataPCutHalf << <blocks, threads >> >((__half*)tensor->data, tensor->unitNum, p, value);
}
else {
ShowNTErrors("TODO!")
}
BacktoCudaDev(tensor->devID, devIDBackup); BacktoCudaDev(tensor->devID, devIDBackup);
} }
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论