Commit 3a515f68 by liyinqiao

Merge with XU Chen branch (Don't use this! It's an incomplete version)

1. Supporting efficient propagate and gradient accumulation for backward functions.
2. Update the setData functions.
3. Clean the codes.
parent be870567
...@@ -31,43 +31,54 @@ namespace nts{ ...@@ -31,43 +31,54 @@ namespace nts{
/* compute dE/dx of a node */ /* compute dE/dx of a node */
void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient) void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient)
{ {
XLink &income = node->income; if (!isEfficient) {
int operID = income.typeID;
if(!isEfficient){
CheckNTErrors(node->grad != NULL, "No gradient found!"); CheckNTErrors(node->grad != NULL, "No gradient found!");
} }
else{ else {
CheckNTErrors(!node->isGrad || node->grad != NULL, "No gradient found!"); CheckNTErrors(!node->isGrad || node->grad != NULL, "No gradient found!");
} }
XLink &income = node->income;
int operID = income.typeID;
CheckNTErrors(income.tailNum == 1, "Too many input tensors for the function!"); CheckNTErrors(income.tailNum == 1, "Too many input tensors for the function!");
XTensor * input = income.tails[0]; XTensor * input = income.tails[0];
XTensor * output = node; XTensor * output = node;
XNoder::MakeGrad(input); if (!isEfficient || input->isGrad) {
XNoder::MakeGrad(input);
if(operID == FUNC_HARDTANH) XTensor * dedx = input->grad;
_HardTanHBackward(output, input, output->grad, input->grad); XTensor * dedy = output->grad;
else if(operID == FUNC_IDENTITY) //XTensor * tmp = NewTensorBufV2(output, output->devID, output->mem);
_IdentityBackward(output, input, output->grad, input->grad); XTensor * tmp = NewTensor(output);
else if(operID == FUNC_LOGSOFTMAX){
int leadDim = income.GetParamInt(0); if (operID == FUNC_HARDTANH)
CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in logsoftmax!"); _HardTanHBackward(output, input, dedy, tmp);
_LogSoftmaxBackward(NULL, output, input, output->grad, input->grad, NULL, leadDim, NOLOSS); else if (operID == FUNC_IDENTITY)
} _IdentityBackward(output, input, dedy, tmp);
else if(operID == FUNC_RECTIFY) else if (operID == FUNC_LOGSOFTMAX) {
_RectifyBackward(output, input, output->grad, input->grad); int leadDim = income.GetParamInt(0);
else if(operID == FUNC_SIGMOID) CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in logsoftmax!");
_SigmoidBackward(output, input, output->grad, input->grad); _LogSoftmaxBackward(NULL, output, input, dedy, tmp, NULL, leadDim, NOLOSS);
else if(operID == FUNC_SOFTMAX){ }
int leadDim = income.GetParamInt(0); else if (operID == FUNC_RECTIFY)
CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in softmax!"); _RectifyBackward(output, input, dedy, tmp);
_SoftmaxBackward(NULL, output, input, output->grad, input->grad, NULL, leadDim, NOLOSS); else if (operID == FUNC_SIGMOID)
} _SigmoidBackward(output, input, dedy, tmp);
else{ else if (operID == FUNC_SOFTMAX) {
ShowNTErrors("Wrong activation function type!"); int leadDim = income.GetParamInt(0);
CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in softmax!");
_SoftmaxBackward(NULL, output, input, dedy, tmp, NULL, leadDim, NOLOSS);
}
else {
ShowNTErrors("Wrong activation function type!");
}
_SumMe(dedx, tmp);
//DelTensorBuf(tmp);
DelTensor(tmp);
} }
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
......
...@@ -33,7 +33,6 @@ ...@@ -33,7 +33,6 @@
namespace nts{ namespace nts{
/* compute dE/dx of a node */ /* compute dE/dx of a node */
void XLossGrad::MakeGrad(XTensor * node, bool isEfficient) void XLossGrad::MakeGrad(XTensor * node, bool isEfficient)
{ {
...@@ -48,33 +47,33 @@ void XLossGrad::MakeGrad(XTensor * node, bool isEfficient) ...@@ -48,33 +47,33 @@ void XLossGrad::MakeGrad(XTensor * node, bool isEfficient)
XTensor * padding = NULL; XTensor * padding = NULL;
int leadingDim; int leadingDim;
XNoder::MakeGrad(output); if (!isEfficient || output->isGrad) {
XTensor * dedy = output->grad; XNoder::MakeGrad(output);
XTensor * dedy = output->grad;
if (income.tailNum == 1) {
if(dedy->dataType == X_FLOAT) if (income.tailNum == 1) {
_SetDataFixedFloat(dedy, 1.0F); dedy->SetDataFixed(1);
else if(dedy->dataType == X_DOUBLE) return;
_SetDataFixedDouble(dedy, 1.0); }
else if(dedy->dataType == X_INT)
_SetDataFixedInt(dedy, 1); gold = income.tails[1];
else
ShowNTErrors("TODO"); //XTensor * tmp = NewTensorBufV2(output, output->devID, output->mem);
XTensor* tmp = NewTensor(output);
return;
} if (operID == LOSS_CROSSENTROPY) {
if (income.tailNum == 3)
gold = income.tails[1]; padding = income.tails[2];
leadingDim = income.GetParamInt(0);
if(operID == LOSS_CROSSENTROPY) { CheckNTErrors(leadingDim >= 0 && leadingDim < output->order, "wrong leading dimension in logsoftmax!");
if (income.tailNum == 3) _CrossEntropyBackward(tmp, output, gold, weight, padding, leadingDim);
padding = income.tails[2]; _SumMe(dedy, tmp);
leadingDim = income.GetParamInt(0); }
CheckNTErrors(leadingDim >= 0 && leadingDim < output->order, "wrong leading dimension in logsoftmax!"); else {
_CrossEntropyBackward(dedy, output, gold, weight, padding, leadingDim); ShowNTErrors("Wrong activation function type!");
} }
else{ //DelTensorBuf(tmp);
ShowNTErrors("Wrong activation function type!"); DelTensor(tmp);
} }
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
...@@ -87,79 +86,4 @@ bool XLossGrad::IsLossOP(XTensor * node) ...@@ -87,79 +86,4 @@ bool XLossGrad::IsLossOP(XTensor * node)
return (income.typeID & LOSS_BASE) != 0; return (income.typeID & LOSS_BASE) != 0;
} }
/*
compute dE/dx for a given function y = f(x)
>> gold - gold standard to measure error (or loss)
>> y - output of the function
>> x - input of the function
>> dedy - dE/dy
>> dedx - dE/dx
>> funcID - id of the function f
>> params - parameters of the function
>> lossName - name of the loss, e.g., cross entropy
*/
//void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x,
// XTensor * dedy, XTensor * dedx, XTensor * padding,
// int funcID, void * params,
// LOSS_FUNCTION_NAME lossName)
//{
// CheckNTErrors(gold && y && x, "Empty input tensors!");
// CheckNTErrors(dedx, "Empty gradient tensors!");
// CheckNTErrors((funcID & FUNCTION_BASE) != 0, "Illegal function id");
//
// if(funcID == FUNC_HARDTANH){
// _HardTanHBackward(gold, y, x, dedy, dedx, lossName);
// }
// else if(funcID == FUNC_IDENTITY){
// _IdentityBackward(gold, y, x, dedy, dedx, lossName);
// }
// else if(funcID == FUNC_LOGSOFTMAX){
// int leadDim = *(int*)params;
// _LogSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
// }
// else if(funcID == FUNC_RECTIFY){
// _RectifyBackward(gold, y, x, dedy, dedx, lossName);
// }
// else if(funcID == FUNC_SIGMOID){
// _SigmoidBackward(gold, y, x, dedy, dedx, lossName);
// }else if(funcID == FUNC_SOFTMAX){
// int leadDim = *(int*)params;
// _SoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
// }
// else{
// ShowNTErrors("wrong function found when call the backward process!");
// }
//
//}
/*
compute dE/dy for variable y and error(loss) function E
>> gold - gold standard to measure error (or loss)
>> y - output of the function
>> dedy - dE/dy
>> lossName - name of the loss, e.g., cross entropy
*/
//void XLossGrad::Compute(XTensor * gold, XTensor * y,
// XTensor * dedy, XTensor * padding,
// LOSS_FUNCTION_NAME lossName)
//{
// if(gold == NULL){
// if(dedy->dataType == X_FLOAT)
// _SetDataFixedFloat(dedy, 1.0F);
// else if(dedy->dataType == X_DOUBLE)
// _SetDataFixedDouble(dedy, 1.0);
// else if(dedy->dataType == X_INT)
// _SetDataFixedInt(dedy, 1);
// else{
// ShowNTErrors("TODO");
// }
// return;
// }
//
// //_LossBackward(dedy, gold, y, lossName);
// if(lossName == CROSSENTROPY)
// _CrossEntropyBackward(dedy, y, gold, NULL, padding);
//
//}
} }
\ No newline at end of file
...@@ -30,82 +30,82 @@ namespace nts{ ...@@ -30,82 +30,82 @@ namespace nts{
/* compute dE/dx of a node */ /* compute dE/dx of a node */
void XMathGrad::MakeGrad(XTensor * node, bool isEfficient) void XMathGrad::MakeGrad(XTensor * node, bool isEfficient)
{ {
if(!isEfficient){ if (!isEfficient) {
CheckNTErrors(node->grad != NULL, "No gradient found!"); CheckNTErrors(node->grad != NULL, "No gradient found!");
} }
else{ else {
CheckNTErrors(!node->isGrad || node->grad != NULL, "No gradient found!"); CheckNTErrors(!node->isGrad || node->grad != NULL, "No gradient found!");
} }
XLink &income = node->income; XLink &income = node->income;
int operID = income.typeID; int operID = income.typeID;
if(operID == MATH_ABSOLUTE) if (operID == MATH_ABSOLUTE)
GradAbsolute(node, isEfficient); GradAbsolute(node, isEfficient);
else if(operID == MATH_COS) else if (operID == MATH_COS)
GradCos(node, isEfficient); GradCos(node, isEfficient);
else if(operID == MATH_EXP) else if (operID == MATH_EXP)
GradExp(node, isEfficient); GradExp(node, isEfficient);
else if(operID == MATH_LOG) else if (operID == MATH_LOG)
GradLog(node, isEfficient); GradLog(node, isEfficient);
else if(operID == MATH_ROUND) else if (operID == MATH_ROUND)
GradRound(node, isEfficient); GradRound(node, isEfficient);
else if(operID == MATH_SIGN) else if (operID == MATH_SIGN)
GradSign(node, isEfficient); GradSign(node, isEfficient);
else if(operID == MATH_SIN) else if (operID == MATH_SIN)
GradSin(node, isEfficient); GradSin(node, isEfficient);
else if(operID == MATH_TAN) else if (operID == MATH_TAN)
GradTan(node, isEfficient); GradTan(node, isEfficient);
else if(operID == MATH_CLIP) else if (operID == MATH_CLIP)
GradClip(node, isEfficient); GradClip(node, isEfficient);
else if(operID == MATH_DIV) else if (operID == MATH_DIV)
GradDiv(node, isEfficient); GradDiv(node, isEfficient);
else if(operID == MATH_DIVDIM) else if (operID == MATH_DIVDIM)
GradDivDim(node, isEfficient); GradDivDim(node, isEfficient);
else if(operID == MATH_MATRIXMUL) else if (operID == MATH_MATRIXMUL)
GradMatrixMul(node, isEfficient); GradMatrixMul(node, isEfficient);
else if(operID == MATH_MATRIXMULBATCHED) else if (operID == MATH_MATRIXMULBATCHED)
GradMatrixMulBatched(node, isEfficient); GradMatrixMulBatched(node, isEfficient);
else if(operID == MATH_MULTIPLY) else if (operID == MATH_MULTIPLY)
GradMultiply(node, isEfficient); GradMultiply(node, isEfficient);
else if(operID == MATH_MULTIPLYDIM) else if (operID == MATH_MULTIPLYDIM)
GradMultiplyDim(node, isEfficient); GradMultiplyDim(node, isEfficient);
else if (operID == MATH_MULTIPLYBROADCAST) else if (operID == MATH_MULTIPLYBROADCAST)
GradMultiplyBroadcast(node, isEfficient); GradMultiplyBroadcast(node, isEfficient);
else if(operID == MATH_NEGATE) else if (operID == MATH_NEGATE)
GradNegate(node, isEfficient); GradNegate(node, isEfficient);
else if(operID == MATH_NORMALIZE) else if (operID == MATH_NORMALIZE)
GradNormalize(node, isEfficient); GradNormalize(node, isEfficient);
else if(operID == MATH_POWER) else if (operID == MATH_POWER)
GradPower(node, isEfficient); GradPower(node, isEfficient);
else if(operID == MATH_SCALEANDSHIFT) else if (operID == MATH_SCALEANDSHIFT)
GradScaleAndShift(node, isEfficient); GradScaleAndShift(node, isEfficient);
else if(operID == MATH_SCALE) else if (operID == MATH_SCALE)
GradScale(node, isEfficient); GradScale(node, isEfficient);
else if(operID == MATH_DESCALE) else if (operID == MATH_DESCALE)
GradDescale(node, isEfficient); GradDescale(node, isEfficient);
else if(operID == MATH_SHIFT) else if (operID == MATH_SHIFT)
GradShift(node, isEfficient); GradShift(node, isEfficient);
else if(operID == MATH_SUB) else if (operID == MATH_SUB)
GradSub(node, isEfficient); GradSub(node, isEfficient);
else if(operID == MATH_SUBDIM) else if (operID == MATH_SUBDIM)
GradSubDim(node, isEfficient); GradSubDim(node, isEfficient);
else if(operID == MATH_SUM) else if (operID == MATH_SUM)
GradSum(node, isEfficient); GradSum(node, isEfficient);
else if(operID == MATH_SUMDIM) else if (operID == MATH_SUMDIM)
GradSumDim(node, isEfficient); GradSumDim(node, isEfficient);
else if(operID == MATH_SUMBROADCAST) else if (operID == MATH_SUMBROADCAST)
GradSumBroadcast(node, isEfficient); GradSumBroadcast(node, isEfficient);
else if(operID == REDUCE_REDUCEMEAN) else if (operID == REDUCE_REDUCEMEAN)
GradReduceMean(node, isEfficient); GradReduceMean(node, isEfficient);
else if(operID == REDUCE_REDUCESUM) else if (operID == REDUCE_REDUCESUM)
GradReduceSum(node, isEfficient); GradReduceSum(node, isEfficient);
else if(operID == REDUCE_REDUCESUMALL) else if (operID == REDUCE_REDUCESUMALL)
GradReduceSumAll(node, isEfficient); GradReduceSumAll(node, isEfficient);
else if(operID == REDUCE_REDUCESUMSQUARED) else if (operID == REDUCE_REDUCESUMSQUARED)
GradReduceSumSquared(node, isEfficient); GradReduceSumSquared(node, isEfficient);
else if(operID == REDUCE_REDUCEVARIANCE) else if (operID == REDUCE_REDUCEVARIANCE)
GradReduceVariance(node, isEfficient); GradReduceVariance(node, isEfficient);
else if (operID == MATH_MULANDSHIFT) else if (operID == MATH_MULANDSHIFT)
GradMulAndShift(node, isEfficient); GradMulAndShift(node, isEfficient);
...@@ -138,14 +138,17 @@ void XMathGrad::GradAbsolute(XTensor * node, bool isEfficient) ...@@ -138,14 +138,17 @@ void XMathGrad::GradAbsolute(XTensor * node, bool isEfficient)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for ABSOLUTE!"); CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for ABSOLUTE!");
XTensor * a = income.tails[0]; XTensor * a = income.tails[0];
XTensor * b = NewTensorBufV2(a, a->devID, a->mem);
XNoder::MakeGrad(a); /* dE/da = dE/dc * sign(a) */
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
_Sign(a, b); XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
_Multiply(node->grad, b, a->grad, 1.0F); _Sign(a, tmp);
_Multiply(node->grad, tmp, a->grad, 1.0F);
DelTensorBuf(b); DelTensorBuf(tmp);
}
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
} }
...@@ -166,15 +169,18 @@ void XMathGrad::GradCos(XTensor * node, bool isEfficient) ...@@ -166,15 +169,18 @@ void XMathGrad::GradCos(XTensor * node, bool isEfficient)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for COS!"); CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for COS!");
XTensor * a = income.tails[0]; XTensor * a = income.tails[0];
XTensor * b = NewTensorBufV2(a, a->devID, a->mem);
XNoder::MakeGrad(a); /* dE/da = dE/dc * -sin(a) */
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
_Sin(a, b); XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
_ScaleAndShiftMe(b, -1.0F); _Sin(a, tmp);
_Multiply(node->grad, b, a->grad, 1.0F); _NegateMe(tmp);
_Multiply(node->grad, tmp, a->grad, 1.0F);
DelTensorBuf(b); DelTensorBuf(tmp);
}
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
} }
...@@ -195,14 +201,17 @@ void XMathGrad::GradExp(XTensor * node, bool isEfficient) ...@@ -195,14 +201,17 @@ void XMathGrad::GradExp(XTensor * node, bool isEfficient)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for EXP!"); CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for EXP!");
XTensor * a = income.tails[0]; XTensor * a = income.tails[0];
XTensor * b = NewTensorBufV2(a, a->devID, a->mem);
XNoder::MakeGrad(a); /* dE/da = dE/dc * exp(a) */
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
_Exp(a, b); XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
_Multiply(node->grad, b, a->grad, 1.0F); _Exp(a, tmp);
_Multiply(node->grad, tmp, a->grad, 1.0F);
DelTensorBuf(b); DelTensorBuf(tmp);
}
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
} }
...@@ -224,9 +233,11 @@ void XMathGrad::GradLog(XTensor * node, bool isEfficient) ...@@ -224,9 +233,11 @@ void XMathGrad::GradLog(XTensor * node, bool isEfficient)
XTensor * a = income.tails[0]; XTensor * a = income.tails[0];
XNoder::MakeGrad(a); /* dE/da = dE/dc * 1/a */
if (!isEfficient || a->isGrad) {
_Div(node->grad, a, a->grad, 1.0F); XNoder::MakeGrad(a);
_Div(node->grad, a, a->grad, 1.0F);
}
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
} }
...@@ -246,8 +257,12 @@ void XMathGrad::GradRound(XTensor * node, bool isEfficient) ...@@ -246,8 +257,12 @@ void XMathGrad::GradRound(XTensor * node, bool isEfficient)
XLink &income = node->income; XLink &income = node->income;
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for ROUND!"); CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for ROUND!");
// we do nothing here XTensor * a = income.tails[0];
// TODO: set grad = 0 if the node is the only child
/* dE/da = 0, we do nothing here */
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
}
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
} }
...@@ -267,8 +282,12 @@ void XMathGrad::GradSign(XTensor * node, bool isEfficient) ...@@ -267,8 +282,12 @@ void XMathGrad::GradSign(XTensor * node, bool isEfficient)
XLink &income = node->income; XLink &income = node->income;
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SIGN!"); CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SIGN!");
// we do nothing here XTensor * a = income.tails[0];
// TODO: set grad = 0 if the node is the only child
/* dE/da = 0, we do nothing here */
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
}
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
} }
...@@ -289,14 +308,17 @@ void XMathGrad::GradSin(XTensor * node, bool isEfficient) ...@@ -289,14 +308,17 @@ void XMathGrad::GradSin(XTensor * node, bool isEfficient)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SIN!"); CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SIN!");
XTensor * a = income.tails[0]; XTensor * a = income.tails[0];
XTensor * b = NewTensorBufV2(a, a->devID, a->mem);
XNoder::MakeGrad(a); /* dE/da = dE/dc * cos(a) */
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
_Cos(a, b); XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
_Multiply(node->grad, b, a->grad, 1.0F); _Cos(a, tmp);
_Multiply(node->grad, tmp, a->grad, 1.0F);
DelTensorBuf(b); DelTensorBuf(tmp);
}
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
} }
...@@ -317,15 +339,18 @@ void XMathGrad::GradTan(XTensor * node, bool isEfficient) ...@@ -317,15 +339,18 @@ void XMathGrad::GradTan(XTensor * node, bool isEfficient)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for TAN!"); CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for TAN!");
XTensor * a = income.tails[0]; XTensor * a = income.tails[0];
XTensor * b = NewTensorBufV2(a, a->devID, a->mem); XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
XNoder::MakeGrad(a);
_Cos(a, b); /* dE/da = dE/dc * 1/(cos(a))^2
_PowerMe(b, -2.0F); = dE/dc * (cos(a))^-2 */
_Multiply(node->grad, b, a->grad, 1.0F); if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
_Cos(a, tmp);
_PowerMe(tmp, -2.0F);
_Multiply(node->grad, tmp, a->grad, 1.0F);
DelTensorBuf(b); DelTensorBuf(tmp);
}
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
} }
...@@ -345,17 +370,21 @@ void XMathGrad::GradClip(XTensor * node, bool isEfficient) ...@@ -345,17 +370,21 @@ void XMathGrad::GradClip(XTensor * node, bool isEfficient)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for CLIP!"); CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for CLIP!");
XTensor * a = income.tails[0]; XTensor * a = income.tails[0];
XTensor * b = NewTensorBufV2(a, a->devID, a->mem);
DTYPE lower = income.GetParam(0); DTYPE lower = income.GetParam(0);
DTYPE upper = income.GetParam(1); DTYPE upper = income.GetParam(1);
XNoder::MakeGrad(a); /* dE/da = 1 lower < a < upper
= 0 otherwise */
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
_ClipBackward(node, a, node->grad, a->grad, lower, upper); XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
_Sum(a->grad, b, a->grad); _ClipBackward(node, a, node->grad, tmp, lower, upper);
_SumMe(a->grad, tmp);
DelTensorBuf(b); DelTensorBuf(tmp);
}
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
} }
...@@ -378,21 +407,26 @@ void XMathGrad::GradDiv(XTensor * node, bool isEfficient) ...@@ -378,21 +407,26 @@ void XMathGrad::GradDiv(XTensor * node, bool isEfficient)
XTensor * a = income.tails[0]; XTensor * a = income.tails[0];
XTensor * b = income.tails[1]; XTensor * b = income.tails[1];
XTensor * ab2 = NewTensorBufV2(a, a->devID, a->mem);
XNoder::MakeGrad(a);
XNoder::MakeGrad(b);
CheckNTErrors(_IsSameShaped(a, b), "Wrong sized input tensors!"); CheckNTErrors(_IsSameShaped(a, b), "Wrong sized input tensors!");
/* dE/da = dE/dc / b */
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
_Div(node->grad, b, a->grad, 1.0F);
}
_Div(node->grad, b, a->grad, 1.0F); /* dE/db = dE/dc * a/(-b^2)
= dE/dc * a * (-b^-2) */
_Power(b, ab2, -2.0F); if (!isEfficient || b->isGrad) {
_Multiply(a, ab2, ab2); XNoder::MakeGrad(b);
_ScaleAndShiftMe(ab2, -1.0F); XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
_Multiply(node->grad, ab2, b->grad, 1.0F); _Power(b, tmp, -2.0F);
_NegateMe(tmp);
_MultiplyMe(tmp, a);
_Multiply(node->grad, tmp, b->grad, 1.0F);
DelTensorBuf(ab2); DelTensorBuf(tmp);
}
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
} }
...@@ -416,87 +450,82 @@ void XMathGrad::GradDivDim(XTensor * node, bool isEfficient) ...@@ -416,87 +450,82 @@ void XMathGrad::GradDivDim(XTensor * node, bool isEfficient)
XTensor * a = income.tails[0]; XTensor * a = income.tails[0];
XTensor * b = income.tails[1]; XTensor * b = income.tails[1];
int n = income.GetParamInt(0); int n = income.GetParamInt(0);
XNoder::MakeGrad(a);
XNoder::MakeGrad(b);
/* dE/da = dE/dc * (1/b) */ /* dE/da = dE/dc * (1/b) */
_DivDim(node->grad, b, a->grad, n, 1.0); if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
_DivDim(node->grad, b, a->grad, n, 1.0);
}
/* dE/db = dE/dc * dc/db */ /* dE/db = dE/dc * dc/db
int order = a->order; = (dE/dc * (-a/b^2)).reduce(0,...,n-1,n+1,...) */
int dimSize[MAX_TENSOR_DIM_NUM]; if (!isEfficient || b->isGrad) {
memcpy(dimSize, a->dimSize, sizeof(int) * a->order); XNoder::MakeGrad(b);
int order = a->order;
int dimSize[MAX_TENSOR_DIM_NUM];
memcpy(dimSize, a->dimSize, sizeof(int) * a->order);
XTensor * aTMP1 = NewTensorBufV2(a, a->devID, a->mem); XTensor * aTMP1 = NewTensorBufV2(a, a->devID, a->mem);
XTensor * aTMP2 = NewTensorBufV2(a, a->devID, a->mem); XTensor * aTMP2 = NewTensorBufV2(a, a->devID, a->mem);
XTensor * bTMP = NewTensorBufV2(b, b->devID, b->mem); XTensor * bTMP = NewTensorBufV2(b, b->devID, b->mem);
XTensor * interGradTMP = NewTensorBufV2(node->grad, node->devID, node->mem); XTensor * interGradTMP = NewTensorBufV2(node->grad, node->devID, node->mem);
_Negate(a, aTMP1); _Negate(a, aTMP1);
_Power(b, bTMP, -2.0F); _Power(b, bTMP, -2.0F);
_MultiplyDim(aTMP1, bTMP, aTMP2, n); _MultiplyDim(aTMP1, bTMP, aTMP2, n);
_Multiply(node->grad, aTMP2, interGradTMP); _Multiply(node->grad, aTMP2, interGradTMP);
if(n == order - 1){ if (n == order - 1) {
int reshapedSize[MAX_TENSOR_DIM_NUM]; int reshapedSize[MAX_TENSOR_DIM_NUM];
reshapedSize[0] = a->unitNum/dimSize[order - 1]; reshapedSize[0] = a->unitNum / dimSize[order - 1];
reshapedSize[1] = dimSize[order - 1]; reshapedSize[1] = dimSize[order - 1];
/* we reshape dE/dc * a to a matrix whose column number is equal to the /* we reshape dE/dc * a to a matrix whose column number is equal to the
size of b. Then we can reduce the matrix into a row vector. */ size of b. Then we can reduce the matrix into a row vector. */
interGradTMP->Reshape(2, reshapedSize); interGradTMP->Reshape(2, reshapedSize);
//if(b->outgo.tailNum > 1){
XTensor * bGradTMP = NewTensorBufV2(b->grad, b->devID, b->mem); XTensor * bGradTMP = NewTensorBufV2(b->grad, b->devID, b->mem);
_ReduceSum(interGradTMP, bGradTMP, 0); _ReduceSum(interGradTMP, bGradTMP, 0);
_Sum(b->grad, bGradTMP, b->grad);
_SumMe(b->grad, bGradTMP);
DelTensorBuf(bGradTMP); DelTensorBuf(bGradTMP);
/*}
else{
_ReduceSum(interGradTMP, b->grad, 0);
}*/
}
else{
int reshapedSize[MAX_TENSOR_DIM_NUM];
reshapedSize[0] = 1;
reshapedSize[1] = dimSize[n];
reshapedSize[2] = 1;
for(int i = 0; i < order; i++){
if(i < n)
reshapedSize[0] *= dimSize[i];
} }
else {
int reshapedSize[MAX_TENSOR_DIM_NUM];
reshapedSize[0] = 1;
reshapedSize[1] = dimSize[n];
reshapedSize[2] = 1;
reshapedSize[2] = a->unitNum / (reshapedSize[0] * reshapedSize[1]); for (int i = 0; i < order; i++) {
if (i < n)
reshapedSize[0] *= dimSize[i];
}
/* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|. reshapedSize[2] = a->unitNum / (reshapedSize[0] * reshapedSize[1]);
Then reduce along with z and x to obtain dE/db. */
interGradTMP->Reshape(3, reshapedSize);
XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem); /* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|.
_ReduceSum(interGradTMP, interGrad, 2); Then reduce along with z and x to obtain dE/db. */
interGradTMP->Reshape(3, reshapedSize);
//if(b->outgo.tailNum > 1){ XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
XTensor * bGradTMP2 = NewTensorBufV2(b->grad, b->devID, b->mem); _ReduceSum(interGradTMP, interGrad, 2);
XTensor * bGradTMP2 = NewTensorBufV2(b->grad, b->devID, b->mem);
_ReduceSum(interGrad, bGradTMP2, 0); _ReduceSum(interGrad, bGradTMP2, 0);
_Sum(b->grad, bGradTMP2, b->grad);
_SumMe(b->grad, bGradTMP2);
DelTensorBuf(bGradTMP2); DelTensorBuf(bGradTMP2);
/*} DelTensorBuf(interGrad);
else{ }
_ReduceSum(interGrad, b->grad, 0);
}*/
DelTensorBuf(interGrad);
}
DelTensorBuf(interGradTMP); DelTensorBuf(interGradTMP);
DelTensorBuf(bTMP); DelTensorBuf(bTMP);
DelTensorBuf(aTMP2); DelTensorBuf(aTMP2);
DelTensorBuf(aTMP1); DelTensorBuf(aTMP1);
}
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
} }
...@@ -523,9 +552,9 @@ void XMathGrad::GradMatrixMul(XTensor * node, bool isEfficient) ...@@ -523,9 +552,9 @@ void XMathGrad::GradMatrixMul(XTensor * node, bool isEfficient)
MATRIX_TRANS_TYPE transB = income.GetParamTrans(1); MATRIX_TRANS_TYPE transB = income.GetParamTrans(1);
DTYPE alpha = income.GetParam(2); DTYPE alpha = income.GetParam(2);
if(!isEfficient || a->isGrad) if (!isEfficient || a->isGrad)
XNoder::MakeGrad(a); XNoder::MakeGrad(a);
if(!isEfficient || b->isGrad) if (!isEfficient || b->isGrad)
XNoder::MakeGrad(b); XNoder::MakeGrad(b);
XTensor * c = node; XTensor * c = node;
...@@ -533,9 +562,9 @@ void XMathGrad::GradMatrixMul(XTensor * node, bool isEfficient) ...@@ -533,9 +562,9 @@ void XMathGrad::GradMatrixMul(XTensor * node, bool isEfficient)
XTensor * deda = a->grad; XTensor * deda = a->grad;
XTensor * dedb = b->grad; XTensor * dedb = b->grad;
if(a->order == 2 && b->order == 2) if (a->order == 2 && b->order == 2)
GradMatrixMul(a, deda, transA, b, dedb, transB, dedc, alpha, isEfficient); GradMatrixMul(a, deda, transA, b, dedb, transB, dedc, alpha, isEfficient);
else if(transA == X_NOTRANS && a->order > 2 && b->order == 2){ else if (transA == X_NOTRANS && a->order > 2 && b->order == 2){
int orderBackupA = a->order; int orderBackupA = a->order;
int orderBackupC = c->order; int orderBackupC = c->order;
int dimsBackupA[MAX_TENSOR_DIM_NUM]; int dimsBackupA[MAX_TENSOR_DIM_NUM];
...@@ -545,7 +574,7 @@ void XMathGrad::GradMatrixMul(XTensor * node, bool isEfficient) ...@@ -545,7 +574,7 @@ void XMathGrad::GradMatrixMul(XTensor * node, bool isEfficient)
a->Reshape(a->unitNum/a->GetDim(-1), a->GetDim(-1)); a->Reshape(a->unitNum/a->GetDim(-1), a->GetDim(-1));
c->Reshape(c->unitNum/c->GetDim(-1), c->GetDim(-1)); c->Reshape(c->unitNum/c->GetDim(-1), c->GetDim(-1));
if(!isEfficient || a->isGrad) if (!isEfficient || a->isGrad)
deda->Reshape(deda->unitNum/deda->GetDim(-1), deda->GetDim(-1)); deda->Reshape(deda->unitNum/deda->GetDim(-1), deda->GetDim(-1));
dedc->Reshape(dedc->unitNum/dedc->GetDim(-1), dedc->GetDim(-1)); dedc->Reshape(dedc->unitNum/dedc->GetDim(-1), dedc->GetDim(-1));
...@@ -553,7 +582,7 @@ void XMathGrad::GradMatrixMul(XTensor * node, bool isEfficient) ...@@ -553,7 +582,7 @@ void XMathGrad::GradMatrixMul(XTensor * node, bool isEfficient)
a->Reshape(orderBackupA, dimsBackupA); a->Reshape(orderBackupA, dimsBackupA);
c->Reshape(orderBackupC, dimsBackupC); c->Reshape(orderBackupC, dimsBackupC);
if(!isEfficient || a->isGrad) if (!isEfficient || a->isGrad)
deda->Reshape(orderBackupA, dimsBackupA); deda->Reshape(orderBackupA, dimsBackupA);
dedc->Reshape(orderBackupC, dimsBackupC); dedc->Reshape(orderBackupC, dimsBackupC);
} }
...@@ -580,54 +609,54 @@ void XMathGrad::GradMatrixMul(XTensor * a, XTensor * deda, MATRIX_TRANS_TYPE tra ...@@ -580,54 +609,54 @@ void XMathGrad::GradMatrixMul(XTensor * a, XTensor * deda, MATRIX_TRANS_TYPE tra
XTensor * dedc, DTYPE alpha, bool isEfficient) XTensor * dedc, DTYPE alpha, bool isEfficient)
{ {
/* c = a * b * \alpha */ /* c = a * b * \alpha */
if(transA == X_NOTRANS && transB == X_NOTRANS){ if (transA == X_NOTRANS && transB == X_NOTRANS) {
/* dE/da = dE/dc * b^T * \alpha */ /* dE/da = dE/dc * b^T * \alpha */
if(!isEfficient || a->isGrad) if (!isEfficient || a->isGrad)
_MatrixMul(dedc, X_NOTRANS, b, X_TRANS, deda, alpha, 1.0F); _MatrixMul(dedc, X_NOTRANS, b, X_TRANS, deda, alpha, 1.0F);
/* dE/db = a^T * dE/dc * \alpha */ /* dE/db = a^T * dE/dc * \alpha */
if(!isEfficient || b->isGrad) if (!isEfficient || b->isGrad)
_MatrixMul(a, X_TRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F); _MatrixMul(a, X_TRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
} }
/* c = a^T * b * \alpha */ /* c = a^T * b * \alpha */
else if(transA == X_TRANS && transB == X_NOTRANS){ else if (transA == X_TRANS && transB == X_NOTRANS){
/* dE/da = (dE/dc * b^T)^T * \alpha /* dE/da = (dE/dc * b^T)^T * \alpha
= b * dE/dc^T * \alpha */ = b * dE/dc^T * \alpha */
if(!isEfficient || a->isGrad) if (!isEfficient || a->isGrad)
_MatrixMul(b, X_NOTRANS, dedc, X_TRANS, deda, alpha, 1.0F); _MatrixMul(b, X_NOTRANS, dedc, X_TRANS, deda, alpha, 1.0F);
/* dE/db = a * dE/dc * \alpha */ /* dE/db = a * dE/dc * \alpha */
if(!isEfficient || b->isGrad) if (!isEfficient || b->isGrad)
_MatrixMul(a, X_NOTRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F); _MatrixMul(a, X_NOTRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
} }
/* c = a * b^T * \alpha */ /* c = a * b^T * \alpha */
else if(transA == X_NOTRANS && transB == X_TRANS){ else if (transA == X_NOTRANS && transB == X_TRANS){
/* dE/da = dE/dc * b * \alpha */ /* dE/da = dE/dc * b * \alpha */
if(!isEfficient || a->isGrad) if (!isEfficient || a->isGrad)
_MatrixMul(dedc, X_NOTRANS, b, X_NOTRANS, deda, alpha, 1.0F); _MatrixMul(dedc, X_NOTRANS, b, X_NOTRANS, deda, alpha, 1.0F);
/* dE/db = (a^T * dE/dc)^T * \alpha /* dE/db = (a^T * dE/dc)^T * \alpha
= dE/dc^T * a * \alpha */ = dE/dc^T * a * \alpha */
if(!isEfficient || b->isGrad) if (!isEfficient || b->isGrad)
_MatrixMul(dedc, X_TRANS, a, X_NOTRANS, dedb, alpha, 1.0F); _MatrixMul(dedc, X_TRANS, a, X_NOTRANS, dedb, alpha, 1.0F);
} }
/* c = a^T * b^T * \alpha */ /* c = a^T * b^T * \alpha */
else if(transA == X_TRANS && transB == X_TRANS){ else if (transA == X_TRANS && transB == X_TRANS){
/* dE/da = (dE/dc * b)^T * \alpha /* dE/da = (dE/dc * b)^T * \alpha
= b^T * dE/dc^T * \alpha */ = b^T * dE/dc^T * \alpha */
if(!isEfficient || a->isGrad) if (!isEfficient || a->isGrad)
_MatrixMul(b, X_TRANS, dedc, X_TRANS, deda, alpha, 1.0F); _MatrixMul(b, X_TRANS, dedc, X_TRANS, deda, alpha, 1.0F);
/* dE/db = (a * dE/dc)^T * \alpha /* dE/db = (a * dE/dc)^T * \alpha
= dE/dc^T * a^T * \alpha */ = dE/dc^T * a^T * \alpha */
if(!isEfficient || b->isGrad) if (!isEfficient || b->isGrad)
_MatrixMul(dedc, X_TRANS, a, X_TRANS, dedb, alpha, 1.0F); _MatrixMul(dedc, X_TRANS, a, X_TRANS, dedb, alpha, 1.0F);
} }
} }
...@@ -655,55 +684,65 @@ void XMathGrad::GradMatrixMulBatched(XTensor * node, bool isEfficient) ...@@ -655,55 +684,65 @@ void XMathGrad::GradMatrixMulBatched(XTensor * node, bool isEfficient)
MATRIX_TRANS_TYPE transB = income.GetParamTrans(1); MATRIX_TRANS_TYPE transB = income.GetParamTrans(1);
DTYPE alpha = income.GetParam(2); DTYPE alpha = income.GetParam(2);
XNoder::MakeGrad(a); if (!isEfficient || a->isGrad)
XNoder::MakeGrad(b); XNoder::MakeGrad(a);
if (!isEfficient || b->isGrad)
XNoder::MakeGrad(b);
XTensor * dedc = node->grad; XTensor * dedc = node->grad;
XTensor * deda = a->grad; XTensor * deda = a->grad;
XTensor * dedb = b->grad; XTensor * dedb = b->grad;
/* c = a * b * \alpha */ /* c = a * b * \alpha */
if(transA == X_NOTRANS && transB == X_NOTRANS){ if (transA == X_NOTRANS && transB == X_NOTRANS) {
/* dE/da = dE/dc * b^T * \alpha */ /* dE/da = dE/dc * b^T * \alpha */
_MatrixMulBatched(dedc, X_NOTRANS, b, X_TRANS, deda, alpha, 1.0F); if (!isEfficient || a->isGrad)
_MatrixMulBatched(dedc, X_NOTRANS, b, X_TRANS, deda, alpha, 1.0F);
/* dE/db = a^T * dE/dc * \alpha */ /* dE/db = a^T * dE/dc * \alpha */
_MatrixMulBatched(a, X_TRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F); if (!isEfficient || b->isGrad)
_MatrixMulBatched(a, X_TRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
} }
/* c = a^T * b * \alpha */ /* c = a^T * b * \alpha */
else if(transA == X_TRANS && transB == X_NOTRANS){ else if (transA == X_TRANS && transB == X_NOTRANS) {
/* dE/da = (dE/dc * b^T)^T * \alpha /* dE/da = (dE/dc * b^T)^T * \alpha
= b * dE/dc^T * \alpha */ = b * dE/dc^T * \alpha */
_MatrixMulBatched(b, X_NOTRANS, dedc, X_TRANS, deda, alpha, 1.0F); if (!isEfficient || a->isGrad)
_MatrixMulBatched(b, X_NOTRANS, dedc, X_TRANS, deda, alpha, 1.0F);
/* dE/db = a * dE/dc * \alpha */ /* dE/db = a * dE/dc * \alpha */
_MatrixMulBatched(a, X_NOTRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F); if (!isEfficient || b->isGrad)
_MatrixMulBatched(a, X_NOTRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
} }
/* c = a * b^T * \alpha */ /* c = a * b^T * \alpha */
else if(transA == X_NOTRANS && transB == X_TRANS){ else if (transA == X_NOTRANS && transB == X_TRANS) {
/* dE/da = dE/dc * b * \alpha */ /* dE/da = dE/dc * b * \alpha */
_MatrixMulBatched(dedc, X_NOTRANS, b, X_NOTRANS, deda, alpha, 1.0F); if (!isEfficient || a->isGrad)
_MatrixMulBatched(dedc, X_NOTRANS, b, X_NOTRANS, deda, alpha, 1.0F);
/* dE/db = (a^T * dE/dc)^T * \alpha /* dE/db = (a^T * dE/dc)^T * \alpha
= dE/dc^T * a * \alpha */ = dE/dc^T * a * \alpha */
_MatrixMulBatched(dedc, X_TRANS, a, X_NOTRANS, dedb, alpha, 1.0F); if (!isEfficient || b->isGrad)
_MatrixMulBatched(dedc, X_TRANS, a, X_NOTRANS, dedb, alpha, 1.0F);
} }
/* c = a^T * b^T * \alpha */ /* c = a^T * b^T * \alpha */
else if(transA == X_TRANS && transB == X_TRANS){ else if (transA == X_TRANS && transB == X_TRANS) {
/* dE/da = (dE/dc * b)^T * \alpha /* dE/da = (dE/dc * b)^T * \alpha
= b^T * dE/dc^T * \alpha */ = b^T * dE/dc^T * \alpha */
_MatrixMulBatched(b, X_TRANS, dedc, X_TRANS, deda, alpha, 1.0F); if (!isEfficient || a->isGrad)
_MatrixMulBatched(b, X_TRANS, dedc, X_TRANS, deda, alpha, 1.0F);
/* dE/db = (a * dE/dc)^T * \alpha /* dE/db = (a * dE/dc)^T * \alpha
= dE/dc^T * a^T * \alpha */ = dE/dc^T * a^T * \alpha */
_MatrixMulBatched(dedc, X_TRANS, a, X_TRANS, dedb, alpha, 1.0F); if (!isEfficient || b->isGrad)
_MatrixMulBatched(dedc, X_TRANS, a, X_TRANS, dedb, alpha, 1.0F);
} }
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
...@@ -730,11 +769,13 @@ void XMathGrad::GradMultiply(XTensor * node, bool isEfficient) ...@@ -730,11 +769,13 @@ void XMathGrad::GradMultiply(XTensor * node, bool isEfficient)
CheckNTErrors(_IsSameShaped(a, b), "Wrong sized input tensors!"); CheckNTErrors(_IsSameShaped(a, b), "Wrong sized input tensors!");
/* dE/da = dE/dc * b */
if (!isEfficient || a->isGrad) { if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a); XNoder::MakeGrad(a);
_Multiply(node->grad, b, a->grad, 1.0F); _Multiply(node->grad, b, a->grad, 1.0F);
} }
/* dE/db = dE/dc * a */
if (!isEfficient || b->isGrad) { if (!isEfficient || b->isGrad) {
XNoder::MakeGrad(b); XNoder::MakeGrad(b);
_Multiply(node->grad, a, b->grad, 1.0F); _Multiply(node->grad, a, b->grad, 1.0F);
...@@ -762,77 +803,70 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient) ...@@ -762,77 +803,70 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
XTensor * a = income.tails[0]; XTensor * a = income.tails[0];
XTensor * b = income.tails[1]; XTensor * b = income.tails[1];
int n = income.GetParamInt(0); int n = income.GetParamInt(0);
XNoder::MakeGrad(a);
XNoder::MakeGrad(b);
/* dE/da */ /* dE/da = dE/dc * b */
_MultiplyDim(node->grad, b, a->grad, n, 1.0F); if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
/* dE/db */ _MultiplyDim(node->grad, b, a->grad, n, 1.0F);
int order = a->order; }
int dimSize[MAX_TENSOR_DIM_NUM];
memcpy(dimSize, a->dimSize, sizeof(int) * a->order);
XTensor * bGradTMP = NewTensorBufV2(node->grad, node->devID, node->mem); /* dE/db = (dE/dc * a).reduce(0,...,n-1,n+1,...) */
_Multiply(node->grad, a, bGradTMP); if (!isEfficient || b->isGrad) {
XNoder::MakeGrad(b);
if(n == order - 1){ int order = a->order;
int reshapedSize[MAX_TENSOR_DIM_NUM]; int dimSize[MAX_TENSOR_DIM_NUM];
reshapedSize[0] = a->unitNum/dimSize[order - 1]; memcpy(dimSize, a->dimSize, sizeof(int) * a->order);
reshapedSize[1] = dimSize[order - 1];
/* we reshape dE/dc * a to a matrix whose column number is equal to the XTensor * bGradTMP = NewTensorBufV2(node->grad, node->devID, node->mem);
size of b. Then we can reduce the matrix into a row vector. */ _Multiply(node->grad, a, bGradTMP);
bGradTMP->Reshape(2, reshapedSize);
//if(b->outgo.tailNum > 1){ if (n == order - 1) {
XTensor * bGradTMP2 = NewTensorBufV2(b->grad, b->devID, b->mem); int reshapedSize[MAX_TENSOR_DIM_NUM];
reshapedSize[0] = a->unitNum / dimSize[order - 1];
reshapedSize[1] = dimSize[order - 1];
/* we reshape dE/dc * a to a matrix whose column number is equal to the
size of b. Then we can reduce the matrix into a row vector. */
bGradTMP->Reshape(2, reshapedSize);
XTensor * bGradTMP2 = NewTensorBufV2(b->grad, b->devID, b->mem);
_ReduceSum(bGradTMP, bGradTMP2, 0); _ReduceSum(bGradTMP, bGradTMP2, 0);
_Sum(b->grad, bGradTMP2, b->grad); _Sum(b->grad, bGradTMP2, b->grad);
DelTensorBuf(bGradTMP2); DelTensorBuf(bGradTMP2);
/*}
else{
_ReduceSum(bGradTMP, b->grad, 0);
}*/
}
else{
int reshapedSize[MAX_TENSOR_DIM_NUM];
reshapedSize[0] = 1;
reshapedSize[1] = dimSize[n];
reshapedSize[2] = 1;
for(int i = 0; i < order; i++){
if(i < n)
reshapedSize[0] *= dimSize[i];
} }
else {
int reshapedSize[MAX_TENSOR_DIM_NUM];
reshapedSize[0] = 1;
reshapedSize[1] = dimSize[n];
reshapedSize[2] = 1;
reshapedSize[2] = a->unitNum / (reshapedSize[0] * reshapedSize[1]); for (int i = 0; i < order; i++) {
if (i < n)
reshapedSize[0] *= dimSize[i];
}
/* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|. reshapedSize[2] = a->unitNum / (reshapedSize[0] * reshapedSize[1]);
Then reduce along with z and x to obtain dE/db. */
bGradTMP->Reshape(3, reshapedSize);
XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem); /* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|.
_ReduceSum(bGradTMP, interGrad, 2); Then reduce along with z and x to obtain dE/db. */
bGradTMP->Reshape(3, reshapedSize);
//if(b->outgo.tailNum > 1){ XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
XTensor * bGradTMP2 = NewTensorBufV2(b->grad, b->devID, b->mem); _ReduceSum(bGradTMP, interGrad, 2);
XTensor * bGradTMP2 = NewTensorBufV2(b->grad, b->devID, b->mem);
_ReduceSum(interGrad, bGradTMP2, 0); _ReduceSum(interGrad, bGradTMP2, 0);
_Sum(b->grad, bGradTMP2, b->grad); _Sum(b->grad, bGradTMP2, b->grad);
DelTensorBuf(bGradTMP2); DelTensorBuf(bGradTMP2);
/*} DelTensorBuf(interGrad);
else{ }
_ReduceSum(interGrad, b->grad, 0); DelTensorBuf(bGradTMP);
}*/
DelTensorBuf(interGrad);
} }
DelTensorBuf(bGradTMP);
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
} }
...@@ -859,11 +893,18 @@ void XMathGrad::GradMultiplyBroadcast(XTensor * node, bool isEfficient) ...@@ -859,11 +893,18 @@ void XMathGrad::GradMultiplyBroadcast(XTensor * node, bool isEfficient)
XTensor * b = income.tails[1]; XTensor * b = income.tails[1];
XNoder::MakeGrad(a); XNoder::MakeGrad(a);
_MultiplyBroadcast(node->grad, b, a->grad, 1.0F);
if(b->isVar || b->income.tailNum > 0){ /* dE/da = dE/dc * b */
ShowNTErrors("TODO"); if (!isEfficient || a->isGrad)
_MultiplyBroadcast(node->grad, b, a->grad, 1.0F);
/* dE/db = (dE/dc * a).reduce(0...n) */
if (!isEfficient || b->isGrad) {
if (b->isVar || b->income.tailNum > 0)
ShowNTErrors("TODO");
} }
node->visitMark = NODE_FINISHED;
} }
/* /*
...@@ -882,14 +923,12 @@ void XMathGrad::GradNegate(XTensor * node, bool isEfficient) ...@@ -882,14 +923,12 @@ void XMathGrad::GradNegate(XTensor * node, bool isEfficient)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for NEGATE!"); CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for NEGATE!");
XTensor * a = income.tails[0]; XTensor * a = income.tails[0];
XTensor * b = NewTensorBufV2(a, a->devID, a->mem);
XNoder::MakeGrad(a);
_ScaleAndShift(node->grad, b, -1.0F); /* dE/da = dE/dc * (-1) */
_Sum(a->grad, b, a->grad); if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
DelTensorBuf(b); _Sum(a->grad, node->grad, a->grad, -1.0F);
}
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
} }
...@@ -903,7 +942,6 @@ gradient for normalize ...@@ -903,7 +942,6 @@ gradient for normalize
void XMathGrad::GradNormalize(XTensor * node, bool isEfficient) void XMathGrad::GradNormalize(XTensor * node, bool isEfficient)
{ {
ShowNTErrors("TODO!"); ShowNTErrors("TODO!");
} }
/* /*
...@@ -922,17 +960,20 @@ void XMathGrad::GradPower(XTensor * node, bool isEfficient) ...@@ -922,17 +960,20 @@ void XMathGrad::GradPower(XTensor * node, bool isEfficient)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for POWER!"); CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for POWER!");
XTensor * a = income.tails[0]; XTensor * a = income.tails[0];
XTensor * b = NewTensorBufV2(a, a->devID, a->mem);
DTYPE p = income.GetParam(0); DTYPE p = income.GetParam(0);
XNoder::MakeGrad(a); /* dE/da = (dE/dc) * p * a^(p-1) */
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
_Power(a, b, p - 1.0F); XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
_ScaleAndShiftMe(b, p); _Power(a, tmp, p - 1.0F);
_Multiply(node->grad, b, a->grad, 1.0F); _ScaleAndShiftMe(tmp, p);
_Multiply(node->grad, tmp, a->grad, 1.0F);
DelTensorBuf(b); DelTensorBuf(tmp);
}
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
} }
...@@ -956,9 +997,12 @@ void XMathGrad::GradScaleAndShift(XTensor * node, bool isEfficient) ...@@ -956,9 +997,12 @@ void XMathGrad::GradScaleAndShift(XTensor * node, bool isEfficient)
DTYPE scale = income.GetParam(0); DTYPE scale = income.GetParam(0);
XNoder::MakeGrad(a); /* dE/da = dE/dc * scale */
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
_Sum(a->grad, node->grad, a->grad, scale); _Sum(a->grad, node->grad, a->grad, scale);
}
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
} }
...@@ -982,9 +1026,12 @@ void XMathGrad::GradScale(XTensor * node, bool isEfficient) ...@@ -982,9 +1026,12 @@ void XMathGrad::GradScale(XTensor * node, bool isEfficient)
DTYPE scale = income.GetParam(0); DTYPE scale = income.GetParam(0);
XNoder::MakeGrad(a); /* dE/da = dE/dc * scale */
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
_Sum(a->grad, node->grad, a->grad, scale); _Sum(a->grad, node->grad, a->grad, scale);
}
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
} }
...@@ -1008,9 +1055,12 @@ void XMathGrad::GradDescale(XTensor * node, bool isEfficient) ...@@ -1008,9 +1055,12 @@ void XMathGrad::GradDescale(XTensor * node, bool isEfficient)
DTYPE descale = income.GetParam(0); DTYPE descale = income.GetParam(0);
XNoder::MakeGrad(a); /* dE/da = dE/dc / descale */
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
_Sum(a->grad, node->grad, a->grad, 1/descale); _Sum(a->grad, node->grad, a->grad, 1 / descale);
}
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
} }
...@@ -1032,9 +1082,12 @@ void XMathGrad::GradShift(XTensor * node, bool isEfficient) ...@@ -1032,9 +1082,12 @@ void XMathGrad::GradShift(XTensor * node, bool isEfficient)
XTensor * a = income.tails[0]; XTensor * a = income.tails[0];
XNoder::MakeGrad(a); /* dE/da = dE/dc */
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
_Sum(a->grad, node->grad, a->grad); _Sum(a->grad, node->grad, a->grad);
}
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
} }
...@@ -1059,11 +1112,17 @@ void XMathGrad::GradSub(XTensor * node, bool isEfficient) ...@@ -1059,11 +1112,17 @@ void XMathGrad::GradSub(XTensor * node, bool isEfficient)
XTensor * b = income.tails[1]; XTensor * b = income.tails[1];
DTYPE beta = income.GetParam(0); DTYPE beta = income.GetParam(0);
XNoder::MakeGrad(a); /* dE/da = dE/dc */
XNoder::MakeGrad(b); if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
_Sum(a->grad, node->grad, a->grad);
}
_Sum(a->grad, node->grad, a->grad); /* dE/db = -dE/dc * \beta */
_Sum(b->grad, node->grad, b->grad, -beta); if (!isEfficient || b->isGrad) {
XNoder::MakeGrad(b);
_Sum(b->grad, node->grad, b->grad, -beta);
}
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
} }
...@@ -1087,81 +1146,70 @@ void XMathGrad::GradSubDim(XTensor * node, bool isEfficient) ...@@ -1087,81 +1146,70 @@ void XMathGrad::GradSubDim(XTensor * node, bool isEfficient)
XTensor * b = income.tails[1]; XTensor * b = income.tails[1];
int n = income.GetParamInt(0); int n = income.GetParamInt(0);
DTYPE beta = income.GetParam(1); DTYPE beta = income.GetParam(1);
XNoder::MakeGrad(a);
XNoder::MakeGrad(b);
_Sum(a->grad, node->grad, a->grad); /* dE/da = dE/dc */
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
_Sum(a->grad, node->grad, a->grad);
}
int order = a->order; /* dE/db = - dE/dc * b.reduce(0,...,n-1,n+1,...) * \beta */
int dimSize[MAX_TENSOR_DIM_NUM]; if (!isEfficient || b->isGrad) {
memcpy(dimSize, a->dimSize, sizeof(int) * a->order); XNoder::MakeGrad(b);
int order = a->order;
int dimSize[MAX_TENSOR_DIM_NUM];
memcpy(dimSize, a->dimSize, sizeof(int) * a->order);
if(n == order - 1){ if (n == order - 1) {
int reshapedSize[MAX_TENSOR_DIM_NUM]; int reshapedSize[MAX_TENSOR_DIM_NUM];
reshapedSize[0] = a->unitNum / dimSize[order - 1]; reshapedSize[0] = a->unitNum / dimSize[order - 1];
reshapedSize[1] = dimSize[order - 1]; reshapedSize[1] = dimSize[order - 1];
/* we reshape dE/dc to a matrix whose column number is equal to the /* we reshape dE/dc to a matrix whose column number is equal to the
size of b. Then we can reduce the matrix into a row vector. */ size of b. Then we can reduce the matrix into a row vector. */
node->grad->Reshape(2, reshapedSize); node->grad->Reshape(2, reshapedSize);
//if(b->outgo.tailNum > 1){
XTensor * bGradTMP = NewTensorBufV2(b->grad, b->devID, b->mem); XTensor * bGradTMP = NewTensorBufV2(b->grad, b->devID, b->mem);
_ReduceSum(node->grad, bGradTMP, 0); _ReduceSum(node->grad, bGradTMP, 0);
if(beta != 1.0F) if (beta != 1.0F)
_ScaleAndShiftMe(bGradTMP, beta); _ScaleAndShiftMe(bGradTMP, beta);
_Sub(b->grad, bGradTMP, b->grad); _Sub(b->grad, bGradTMP, b->grad);
DelTensorBuf(bGradTMP); DelTensorBuf(bGradTMP);
/*}
else{ node->grad->Reshape(order, dimSize);
_ReduceSum(node->grad, b->grad, 0);
if(beta != 1.0F)
_ScaleAndShiftMe(b->grad, beta);
_ScaleAndShiftMe(b->grad, -1.0F);
}*/
node->grad->Reshape(order, dimSize);
}
else{
int reshapedSize[MAX_TENSOR_DIM_NUM];
reshapedSize[0] = 1;
reshapedSize[1] = dimSize[n];
reshapedSize[2] = 1;
for(int i = 0; i < order; i++){
if(i < n)
reshapedSize[0] *= dimSize[i];
} }
else {
int reshapedSize[MAX_TENSOR_DIM_NUM];
reshapedSize[0] = 1;
reshapedSize[1] = dimSize[n];
reshapedSize[2] = 1;
reshapedSize[2] = a->unitNum / (reshapedSize[0] * reshapedSize[1]); for (int i = 0; i < order; i++) {
if (i < n)
reshapedSize[0] *= dimSize[i];
}
/* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|. reshapedSize[2] = a->unitNum / (reshapedSize[0] * reshapedSize[1]);
Then reduce along with z and x to obtain dE/db. */
node->grad->Reshape(3, reshapedSize);
XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem); /* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|.
Then reduce along with z and x to obtain dE/db. */
node->grad->Reshape(3, reshapedSize);
_ReduceSum(node->grad, interGrad, 2); XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
_ReduceSum(node->grad, interGrad, 2);
//if(b->outgo.tailNum > 1){
XTensor * bGradTMP = NewTensorBufV2(b->grad, b->devID, b->mem); XTensor * bGradTMP = NewTensorBufV2(b->grad, b->devID, b->mem);
_ReduceSum(interGrad, bGradTMP, 0); _ReduceSum(interGrad, bGradTMP, 0);
if(beta != 1.0F) if (beta != 1.0F)
_ScaleAndShiftMe(bGradTMP, beta); _ScaleAndShiftMe(bGradTMP, beta);
_Sub(b->grad, bGradTMP, b->grad); _Sub(b->grad, bGradTMP, b->grad);
DelTensorBuf(bGradTMP); DelTensorBuf(bGradTMP);
/*}
else{
_ReduceSum(interGrad, b->grad, 0);
if(beta != 1.0F)
_ScaleAndShiftMe(b->grad, beta);
_ScaleAndShiftMe(b->grad, -1.0F);
}*/
node->grad->Reshape(order, dimSize);
DelTensorBuf(interGrad); node->grad->Reshape(order, dimSize);
DelTensorBuf(interGrad);
}
} }
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
...@@ -1174,7 +1222,6 @@ c = a + b * \beta ...@@ -1174,7 +1222,6 @@ c = a + b * \beta
we have we have
dE/da = dE/dc dE/da = dE/dc
dE/db = dE/dc * \beta dE/db = dE/dc * \beta
>> node - the node (c) for backward computation >> node - the node (c) for backward computation
>> isEfficient - indicates whether the computation is in >> isEfficient - indicates whether the computation is in
an efficient manner an efficient manner
...@@ -1188,12 +1235,14 @@ void XMathGrad::GradSum(XTensor * node, bool isEfficient) ...@@ -1188,12 +1235,14 @@ void XMathGrad::GradSum(XTensor * node, bool isEfficient)
XTensor * b = income.tails[1]; XTensor * b = income.tails[1];
DTYPE beta = income.GetParam(0); DTYPE beta = income.GetParam(0);
if(!isEfficient || a->isGrad){ /* dE/da = dE/dc */
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a); XNoder::MakeGrad(a);
_Sum(a->grad, node->grad, a->grad); _Sum(a->grad, node->grad, a->grad);
} }
if(!isEfficient || b->isGrad){ /* dE/db = dE/dc * \beta */
if (!isEfficient || b->isGrad) {
XNoder::MakeGrad(b); XNoder::MakeGrad(b);
_Sum(b->grad, node->grad, b->grad, beta); _Sum(b->grad, node->grad, b->grad, beta);
} }
...@@ -1221,81 +1270,72 @@ void XMathGrad::GradSumDim(XTensor * node, bool isEfficient) ...@@ -1221,81 +1270,72 @@ void XMathGrad::GradSumDim(XTensor * node, bool isEfficient)
XTensor * b = income.tails[1]; XTensor * b = income.tails[1];
int n = income.GetParamInt(0); int n = income.GetParamInt(0);
DTYPE beta = income.GetParam(1); DTYPE beta = income.GetParam(1);
XNoder::MakeGrad(a);
XNoder::MakeGrad(b);
_Sum(a->grad, node->grad, a->grad); if (!isEfficient || a->isGrad) {
/* dE/da = dE/dc */
XNoder::MakeGrad(a);
_Sum(a->grad, node->grad, a->grad);
}
int order = a->order; /* dE/db = dE/dc * a.reduce(0,...,n-1,n+1,...) * \beta */
int dimSize[MAX_TENSOR_DIM_NUM]; if (!isEfficient || b->isGrad) {
memcpy(dimSize, a->dimSize, sizeof(int) * a->order); XNoder::MakeGrad(b);
int order = a->order;
int dimSize[MAX_TENSOR_DIM_NUM];
memcpy(dimSize, a->dimSize, sizeof(int) * a->order);
if(n == order - 1){ if (n == order - 1) {
int reshapedSize[MAX_TENSOR_DIM_NUM]; int reshapedSize[MAX_TENSOR_DIM_NUM];
reshapedSize[0] = a->unitNum/dimSize[order - 1]; reshapedSize[0] = a->unitNum / dimSize[order - 1];
reshapedSize[1] = dimSize[order - 1]; reshapedSize[1] = dimSize[order - 1];
/* we reshape dE/dc to a matrix whose column number is equal to the /* we reshape dE/dc to a matrix whose column number is equal to the
size of b. Then we can reduce the matrix into a row vector. */ size of b. Then we can reduce the matrix into a row vector. */
node->grad->Reshape(2, reshapedSize); node->grad->Reshape(2, reshapedSize);
//if(b->outgo.tailNum > 1){
XTensor * bGradTMP = NewTensorBufV2(b->grad, b->devID, b->mem); XTensor * bGradTMP = NewTensorBufV2(b->grad, b->devID, b->mem);
_ReduceSum(node->grad, bGradTMP, 0); _ReduceSum(node->grad, bGradTMP, 0);
if(beta != 1.0F) if (beta != 1.0F)
_ScaleAndShiftMe(bGradTMP, beta); _ScaleAndShiftMe(bGradTMP, beta);
_Sum(bGradTMP, b->grad, b->grad); _Sum(bGradTMP, b->grad, b->grad);
DelTensorBuf(bGradTMP); DelTensorBuf(bGradTMP);
/*}
else{ node->grad->Reshape(order, dimSize);
_ReduceSum(node->grad, b->grad, 0);
if(beta != 1.0F)
_ScaleAndShiftMe(b->grad, beta);
}*/
node->grad->Reshape(order, dimSize);
}
else{
int reshapedSize[MAX_TENSOR_DIM_NUM];
reshapedSize[0] = 1;
reshapedSize[1] = dimSize[n];
reshapedSize[2] = 1;
for(int i = 0; i < order; i++){
if(i < n)
reshapedSize[0] *= dimSize[i];
} }
else {
int reshapedSize[MAX_TENSOR_DIM_NUM];
reshapedSize[0] = 1;
reshapedSize[1] = dimSize[n];
reshapedSize[2] = 1;
for (int i = 0; i < order; i++) {
if (i < n)
reshapedSize[0] *= dimSize[i];
}
reshapedSize[2] = a->unitNum / (reshapedSize[0] * reshapedSize[1]); reshapedSize[2] = a->unitNum / (reshapedSize[0] * reshapedSize[1]);
/* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|. /* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|.
Then reduce along with z and x to obtain dE/db. */ Then reduce along with z and x to obtain dE/db. */
node->grad->Reshape(3, reshapedSize); node->grad->Reshape(3, reshapedSize);
XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem); XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
_ReduceSum(node->grad, interGrad, 2); _ReduceSum(node->grad, interGrad, 2);
//if(b->outgo.tailNum > 1){
XTensor * bGradTMP = NewTensorBufV2(b->grad, b->devID, b->mem); XTensor * bGradTMP = NewTensorBufV2(b->grad, b->devID, b->mem);
_ReduceSum(interGrad, bGradTMP, 0); _ReduceSum(interGrad, bGradTMP, 0);
if(beta != 1.0F) if (beta != 1.0F)
_ScaleAndShiftMe(bGradTMP, beta); _ScaleAndShiftMe(bGradTMP, beta);
_Sum(bGradTMP, b->grad, b->grad); _Sum(bGradTMP, b->grad, b->grad);
DelTensorBuf(bGradTMP); DelTensorBuf(bGradTMP);
/*}
else{
_ReduceSum(interGrad, b->grad, 0);
if(beta != 1.0F)
_ScaleAndShiftMe(b->grad, beta);
}*/
node->grad->Reshape(order, dimSize); node->grad->Reshape(order, dimSize);
DelTensorBuf(interGrad);
DelTensorBuf(interGrad);
}
} }
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
} }
...@@ -1322,12 +1362,20 @@ void XMathGrad::GradSumBroadcast(XTensor * node, bool isEfficient) ...@@ -1322,12 +1362,20 @@ void XMathGrad::GradSumBroadcast(XTensor * node, bool isEfficient)
XTensor * b = income.tails[1]; XTensor * b = income.tails[1];
//DTYPE beta = income.GetParam(0); //DTYPE beta = income.GetParam(0);
XNoder::MakeGrad(a); /* dE/da = dE/dc */
_Sum(a->grad, node->grad, a->grad); if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
_Sum(a->grad, node->grad, a->grad);
}
if(b->isVar || b->income.tailNum > 0){ /* dE/db = dE/dc * a.reduce(0..n) * \beta */
ShowNTErrors("TODO"); if (!isEfficient || b->isGrad) {
if (b->isVar || b->income.tailNum > 0) {
ShowNTErrors("TODO");
}
} }
node->visitMark = NODE_FINISHED;
} }
/* /*
...@@ -1347,18 +1395,21 @@ void XMathGrad::GradReduceMean(XTensor * node, bool isEfficient) ...@@ -1347,18 +1395,21 @@ void XMathGrad::GradReduceMean(XTensor * node, bool isEfficient)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for Reduce!"); CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for Reduce!");
XTensor * a = income.tails[0]; XTensor * a = income.tails[0];
XTensor * b = NewTensorBufV2(a, a->devID, a->mem);
int dim = income.GetParamInt(0); int dim = income.GetParamInt(0);
int n = a->GetDim(dim); int n = a->GetDim(dim);
XNoder::MakeGrad(a); /* dE/da = Unsqueeze(dE/dc) * 1/dimSizeA[dim] */
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
_Unsqueeze(node->grad, b, dim, n); XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
_ScaleAndShiftMe(b, 1.0F/n); _Unsqueeze(node->grad, tmp, dim, n);
_Sum(a->grad, b, a->grad); _ScaleAndShiftMe(tmp, 1.0F / n);
_Sum(a->grad, tmp, a->grad);
DelTensorBuf(b); DelTensorBuf(tmp);
}
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
} }
...@@ -1368,7 +1419,7 @@ gradient for reduceSum ...@@ -1368,7 +1419,7 @@ gradient for reduceSum
for for
c = reduceSum(a, dim) c = reduceSum(a, dim)
we have we have
dE/da = Unsqueeze(dE/dc) * 1 dE/da = Unsqueeze(dE/dc)
>> node - the node (c) for backward computation >> node - the node (c) for backward computation
>> isEfficient - indicates whether the computation is in >> isEfficient - indicates whether the computation is in
...@@ -1380,17 +1431,19 @@ void XMathGrad::GradReduceSum(XTensor * node, bool isEfficient) ...@@ -1380,17 +1431,19 @@ void XMathGrad::GradReduceSum(XTensor * node, bool isEfficient)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for Reduce!"); CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for Reduce!");
XTensor * a = income.tails[0]; XTensor * a = income.tails[0];
XTensor * b = NewTensorBufV2(a, a->devID, a->mem);
int dim = income.GetParamInt(0); int dim = income.GetParamInt(0);
int n = a->GetDim(dim); int n = a->GetDim(dim);
XNoder::MakeGrad(a); /* dE/da = Unsqueeze(dE/dc) */
if (!isEfficient || a->isGrad) {
_Unsqueeze(node->grad, b, dim, n); XNoder::MakeGrad(a);
_Sum(a->grad, b, a->grad);
DelTensorBuf(b); XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
_Unsqueeze(node->grad, tmp, dim, n);
_Sum(a->grad, tmp, a->grad);
DelTensorBuf(tmp);
}
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
} }
...@@ -1412,16 +1465,17 @@ void XMathGrad::GradReduceSumAll(XTensor * node, bool isEfficient) ...@@ -1412,16 +1465,17 @@ void XMathGrad::GradReduceSumAll(XTensor * node, bool isEfficient)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for Reduce!"); CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for Reduce!");
XTensor * a = income.tails[0]; XTensor * a = income.tails[0];
XTensor * b = NewTensorBufV2(a, a->devID, a->mem);
XNoder::MakeGrad(a);
DTYPE value = node->grad->Get0D();
_SetDataFixed(b, (void*)&value);
_Sum(a->grad, b, a->grad); /* dE/da = dE/dc * 1 */
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
DelTensorBuf(b); XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
DTYPE value = node->grad->Get0D();
tmp->SetDataFixed(value);
_Sum(a->grad, tmp, a->grad);
DelTensorBuf(tmp);
}
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
} }
...@@ -1452,22 +1506,28 @@ void XMathGrad::GradReduceSumSquared(XTensor * node, bool isEfficient) ...@@ -1452,22 +1506,28 @@ void XMathGrad::GradReduceSumSquared(XTensor * node, bool isEfficient)
int dim = income.GetParamInt(0); int dim = income.GetParamInt(0);
int n = a->GetDim(dim); int n = a->GetDim(dim);
XNoder::MakeGrad(a);
XNoder::MakeGrad(b);
/* compute a-b */
_Unsqueeze(b, c, dim, n); _Unsqueeze(b, c, dim, n);
_Sub(a, c, d); _Sub(a, c, d);
_ReduceSum(d, f, dim);
/* dE/da_i = Unsqueeze(dE/dc) * 2 * (a_i - b) */ /* dE/da_i = Unsqueeze(dE/dc) * 2 * (a_i - b) */
_ScaleAndShiftMe(d, 2.0F); if (!isEfficient || a->isGrad) {
_Unsqueeze(node->grad, e, dim, n); XNoder::MakeGrad(a);
_Multiply(d, e, a->grad, 1.0F);
_ScaleAndShiftMe(d, 2.0F);
_Unsqueeze(node->grad, e, dim, n);
_Multiply(d, e, a->grad, 1.0F);
}
/* dE/db = dE/dc * -2 * n * \sum_i (a_i - b) */ /* dE/db = dE/dc * -2 * n * \sum_i (a_i - b) */
_ScaleAndShiftMe(f, -2.0F); if (!isEfficient || b->isGrad) {
_Multiply(node->grad, f, b->grad, 1.0F); XNoder::MakeGrad(b);
_ReduceSum(d, f, dim);
_ScaleAndShiftMe(f, -2.0F);
_Multiply(node->grad, f, b->grad, 1.0F);
}
DelTensorBuf(f); DelTensorBuf(f);
DelTensorBuf(e); DelTensorBuf(e);
...@@ -1504,22 +1564,27 @@ void XMathGrad::GradReduceVariance(XTensor * node, bool isEfficient) ...@@ -1504,22 +1564,27 @@ void XMathGrad::GradReduceVariance(XTensor * node, bool isEfficient)
int dim = income.GetParamInt(0); int dim = income.GetParamInt(0);
int n = a->GetDim(dim); int n = a->GetDim(dim);
XNoder::MakeGrad(a);
XNoder::MakeGrad(b);
/* compute a-b */
_Unsqueeze(b, c, dim, n); _Unsqueeze(b, c, dim, n);
_Sub(a, c, d); _Sub(a, c, d);
_ReduceSum(d, f, dim);
/* dE/da_i = Unsqueeze(dE/dc) * 2 * (a_i - b) / n */ /* dE/da_i = Unsqueeze(dE/dc) * 2 * (a_i - b) / n */
_ScaleAndShiftMe(d, 2.0F / n); if (!isEfficient || a->isGrad) {
_Unsqueeze(node->grad, e, dim, n); XNoder::MakeGrad(a);
_Multiply(d, e, a->grad, 1.0F);
_ScaleAndShiftMe(d, 2.0F / n);
_Unsqueeze(node->grad, e, dim, n);
_Multiply(d, e, a->grad, 1.0F);
}
/* dE/db = dE/dc * -2 * \sum_i (a_i - b) */ /* dE/db = dE/dc * -2 * \sum_i (a_i - b) */
_ScaleAndShiftMe(f, -2.0F /n); if (!isEfficient || b->isGrad) {
_Multiply(node->grad, f, b->grad, 1.0F); XNoder::MakeGrad(b);
_ReduceSum(d, f, dim);
_ScaleAndShiftMe(f, -2.0F / n);
_Multiply(node->grad, f, b->grad, 1.0F);
}
DelTensorBuf(f); DelTensorBuf(f);
DelTensorBuf(e); DelTensorBuf(e);
...@@ -1529,7 +1594,6 @@ void XMathGrad::GradReduceVariance(XTensor * node, bool isEfficient) ...@@ -1529,7 +1594,6 @@ void XMathGrad::GradReduceVariance(XTensor * node, bool isEfficient)
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
} }
/* /*
gradient for operation gradient for operation
for c = matmul(x, w) + b for c = matmul(x, w) + b
...@@ -1554,67 +1618,67 @@ void XMathGrad::GradMulAndShift(XTensor * node, bool isEfficient) ...@@ -1554,67 +1618,67 @@ void XMathGrad::GradMulAndShift(XTensor * node, bool isEfficient)
MATRIX_TRANS_TYPE transW = income.GetParamTrans(1); MATRIX_TRANS_TYPE transW = income.GetParamTrans(1);
MATRIX_TRANS_TYPE transX = income.GetParamTrans(2); MATRIX_TRANS_TYPE transX = income.GetParamTrans(2);
DTYPE alpha = income.GetParam(3); DTYPE alpha = income.GetParam(3);
/* dE/db = dE/dc * x.reduce(0,...,n-1,n+1,...) */
if (!isEfficient || w->isGrad) if (!isEfficient || b->isGrad) {
XNoder::MakeGrad(w);
if (!isEfficient || x->isGrad)
XNoder::MakeGrad(x);
if (!isEfficient || b->isGrad)
XNoder::MakeGrad(b); XNoder::MakeGrad(b);
int order = node->order; int order = node->order;
int dimSize[MAX_TENSOR_DIM_NUM]; int dimSize[MAX_TENSOR_DIM_NUM];
memcpy(dimSize, node->dimSize, sizeof(int) * node->order); memcpy(dimSize, node->dimSize, sizeof(int) * node->order);
/* compute dE/db */ /* compute dE/db */
if (n == order - 1) { if (n == order - 1) {
int reshapedSize[MAX_TENSOR_DIM_NUM]; int reshapedSize[MAX_TENSOR_DIM_NUM];
reshapedSize[0] = node->unitNum / dimSize[order - 1]; reshapedSize[0] = node->unitNum / dimSize[order - 1];
reshapedSize[1] = dimSize[order - 1]; reshapedSize[1] = dimSize[order - 1];
/* we reshape dE/dc to a matrix whose column number is equal to the /* we reshape dE/dc to a matrix whose column number is equal to the
size of b. Then we can reduce the matrix into a row vector. */ size of b. Then we can reduce the matrix into a row vector. */
node->grad->Reshape(2, reshapedSize); node->grad->Reshape(2, reshapedSize);
XTensor * bGradTMP = NewTensorBufV2(b->grad, b->devID, b->mem); XTensor * bGradTMP = NewTensorBufV2(b->grad, b->devID, b->mem);
_ReduceSum(node->grad, bGradTMP, 0); _ReduceSum(node->grad, bGradTMP, 0);
_Sum(bGradTMP, b->grad, b->grad); _Sum(bGradTMP, b->grad, b->grad);
DelTensorBuf(bGradTMP); DelTensorBuf(bGradTMP);
node->grad->Reshape(order, dimSize); node->grad->Reshape(order, dimSize);
}
else {
int reshapedSize[MAX_TENSOR_DIM_NUM];
reshapedSize[0] = 1;
reshapedSize[1] = dimSize[n];
reshapedSize[2] = 1;
for (int i = 0; i < order; i++) {
if (i < n)
reshapedSize[0] *= dimSize[i];
} }
else {
int reshapedSize[MAX_TENSOR_DIM_NUM];
reshapedSize[0] = 1;
reshapedSize[1] = dimSize[n];
reshapedSize[2] = 1;
reshapedSize[2] = node->unitNum / (reshapedSize[0] * reshapedSize[1]); for (int i = 0; i < order; i++) {
if (i < n)
reshapedSize[0] *= dimSize[i];
}
/* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|. reshapedSize[2] = node->unitNum / (reshapedSize[0] * reshapedSize[1]);
Then reduce along with z and x to obtain dE/db. */
node->grad->Reshape(3, reshapedSize);
XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem); /* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|.
Then reduce along with z and x to obtain dE/db. */
node->grad->Reshape(3, reshapedSize);
_ReduceSum(node->grad, interGrad, 2); XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
_ReduceSum(node->grad, interGrad, 2);
XTensor * bGradTMP = NewTensorBufV2(b->grad, b->devID, b->mem);
_ReduceSum(interGrad, bGradTMP, 0);
_Sum(bGradTMP, b->grad, b->grad);
DelTensorBuf(bGradTMP);
node->grad->Reshape(order, dimSize); XTensor * bGradTMP = NewTensorBufV2(b->grad, b->devID, b->mem);
_ReduceSum(interGrad, bGradTMP, 0);
_Sum(bGradTMP, b->grad, b->grad);
DelTensorBuf(bGradTMP);
DelTensorBuf(interGrad); node->grad->Reshape(order, dimSize);
DelTensorBuf(interGrad);
}
} }
if (!isEfficient || w->isGrad)
XNoder::MakeGrad(w);
if (!isEfficient || x->isGrad)
XNoder::MakeGrad(x);
/* compute dE/dx, dE/dw */ /* compute dE/dx, dE/dw */
XTensor * c = node; XTensor * c = node;
XTensor * dedc = node->grad; XTensor * dedc = node->grad;
...@@ -1623,7 +1687,7 @@ void XMathGrad::GradMulAndShift(XTensor * node, bool isEfficient) ...@@ -1623,7 +1687,7 @@ void XMathGrad::GradMulAndShift(XTensor * node, bool isEfficient)
if (x->order == 2 && w->order == 2) if (x->order == 2 && w->order == 2)
GradMatrixMul(x, dedx, transX, w, dedw, transW, dedc, alpha, isEfficient); GradMatrixMul(x, dedx, transX, w, dedw, transW, dedc, alpha, isEfficient);
else if (transX == X_NOTRANS && x->order > 2 && w->order == 2){ else if (transX == X_NOTRANS && x->order > 2 && w->order == 2) {
int orderBackupX = x->order; int orderBackupX = x->order;
int orderBackupC = c->order; int orderBackupC = c->order;
int dimsBackupX[MAX_TENSOR_DIM_NUM]; int dimsBackupX[MAX_TENSOR_DIM_NUM];
......
...@@ -34,35 +34,35 @@ namespace nts{ ...@@ -34,35 +34,35 @@ namespace nts{
/* compute dE/dx of a node */ /* compute dE/dx of a node */
void XShapeGrad::MakeGrad(XTensor * node, bool isEfficient) void XShapeGrad::MakeGrad(XTensor * node, bool isEfficient)
{ {
if(!isEfficient){ if (!isEfficient) {
CheckNTErrors(node->grad != NULL, "No gradient found!"); CheckNTErrors(node->grad != NULL, "No gradient found!");
} }
else{ else {
CheckNTErrors(!node->isGrad || node->grad != NULL, "No gradient found!"); CheckNTErrors(!node->isGrad || node->grad != NULL, "No gradient found!");
} }
XLink &income = node->income; XLink &income = node->income;
int operID = income.typeID; int operID = income.typeID;
if(operID == MOVEMENT_COPYINDEXED) if (operID == MOVEMENT_COPYINDEXED)
GradCopyIndexed(node, isEfficient); GradCopyIndexed(node, isEfficient);
else if(operID == MOVEMENT_GATHER) else if (operID == MOVEMENT_GATHER)
GradGather(node, isEfficient); GradGather(node, isEfficient);
else if (operID == MOVEMENT_DROPOUTWITHINDEX) else if (operID == MOVEMENT_DROPOUTWITHINDEX)
GradDropoutWithIndex(node, isEfficient); GradDropoutWithIndex(node, isEfficient);
else if(operID == SHAPE_MERGE) else if (operID == SHAPE_MERGE)
GradMerge(node, isEfficient); GradMerge(node, isEfficient);
else if(operID == SHAPE_MERGE_LIST) else if (operID == SHAPE_MERGE_LIST)
GradMergeList(node, isEfficient); GradMergeList(node, isEfficient);
else if(operID == SHAPE_RESHAPE) else if (operID == SHAPE_RESHAPE)
GradReshape(node, isEfficient); GradReshape(node, isEfficient);
else if(operID == SHAPE_SPLIT) else if (operID == SHAPE_SPLIT)
GradSplit(node, isEfficient); GradSplit(node, isEfficient);
else if(operID == SHAPE_SPLIT_LIST) else if (operID == SHAPE_SPLIT_LIST)
GradSplitList(node, isEfficient); GradSplitList(node, isEfficient);
else if (operID == SHAPE_TRANSPOSE) else if (operID == SHAPE_TRANSPOSE)
GradTranspose(node, isEfficient); GradTranspose(node, isEfficient);
else if(operID == SHAPE_UNSQUEEZE) else if (operID == SHAPE_UNSQUEEZE)
GradUnsqueeze(node, isEfficient); GradUnsqueeze(node, isEfficient);
else{ else{
ShowNTErrors("TODO!"); ShowNTErrors("TODO!");
...@@ -77,10 +77,10 @@ bool XShapeGrad::IsShapeOP(XTensor * node) ...@@ -77,10 +77,10 @@ bool XShapeGrad::IsShapeOP(XTensor * node)
} }
/* post processing of a node */ /* post processing of a node */
void XShapeGrad::PostProcessing(XTensor * node, int typeID, bool isEfficent) void XShapeGrad::PostProcessing(XTensor * node, int typeID, bool isEfficient)
{ {
if(typeID == SHAPE_SPLIT_LIST) if (typeID == SHAPE_SPLIT_LIST)
GradSplitListPost(node, isEfficent); GradSplitListPost(node, isEfficient);
} }
/* /*
...@@ -93,7 +93,7 @@ dE/da = spreadforcopyindexed(b) ...@@ -93,7 +93,7 @@ dE/da = spreadforcopyindexed(b)
>> isEfficient - indicates whether the computation is in >> isEfficient - indicates whether the computation is in
an efficient manner an efficient manner
*/ */
void XShapeGrad::GradCopyIndexed(XTensor * node, bool isEfficent) void XShapeGrad::GradCopyIndexed(XTensor * node, bool isEfficient)
{ {
XLink &income = node->income; XLink &income = node->income;
CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for CopyIndexed!"); CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for CopyIndexed!");
...@@ -105,8 +105,15 @@ void XShapeGrad::GradCopyIndexed(XTensor * node, bool isEfficent) ...@@ -105,8 +105,15 @@ void XShapeGrad::GradCopyIndexed(XTensor * node, bool isEfficent)
XTensor * srcIndex = income.tails[1]; XTensor * srcIndex = income.tails[1];
XTensor * tgtIndex = income.tails[2]; XTensor * tgtIndex = income.tails[2];
XNoder::MakeGrad(input); if (!isEfficient || input->isGrad) {
_SpreadForCopyIndexed(input->grad, node->grad, dim, srcIndex, tgtIndex, copyNum); XNoder::MakeGrad(input);
XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
_SpreadForCopyIndexed(tmp, node->grad, dim, srcIndex, tgtIndex, copyNum);
_SumMe(input->grad, tmp);
DelTensorBuf(tmp);
}
} }
/* /*
...@@ -119,16 +126,23 @@ dE/da = spreadforgather(b) ...@@ -119,16 +126,23 @@ dE/da = spreadforgather(b)
>> isEfficient - indicates whether the computation is in >> isEfficient - indicates whether the computation is in
an efficient manner an efficient manner
*/ */
void XShapeGrad::GradGather(XTensor * node, bool isEfficent) void XShapeGrad::GradGather(XTensor * node, bool isEfficient)
{ {
XLink &income = node->income; XLink &income = node->income;
CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for Gather!"); CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for Gather!");
XTensor * input = income.tails[0]; XTensor * input = income.tails[0];
XTensor * index = income.tails[1]; XTensor * index = income.tails[1];
XNoder::MakeGrad(input);
if (!isEfficient || input->isGrad) {
XNoder::MakeGrad(input);
XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
_SpreadForGather(tmp, node->grad, index);
_SumMe(input->grad, tmp);
_SpreadForGather(input->grad, node->grad, index); DelTensorBuf(tmp);
}
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
} }
...@@ -136,7 +150,7 @@ void XShapeGrad::GradGather(XTensor * node, bool isEfficent) ...@@ -136,7 +150,7 @@ void XShapeGrad::GradGather(XTensor * node, bool isEfficent)
/* /*
gradient computation for DropoutWithIndex function gradient computation for DropoutWithIndex function
*/ */
void XShapeGrad::GradDropoutWithIndex(XTensor * node, bool isEfficent) void XShapeGrad::GradDropoutWithIndex(XTensor * node, bool isEfficient)
{ {
XLink &income = node->income; XLink &income = node->income;
CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for DropoutWithIndex!"); CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for DropoutWithIndex!");
...@@ -144,28 +158,23 @@ void XShapeGrad::GradDropoutWithIndex(XTensor * node, bool isEfficent) ...@@ -144,28 +158,23 @@ void XShapeGrad::GradDropoutWithIndex(XTensor * node, bool isEfficent)
XTensor * input = income.tails[0]; XTensor * input = income.tails[0];
XTensor * index = income.tails[1]; XTensor * index = income.tails[1];
DTYPE scale = income.GetParam(0); DTYPE scale = income.GetParam(0);
XNoder::MakeGrad(input);
if (!isEfficient || input->isGrad) {
//_Identity(node->grad, input->grad); XNoder::MakeGrad(input);
_CopyValues(node->grad, input->grad);
int order = node->grad->order; XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
int * dimSize = new int[order]; _CopyValues(node->grad, tmp);
for (int i = 0; i < order; i++) { tmp->Reshape(tmp->unitNum);
dimSize[i] = node->grad->dimSize[i];
}
int order1 = 1; _DropoutWithIndex(node->grad, index, tmp);
int * dimSize1 = new int[order1]; _ScaleAndShiftMe(tmp, scale);
dimSize1[0] = input->grad->unitNum;
input->grad->Reshape(order1, dimSize1);
_DropoutWithIndex(node->grad, index, input->grad); tmp->Reshape(input->order, input->dimSize);
_ScaleAndShiftMe(input->grad, scale); _SumMe(input->grad, tmp);
input->grad->Reshape(order, dimSize); DelTensorBuf(tmp);
}
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
} }
...@@ -185,7 +194,7 @@ dE/da = split(dE/dc) ...@@ -185,7 +194,7 @@ dE/da = split(dE/dc)
>> isEfficient - indicates whether the computation is in >> isEfficient - indicates whether the computation is in
an efficient manner an efficient manner
*/ */
void XShapeGrad::GradMerge(XTensor * node, bool isEfficent) void XShapeGrad::GradMerge(XTensor * node, bool isEfficient)
{ {
XLink &income = node->income; XLink &income = node->income;
XTensor * input = income.tails[0]; XTensor * input = income.tails[0];
...@@ -196,62 +205,64 @@ void XShapeGrad::GradMerge(XTensor * node, bool isEfficent) ...@@ -196,62 +205,64 @@ void XShapeGrad::GradMerge(XTensor * node, bool isEfficent)
int whereToMerge = income.GetParamInt(0); int whereToMerge = income.GetParamInt(0);
int leadDim = income.GetParamInt(1); int leadDim = income.GetParamInt(1);
int blockSize = 1; if (!isEfficient || input->isGrad) {
int blockNum = 1; XNoder::MakeGrad(input);
for(int i = 0; i < input->order; i++){
if(i < leadDim)
blockNum *= input->dimSize[i];
}
blockSize = input->GetDataSizeInChar() / blockNum;
XNoder::MakeGrad(input);
int * dims = new int[input->order]; int * dims = new int[input->order];
memset(dims, 0, sizeof(int) * input->order); memset(dims, 0, sizeof(int) * input->order);
for(int i = 0, j = 0; i < input->order; i++){ for (int i = 0, j = 0; i < input->order; i++) {
if(i >= leadDim){ if (i >= leadDim) {
dims[j++] = input->dimSize[i]; dims[j++] = input->dimSize[i];
}
} }
} dims[0] = -dims[0];
dims[0] = -dims[0]; XTensor gradInputSmall(input->order - leadDim, dims,
XTensor gradInputSmall(input->order - leadDim, dims, input->dataType, input->denseRatio,
input->dataType, input->denseRatio, input->devID, input->mem);
input->devID, input->mem);
dims[whereToMerge - leadDim] *= dims[0];
dims[whereToMerge - leadDim] *= dims[0]; XTensor gradNodeSmall(node->order - leadDim, dims + leadDim + 1,
XTensor gradNodeSmall(node->order - leadDim, dims + leadDim + 1, node->dataType, node->denseRatio,
node->dataType, node->denseRatio, node->devID, node->mem);
node->devID, node->mem);
int blockSize = 1;
/* we can simply split the gradient tensor int blockNum = 1;
if the input is used in merging only */ for (int i = 0; i < input->order; i++) {
if(input->outgo.tailNum == 1){ if (i < leadDim)
for(int i = 0; i < blockNum; i++){ blockNum *= input->dimSize[i];
gradNodeSmall.data = (char*)node->grad->data + i * blockSize; }
gradInputSmall.data = (char*)input->grad->data + i * blockSize; blockSize = input->GetDataSizeInChar() / blockNum;
_Split(&gradNodeSmall, &gradInputSmall, whereToMerge - leadDim - 1, input->dimSize[leadDim]);
/* we can simply split the gradient tensor
if the input is used in merging only */
if (input->outgo.tailNum == 1) {
for (int i = 0; i < blockNum; i++) {
gradNodeSmall.data = (char*)node->grad->data + i * blockSize;
gradInputSmall.data = (char*)input->grad->data + i * blockSize;
_Split(&gradNodeSmall, &gradInputSmall, whereToMerge - leadDim - 1, input->dimSize[leadDim]);
}
} }
}
/* a more complicated case is that the input tensor is used for
other operations somewhere else. So we have to do gradient
accumulation after spliting, i.e., we need an additional
SUM operation */
else{
XTensor gradInputSmallBuf(&gradInputSmall);
for(int i = 0; i < blockNum; i++){ /* a more complicated case is that the input tensor is used for
gradNodeSmall.data = (char*)node->grad->data + i * blockSize; other operations somewhere else. So we have to do gradient
gradInputSmall.data = (char*)input->grad->data + i * blockSize; accumulation after spliting, i.e., we need an additional
_Split(&gradNodeSmall, &gradInputSmallBuf, whereToMerge - leadDim - 1, input->dimSize[leadDim]); SUM operation */
_Sum(&gradInputSmall, &gradInputSmallBuf, &gradInputSmall); else {
XTensor gradInputSmallBuf(&gradInputSmall);
for (int i = 0; i < blockNum; i++) {
gradNodeSmall.data = (char*)node->grad->data + i * blockSize;
gradInputSmall.data = (char*)input->grad->data + i * blockSize;
_Split(&gradNodeSmall, &gradInputSmallBuf, whereToMerge - leadDim - 1, input->dimSize[leadDim]);
_Sum(&gradInputSmall, &gradInputSmallBuf, &gradInputSmall);
}
} }
}
gradNodeSmall.data = NULL; gradNodeSmall.data = NULL;
gradInputSmall.data = NULL; gradInputSmall.data = NULL;
delete[] dims; delete[] dims;
}
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
} }
...@@ -279,18 +290,18 @@ void XShapeGrad::GradMergeList(XTensor * node, bool isEfficient) ...@@ -279,18 +290,18 @@ void XShapeGrad::GradMergeList(XTensor * node, bool isEfficient)
TensorList smalls(income.tailNum); TensorList smalls(income.tailNum);
TensorList smallsGrad(income.tailNum); TensorList smallsGrad(income.tailNum);
bool mergeOnly = true; bool mergeOnly = true;
for(int i = 0; i < income.tailNum; i++){
for (int i = 0; i < income.tailNum; i++) {
/* TODO! efficient backpropagate */
XTensor * tail = income.tails[i]; XTensor * tail = income.tails[i];
XNoder::MakeGrad(tail); XNoder::MakeGrad(tail);
smalls.Add(tail); smalls.Add(tail);
smallsGrad.Add(tail->grad); smallsGrad.Add(tail->grad);
if(i > 1){ if (i > 1)
CheckNTErrors(_IsSameShaped(last, tail), CheckNTErrors(_IsSameShaped(last, tail), "Input tensors must be of the same size!");
"Input tensors must be of the same size!");
}
if(tail->outgo.tailNum > 1) if (tail->outgo.tailNum > 1)
mergeOnly = false; mergeOnly = false;
last = tail; last = tail;
...@@ -300,7 +311,7 @@ void XShapeGrad::GradMergeList(XTensor * node, bool isEfficient) ...@@ -300,7 +311,7 @@ void XShapeGrad::GradMergeList(XTensor * node, bool isEfficient)
/* we can simply split the gradient tensor into the input tensors /* we can simply split the gradient tensor into the input tensors
if the inputs are used in merging only */ if the inputs are used in merging only */
if(mergeOnly) if (mergeOnly)
_Split(node->grad, &smallsGrad, whereToMerge, smalls.count); _Split(node->grad, &smallsGrad, whereToMerge, smalls.count);
/* a more complicated case is that the input tensors are used for /* a more complicated case is that the input tensors are used for
...@@ -326,7 +337,7 @@ void XShapeGrad::GradMergeList(XTensor * node, bool isEfficient) ...@@ -326,7 +337,7 @@ void XShapeGrad::GradMergeList(XTensor * node, bool isEfficient)
last->devID, last->mem); last->devID, last->mem);
/* gradient accumulation for each split */ /* gradient accumulation for each split */
for(int i = 0; i < smalls.count; i++){ for (int i = 0; i < smalls.count; i++) {
XTensor * inputGrad = (XTensor*)smallsGrad.Get(i); XTensor * inputGrad = (XTensor*)smallsGrad.Get(i);
gradSmall.data = (char*)gradSplit.data + i * last->unitNum * last->unitSize; gradSmall.data = (char*)gradSplit.data + i * last->unitNum * last->unitSize;
_Sum(inputGrad, &gradSmall, inputGrad); _Sum(inputGrad, &gradSmall, inputGrad);
...@@ -349,17 +360,20 @@ dE/da = reshape(dE/db) ...@@ -349,17 +360,20 @@ dE/da = reshape(dE/db)
>> isEfficient - indicates whether the computation is in >> isEfficient - indicates whether the computation is in
an efficient manner an efficient manner
*/ */
void XShapeGrad::GradReshape(XTensor * node, bool isEfficent) void XShapeGrad::GradReshape(XTensor * node, bool isEfficient)
{ {
XLink &income = node->income; XLink &income = node->income;
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for RESHAPE!");
XTensor * input = income.tails[0]; XTensor * input = income.tails[0];
XNoder::MakeGrad(input);
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for MERGE!"); if (!isEfficient || input->isGrad) {
XNoder::MakeGrad(input);
node->grad->Reshape(input->order, input->dimSize); node->grad->Reshape(input->order, input->dimSize);
_CopyValues(node->grad, input->grad); _CopyValues(node->grad, input->grad);
node->grad->Reshape(node->order, node->dimSize); node->grad->Reshape(node->order, node->dimSize);
}
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
} }
...@@ -386,22 +400,24 @@ void XShapeGrad::GradSplit(XTensor * node, bool isEfficient) ...@@ -386,22 +400,24 @@ void XShapeGrad::GradSplit(XTensor * node, bool isEfficient)
CheckNTErrors(node->order == input->order + 1, "Wrong tensor orders!"); CheckNTErrors(node->order == input->order + 1, "Wrong tensor orders!");
CheckNTErrors(splitNum == node->dimSize[0], "Wrong split number!"); CheckNTErrors(splitNum == node->dimSize[0], "Wrong split number!");
XNoder::MakeGrad(input); if (!isEfficient || input->isGrad) {
XNoder::MakeGrad(input);
/* we can simply merge the gradient tensor /* we can simply merge the gradient tensor
if the input is used in spliting only */ if the input is used in spliting only */
if(input->outgo.tailNum == 1) if (input->outgo.tailNum == 1)
_Merge(node->grad, input->grad, whereToSplit + 1, 0); _Merge(node->grad, input->grad, whereToSplit + 1, 0);
/* if the tensor is used somewhere else, we need another SUM /* if the tensor is used somewhere else, we need another SUM
for gradient accumulation */ for gradient accumulation */
else{ else {
XTensor * inputGradTMP = NewTensorBufV2(input, input->devID, input->mem); XTensor * inputGradTMP = NewTensorBufV2(input, input->devID, input->mem);
_Merge(node->grad, inputGradTMP, whereToSplit + 1, 0); _Merge(node->grad, inputGradTMP, whereToSplit + 1, 0);
_Sum(input->grad, inputGradTMP, input->grad); _Sum(input->grad, inputGradTMP, input->grad);
DelTensorBuf(inputGradTMP); DelTensorBuf(inputGradTMP);
}
} }
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
...@@ -449,14 +465,14 @@ void XShapeGrad::GradSplitListPost(XTensor * node, bool isEfficient) ...@@ -449,14 +465,14 @@ void XShapeGrad::GradSplitListPost(XTensor * node, bool isEfficient)
int whereToSplit = -1; int whereToSplit = -1;
int splitNum = 0; int splitNum = 0;
for(int i = 0; i < outgo.tailNum; i++){ for (int i = 0; i < outgo.tailNum; i++) {
XTensor * parent = (XTensor*)outgo.tails[i]; XTensor * parent = (XTensor*)outgo.tails[i];
XLink &income = parent->income; XLink &income = parent->income;
if(income.typeID == SHAPE_SPLIT_LIST){ if (income.typeID == SHAPE_SPLIT_LIST) {
int w = income.GetParamInt(0); int w = income.GetParamInt(0);
int splitID = income.GetParamInt(1); int splitID = income.GetParamInt(1);
if(whereToSplit < 0) if (whereToSplit < 0)
whereToSplit = w; whereToSplit = w;
splitNum++; splitNum++;
...@@ -468,24 +484,26 @@ void XShapeGrad::GradSplitListPost(XTensor * node, bool isEfficient) ...@@ -468,24 +484,26 @@ void XShapeGrad::GradSplitListPost(XTensor * node, bool isEfficient)
} }
} }
XNoder::MakeGrad(node); if (!isEfficient || node->isGrad) {
XNoder::MakeGrad(node);
/* we can simply merge the gradient tensor /* we can simply merge the gradient tensor
if the node is used in spliting only */ if the node is used in spliting only */
if(outgo.tailNum == splitNum){ if (outgo.tailNum == splitNum) {
_Merge(&splits, node->grad, whereToSplit); _Merge(&splits, node->grad, whereToSplit);
} }
/* if the tensor is used as input to other nodes /* if the tensor is used as input to other nodes
somewhere else, we need another SUM for gradient somewhere else, we need another SUM for gradient
accumulation */ accumulation */
else{ else {
XTensor * nodeGradTMP = NewTensorBufV2(node, node->devID, node->mem); XTensor * nodeGradTMP = NewTensorBufV2(node, node->devID, node->mem);
_Merge(&splits, nodeGradTMP, whereToSplit + 1); _Merge(&splits, nodeGradTMP, whereToSplit + 1);
_Sum(node->grad, nodeGradTMP, node->grad); _Sum(node->grad, nodeGradTMP, node->grad);
DelTensorBuf(nodeGradTMP); DelTensorBuf(nodeGradTMP);
}
} }
} }
...@@ -506,19 +524,23 @@ void XShapeGrad::GradTranspose(XTensor * node, bool isEfficient) ...@@ -506,19 +524,23 @@ void XShapeGrad::GradTranspose(XTensor * node, bool isEfficient)
XTensor * output = node; XTensor * output = node;
XTensor * input = income.tails[0]; XTensor * input = income.tails[0];
XTensor * b = NewTensorBufV2(input, input->devID, input->mem);
XNoder::MakeGrad(input);
int i = income.GetParamInt(0);
int j = income.GetParamInt(1);
CheckNTErrors(input->order > i && i >= 0, "index of dimension is out of scope!"); if (!isEfficient || input->isGrad) {
CheckNTErrors(input->order > j && j >= 0, "index of dimension is out of scope!"); XNoder::MakeGrad(input);
_Transpose(output->grad, b, i, j); int i = income.GetParamInt(0);
_Sum(input->grad, b, input->grad); int j = income.GetParamInt(1);
DelTensorBuf(b); CheckNTErrors(input->order > i && i >= 0, "index of dimension is out of scope!");
CheckNTErrors(input->order > j && j >= 0, "index of dimension is out of scope!");
XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
_Transpose(output->grad, tmp, i, j);
_Sum(input->grad, tmp, input->grad);
DelTensorBuf(tmp);
}
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
} }
...@@ -540,7 +562,6 @@ void XShapeGrad::GradUnsqueeze(XTensor * node, bool isEfficient) ...@@ -540,7 +562,6 @@ void XShapeGrad::GradUnsqueeze(XTensor * node, bool isEfficient)
XTensor * output = node; XTensor * output = node;
XTensor * input = income.tails[0]; XTensor * input = income.tails[0];
XNoder::MakeGrad(input);
int dim = income.GetParamInt(0); int dim = income.GetParamInt(0);
int dSize = income.GetParamInt(1); int dSize = income.GetParamInt(1);
...@@ -548,12 +569,16 @@ void XShapeGrad::GradUnsqueeze(XTensor * node, bool isEfficient) ...@@ -548,12 +569,16 @@ void XShapeGrad::GradUnsqueeze(XTensor * node, bool isEfficient)
CheckNTErrors(dSize == output->GetDim(dim), "Wrong dim size for UNSQUEEZE!"); CheckNTErrors(dSize == output->GetDim(dim), "Wrong dim size for UNSQUEEZE!");
CheckNTErrors(output->unitNum = input->unitNum * dSize, "Wrong tensor size!"); CheckNTErrors(output->unitNum = input->unitNum * dSize, "Wrong tensor size!");
XTensor * g = NewTensorBufV2(input->grad, input->devID, input->mem); if (!isEfficient || input->isGrad) {
XNoder::MakeGrad(input);
_ReduceSum(output->grad, g, dim);
_Sum(input->grad, g, input->grad); XTensor * tmp = NewTensorBufV2(input->grad, input->devID, input->mem);
DelTensorBuf(g); _ReduceSum(output->grad, tmp, dim);
_Sum(input->grad, tmp, input->grad);
DelTensorBuf(tmp);
}
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
} }
......
...@@ -42,55 +42,55 @@ public: ...@@ -42,55 +42,55 @@ public:
/* post processing of a node */ /* post processing of a node */
static static
void PostProcessing(XTensor * node, int typeId, bool isEfficent); void PostProcessing(XTensor * node, int typeId, bool isEfficient);
private: private:
/* gradient computation for copying indexed sub-tensors: b = copyindexed(a, srcIndex, indexSize, tgtIndex, copyNum) */ /* gradient computation for copying indexed sub-tensors: b = copyindexed(a, srcIndex, indexSize, tgtIndex, copyNum) */
static static
void GradCopyIndexed(XTensor * node, bool isEfficent); void GradCopyIndexed(XTensor * node, bool isEfficient);
/* gradient computation for copying indexed sub-tensors: b = gather(a, index) */ /* gradient computation for copying indexed sub-tensors: b = gather(a, index) */
static static
void GradGather(XTensor * node, bool isEfficent); void GradGather(XTensor * node, bool isEfficient);
/* gradient computation for dropout with index: b = dropoutwithindex(a, index) */ /* gradient computation for dropout with index: b = dropoutwithindex(a, index) */
static static
void GradDropoutWithIndex(XTensor * node, bool isEfficent); void GradDropoutWithIndex(XTensor * node, bool isEfficient);
/* gradient computation for merge: c = merge(a, b, ...) */ /* gradient computation for merge: c = merge(a, b, ...) */
static static
void GradMerge(XTensor * node, bool isEfficent); void GradMerge(XTensor * node, bool isEfficient);
/* gradient computation for merging a list of tensors : c = merge(list(a, b, ...)) */ /* gradient computation for merging a list of tensors : c = merge(list(a, b, ...)) */
static static
void GradMergeList(XTensor * node, bool isEfficent); void GradMergeList(XTensor * node, bool isEfficient);
/* gradient computation for transposing a tensor : b = transpose(a) */ /* gradient computation for transposing a tensor : b = transpose(a) */
static static
void GradTranspose(XTensor * node, bool isEfficent); void GradTranspose(XTensor * node, bool isEfficient);
/* gradient computation for reshaping a tensor: c = reshape(a) */ /* gradient computation for reshaping a tensor: c = reshape(a) */
static static
void GradReshape(XTensor * node, bool isEfficent); void GradReshape(XTensor * node, bool isEfficient);
/* gradient computation for split: c = split(a) */ /* gradient computation for split: c = split(a) */
static static
void GradSplit(XTensor * node, bool isEfficent); void GradSplit(XTensor * node, bool isEfficient);
/* gradient computation for spliting. we return the list of the splits : list(c_1, ...) = split(a) */ /* gradient computation for spliting. we return the list of the splits : list(c_1, ...) = split(a) */
static static
void GradSplitList(XTensor * node, bool isEfficent); void GradSplitList(XTensor * node, bool isEfficient);
/* gradient computation for spliting. we return the list of the splits : list(c_1, ...) = split(a). /* gradient computation for spliting. we return the list of the splits : list(c_1, ...) = split(a).
this method is called only when all nodes of spliting have been processed. We do this in a post-processing this method is called only when all nodes of spliting have been processed. We do this in a post-processing
manner because we can fuze multiple memory copy jobs one time. This is good for system speed up. */ manner because we can fuze multiple memory copy jobs one time. This is good for system speed up. */
static static
void GradSplitListPost(XTensor * node, bool isEfficent); void GradSplitListPost(XTensor * node, bool isEfficient);
/* gradient computation for unsqueezing a tensor : c = unsqueeze(a) */ /* gradient computation for unsqueezing a tensor : c = unsqueeze(a) */
static static
void GradUnsqueeze(XTensor * node, bool isEfficent); void GradUnsqueeze(XTensor * node, bool isEfficient);
}; };
......
...@@ -316,7 +316,6 @@ void XNet::ClearGrad(XTensor * node) ...@@ -316,7 +316,6 @@ void XNet::ClearGrad(XTensor * node)
} }
if(finished){ if(finished){
//fprintf(stderr, "del %d %ld\n", node->id, node->grad->unitNum);
delete node->grad; delete node->grad;
node->grad = NULL; node->grad = NULL;
} }
......
...@@ -171,7 +171,7 @@ void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding, ...@@ -171,7 +171,7 @@ void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding,
dims[inputEnc->order - 1] = 1; dims[inputEnc->order - 1] = 1;
InitTensor(&first, inputEnc->order, dims, X_INT, inputEnc->devID); InitTensor(&first, inputEnc->order, dims, X_INT, inputEnc->devID);
_SetDataFixedInt(&first, startSymbol); first.SetDataFixed(startSymbol);
/* add a new word into the input sequence of the decoder side */ /* add a new word into the input sequence of the decoder side */
if (inputLast == NULL) { if (inputLast == NULL) {
...@@ -195,7 +195,7 @@ void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding, ...@@ -195,7 +195,7 @@ void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding,
XTensor paddingDec; XTensor paddingDec;
InitTensor(&paddingDec, inputDec.order, dims, X_INT, paddingEnc->devID); InitTensor(&paddingDec, inputDec.order, dims, X_INT, paddingEnc->devID);
SetDataFixedInt(paddingDec, 1); paddingDec.SetDataFixed(1);
XTensor maskDec; XTensor maskDec;
XTensor maskEncDec; XTensor maskEncDec;
......
...@@ -503,7 +503,7 @@ void T2TSearch::Dump(XTensor * output) ...@@ -503,7 +503,7 @@ void T2TSearch::Dump(XTensor * output)
int * words = new int[maxLength]; int * words = new int[maxLength];
InitTensor(output, 3, dims, X_INT); InitTensor(output, 3, dims, X_INT);
SetDataFixedInt(*output, -1); output->SetDataFixed(-1);
/* heap for an input sentence in the batch */ /* heap for an input sentence in the batch */
for(int h = 0; h < batchSize; h++){ for(int h = 0; h < batchSize; h++){
......
...@@ -50,6 +50,15 @@ extern TENSOR_DATA_TYPE GetDataType(const char * typeName); ...@@ -50,6 +50,15 @@ extern TENSOR_DATA_TYPE GetDataType(const char * typeName);
unsigned short FloatToFloat16(float f); unsigned short FloatToFloat16(float f);
float Float16ToFloat(unsigned short h); float Float16ToFloat(unsigned short h);
#define CheckDataType(a, b) \
{ \
if(GetDataTypeName(a) != GetDataTypeName(a)){ \
fprintf(stderr, "[ERROR] (%s line %d): we must run the code on the same datatype (%s vs %s)\n", \
__FILENAME__, __LINE__, GetDataTypeName(a), GetDataTypeName(b)); \
exit(1); \
} \
} \
} /* end of the nts (NiuTrans.Tensor) namespace */ } /* end of the nts (NiuTrans.Tensor) namespace */
#endif #endif
\ No newline at end of file
...@@ -64,7 +64,7 @@ ...@@ -64,7 +64,7 @@
#endif #endif
/* the nts (NiuTrans.Tensor) namespace */ /* the nts (NiuTrans.Tensor) namespace */
namespace nts { namespace nts{
int tensorIDGlobal = 0; int tensorIDGlobal = 0;
MUTEX_HANDLE tensorMutex; MUTEX_HANDLE tensorMutex;
...@@ -73,7 +73,7 @@ XTensor NULLTensor; ...@@ -73,7 +73,7 @@ XTensor NULLTensor;
/* generate a tensor id */ /* generate a tensor id */
int MakeTensorID() int MakeTensorID()
{ {
if (tensorIDGlobal == 0) if(tensorIDGlobal == 0)
MUTEX_INIT(tensorMutex); MUTEX_INIT(tensorMutex);
MUTEX_LOCK(tensorMutex); MUTEX_LOCK(tensorMutex);
...@@ -145,7 +145,7 @@ XTensor::XTensor(const int myOrder, const int* myDimSize, const TENSOR_DATA_TYPE ...@@ -145,7 +145,7 @@ XTensor::XTensor(const int myOrder, const int* myDimSize, const TENSOR_DATA_TYPE
mem = myMem; mem = myMem;
devID = myMem != NULL ? myMem->devID : myDevID; devID = myMem != NULL ? myMem->devID : myDevID;
if (order >= 0) if(order >= 0)
Resize(myOrder, myDimSize, myDataType, myDenseRatio); Resize(myOrder, myDimSize, myDataType, myDenseRatio);
} }
...@@ -158,8 +158,8 @@ XTensor::XTensor(const XTensor& reference) ...@@ -158,8 +158,8 @@ XTensor::XTensor(const XTensor& reference)
ShallowCopy(reference); ShallowCopy(reference);
data = NULL; data = NULL;
dataHost = NULL; dataHost = NULL;
if (reference.isTmp) { if(reference.isTmp){
devID = reference.devID; devID = reference.devID;
mem = reference.mem; mem = reference.mem;
data = reference.data; data = reference.data;
...@@ -172,16 +172,16 @@ XTensor::XTensor(const XTensor& reference) ...@@ -172,16 +172,16 @@ XTensor::XTensor(const XTensor& reference)
This is VERY tricky and there might be better solutions :) */ This is VERY tricky and there might be better solutions :) */
*reference.dataP = NULL; *reference.dataP = NULL;
} }
else { else{
devID = reference.devID; devID = reference.devID;
mem = reference.mem; mem = reference.mem;
InitTensorV2(this, &reference); InitTensorV2(this, &reference);
_CopyValues(&reference, this); _CopyValues(&reference, this);
} }
if (reference.isTmp) if(reference.isTmp)
XLink::Replace(&reference, this); XLink::Replace(&reference, this);
else { else{
CheckNTErrors(outgo.tailNum == 0, "The node has outgoing edge to other nodes!"); CheckNTErrors(outgo.tailNum == 0, "The node has outgoing edge to other nodes!");
XLink::CopyIncoming(&reference, this); XLink::CopyIncoming(&reference, this);
} }
...@@ -225,7 +225,7 @@ XTensor::~XTensor() ...@@ -225,7 +225,7 @@ XTensor::~XTensor()
the connectivity of the graph. To kill memory the connectivity of the graph. To kill memory
leak, we release the data of the new tensor leak, we release the data of the new tensor
when its parent is deleted (see ClearIncoming). */ when its parent is deleted (see ClearIncoming). */
if (outgo.tailNum > 0) { if(outgo.tailNum > 0){
int dims[MAX_TENSOR_DIM_NUM]; int dims[MAX_TENSOR_DIM_NUM];
memcpy(dims, dimSize, order * sizeof(int)); memcpy(dims, dimSize, order * sizeof(int));
dims[0] = -dims[0]; dims[0] = -dims[0];
...@@ -243,7 +243,7 @@ XTensor::~XTensor() ...@@ -243,7 +243,7 @@ XTensor::~XTensor()
DestroyData(); DestroyData();
if (grad != NULL) if(grad != NULL)
delete grad; delete grad;
} }
...@@ -288,16 +288,16 @@ void XTensor::Init() ...@@ -288,16 +288,16 @@ void XTensor::Init()
/* delete data arrays */ /* delete data arrays */
void XTensor::DestroyData() void XTensor::DestroyData()
{ {
if (data != NULL && mem == NULL && !isShared) if(data != NULL && mem == NULL && !isShared)
XMemFree(devID, data); XMemFree(devID, data);
else if (data != NULL && isInGlobalMem) else if(data != NULL && isInGlobalMem)
FreeData(this, mem); FreeData(this, mem);
else if (data != NULL) else if(data != NULL)
mem->Release(data, GetDataSizeInChar(), signature); mem->Release(data, GetDataSizeInChar(), signature);
data = NULL; data = NULL;
if (dataHost != NULL) if(dataHost != NULL)
delete[] (char*)dataHost; delete[] (char*)dataHost;
dataHost = NULL; dataHost = NULL;
} }
...@@ -330,11 +330,11 @@ XTensor& XTensor::operator= (const XTensor& tensor) ...@@ -330,11 +330,11 @@ XTensor& XTensor::operator= (const XTensor& tensor)
{ {
/* we must make a hard copy of the tensor if it is the input /* we must make a hard copy of the tensor if it is the input
of another node. */ of another node. */
if (outgo.tailNum > 0) { if(outgo.tailNum > 0){
int dims[MAX_TENSOR_DIM_NUM]; int dims[MAX_TENSOR_DIM_NUM];
memcpy(dims, dimSize, order * sizeof(int)); memcpy(dims, dimSize, order * sizeof(int));
dims[0] = -dims[0]; dims[0] = -dims[0];
XTensor* newTensor = new XTensor(order, dims, dataType, denseRatio, devID, mem); XTensor* newTensor = new XTensor(order, dims, dataType, denseRatio, devID, mem);
newTensor->SetTMPFlag(); newTensor->SetTMPFlag();
newTensor->data = data; newTensor->data = data;
...@@ -350,35 +350,35 @@ XTensor& XTensor::operator= (const XTensor& tensor) ...@@ -350,35 +350,35 @@ XTensor& XTensor::operator= (const XTensor& tensor)
dataHost = NULL; dataHost = NULL;
} }
if (false && !tensor.isTmp) { if(false && !tensor.isTmp){
/* NOTE: this might lead to additional data copy by Mac LLVM compilers */ /* NOTE: this might lead to additional data copy by Mac LLVM compilers */
/* we make an identity transformation here */ /* we make an identity transformation here */
if (outgo.tailNum > 0) if(outgo.tailNum > 0)
XLink::ClearOutgoing(this); XLink::ClearOutgoing(this);
XLink::ClearIncoming(this); XLink::ClearIncoming(this);
if (!_IsSameShaped(this, &tensor)) if(!_IsSameShaped(this, &tensor))
Resize(tensor.order, tensor.dimSize, tensor.dataType, tensor.denseRatio); Resize(tensor.order, tensor.dimSize, tensor.dataType, tensor.denseRatio);
_Identity(&tensor, this); _Identity(&tensor, this);
XLink::MakeLink(&tensor, NULL, this, FUNC_IDENTITY); XLink::MakeLink(&tensor, NULL, this, FUNC_IDENTITY);
} }
else { else{
/* hard copy of the data array */ /* hard copy of the data array */
int size = unitNum * unitSize; int size = unitNum * unitSize;
if (isInit && !isSparse && !tensor.isSparse && if(isInit && !isSparse && !tensor.isSparse &&
size == tensor.unitNum * tensor.unitSize && size == tensor.unitNum * tensor.unitSize &&
((devID < 0 && tensor.devID < 0) && devID == tensor.devID) && ((devID < 0 && tensor.devID < 0) && devID == tensor.devID) &&
data != NULL) data != NULL)
{ {
XMemCopy(data, devID, tensor.data, tensor.devID, size); XMemCopy(data, devID, tensor.data, tensor.devID, size);
if (dataHost != NULL && tensor.dataHost != NULL) if(dataHost != NULL && tensor.dataHost != NULL)
XMemCopy(dataHost, -1, tensor.dataHost, tensor.devID, size); XMemCopy(dataHost, -1, tensor.dataHost, tensor.devID, size);
} }
else { else{
DestroyData(); DestroyData();
if (!isInit) { if(!isInit){
devID = tensor.devID; devID = tensor.devID;
mem = tensor.mem; mem = tensor.mem;
} }
...@@ -407,11 +407,11 @@ XTensor& XTensor::operator= (const XTensor&& tensor) ...@@ -407,11 +407,11 @@ XTensor& XTensor::operator= (const XTensor&& tensor)
{ {
/* we must make a hard copy of the tensor if it is the input /* we must make a hard copy of the tensor if it is the input
of another node. */ of another node. */
if (outgo.tailNum > 0) { if(outgo.tailNum > 0){
int dims[MAX_TENSOR_DIM_NUM]; int dims[MAX_TENSOR_DIM_NUM];
memcpy(dims, dimSize, order * sizeof(int)); memcpy(dims, dimSize, order * sizeof(int));
dims[0] = -dims[0]; dims[0] = -dims[0];
XTensor* newTensor = new XTensor(order, dims, dataType, denseRatio, devID, mem); XTensor* newTensor = new XTensor(order, dims, dataType, denseRatio, devID, mem);
newTensor->SetTMPFlag(); newTensor->SetTMPFlag();
newTensor->data = data; newTensor->data = data;
...@@ -520,7 +520,7 @@ relocate the data on the target device ...@@ -520,7 +520,7 @@ relocate the data on the target device
*/ */
void XTensor::SetDevice(int myDevId, XMem* myMem) void XTensor::SetDevice(int myDevId, XMem* myMem)
{ {
if (myMem == NULL) { if(myMem == NULL){
myMem = GMems.GetMem(myDevId); myMem = GMems.GetMem(myDevId);
} }
FlushToMem(myMem); FlushToMem(myMem);
...@@ -529,7 +529,7 @@ void XTensor::SetDevice(int myDevId, XMem* myMem) ...@@ -529,7 +529,7 @@ void XTensor::SetDevice(int myDevId, XMem* myMem)
bool XTensor::IsReduceShaped(const XTensor* a, const XTensor* b, int dim) bool XTensor::IsReduceShaped(const XTensor* a, const XTensor* b, int dim)
{ {
if (a == NULL || b == NULL) if(a == NULL || b == NULL)
return false; return false;
if ((a->order - 1) != b->order) if ((a->order - 1) != b->order)
...@@ -541,18 +541,18 @@ bool XTensor::IsReduceShaped(const XTensor* a, const XTensor* b, int dim) ...@@ -541,18 +541,18 @@ bool XTensor::IsReduceShaped(const XTensor* a, const XTensor* b, int dim)
return false; return false;
} }
else if (i >= dim) { else if (i >= dim) {
if (a->dimSize[i + 1] != b->dimSize[i]) if (a->dimSize[i+1] != b->dimSize[i])
return false; return false;
} }
} }
if (a->dataType != b->dataType) if(a->dataType != b->dataType)
return false; return false;
if (a->denseRatio != b->denseRatio) if(a->denseRatio != b->denseRatio)
return false; return false;
if (a->isSparse != b->isSparse) if(a->isSparse != b->isSparse)
return false; return false;
return true; return true;
...@@ -579,7 +579,7 @@ int XTensor::GetDim(const int dim) const ...@@ -579,7 +579,7 @@ int XTensor::GetDim(const int dim) const
CheckNTErrors(dim >= -order, "dimenision is out of range!"); CheckNTErrors(dim >= -order, "dimenision is out of range!");
int d = dim; int d = dim;
if (dim < 0) if(dim < 0)
d = order + dim; d = order + dim;
return dimSize[d]; return dimSize[d];
...@@ -595,7 +595,7 @@ void XTensor::Reshape(const int myOrder, const int* myDimSize) ...@@ -595,7 +595,7 @@ void XTensor::Reshape(const int myOrder, const int* myDimSize)
int dims[MAX_TENSOR_DIM_NUM]; int dims[MAX_TENSOR_DIM_NUM];
int num = 1; int num = 1;
for (int i = 0; i < myOrder; i++) { for(int i = 0; i < myOrder; i++){
num *= myDimSize[i]; num *= myDimSize[i];
dims[i] = abs(myDimSize[i]); dims[i] = abs(myDimSize[i]);
} }
...@@ -663,7 +663,7 @@ XTensor XTensor::TypeAs(const XTensor input) ...@@ -663,7 +663,7 @@ XTensor XTensor::TypeAs(const XTensor input)
/* get the number of items in the data array */ /* get the number of items in the data array */
int XTensor::GetSize() const int XTensor::GetSize() const
{ {
if (isSparse) if(isSparse)
return unitNumNonZero; return unitNumNonZero;
else else
return unitNum; return unitNum;
...@@ -672,13 +672,13 @@ int XTensor::GetSize() const ...@@ -672,13 +672,13 @@ int XTensor::GetSize() const
/* get the size of the memory space used */ /* get the size of the memory space used */
int XTensor::GetDataSizeInChar() const int XTensor::GetDataSizeInChar() const
{ {
if (isSparse) { if(isSparse){
int num = int(unitNum * denseRatio + 1); int num = int(unitNum * denseRatio + 1);
int tupleSize = sizeof(int) + sizeof(DTYPE); int tupleSize = sizeof(int)+sizeof(DTYPE);
int size = sizeof(int) + tupleSize * (num); int size = sizeof(int) + tupleSize*(num);
return size; return size;
} }
else { else{
return unitNum * unitSize; return unitNum * unitSize;
} }
} }
...@@ -690,15 +690,15 @@ get unit size in terms of "dataType" ...@@ -690,15 +690,15 @@ get unit size in terms of "dataType"
*/ */
int XTensor::GetUnitSize(TENSOR_DATA_TYPE myDataType) const int XTensor::GetUnitSize(TENSOR_DATA_TYPE myDataType) const
{ {
if (myDataType == X_INT) if(myDataType == X_INT)
return sizeof(int); return sizeof(int);
else if (myDataType == X_FLOAT) else if(myDataType == X_FLOAT)
return sizeof(float); return sizeof(float);
else if (myDataType == X_DOUBLE) else if(myDataType == X_DOUBLE)
return sizeof(double); return sizeof(double);
else if (myDataType == X_INT8) else if(myDataType == X_INT8)
return 1; return 1;
else if (myDataType == X_FLOAT16) else if(myDataType == X_FLOAT16)
return 2; return 2;
return sizeof(float); return sizeof(float);
} }
...@@ -739,19 +739,19 @@ a vector with all entries of 0 ...@@ -739,19 +739,19 @@ a vector with all entries of 0
*/ */
void XTensor::SetZeroAll(XStream* stream) void XTensor::SetZeroAll(XStream* stream)
{ {
if (data == NULL) if(data == NULL)
return; return;
if (isSparse) { if(isSparse){
if (devID >= 0) { if(devID >= 0){
#ifdef USE_CUDA #ifdef USE_CUDA
int size = sizeof(int) + (sizeof(int) + sizeof(DTYPE)) * unitNumNonZero; int size = sizeof(int) + (sizeof(int)+sizeof(DTYPE)) * unitNumNonZero;
int devIDBackup = 0; int devIDBackup = 0;
cudaGetDevice(&devIDBackup); cudaGetDevice(&devIDBackup);
cudaSetDevice(devID); cudaSetDevice(devID);
if (stream == NULL) if(stream == NULL)
cudaMemset(data, 0, size); cudaMemset(data, 0, size);
else else
cudaMemsetAsync(data, 0, size, stream->stream); cudaMemsetAsync(data, 0, size, stream->stream);
...@@ -764,14 +764,14 @@ void XTensor::SetZeroAll(XStream* stream) ...@@ -764,14 +764,14 @@ void XTensor::SetZeroAll(XStream* stream)
unitNumNonZero = 0; unitNumNonZero = 0;
} }
else { else{
if (devID >= 0) { if(devID >= 0){
#ifdef USE_CUDA #ifdef USE_CUDA
int devIDBackup = 0; int devIDBackup = 0;
cudaGetDevice(&devIDBackup); cudaGetDevice(&devIDBackup);
cudaSetDevice(devID); cudaSetDevice(devID);
if (stream == NULL) if(stream == NULL)
cudaMemset(data, 0, unitNum * unitSize); cudaMemset(data, 0, unitNum * unitSize);
else else
cudaMemsetAsync(data, 0, unitNum * unitSize, stream->stream); cudaMemsetAsync(data, 0, unitNum * unitSize, stream->stream);
...@@ -791,7 +791,7 @@ void XTensor::SetZeroAll(XStream* stream) ...@@ -791,7 +791,7 @@ void XTensor::SetZeroAll(XStream* stream)
*/ */
void XTensor::SetData(const void* d, int num, int beg) void XTensor::SetData(const void* d, int num, int beg)
{ {
if (data == NULL || d == NULL) if(data == NULL || d == NULL)
return; return;
CheckNTErrors(!isSparse, "TODO"); CheckNTErrors(!isSparse, "TODO");
...@@ -816,6 +816,16 @@ void XTensor::Range(DTYPE lower, DTYPE upper, DTYPE step) ...@@ -816,6 +816,16 @@ void XTensor::Range(DTYPE lower, DTYPE upper, DTYPE step)
_SetDataRange(this, lower, upper, step); _SetDataRange(this, lower, upper, step);
} }
/* generate data items with a fixed value */
template<class T>
void XTensor::SetDataFixed(T num)
{
_SetDataFixed(this, num);
}
template void XTensor::SetDataFixed<int>(int);
template void XTensor::SetDataFixed<float>(float);
template void XTensor::SetDataFixed<double>(double);
/* /*
set the tensor items by a uniform distribution in range [lower, upper] set the tensor items by a uniform distribution in range [lower, upper]
>> lower - lower value of the range >> lower - lower value of the range
...@@ -823,62 +833,7 @@ set the tensor items by a uniform distribution in range [lower, upper] ...@@ -823,62 +833,7 @@ set the tensor items by a uniform distribution in range [lower, upper]
*/ */
void XTensor::SetDataRand(DTYPE lower, DTYPE upper) void XTensor::SetDataRand(DTYPE lower, DTYPE upper)
{ {
// TODO: GPU code!!!!!!! _SetDataRand(this, lower, upper);
if (data == NULL)
return;
// srand((unsigned)time(0));
DTYPE variance = upper - lower;
void* d = NULL;
if (dataType == X_FLOAT) {
d = new float[unitNum];
for (int i = 0; i < unitNum; i++) {
DTYPE value = lower + variance * (float)rand() / RAND_MAX;
*((float*)d + i) = value;
}
}
else if (dataType == X_DOUBLE) {
d = new double[unitNum];
for (int i = 0; i < unitNum; i++) {
*((double*)d + i) = lower + variance * rand() / RAND_MAX;
}
}
else {
ShowNTErrors("Data type must be X_FLOAT or X_Double!");
}
SetData(d, unitNum);
if (dataType == X_FLOAT) {
delete[] (float*)d;
}
else {
delete[] (double*)d;
}
}
/* a gauss distribution (Box-Muller method) */
double GaussRand(DTYPE mean, DTYPE standardDeviation)
{
// TODO: GPU code!!!!!!!
static double u, v;
static int phase = 0;
double z;
double pi = 3.141592654;
if (phase == 0) {
u = (rand() + 1.0) / (RAND_MAX + 1.0);
v = (rand() + 1.0) / (RAND_MAX + 1.0);
z = sqrt(-2.0 * log(u)) * sin(2.0 * pi * v);
}
else {
z = sqrt(-2.0 * log(u)) * cos(2.0 * pi * v);
}
phase = 1 - phase;
return mean + (z * standardDeviation);
} }
/* /*
...@@ -888,37 +843,7 @@ set the tensor items by a normal distribution ...@@ -888,37 +843,7 @@ set the tensor items by a normal distribution
*/ */
void XTensor::SetDataRandn(DTYPE mean, DTYPE standardDeviation) void XTensor::SetDataRandn(DTYPE mean, DTYPE standardDeviation)
{ {
// TODO: cuda code!!!!!!! _SetDataRandN(this, mean, standardDeviation);
if (data == NULL)
return;
// srand((unsigned)time(0));
void* d = NULL;
if (dataType == X_FLOAT) {
d = new float[unitNum];
for (int i = 0; i < unitNum; i++) {
*((float*)d + i) = (float)GaussRand(mean, standardDeviation);
}
}
else if (dataType == X_DOUBLE) {
d = new double[unitNum];
for (int i = 0; i < unitNum; i++) {
*((double*)d + i) = GaussRand(mean, standardDeviation);
}
}
else {
ShowNTErrors("Data type must be X_FLOAT or X_Double!");
}
SetData(d, unitNum);
if (dataType == X_FLOAT) {
delete[] (float*)d;
}
else {
delete[] (double*)d;
}
} }
/* /*
...@@ -990,20 +915,20 @@ void* XTensor::GetCell(int index[], int size) const ...@@ -990,20 +915,20 @@ void* XTensor::GetCell(int index[], int size) const
CheckNTErrors((size == order), "Illegal index!"); CheckNTErrors((size == order), "Illegal index!");
int offset = index[0]; int offset = index[0];
for (int i = 1; i < size; ++i) { for(int i = 1; i < size; ++i){
CheckNTErrors((index[i] < dimSize[i]), "Index is out of range!"); CheckNTErrors((index[i] < dimSize[i]), "Index is out of range!");
offset = offset * dimSize[i] + index[i]; offset = offset * dimSize[i] + index[i];
} }
if (isSparse) { if(isSparse){
DTYPE value; DTYPE value;
void* p; void* p;
if (BinarySearch(offset, value, p)) if(BinarySearch(offset, value, p))
return (char*)p + sizeof(int); return (char*)p + sizeof(int);
else else
return NULL; return NULL;
} }
else { else{
return ((char*)data) + offset * unitSize; return ((char*)data) + offset * unitSize;
} }
} }
...@@ -1089,9 +1014,9 @@ int XTensor::GetInt(int offset) const ...@@ -1089,9 +1014,9 @@ int XTensor::GetInt(int offset) const
CheckNTErrors(offset >= 0 && offset < unitNum, "Invalid index!"); CheckNTErrors(offset >= 0 && offset < unitNum, "Invalid index!");
CheckNTErrors(data != NULL, "Cannot use an uninitialized tensor!"); CheckNTErrors(data != NULL, "Cannot use an uninitialized tensor!");
CheckNTErrors(denseRatio == 1.0F, "Only dense tensors are supported in Get(offset)."); CheckNTErrors(denseRatio == 1.0F, "Only dense tensors are supported in Get(offset).");
int* address = (int*)data + offset; int* address = (int*)data + offset;
return ToCPUInt(devID, address); return ToCPUInt(devID, address);
} }
...@@ -1195,7 +1120,7 @@ int XTensor::GetKeyInSparse(int i) const ...@@ -1195,7 +1120,7 @@ int XTensor::GetKeyInSparse(int i) const
char* d = (char*)data + sizeof(int); char* d = (char*)data + sizeof(int);
int* key = (int*)(d + (sizeof(int) + sizeof(DTYPE)) * i); int* key = (int*)(d + (sizeof(int) + sizeof(DTYPE)) * i);
return ToCPUInt(devID, key); return ToCPUInt(devID, key);
} }
...@@ -1308,9 +1233,9 @@ bool XTensor::SetInt(int value, int offset) ...@@ -1308,9 +1233,9 @@ bool XTensor::SetInt(int value, int offset)
{ {
CheckNTErrors(offset >= 0 && offset < unitNum, "Invalid index!"); CheckNTErrors(offset >= 0 && offset < unitNum, "Invalid index!");
CheckNTErrors(data != NULL, "Cannot use an uninitialized tensor!"); CheckNTErrors(data != NULL, "Cannot use an uninitialized tensor!");
int* d = (int*)data + offset; int* d = (int*)data + offset;
return SetToDeviceInt(devID, d, value); return SetToDeviceInt(devID, d, value);
} }
...@@ -1415,7 +1340,7 @@ bool XTensor::Add2D(DTYPE value, int ni, int mi) ...@@ -1415,7 +1340,7 @@ bool XTensor::Add2D(DTYPE value, int ni, int mi)
CheckNTErrors(dataType == DEFAULT_DTYPE, "The tensor is not in default type."); CheckNTErrors(dataType == DEFAULT_DTYPE, "The tensor is not in default type.");
CheckNTErrors(isSparse == false, "TODO!"); CheckNTErrors(isSparse == false, "TODO!");
if (devID < 0) { if(devID < 0){
DTYPE* p = (DTYPE*)data + ni * dimSize[1] + mi; DTYPE* p = (DTYPE*)data + ni * dimSize[1] + mi;
CheckNTErrors((p != NULL), "No data array is found!"); CheckNTErrors((p != NULL), "No data array is found!");
...@@ -1424,7 +1349,7 @@ bool XTensor::Add2D(DTYPE value, int ni, int mi) ...@@ -1424,7 +1349,7 @@ bool XTensor::Add2D(DTYPE value, int ni, int mi)
return true; return true;
} }
else { else{
int dims[2] = {ni, mi}; int dims[2] = {ni, mi};
return SetToDevice(devID, GetCell(dims, 2), Get2D(ni, mi) + value); return SetToDevice(devID, GetCell(dims, 2), Get2D(ni, mi) + value);
} }
...@@ -1433,24 +1358,24 @@ bool XTensor::Add2D(DTYPE value, int ni, int mi) ...@@ -1433,24 +1358,24 @@ bool XTensor::Add2D(DTYPE value, int ni, int mi)
/* get the number of non-zero elements (in a sparse tensor) */ /* get the number of non-zero elements (in a sparse tensor) */
int XTensor::GetNonzeroSize() const int XTensor::GetNonzeroSize() const
{ {
if (!isSparse) { if(!isSparse){
XPRINT(1, stderr, "WARNING! Counting non-zero elements in a dense tensor might be slow!\n"); XPRINT(1, stderr, "WARNING! Counting non-zero elements in a dense tensor might be slow!\n");
CheckNTErrors(devID < 0, "TODO"); CheckNTErrors(devID < 0, "TODO");
if (dataType == DEFAULT_DTYPE) { if(dataType == DEFAULT_DTYPE){
int count = 0; int count = 0;
for (int i = 0; i < unitNum; i++) { for(int i = 0; i < unitNum; i++){
DTYPE value = *(DTYPE*)((char*)data + i * sizeof(DTYPE)); DTYPE value = *(DTYPE*)((char*)data + i * sizeof(DTYPE));
if (value == 0) if(value == 0)
count++; count++;
} }
return count; return count;
} }
else { else{
ShowNTErrors("TODO!"); ShowNTErrors("TODO!");
return -1; return -1;
} }
} }
else { else{
/* return the head of the tuple list */ /* return the head of the tuple list */
return unitNumNonZero; return unitNumNonZero;
} }
...@@ -1481,7 +1406,7 @@ set the tensor as "variable" ...@@ -1481,7 +1406,7 @@ set the tensor as "variable"
void XTensor::SetVarFlag(bool myIsVar) void XTensor::SetVarFlag(bool myIsVar)
{ {
isVar = myIsVar; isVar = myIsVar;
if (isVar) if(isVar)
SetGradFlag(true); SetGradFlag(true);
} }
...@@ -1497,7 +1422,7 @@ bool XTensor::Resize(const int myOrder, const int* myDimSize, ...@@ -1497,7 +1422,7 @@ bool XTensor::Resize(const int myOrder, const int* myDimSize,
const TENSOR_DATA_TYPE myDataType, const float myDenseRatio) const TENSOR_DATA_TYPE myDataType, const float myDenseRatio)
{ {
/* free old mem */ /* free old mem */
if (data != NULL) { if(data != NULL){
if (mem == NULL) if (mem == NULL)
XMemFree(devID, data); XMemFree(devID, data);
else else
...@@ -1513,11 +1438,11 @@ bool XTensor::Resize(const int myOrder, const int* myDimSize, ...@@ -1513,11 +1438,11 @@ bool XTensor::Resize(const int myOrder, const int* myDimSize,
bool filledData = true; bool filledData = true;
bool zeroData = false; bool zeroData = false;
for (int i = 0; i < order; i++) { for(int i = 0; i < order; i++){
dimSize[i] = abs(myDimSize[i]); dimSize[i] = abs(myDimSize[i]);
if (myDimSize[i] < 0) if(myDimSize[i] < 0)
filledData = false; filledData = false;
if (myDimSize[i] == 0) if(myDimSize[i] == 0)
zeroData = true; zeroData = true;
unitNum *= dimSize[i]; unitNum *= dimSize[i];
} }
...@@ -1528,17 +1453,17 @@ bool XTensor::Resize(const int myOrder, const int* myDimSize, ...@@ -1528,17 +1453,17 @@ bool XTensor::Resize(const int myOrder, const int* myDimSize,
dataType = myDataType; dataType = myDataType;
unitSize = GetUnitSize(dataType); unitSize = GetUnitSize(dataType);
if (myDataType != DEFAULT_DTYPE) if(myDataType != DEFAULT_DTYPE)
isDefaultDType = false; isDefaultDType = false;
else else
isDefaultDType = true; isDefaultDType = true;
if (zeroData) { if(zeroData){
unitNum = 0; unitNum = 0;
return false; return false;
} }
if (isSparse) { if(isSparse){
/* /*
for sparse matrices, we use a list of tuple (key, value), for sparse matrices, we use a list of tuple (key, value),
ordered by key. Take a (2-dimensional) matrix as an example, ordered by key. Take a (2-dimensional) matrix as an example,
...@@ -1560,18 +1485,18 @@ bool XTensor::Resize(const int myOrder, const int* myDimSize, ...@@ -1560,18 +1485,18 @@ bool XTensor::Resize(const int myOrder, const int* myDimSize,
int tupleSize = sizeof(int) + sizeof(DTYPE); int tupleSize = sizeof(int) + sizeof(DTYPE);
int size = sizeof(int) + tupleSize * (num); int size = sizeof(int) + tupleSize * (num);
if (filledData) { if(filledData){
int* d = NULL; int* d = NULL;
if (mem == NULL) { if(mem == NULL){
d = new int[size]; d = new int[size];
memset(d, 0, size); memset(d, 0, size);
} }
else { else{
d = (int*)mem->Alloc(mem->devID, size); d = (int*)mem->Alloc(mem->devID, size);
} }
if (d == NULL) if(d == NULL)
return false; return false;
#if !defined(UNSAFE_BUT_FAST_MEM) #if !defined(UNSAFE_BUT_FAST_MEM)
...@@ -1581,11 +1506,11 @@ bool XTensor::Resize(const int myOrder, const int* myDimSize, ...@@ -1581,11 +1506,11 @@ bool XTensor::Resize(const int myOrder, const int* myDimSize,
} }
return true; return true;
} }
else { else{
if (filledData) { if(filledData){
/* allocate the new one */ /* allocate the new one */
if (mem == NULL) { if(mem == NULL){
data = XMemAlloc(devID, unitNum * unitSize); data = XMemAlloc(devID, unitNum * unitSize);
#if defined(UNSAFE_BUT_FAST_MEM) #if defined(UNSAFE_BUT_FAST_MEM)
XMemSet(devID, data, 0, unitNum * unitSize); XMemSet(devID, data, 0, unitNum * unitSize);
#endif #endif
...@@ -1593,12 +1518,12 @@ bool XTensor::Resize(const int myOrder, const int* myDimSize, ...@@ -1593,12 +1518,12 @@ bool XTensor::Resize(const int myOrder, const int* myDimSize,
else else
data = (void*)mem->Alloc(mem->devID, unitNum * unitSize); data = (void*)mem->Alloc(mem->devID, unitNum * unitSize);
if (data == NULL) if(data == NULL)
return false; return false;
} }
#if !defined(UNSAFE_BUT_FAST_MEM) #if !defined(UNSAFE_BUT_FAST_MEM)
if (data != NULL) if(data != NULL)
XMem::SetZero(data, unitNum * unitSize, mem); XMem::SetZero(data, unitNum * unitSize, mem);
#endif #endif
return true; return true;
...@@ -1614,7 +1539,7 @@ bool XTensor::Resize(const XTensor* myTensor) ...@@ -1614,7 +1539,7 @@ bool XTensor::Resize(const XTensor* myTensor)
denseRatio = myTensor->denseRatio; denseRatio = myTensor->denseRatio;
TENSOR_DATA_TYPE myDataType = myTensor->dataType; TENSOR_DATA_TYPE myDataType = myTensor->dataType;
if (myDataType != DEFAULT_DTYPE) if(myDataType != DEFAULT_DTYPE)
isDefaultDType = false; isDefaultDType = false;
else else
isDefaultDType = true; isDefaultDType = true;
...@@ -1637,7 +1562,7 @@ bool XTensor::BinarySearch(int key, DTYPE& value, void*& position) const ...@@ -1637,7 +1562,7 @@ bool XTensor::BinarySearch(int key, DTYPE& value, void*& position) const
int* d = (int*)data; int* d = (int*)data;
if (key < 0 || *d == 0) { if(key < 0 || *d == 0){
value = 0; value = 0;
position = NULL; position = NULL;
return false; return false;
...@@ -1652,32 +1577,32 @@ bool XTensor::BinarySearch(int key, DTYPE& value, void*& position) const ...@@ -1652,32 +1577,32 @@ bool XTensor::BinarySearch(int key, DTYPE& value, void*& position) const
int tupleSize = sizeof(int) + sizeof(DTYPE); int tupleSize = sizeof(int) + sizeof(DTYPE);
char* p = (char*)data + headSize; char* p = (char*)data + headSize;
while (low <= high) { while (low <= high){
int mid = low + (high - low) / 2; int mid = low + (high - low)/2;
k = (int*)(p + tupleSize * mid); k = (int*)(p + tupleSize * mid);
if (*k == key) { if(*k == key){
ok = true; ok = true;
high = mid - 1; high = mid - 1;
break; break;
} }
else if (*k > key) { else if(*k > key){
high = mid - 1; high = mid - 1;
} }
else { else{
low = mid + 1; low = mid + 1;
last = mid; last = mid;
} }
} }
if (ok) { if(ok){
DTYPE* p = (DTYPE*)((char*)k + sizeof(int)); DTYPE* p = (DTYPE*)((char*)k + sizeof(int));
value = *p; value = *p;
position = k; position = k;
return true; return true;
} }
else { else{
value = 0; value = 0;
if (last == -1) if(last == -1)
position = NULL; position = NULL;
else else
position = (char*)data + headSize + tupleSize * last; position = (char*)data + headSize + tupleSize * last;
...@@ -1730,10 +1655,10 @@ void XTensor::Dump(FILE* file, const char* label, const int n, const int beg, co ...@@ -1730,10 +1655,10 @@ void XTensor::Dump(FILE* file, const char* label, const int n, const int beg, co
if (label != NULL) if (label != NULL)
fprintf(file, "%s ", label); fprintf(file, "%s ", label);
if (isInit) { if(isInit){
fprintf(file, "order=%d dimsize=", order); fprintf(file, "order=%d dimsize=", order);
if (order == 0) { if(order == 0) {
fprintf(file, "%d,", dimSize[0]); fprintf(file, "%d,", dimSize[0]);
} }
for (int i = 0; i < order; i++) { for (int i = 0; i < order; i++) {
...@@ -1742,21 +1667,21 @@ void XTensor::Dump(FILE* file, const char* label, const int n, const int beg, co ...@@ -1742,21 +1667,21 @@ void XTensor::Dump(FILE* file, const char* label, const int n, const int beg, co
fprintf(file, ","); fprintf(file, ",");
} }
} }
else { else{
fprintf(file, "order=-1 dimsize=-1"); fprintf(file, "order=-1 dimsize=-1");
} }
fprintf(file, " dtype=%s dense=%f\n", GetDataTypeName(dataType), denseRatio); fprintf(file, " dtype=%s dense=%f\n", GetDataTypeName(dataType), denseRatio);
if (!isInit) { if(!isInit){
fprintf(file, "NULL"); fprintf(file, "NULL");
} }
if (!isSparse) { if (!isSparse) {
if (dataType == DEFAULT_DTYPE) { if (dataType == DEFAULT_DTYPE) {
int end = MIN(n > 0 ? beg + n : beg + unitNum, unitNum); int end = MIN(n > 0 ? beg + n : beg + unitNum, unitNum);
for (int i = beg; i < end; i++) { for(int i = beg; i < end; i++){
DTYPE f = ((DTYPE*)d)[i]; DTYPE f = ((DTYPE*)d)[i];
if (i == beg) if(i == beg)
fprintf(file, "%e", f); fprintf(file, "%e", f);
else else
fprintf(file, " %e", f); fprintf(file, " %e", f);
...@@ -1765,9 +1690,9 @@ void XTensor::Dump(FILE* file, const char* label, const int n, const int beg, co ...@@ -1765,9 +1690,9 @@ void XTensor::Dump(FILE* file, const char* label, const int n, const int beg, co
} }
else if (dataType == X_INT) { else if (dataType == X_INT) {
int end = MIN(n > 0 ? beg + n : beg + unitNum, unitNum); int end = MIN(n > 0 ? beg + n : beg + unitNum, unitNum);
for (int i = beg; i < end; i++) { for(int i = beg; i < end; i++){
int f = ((int*)d)[i]; int f = ((int*)d)[i];
if (i == beg) if(i == beg)
fprintf(file, "%d", f); fprintf(file, "%d", f);
else else
fprintf(file, " %d", f); fprintf(file, " %d", f);
...@@ -1858,7 +1783,7 @@ void XTensor::Read(FILE* file, const char* label) ...@@ -1858,7 +1783,7 @@ void XTensor::Read(FILE* file, const char* label)
fgetc(file); fgetc(file);
if (fscanf(file, "order=%d dimsize=%s dtype=%s dense=%f", if (fscanf(file, "order=%d dimsize=%s dtype=%s dense=%f",
&dimNum, dimSizeName, typeName, &dRatio) < 4) { &dimNum, dimSizeName, typeName, &dRatio) < 4) {
ShowNTErrors("Incorrect format when reading the tensor!"); ShowNTErrors("Incorrect format when reading the tensor!");
} }
...@@ -2018,22 +1943,22 @@ allocate the memory space of the tensor (in the global memory) ...@@ -2018,22 +1943,22 @@ allocate the memory space of the tensor (in the global memory)
*/ */
void XTensor::AllocateData(XTensor* tensor, XMem* myMem, bool useBuf) void XTensor::AllocateData(XTensor* tensor, XMem* myMem, bool useBuf)
{ {
if (tensor == NULL) if(tensor == NULL)
return; return;
if (myMem == NULL) { if(myMem == NULL){
if (tensor->data != NULL) if(tensor->data != NULL)
FreeData(tensor, NULL, false); FreeData(tensor, NULL, false);
tensor->data = XMemAlloc(tensor->devID, tensor->GetDataSizeInChar()); tensor->data = XMemAlloc(tensor->devID, tensor->GetDataSizeInChar());
tensor->isInGlobalMem = true; tensor->isInGlobalMem = true;
} }
else { else{
CheckNTErrors((tensor->data == NULL), "Cannot renew the space for the tensor"); CheckNTErrors((tensor->data == NULL), "Cannot renew the space for the tensor");
if (useBuf) { if(useBuf){
tensor->data = myMem->AllocBuf(tensor->devID, tensor->GetDataSizeInChar()); tensor->data = myMem->AllocBuf(tensor->devID, tensor->GetDataSizeInChar());
tensor->isInGlobalMem = false; tensor->isInGlobalMem = false;
} }
else { else{
tensor->data = myMem->AllocGlobal(tensor->devID, tensor->GetDataSizeInChar()); tensor->data = myMem->AllocGlobal(tensor->devID, tensor->GetDataSizeInChar());
tensor->isInGlobalMem = true; tensor->isInGlobalMem = true;
} }
...@@ -2050,14 +1975,14 @@ free the memory space of the tensor (in the global memory) ...@@ -2050,14 +1975,14 @@ free the memory space of the tensor (in the global memory)
*/ */
void XTensor::FreeData(XTensor* tensor, XMem* myMem, bool useBuf) void XTensor::FreeData(XTensor* tensor, XMem* myMem, bool useBuf)
{ {
if (tensor == NULL) if(tensor == NULL)
return; return;
if (myMem == NULL) { if(myMem == NULL){
XMemFree(tensor->devID, tensor->data); XMemFree(tensor->devID, tensor->data);
} }
else { else{
if (tensor->isInGlobalMem) if(tensor->isInGlobalMem)
myMem->ReleaseGlobal(tensor->devID, tensor->data); myMem->ReleaseGlobal(tensor->devID, tensor->data);
else else
myMem->ReleaseBuf(tensor->devID, tensor->GetDataSizeInChar()); myMem->ReleaseBuf(tensor->devID, tensor->GetDataSizeInChar());
......
...@@ -303,6 +303,10 @@ public: ...@@ -303,6 +303,10 @@ public:
/* generate data items with a range by start, end and the step */ /* generate data items with a range by start, end and the step */
void Range(DTYPE lower, DTYPE upper, DTYPE step); void Range(DTYPE lower, DTYPE upper, DTYPE step);
/* generate data items with a fixed value */
template<class T>
void SetDataFixed(T num);
/* set tensor items by a uniform distribution */ /* set tensor items by a uniform distribution */
void SetDataRand(DTYPE lower = 0.0F, DTYPE upper = 1.0F); void SetDataRand(DTYPE lower = 0.0F, DTYPE upper = 1.0F);
...@@ -423,11 +427,11 @@ public: ...@@ -423,11 +427,11 @@ public:
bool BinarySearch(int key, DTYPE &value, void * &position) const; bool BinarySearch(int key, DTYPE &value, void * &position) const;
/* dump data to a file */ /* dump data to a file */
void Dump(FILE * file, const char * label = NULL, const int n = -1, const int beg = 0, const int verbose = 0); void Dump(FILE * file = stderr, const char * label = NULL, const int n = -1, const int beg = 0, const int verbose = 0);
/* dump data to a file */ /* dump data to a file */
static static
void Dump(const XTensor * tensor, FILE * file, const char * label = NULL, const int n = -1, const int beg = 0, const int verbose = 0); void Dump(const XTensor * tensor, FILE * file = stderr, const char * label = NULL, const int n = -1, const int beg = 0, const int verbose = 0);
/* dump data to a binary file */ /* dump data to a binary file */
void BinaryDump(FILE * file); void BinaryDump(FILE * file);
......
...@@ -116,7 +116,7 @@ void _IndexToOnehot(const XTensor * index, XTensor * onehot, ...@@ -116,7 +116,7 @@ void _IndexToOnehot(const XTensor * index, XTensor * onehot,
float confidence = 1 - labelSmoothingP; float confidence = 1 - labelSmoothingP;
float lowconfidence = labelSmoothingP / size; float lowconfidence = labelSmoothingP / size;
_SetDataFixedFloat(onehot, lowconfidence); onehot->SetDataFixed(lowconfidence);
#ifdef USE_CUDA #ifdef USE_CUDA
if(onehot->devID >= 0 && index->devID >= 0) { if(onehot->devID >= 0 && index->devID >= 0) {
......
...@@ -76,278 +76,191 @@ void _SetDataFanInOut(XTensor * tensor, DTYPE gain) ...@@ -76,278 +76,191 @@ void _SetDataFanInOut(XTensor * tensor, DTYPE gain)
//_SetDataRand(tensor, -finfout, finfout); //_SetDataRand(tensor, -finfout, finfout);
} }
/* /*
generate data items with a fixed value p set a data array with a fixed value
>> tensor - the tensor whose data array would be initialized
>> p - pointer to the number for initializing the tensor >> d - pointer to the data array
>> v - the initial value
>> size - size of the array
*/ */
void _SetDataFixed(XTensor * tensor, void * valuePointer) template<class T>
void ArraySetDataFixed(T * d, T v, int size)
{ {
int num = tensor->unitNum; if (size % 4 == 0) {
for (int i = 0; i < size; i += 4) {
if(tensor->dataType == X_INT){ d[i] = v;
int p = *(int*)valuePointer; d[i + 1] = v;
if(tensor->devID < 0){ d[i + 2] = v;
int * d = (int*)tensor->data; d[i + 3] = v;
if(num % 4 == 0){
for(int i = 0; i < num; i += 4){
d[i] = p;
d[i + 1] = p;
d[i + 2] = p;
d[i + 3] = p;
}
}
else{
for(int i = 0; i < num; i++)
d[i] = p;
}
}
else{
#ifdef USE_CUDA
_CudaSetDataFixedInt(tensor, p);
#endif
}
}
else if(tensor->dataType == X_FLOAT){
float p = *(float*)valuePointer;
if(tensor->devID < 0){
float * d = (float*)tensor->data;
if(num % 4 == 0){
for(int i = 0; i < num; i += 4){
d[i] = p;
d[i + 1] = p;
d[i + 2] = p;
d[i + 3] = p;
}
}
else{
for(int i = 0; i < num; i++)
d[i] = p;
}
}
else{
#ifdef USE_CUDA
_CudaSetDataFixedFloat(tensor, p);
#endif
}
}
else if(tensor->dataType == X_DOUBLE){
double p = *(double*)valuePointer;
if(tensor->devID < 0){
double * d = (double*)tensor->data;
if(num % 4 == 0){
for(int i = 0; i < num; i += 4){
d[i] = p;
d[i + 1] = p;
d[i + 2] = p;
d[i + 3] = p;
}
}
else{
for(int i = 0; i < num; i++)
d[i] = p;
}
}
else{
#ifdef USE_CUDA
_CudaSetDataFixedDouble(tensor, p);
#endif
} }
} }
else{ else {
ShowNTErrors("TODO"); for (int i = 0; i < size; i++)
d[i] = v;
} }
} }
/*
generate data items with a fixed value p (in default type)
>> tensor - the tensor whose data array would be initialized
>> p - number in default type
*/
void SetDataFixed(XTensor &tensor, DTYPE p)
{
_SetDataFixed(&tensor, &p);
}
/* /*
generate data items with a fixed value p (in integer) generate data items with a fixed value
>> tensor - the tensor whose data array would be initialized
>> p - an integer
*/
void SetDataFixedInt(XTensor &tensor, int p)
{
CheckNTErrors(tensor.dataType == X_INT, "An integer tensor is required!");
_SetDataFixed(&tensor, &p);
}
/* >> tensor - the tensor for initialization
generate data items with a fixed value p (in integer) >> value - the initial value
>> tensor - the tensor whose data array would be initialized
>> p - an int-valued number
*/ */
void _SetDataFixedInt(XTensor * tensor, int p) template<class T>
void _SetDataFixed(XTensor * tensor, T value)
{ {
CheckNTErrors(tensor->dataType == X_INT, "the tensor must be in X_INT!"); if (tensor->devID >= 0) {
#ifdef USE_CUDA
_CudaSetDataFixed(tensor, value);
return;
#else
ShowNTErrors("Please specify USE_CUDA and recompile the code!");
#endif
}
if(p == 0) int num = tensor->unitNum;
tensor->SetZeroAll();
if (tensor->dataType == X_INT)
ArraySetDataFixed((int*)tensor->data, (int)value, num);
else if (tensor->dataType == X_FLOAT)
ArraySetDataFixed((float*)tensor->data, (float)value, num);
else if (tensor->dataType == X_DOUBLE)
ArraySetDataFixed((double*)tensor->data, (double)value, num);
else else
_SetDataFixed(tensor, &p); ShowNTErrors("TODO! Unsupported datatype!")
} }
template void _SetDataFixed<int>(XTensor*, int);
template void _SetDataFixed<float>(XTensor*, float);
template void _SetDataFixed<double>(XTensor*, double);
/* /*
generate data items with a fixed value p (in float) generate data items with a fixed value p only if the condition entry is non-zero
>> tensor - the tensor whose data array would be initialized
>> p - a float-valued number
*/
void _SetDataFixedFloat(XTensor * tensor, float p)
{
CheckNTErrors(tensor->dataType == X_FLOAT, "the tensor must be in X_FLOAT!");
if(p == 0)
tensor->SetZeroAll();
else
_SetDataFixed(tensor, &p);
}
/* >> d - pointer to the data array
generate data items with a fixed value p (in double) >> c - pointer to the condition array
>> tensor - the tensor whose data array would be initialized >> v - the initial value
>> p - a double-valued number >> size - size of the array
*/ */
void _SetDataFixedDouble(XTensor * tensor, double p) template<class T>
void ArraySetDataFixedCond(T* d, T* c, T v, int size)
{ {
CheckNTErrors(tensor->dataType == X_DOUBLE, "the tensor must be in X_DOUBLE!"); for (int i = 0; i < size; i++) {
if (c[i] != 0)
if(p == 0) d[i] = v;
tensor->SetZeroAll(); }
else
_SetDataFixed(tensor, &p);
} }
/* /*
generate data items with a fixed value p only if generate data items with a fixed value p only if the condition entry is non-zero
the condition entry is non-zero
>> tensor - the tensor whose data array would be initialized >> tensor - the tensor whose data array would be initialized
>> condition - the condition tensor whose entries would be checked >> condition - the condition tensor whose entries would be checked
for set the corresponding entries in "tensor" for set the corresponding entries in "tensor"
>> p - a given value >> value - a given value
*/ */
void _SetDataFixedCond(XTensor * tensor, XTensor * condition, DTYPE p) template<class T>
void _SetDataFixedCond(XTensor * tensor, XTensor * condition, T value)
{ {
int num = tensor->unitNum; CheckDev(tensor->devID, condition->devID);
CheckDataType(tensor->dataType, condition->dataType);
CheckNTErrors(num == condition->unitNum, "Wrong size of the condition tensor!"); if (tensor->devID >= 0) {
CheckNTErrors(condition->unitSize == sizeof(float), "TODO!");
if(tensor->dataType == DEFAULT_DTYPE){
if(tensor->devID < 0){
DTYPE * data = (DTYPE*)tensor->data;
DTYPE * cond = (DTYPE*)condition->data;
for(int i = 0; i < num; i++){
if(cond[i] != 0)
data[i] = p;
}
}
else{
#ifdef USE_CUDA #ifdef USE_CUDA
_CudaSetDataFixedCondFloat(tensor, condition, p); _CudaSetDataFixedCond(tensor, condition, value);
return;
#else #else
ShowNTErrors("Please specify USE_CUDA and recompile the code"); ShowNTErrors("Please specify USE_CUDA and recompile the code!");
#endif #endif
}
}
else{
ShowNTErrors("the tensor should be in integer typed!");
} }
}
/*
generate data items with a fixed value p only if
the condition entry is non-zero
>> tensor - the tensor whose data array would be initialized
>> condition - the condition tensor whose entries would be checked
for set the corresponding entries in "tensor"
>> p - a given value
*/
void _SetDataFixedCondInt(XTensor * tensor, XTensor * condition, int p)
{
int num = tensor->unitNum; int num = tensor->unitNum;
CheckNTErrors(num == condition->unitNum, "Wrong size of the condition tensor!"); if (tensor->dataType == X_INT)
CheckNTErrors(condition->unitSize == sizeof(float), "TODO!"); ArraySetDataFixedCond((int*)tensor->data, (int*)condition->data, (int)value, num);
else if (tensor->dataType == X_FLOAT)
if(tensor->dataType == DEFAULT_DTYPE){ ArraySetDataFixedCond((float*)tensor->data, (float*)condition->data, (float)value, num);
if(tensor->devID < 0){ else if (tensor->dataType == X_DOUBLE)
int * data = (int*)tensor->data; ArraySetDataFixedCond((double*)tensor->data, (double*)condition->data, (double)value, num);
int * cond = (int*)condition->data; else
for(int i = 0; i < num; i++){ ShowNTErrors("TODO! Unsupported datatype!")
if(cond[i] != 0)
data[i] = p;
}
}
else{
#ifdef USE_CUDA
_CudaSetDataFixedCondInt(tensor, condition, p);
#else
ShowNTErrors("Please specify USE_CUDA and recompile the code");
#endif
}
}
else{
ShowNTErrors("TODO!");
}
} }
template void _SetDataFixedCond<int>(XTensor*, XTensor*, int);
template void _SetDataFixedCond<float>(XTensor*, XTensor*, float);
template void _SetDataFixedCond<double>(XTensor*, XTensor*, double);
/* /*
set data items along with a given dimension (and keep the remaining items unchanged) set data items along with a given dimension (and keep the remaining items unchanged)
>> tensor - the tensor whose data array would be initialized
>> tensor - the tensor for initialization
>> beg - the beginning position >> beg - the beginning position
>> len - length along with the given dimension >> len - length along with the given dimension
>> dim - the dimension along which we set the data >> dim - the dimension along which we set the data
e.g., given a 3 * 3 tensor e.g., given a 3 * 3 tensor
1 2 3 1 2 3
4 5 6 4 5 6
7 8 9 7 8 9
when beg = 1, len = 1, dim = 0 and p = 0, we have when beg = 1, len = 1, dim = 0 and value = 0, we have
1 2 3 1 2 3
0 0 0 0 0 0
7 8 9 7 8 9
i.e., we set all entries of row 1 to 0 i.e., we set all entries of row 1 to 0
>> value - the given value
*/ */
void _SetDataDim(XTensor * tensor, int beg, int len, int dim, DTYPE p) template<class T>
void _SetDataDim(XTensor * tensor, int beg, int len, int dim, T value)
{ {
int n = tensor->order; int order = tensor->order;
int size = tensor->GetDim(dim);
CheckNTErrors(tensor->dataType == DEFAULT_DTYPE, "TODO!"); if (dim < 0)
CheckNTErrors(dim < n && dim >= 0, "Illegal dimension!"); dim = order + dim;
CheckNTErrors(beg >= 0 && beg < tensor->GetDim(dim), "Illegal beginning position!");
CheckNTErrors(beg + len >= 0 && beg + len < tensor->GetDim(dim), "Illegal length!");
if(tensor->devID < 0){
int stride = 1;
int blockSize = 1;
int blockNum = 1;
for(int i = n - 1; i > dim; i--){
stride *= tensor->GetDim(i);
}
blockSize = stride * tensor->GetDim(dim);
blockNum = tensor->unitNum / blockSize;
int l = len * stride; CheckNTErrors(dim < order && dim >= 0, "Illegal dimension!");
CheckNTErrors(beg >= 0 && beg < size, "Illegal beginning position!");
CheckNTErrors(len >= 0 && beg + len <= size, "Illegal length!");
for(int i = 0; i < blockNum; i++){ if (tensor->devID >= 0) {
DTYPE * d = (DTYPE*)tensor->data + blockSize * i + beg * stride;
for(int j = 0; j < l; j++)
d[j] = p;
}
}
else{
#ifdef USE_CUDA #ifdef USE_CUDA
_CudaSetDataDim(tensor, beg, len, dim, p); _CudaSetDataDim(tensor, beg, len, dim, (DTYPE)value);
return;
#else
ShowNTErrors("Please specify USE_CUDA and recompile the code!");
#endif #endif
} }
int stride = 1;
int blockSize = 1;
int blockNum = 1;
for (int i = order - 1; i > dim; i--)
stride *= tensor->GetDim(i);
blockSize = stride * size;
blockNum = tensor->unitNum / blockSize;
int initNum = len * stride;
for(int i = 0; i < blockNum; i++) {
if (tensor->dataType == X_INT) {
int* d = (int*)tensor->data + blockSize * i + beg * stride;
for (int j = 0; j < initNum; j++)
d[j] = (int)value;
}
else if (tensor->dataType == X_FLOAT) {
float* d = (float*)tensor->data + blockSize * i + beg * stride;
for (int j = 0; j < initNum; j++)
d[j] = (float)value;
}
else if (tensor->dataType == X_DOUBLE) {
double* d = (double*)tensor->data + blockSize * i + beg * stride;
for (int j = 0; j < initNum; j++)
d[j] = (double)value;
}
else
ShowNTErrors("TODO! Unsupported datatype!")
}
} }
template void _SetDataDim<int>(XTensor*, int, int, int, int);
template void _SetDataDim<float>(XTensor*, int, int, int, float);
template void _SetDataDim<double>(XTensor*, int, int, int, double);
/* /*
modify data items along with a given index and dimension (and keep the remaining items unchanged) modify data items along with a given index and dimension (and keep the remaining items unchanged)
...@@ -355,115 +268,140 @@ modify data items along with a given index and dimension (and keep the remaining ...@@ -355,115 +268,140 @@ modify data items along with a given index and dimension (and keep the remaining
>> modify - the tensor whose data array would be used to modify the source tensor >> modify - the tensor whose data array would be used to modify the source tensor
>> dim - the dimension along which we modify the tensor >> dim - the dimension along which we modify the tensor
>> index - index of the given dimension >> index - index of the given dimension
e.g., given a source tensor (3, 3) e.g., given a source tensor (3, 3)
1 2 3 1 2 3
4 5 6 4 5 6
7 8 9 7 8 9
given a modified tensor (3) given a modified tensor (3)
1 2 3 1 2 3
when dim = 0, index = 1, we have when dim = 0, index = 1, we have
1 2 3 1 2 3
1 2 3 1 2 3
7 8 9 7 8 9
i.e., we set entries of row 1 to {1, 2, 3} i.e., we set entries of row 1 to {1, 2, 3}
*/ */
void _SetDataIndexed(XTensor * source, XTensor * modify, int dim, int index) void _SetDataIndexed(XTensor * tensor, XTensor * modify, int dim, int index)
{ {
int order = source->order; int order = tensor->order;
int size = source->GetDim(dim); int size = tensor->GetDim(dim);
if (dim < 0)
dim = order + dim;
CheckNTErrors(source->dataType == DEFAULT_DTYPE, "TODO!"); CheckDev(tensor->devID, modify->devID);
CheckNTErrors(dim >= 0 && dim < order, "Illegal dimension!"); CheckNTErrors(dim >= 0 && dim < order, "Illegal dimension!");
CheckNTErrors(index >= 0 && index < size, "Illegal index!"); CheckNTErrors(index >= 0 && index < size, "Illegal index!");
for(int i = 0; i < order - 1; i++){ for(int i = 0; i < order - 1; i++) {
if(i < dim){ if(i < dim) {
CheckNTErrors(modify->GetDim(i) == source->GetDim(i), "Illegal dimension!"); CheckNTErrors(modify->GetDim(i) == tensor->GetDim(i), "Illegal dimension!");
} }
else if(i >= dim){ else if(i >= dim) {
CheckNTErrors(modify->GetDim(i) == source->GetDim(i+1), "Illegal dimension!"); CheckNTErrors(modify->GetDim(i) == tensor->GetDim(i+1), "Illegal dimension!");
} }
} }
if(source->devID < 0 && modify->devID < 0){ if (tensor->devID >= 0) {
#ifdef USE_CUDA
_CudaSetDataIndexed(tensor, modify, dim, index);
return;
#else
ShowNTErrors("Please specify USE_CUDA and recompile the code!");
#endif
}
if(tensor->devID < 0) {
int stride = 1; int stride = 1;
int blockSize = 1; int blockSize = 1;
int blockNum = 1; int blockNum = 1;
for(int i = order - 1; i > dim; i--){ for (int i = order - 1; i > dim; i--) {
stride *= source->GetDim(i); stride *= tensor->GetDim(i);
} }
blockSize = stride * source->GetDim(dim); blockSize = stride * tensor->GetDim(dim);
blockNum = source->unitNum / blockSize; blockNum = tensor->unitNum / blockSize;
for(int i = 0; i < blockNum; i++){ for (int i = 0; i < blockNum; i++) {
DTYPE * d = (DTYPE*)source->data + blockSize * i + index * stride; DTYPE * d = (DTYPE*)tensor->data + blockSize * i + index * stride;
DTYPE * p = (DTYPE*)modify->data + stride * i; DTYPE * p = (DTYPE*)modify->data + stride * i;
for(int j = 0; j < stride; j++) for(int j = 0; j < stride; j++)
d[j] = p[j]; d[j] = p[j];
} }
} }
else if(source->devID >= 0 && modify->devID >= 0) {
#ifdef USE_CUDA
_CudaSetDataIndexed(source, modify, dim, index);
#else
ShowNTErrors("Please specify USE_CUDA and recompile the code!");
#endif
}
else{
ShowNTErrors("TODO!");
}
} }
/* /*
generate data as lower triangular matrics for last two dimensions generate data as lower triangular matrics for last two dimensions
>> tensor - the tensor whose data to be set >> tensor - the tensor whose data to be set
>> p - the value for each entry of the lower triangular matrics >> value - the value for each entry of the lower triangular matrics
>> shift - the offset from diagonal >> shift - the offset from diagonal
e.g., for a 3 * 3 tensor,
when p = 1 ans shift = 0, we have e.g., for a 3 * 3 tensor,
1 0 0 when value = 1 ans shift = 0, we have
1 1 0 1 0 0
1 1 1 1 1 0
when p = 2 and shift = -1, we have 1 1 1
0 0 0 when value = 2 and shift = -1, we have
2 0 0 0 0 0
2 2 0 2 0 0
2 2 0
*/ */
void _SetDataLowTri(XTensor * tensor, DTYPE p, int shift) void _SetDataLowTri(XTensor * tensor, DTYPE value, int shift)
{ {
int n = tensor->order; int n = tensor->order;
CheckNTErrors(tensor->dataType == DEFAULT_DTYPE, "TODO!");
CheckNTErrors(n >= 2, "The tensor must have a order no less than 2!"); CheckNTErrors(n >= 2, "The tensor must have a order no less than 2!");
CheckNTErrors(tensor->GetDim(n - 1) == tensor->GetDim(n - 2), CheckNTErrors(tensor->GetDim(n - 1) == tensor->GetDim(n - 2),
"The last two dimensions must be of the same size!"); "The last two dimensions must be of the same size!");
if(tensor->devID < 0){ tensor->SetZeroAll();
int l = tensor->GetDim(-1); if (tensor->devID >= 0) {
int blockNum = 1; #ifdef USE_CUDA
int blockSize = l * l; _CudaSetDataLowTri(tensor, value, shift);
for(int i = 0; i < n - 2; i++) return;
blockNum *= tensor->GetDim(i); #else
ShowNTErrors("Please specify USE_CUDA and recompile the code!");
for(int i = 0; i < blockNum; i++){ #endif
DTYPE * d = (DTYPE*)tensor->data + i * blockSize; }
for(int row = 0; row < l; row++){
for(int col = 0; col <= row + shift; col++){ int size = tensor->GetDim(-1);
d[row * l + col] = p; int blockSize = size * size;
int blockNum = tensor->unitNum / blockSize;
for (int i = 0; i < blockNum; i++) {
for (int row = 0; row < size; row++) {
if (tensor->dataType == X_INT) {
int * d = (int*)tensor->data + i * blockSize;
for (int col = 0; col <= row + shift; col++) {
d[row * size + col] = (int)value;
} }
for(int col = MAX(0, row + shift + 1); col < l; col++){ /*for (int col = MAX(0, row + shift + 1); col < size; col++) {
d[row * l + col] = 0; d[row * size + col] = 0;
}*/
}
else if (tensor->dataType == X_FLOAT) {
float * d = (float*)tensor->data + i * blockSize;
for (int col = 0; col <= row + shift; col++) {
d[row * size + col] = (float)value;
}
/*for (int col = MAX(0, row + shift + 1); col < size; col++) {
d[row * size + col] = 0;
}*/
}
else if (tensor->dataType == X_DOUBLE) {
double * d = (double*)tensor->data + i * blockSize;
for (int col = 0; col <= row + shift; col++) {
d[row * size + col] = (double)value;
} }
/*for (int col = MAX(0, row + shift + 1); col < size; col++) {
d[row * size + col] = 0;
}*/
} }
else
ShowNTErrors("TODO! Unsupported datatype!")
} }
} }
else{
#ifdef USE_CUDA
_CudaSetDataLowTri(tensor, p, shift);
#endif
}
} }
/* generate data items with a uniform distribution in [0, 1] */ /* generate data items with a uniform distribution in [0, 1] */
...@@ -484,7 +422,7 @@ generate data items with a uniform distribution in [lower, upper] ...@@ -484,7 +422,7 @@ generate data items with a uniform distribution in [lower, upper]
*/ */
void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper) void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
{ {
CheckNTErrors(upper > lower, "the high value must be greater than low value!"); CheckNTErrors(upper >= lower, "the high value must be greater than low value!");
if(tensor == NULL) if(tensor == NULL)
return; return;
...@@ -506,27 +444,50 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper) ...@@ -506,27 +444,50 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
} }
} }
else{ else{
ShowNTErrors("TODO"); ShowNTErrors("TODO! Unsupported datatype!")
} }
} }
/*
GPU code
The trick here is that initialize the data on a temperary tensor on CPU.
The CPU data is then copied to GPU.
TODO: generate data points on GPUs straightforwardly.
*/
else{ else{
#ifdef USE_CUDA #ifdef USE_CUDA
_CudaSetDataRand(tensor, lower, upper); /*
GPU code
The trick here is that initialize the data on a temperary tensor on CPU.
The CPU data is then copied to GPU.
TODO: generate data points on GPUs straightforwardly.
*/
//_CudaSetDataRand(tensor, lower, upper);
int num = tensor->unitNum;
DTYPE variance = upper - lower;
void * d = NULL;
if (tensor->dataType == X_FLOAT) {
d = new float[num];
for (int i = 0; i < num; i++)
*((float*)d + i) = lower + variance * (float)rand() / RAND_MAX;
}
else if (tensor->dataType == X_DOUBLE) {
d = new double[num];
for (int i = 0; i < num; i++)
*((double*)d + i) = (double)lower + variance * rand() / RAND_MAX;
}
else {
ShowNTErrors("Data type must be X_FLOAT or X_Double!");
}
tensor->SetData(d, num);
if (tensor->dataType == X_FLOAT) {
delete[](float*)d;
}
else {
delete[](double*)d;
}
#endif #endif
//XTensor * t2 = NewTensorV2(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, -1);
//_SetDataRand(t2, low, high);
//_CopyValues(t2, tensor);
//delete t2;
} }
} }
/* generate data items with a range by start, end and the step /* generate data items with a range by start, end and the step
>> tensor - the tensor whose data array would be initialized >> tensor - the tensor whose data array would be initialized
>> start - the begin of the array >> start - the begin of the array
>> end - the end of the array (not included self) >> end - the end of the array (not included self)
...@@ -537,7 +498,7 @@ void _SetDataRange(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE step) ...@@ -537,7 +498,7 @@ void _SetDataRange(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE step)
CheckNTErrors((tensor->order == 1), "Tensor must be 1 dimension!"); CheckNTErrors((tensor->order == 1), "Tensor must be 1 dimension!");
/* compute the true length according to the (start, end, step) */ /* compute the true length according to the (start, end, step) */
DTYPE size = fabs(upper - lower); DTYPE size = (DTYPE)fabs(upper - lower);
int num = ceil(size / fabs(step)); int num = ceil(size / fabs(step));
CheckNTErrors((tensor->unitNum == num), "Unit number of the tensor is not matched."); CheckNTErrors((tensor->unitNum == num), "Unit number of the tensor is not matched.");
...@@ -554,7 +515,7 @@ void _SetDataRange(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE step) ...@@ -554,7 +515,7 @@ void _SetDataRange(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE step)
*((float*)data + i) = lower + i * step; *((float*)data + i) = lower + i * step;
} }
else { else {
ShowNTErrors("TODO!"); ShowNTErrors("TODO! Unsupported datatype!")
} }
/* set the data from the array */ /* set the data from the array */
...@@ -564,8 +525,10 @@ void _SetDataRange(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE step) ...@@ -564,8 +525,10 @@ void _SetDataRange(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE step)
} }
/* /*
generate data items with a uniform distribution in [lower, upper] and set generate data items with a uniform distribution in [lower, upper] and
the item to a pre-defined value if the item >= p, set the item to 0 otherwise set the item to a pre-defined value if the item >= p,
set the item to 0 otherwise
>> tensor - the tensor whose data array would be initialized >> tensor - the tensor whose data array would be initialized
>> lower - lower value of the range >> lower - lower value of the range
>> upper - upper value of the range >> upper - upper value of the range
...@@ -595,9 +558,31 @@ void _SetDataRandP(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE va ...@@ -595,9 +558,31 @@ void _SetDataRandP(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE va
#endif // USE_CUDA #endif // USE_CUDA
} }
} }
/* a gauss distribution (Box-Muller method) */
double GaussRand(DTYPE mean, DTYPE standardDeviation)
{
static double u, v;
static int phase = 0;
double z;
double pi = 3.141592654;
if (phase == 0) {
u = (rand() + 1.0) / (RAND_MAX + 1.0);
v = (rand() + 1.0) / (RAND_MAX + 1.0);
z = sqrt(-2.0 * log(u)) * sin(2.0 * pi * v);
}
else {
z = sqrt(-2.0 * log(u)) * cos(2.0 * pi * v);
}
phase = 1 - phase;
return mean + (z * standardDeviation);
}
/* /*
generate data items with a normal distribution with specified mean and standard deviation generate data items with a normal distribution with specified mean and standard deviation
>> tensor - the tensor that keeps the data >> tensor - the tensor that keeps the data
>> mean - mean or expectation of the distribution >> mean - mean or expectation of the distribution
>> standardDeviation - standard deviation of the distribution >> standardDeviation - standard deviation of the distribution
...@@ -605,7 +590,31 @@ generate data items with a normal distribution with specified mean and standard ...@@ -605,7 +590,31 @@ generate data items with a normal distribution with specified mean and standard
void _SetDataRandN(XTensor * tensor, DTYPE mean, DTYPE standardDeviation) void _SetDataRandN(XTensor * tensor, DTYPE mean, DTYPE standardDeviation)
{ {
// TODO: rewrite it and add cuda code!!!!!!! // TODO: rewrite it and add cuda code!!!!!!!
tensor->SetDataRandn(mean, standardDeviation); int num = tensor->unitNum;
void * d = NULL;
if (tensor->dataType == X_FLOAT) {
d = new float[num];
for (int i = 0; i < num; i++)
*((float*)d + i) = (float)GaussRand(mean, standardDeviation);
}
else if (tensor->dataType == X_DOUBLE) {
d = new double[num];
for (int i = 0; i < num; i++)
*((double*)d + i) = GaussRand(mean, standardDeviation);
}
else {
ShowNTErrors("TODO! Unsupported datatype!")
}
tensor->SetData(d, num);
if (tensor->dataType == X_FLOAT) {
delete[](float*)d;
}
else {
delete[](double*)d;
}
} }
/* /*
......
/* /*
* NiuTrans.Tensor - an open-source tensor library * NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University. * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved. * All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
* You may obtain a copy of the License at * You may obtain a copy of the License at
* *
* http://www.apache.org/licenses/LICENSE-2.0 * http://www.apache.org/licenses/LICENSE-2.0
* *
* Unless required by applicable law or agreed to in writing, software * Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, * distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
/* /*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-18 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-18
* I'm surprised that I did not write this file till today. * I'm surprised that I did not write this file till today.
*/ */
#include <curand.h> #include <curand.h>
#include <time.h> #include <time.h>
...@@ -32,110 +32,35 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -32,110 +32,35 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* /*
set an integer data array with a fixed value p (in int) set a data array with a fixed value
>> d - pointer to the data array
>> size - size of the array
>> p - the initial value
*/
__global__
void KernelSetDataFixedInt(int * d, int size, int p)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size)
d[i] = p;
}
/*
generate data items with a fixed value p (in int)
>> tensor - the tensor for initialization
>> p - the initial value
*/
void _CudaSetDataFixedInt(XTensor * tensor, int p)
{
CheckNTErrors(tensor->dataType == X_INT, "the tensor must be in X_INT!");
int gridSize[3];
int blockSize[3];
GDevs.GetCudaThread(tensor->devID, tensor->unitNum, gridSize, blockSize);
dim3 blocks(gridSize[0]);
dim3 threads(blockSize[0]);
int devIDBackup;
ProtectCudaDev(tensor->devID, devIDBackup);
KernelSetDataFixedInt <<<blocks, threads >>>((int*)tensor->data, tensor->unitNum, p);
BacktoCudaDev(tensor->devID, devIDBackup);
}
/*
set a float data array with a fixed value p (in int)
>> d - pointer to the data array >> d - pointer to the data array
>> v - the initial value
>> size - size of the array >> size - size of the array
>> p - the initial value
*/ */
__global__ template<class T>
void KernelSetDataFixedFloat(float * d, int size, float p) __global__
void KernelSetDataFixed(T * d, T v, int size)
{ {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) if (i < size)
d[i] = p; d[i] = v;
}
/*
generate data items with a fixed value p (in float)
>> tensor - the tensor for initialization
>> p - the initial value
*/
void _CudaSetDataFixedFloat(XTensor * tensor, float p)
{
CheckNTErrors(tensor->dataType == X_FLOAT, "the tensor must be in X_FLOAT!");
int gridSize[3];
int blockSize[3];
GDevs.GetCudaThread(tensor->devID, tensor->unitNum, gridSize, blockSize);
dim3 blocks(gridSize[0]);
dim3 threads(blockSize[0]);
int devIDBackup;
ProtectCudaDev(tensor->devID, devIDBackup);
KernelSetDataFixedFloat <<<blocks, threads >>>((float*)tensor->data, tensor->unitNum, p);
BacktoCudaDev(tensor->devID, devIDBackup);
} }
template __global__ void KernelSetDataFixed<int>(int *, int, int);
template __global__ void KernelSetDataFixed<float>(float *, float, int);
template __global__ void KernelSetDataFixed<double>(double *, double, int);
/* /*
set a double data array with a fixed value p (in int) generate data items with a fixed value
>> d - pointer to the data array
>> size - size of the array
>> p - the initial value
*/
__global__
void KernelSetDataFixedDouble(double * d, int size, double p)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size)
d[i] = p;
}
/*
generate data items with a fixed value p (in double)
>> tensor - the tensor for initialization >> tensor - the tensor for initialization
>> p - the initial value >> value - the initial value
*/ */
void _CudaSetDataFixedDouble(XTensor * tensor, double p) template<class T>
void _CudaSetDataFixed(XTensor * tensor, T value)
{ {
CheckNTErrors(tensor->dataType == X_DOUBLE, "the tensor must be in X_DOUBLE!");
int gridSize[3]; int gridSize[3];
int blockSize[3]; int blockSize[3];
...@@ -145,59 +70,23 @@ void _CudaSetDataFixedDouble(XTensor * tensor, double p) ...@@ -145,59 +70,23 @@ void _CudaSetDataFixedDouble(XTensor * tensor, double p)
dim3 threads(blockSize[0]); dim3 threads(blockSize[0]);
int devIDBackup; int devIDBackup;
ProtectCudaDev(tensor->devID, devIDBackup);
KernelSetDataFixedDouble <<<blocks, threads >>>((double*)tensor->data, tensor->unitNum, p);
BacktoCudaDev(tensor->devID, devIDBackup);
}
/*
set a float data array with a fixed value p (in int) only
if the condition entry is non-zero
>> d - pointer to the data array
>> c - pointer to the condition array
>> size - size of the array
>> p - the initial value
*/
__global__
void KernelSetDataFixedCondFloat(float * d, float * c, int size, float p)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size && c[i] != 0)
d[i] = p;
}
/*
generate data items with a fixed value p (in float) only
if the condition entry is non-zero
>> tensor - the tensor for initialization
>> condition - the condition tensor whose entry would be check to
set the corresponding entry in "tensor"
>> p - the initial value
*/
void _CudaSetDataFixedCondFloat(XTensor * tensor, XTensor * condition, float p)
{
CheckNTErrors(tensor->dataType == X_FLOAT, "the tensor must be in X_FLOAT!");
CheckNTErrors(condition->unitSize == sizeof(float), "TODO!");
int gridSize[3];
int blockSize[3];
GDevs.GetCudaThread(tensor->devID, tensor->unitNum, gridSize, blockSize);
dim3 blocks(gridSize[0]);
dim3 threads(blockSize[0]);
int devIDBackup;
ProtectCudaDev(tensor->devID, devIDBackup); ProtectCudaDev(tensor->devID, devIDBackup);
KernelSetDataFixedCondFloat <<<blocks, threads >>>((float*)tensor->data, (float*)condition->data, if (tensor->dataType == X_INT)
tensor->unitNum, p); KernelSetDataFixed << <blocks, threads >> > ((int*)tensor->data, (int)value, tensor->unitNum);
else if (tensor->dataType == X_FLOAT)
KernelSetDataFixed << <blocks, threads >> > ((float*)tensor->data, (float)value, tensor->unitNum);
else if (tensor->dataType == X_DOUBLE)
KernelSetDataFixed << <blocks, threads >> > ((double*)tensor->data, (double)value, tensor->unitNum);
else
ShowNTErrors("TODO! Unsupported datatype!")
BacktoCudaDev(tensor->devID, devIDBackup); BacktoCudaDev(tensor->devID, devIDBackup);
} }
template void _CudaSetDataFixed<int>(XTensor *, int);
template void _CudaSetDataFixed<float>(XTensor *, float);
template void _CudaSetDataFixed<double>(XTensor *, double);
/* /*
set a float data array with a fixed value p (in int) only set a float data array with a fixed value p (in int) only
...@@ -207,28 +96,30 @@ if the condition entry is non-zero ...@@ -207,28 +96,30 @@ if the condition entry is non-zero
>> size - size of the array >> size - size of the array
>> p - the initial value >> p - the initial value
*/ */
template<class T>
__global__ __global__
void KernelSetDataFixedCondInt(int * d, float * c, int size, int p) void KernelSetDataFixedCond(T * d, T * c, T value, int size)
{ {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size && c[i] != 0) if (i < size && c[i] != 0)
d[i] = p; d[i] = value;
} }
template __global__ void KernelSetDataFixedCond<int>(int*, int*, int, int);
template __global__ void KernelSetDataFixedCond<float>(float*, float*, float, int);
template __global__ void KernelSetDataFixedCond<double>(double*, double*, double, int);
/* /*
generate data items with a fixed value p (in int) only generate data items with a fixed value p
if the condition entry is non-zero only if the condition entry is non-zero
>> tensor - the tensor for initialization >> tensor - the tensor for initialization
>> condition - the condition tensor whose entry would be check to >> condition - the condition tensor whose entry would be check to
set the corresponding entry in "tensor" set the corresponding entry in "tensor"
>> p - the initial value >> value - the initial value
*/ */
void _CudaSetDataFixedCondInt(XTensor * tensor, XTensor * condition, int p) template<class T>
void _CudaSetDataFixedCond(XTensor* tensor, XTensor* condition, T value)
{ {
CheckNTErrors(tensor->dataType == X_FLOAT, "the tensor must be in X_FLOAT!");
CheckNTErrors(condition->unitSize == sizeof(float), "TODO!");
int gridSize[3]; int gridSize[3];
int blockSize[3]; int blockSize[3];
...@@ -240,11 +131,24 @@ void _CudaSetDataFixedCondInt(XTensor * tensor, XTensor * condition, int p) ...@@ -240,11 +131,24 @@ void _CudaSetDataFixedCondInt(XTensor * tensor, XTensor * condition, int p)
int devIDBackup; int devIDBackup;
ProtectCudaDev(tensor->devID, devIDBackup); ProtectCudaDev(tensor->devID, devIDBackup);
KernelSetDataFixedCondInt <<<blocks, threads >>>((int*)tensor->data, (float*)condition->data, if (tensor->dataType == X_INT)
tensor->unitNum, p); KernelSetDataFixedCond <<< blocks, threads >>> ((int*)tensor->data, (int*)condition->data,
(int)value, tensor->unitNum);
else if (tensor->dataType == X_FLOAT)
KernelSetDataFixedCond <<< blocks, threads >>> ((float*)tensor->data, (float*)condition->data,
(float)value, tensor->unitNum);
else if (tensor->dataType == X_DOUBLE)
KernelSetDataFixedCond <<< blocks, threads >>> ((double*)tensor->data, (double*)condition->data,
(double)value, tensor->unitNum);
else
ShowNTErrors("TODO! Unsupported datatype!")
BacktoCudaDev(tensor->devID, devIDBackup); BacktoCudaDev(tensor->devID, devIDBackup);
} }
template void _CudaSetDataFixedCond<int>(XTensor*, XTensor*, int);
template void _CudaSetDataFixedCond<float>(XTensor*, XTensor*, float);
template void _CudaSetDataFixedCond<double>(XTensor*, XTensor*, double);
/* /*
set data array with a uniform distribution in [low, high] set data array with a uniform distribution in [low, high]
...@@ -309,8 +213,9 @@ set data items along with a given dimension (and keep the remaining items unchan ...@@ -309,8 +213,9 @@ set data items along with a given dimension (and keep the remaining items unchan
>> blockSize - size of a data block >> blockSize - size of a data block
>> blockNum - number of data blocks >> blockNum - number of data blocks
*/ */
template<class T>
__global__ __global__
void KernelSetDataDim(DTYPE * d, int beg, int len, int blockSize, int blockNum, DTYPE p) void KernelSetDataDim(T * d, int beg, int len, int blockSize, int blockNum, T p)
{ {
/* offset in each block */ /* offset in each block */
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
...@@ -326,6 +231,9 @@ void KernelSetDataDim(DTYPE * d, int beg, int len, int blockSize, int blockNum, ...@@ -326,6 +231,9 @@ void KernelSetDataDim(DTYPE * d, int beg, int len, int blockSize, int blockNum,
d[blockSize * j + i] = p; d[blockSize * j + i] = p;
} }
template __global__ void KernelSetDataDim<int>(int*, int, int, int, int, int);
template __global__ void KernelSetDataDim<float>(float*, int, int, int, int, float);
template __global__ void KernelSetDataDim<double>(double*, int, int, int, int, double);
/* /*
set data items along with a given dimension (and keep the remaining items unchanged) - cuda version set data items along with a given dimension (and keep the remaining items unchanged) - cuda version
...@@ -343,7 +251,8 @@ e.g., given a 3 * 3 tensor ...@@ -343,7 +251,8 @@ e.g., given a 3 * 3 tensor
7 8 9 7 8 9
i.e., we set all entries of row 1 to 0 i.e., we set all entries of row 1 to 0
*/ */
void _CudaSetDataDim(XTensor * tensor, int beg, int len, int dim, DTYPE p) template<class T>
void _CudaSetDataDim(XTensor * tensor, int beg, int len, int dim, T p)
{ {
int n = tensor->order; int n = tensor->order;
...@@ -372,11 +281,24 @@ void _CudaSetDataDim(XTensor * tensor, int beg, int len, int dim, DTYPE p) ...@@ -372,11 +281,24 @@ void _CudaSetDataDim(XTensor * tensor, int beg, int len, int dim, DTYPE p)
int devIDBackup; int devIDBackup;
ProtectCudaDev(tensor->devID, devIDBackup); ProtectCudaDev(tensor->devID, devIDBackup);
KernelSetDataDim<<<blocks, threads >>>((DTYPE*)tensor->data, beg * stride, if (tensor->dataType == X_INT)
len * stride, blockSize, blockNum, p); KernelSetDataDim << <blocks, threads >> > ((int*)tensor->data, beg * stride,
len * stride, blockSize, blockNum, (int)p);
else if (tensor->dataType == X_FLOAT)
KernelSetDataDim << <blocks, threads >> > ((float*)tensor->data, beg * stride,
len * stride, blockSize, blockNum, (float)p);
else if (tensor->dataType == X_DOUBLE)
KernelSetDataDim << <blocks, threads >> > ((double*)tensor->data, beg * stride,
len * stride, blockSize, blockNum, (double)p);
else
ShowNTErrors("TODO! Unsupported datatype!")
BacktoCudaDev(tensor->devID, devIDBackup); BacktoCudaDev(tensor->devID, devIDBackup);
} }
template void _CudaSetDataDim<int>(XTensor*, int, int, int, int);
template void _CudaSetDataDim<float>(XTensor*, int, int, int, float);
template void _CudaSetDataDim<double>(XTensor*, int, int, int, double);
/* /*
modify data items along with a given index and dimension modify data items along with a given index and dimension
...@@ -462,6 +384,7 @@ void _CudaSetDataIndexed(XTensor * source, XTensor * modify, int dim, int index) ...@@ -462,6 +384,7 @@ void _CudaSetDataIndexed(XTensor * source, XTensor * modify, int dim, int index)
/* /*
set lower triangular matrics for each block set lower triangular matrics for each block
>> d - pointer to the data array >> d - pointer to the data array
>> l - row number (or column number) of each block, i.e, >> l - row number (or column number) of each block, i.e,
a block is l * l matrix a block is l * l matrix
...@@ -469,15 +392,15 @@ set lower triangular matrics for each block ...@@ -469,15 +392,15 @@ set lower triangular matrics for each block
>> blockNum - number of the blocks >> blockNum - number of the blocks
>> p - the value for each entry of the lower triangular matrics >> p - the value for each entry of the lower triangular matrics
>> shift - the offset from diagonal >> shift - the offset from diagonal
e.g., for a 3* 3 tensor, e.g., for a 3* 3 tensor,
when p = 1 ans shift = 0, we have when p = 1 ans shift = 0, we have
1 0 0 1 0 0
1 1 0 1 1 0
1 1 1 1 1 1
when p = 2 and shift = -1, we have when p = 2 and shift = -1, we have
0 0 0 0 0 0
2 0 0 2 0 0
2 2 0 2 2 0
*/ */
__global__ __global__
void KernelSetDataLowTri(DTYPE * d, int l, int blockSize, int blockNum, DTYPE p, int shift) void KernelSetDataLowTri(DTYPE * d, int l, int blockSize, int blockNum, DTYPE p, int shift)
...@@ -501,35 +424,28 @@ void KernelSetDataLowTri(DTYPE * d, int l, int blockSize, int blockNum, DTYPE p, ...@@ -501,35 +424,28 @@ void KernelSetDataLowTri(DTYPE * d, int l, int blockSize, int blockNum, DTYPE p,
*d2 = 0; *d2 = 0;
} }
/* /*
generate data as lower triangular matrics for last two dimensions (cuda version) generate data as lower triangular matrics for last two dimensions (cuda version)
>> tensor - the tensor whose data to be set >> tensor - the tensor whose data to be set
>> p - the value for each entry of the lower triangular matrics >> value - the value for each entry of the lower triangular matrics
>> shift - the offset from diagonal >> shift - the offset from diagonal
e.g., for a 3* 3 tensor,
when p = 1 ans shift = 0, we have e.g., for a 3 * 3 tensor,
1 0 0 when value = 1 ans shift = 0, we have
1 1 0 1 0 0
1 1 1 1 1 0
when p = 2 and shift = -1, we have 1 1 1
0 0 0 when value = 2 and shift = -1, we have
2 0 0 0 0 0
2 2 0 2 0 0
2 2 0
*/ */
void _CudaSetDataLowTri(XTensor * tensor, DTYPE p, int shift) void _CudaSetDataLowTri(XTensor * tensor, DTYPE value, int shift)
{ {
int n = tensor->order; int size = tensor->GetDim(-1);
int blockSize = size * size;
CheckNTErrors(tensor->dataType == DEFAULT_DTYPE, "TODO!"); int blockNum = tensor->unitNum / blockSize;
CheckNTErrors(n >= 2, "The tensor must have a order no less than 2!");
CheckNTErrors(tensor->GetDim(n - 1) == tensor->GetDim(n - 2),
"The last two dimensions must be of the same size!");
int l = tensor->GetDim(-1);
int blockNum = 1;
int blockSize = l * l;
for(int i = 0; i < n - 2; i++)
blockNum *= tensor->GetDim(i);
int cudaGrids[3]; int cudaGrids[3];
int cudaBlocks[3]; int cudaBlocks[3];
...@@ -542,7 +458,7 @@ void _CudaSetDataLowTri(XTensor * tensor, DTYPE p, int shift) ...@@ -542,7 +458,7 @@ void _CudaSetDataLowTri(XTensor * tensor, DTYPE p, int shift)
int devIDBackup; int devIDBackup;
ProtectCudaDev(tensor->devID, devIDBackup); ProtectCudaDev(tensor->devID, devIDBackup);
KernelSetDataLowTri<<<blocks, threads >>>((DTYPE*)tensor->data, l, blockSize, blockNum, p, shift); KernelSetDataLowTri<<<blocks, threads >>>((DTYPE*)tensor->data, size, blockSize, blockNum, value, shift);
BacktoCudaDev(tensor->devID, devIDBackup); BacktoCudaDev(tensor->devID, devIDBackup);
} }
......
...@@ -28,31 +28,24 @@ ...@@ -28,31 +28,24 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* generate data items with a fixed value p (in int) */ /* generate data items with a fixed value */
void _CudaSetDataFixedInt(XTensor * tensor, int p); template<class T>
void _CudaSetDataFixed(XTensor * tensor, T value);
/* generate data items with a fixed value p (in float) */ /* generate data items with a fixed value p
void _CudaSetDataFixedFloat(XTensor * tensor, float p); only if the condition entry is non-zero */
template<class T>
/* generate data items with a fixed value p (in double) */ void _CudaSetDataFixedCond(XTensor * tensor, XTensor * condition, T p);
void _CudaSetDataFixedDouble(XTensor * tensor, double p);
/* generate data items with a fixed value p (in float) only
if the condition entry is non-zero */
void _CudaSetDataFixedCondFloat(XTensor * tensor, XTensor * condition, float p);
/* generate data items with a fixed value p (in int) only
if the condition entry is non-zero */
void _CudaSetDataFixedCondInt(XTensor * tensor, XTensor * condition, int p);
/* set data items along with a given dimension (and keep the remaining items unchanged) */ /* set data items along with a given dimension (and keep the remaining items unchanged) */
void _CudaSetDataDim(XTensor * tensor, int beg, int len, int dim, DTYPE p); template<class T>
void _CudaSetDataDim(XTensor * tensor, int beg, int len, int dim, T p);
/* modify data items along with a given index and dimension (and keep the remaining items unchanged) */ /* modify data items along with a given index and dimension (and keep the remaining items unchanged) */
void _CudaSetDataIndexed(XTensor * source, XTensor * modify, int dim, int index); void _CudaSetDataIndexed(XTensor * source, XTensor * modify, int dim, int index);
/* generate data as lower triangular matrics for last two dimensions (cuda version) */ /* generate data as lower triangular matrics for last two dimensions (cuda version) */
void _CudaSetDataLowTri(XTensor * tensor, DTYPE p, int shift); void _CudaSetDataLowTri(XTensor * tensor, DTYPE value, int shift);
/* generate data items with a uniform distribution in [lower, upper] */ /* generate data items with a uniform distribution in [lower, upper] */
void _CudaSetDataRand(const XTensor * tensor, DTYPE lower, DTYPE upper); void _CudaSetDataRand(const XTensor * tensor, DTYPE lower, DTYPE upper);
......
...@@ -30,32 +30,17 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -30,32 +30,17 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/* generate data items with a xavier initialization */ /* generate data items with a xavier initialization */
void _SetDataFanInOut(XTensor * tensor, DTYPE gain = 1.0F); void _SetDataFanInOut(XTensor * tensor, DTYPE gain = 1.0F);
/* generate data items with a fixed value p */ /* generate data items with a fixed value */
void _SetDataFixed(XTensor * tensor, void * valuePointer); template<class T>
void _SetDataFixed(XTensor * tensor, T value);
/* generate data items with a fixed value p (in default type) */ /* generate data items with a fixed value only if the condition entry is non-zero */
void SetDataFixed(XTensor &tensor, DTYPE p); template<class T>
void _SetDataFixedCond(XTensor* tensor, XTensor* condition, T value);
/* generate data items with a fixed value p (in integer) */
void SetDataFixedInt(XTensor &tensor, int p);
/* generate data items with a fixed value p (in int) */
void _SetDataFixedInt(XTensor * tensor, int p);
/* generate data items with a fixed value p (in float) */
void _SetDataFixedFloat(XTensor * tensor, float p);
/* generate data items with a fixed value p (in double) */
void _SetDataFixedDouble(XTensor * tensor, double p);
/* generate data items with a fixed value p only if the condition entry is non-zero */
void _SetDataFixedCond(XTensor * tensor, XTensor * condition, DTYPE p);
/* generate data items with a fixed value p only if the condition entry is non-zero */
void _SetDataFixedCondInt(XTensor * tensor, XTensor * condition, int p);
/* set data items along with a given dimension (and keep the remaining items unchanged) */ /* set data items along with a given dimension (and keep the remaining items unchanged) */
void _SetDataDim(XTensor * tensor, int beg, int len, int dim, DTYPE p); template<class T>
void _SetDataDim(XTensor * tensor, int beg, int len, int dim, T p);
/* modify data items along with a given index and dimension (and keep the remaining items unchanged) */ /* modify data items along with a given index and dimension (and keep the remaining items unchanged) */
void _SetDataIndexed(XTensor * source, XTensor * modify, int dim, int index); void _SetDataIndexed(XTensor * source, XTensor * modify, int dim, int index);
......
...@@ -70,7 +70,7 @@ XTensor DropoutWithIndex(const XTensor &x, XTensor &maskIndex, DTYPE scale) ...@@ -70,7 +70,7 @@ XTensor DropoutWithIndex(const XTensor &x, XTensor &maskIndex, DTYPE scale)
InitTensor1DV2(&c, x.unitNum, x.dataType, x.devID, x.mem); InitTensor1DV2(&c, x.unitNum, x.dataType, x.devID, x.mem);
_SetDataFixedFloat(&c, 1.0F); c.SetDataFixed(1.0);
_DropoutWithIndex(&x, &maskIndex, &c); _DropoutWithIndex(&x, &maskIndex, &c);
......
...@@ -383,15 +383,7 @@ void _LossBackward(XTensor * dedy, XTensor * t, XTensor * y, ...@@ -383,15 +383,7 @@ void _LossBackward(XTensor * dedy, XTensor * t, XTensor * y,
int leadDim, int tBeg, int tLen, int yBeg) int leadDim, int tBeg, int tLen, int yBeg)
{ {
if(t == NULL){ if(t == NULL){
if(dedy->dataType == X_FLOAT) dedy->SetDataFixed(1);
_SetDataFixedFloat(dedy, 1.0F);
else if(dedy->dataType == X_DOUBLE)
_SetDataFixedDouble(dedy, 1.0);
else if(dedy->dataType == X_INT)
_SetDataFixedInt(dedy, 1);
else{
ShowNTErrors("TODO");
}
return; return;
} }
......
...@@ -50,7 +50,7 @@ bool TestDropout1() ...@@ -50,7 +50,7 @@ bool TestDropout1()
XTensor yUser; XTensor yUser;
/* initialize variables */ /* initialize variables */
_SetDataFixedFloat(x, 1.0F); x->SetDataFixed(1);
y->SetZeroAll(); y->SetZeroAll();
/* call Dropout function */ /* call Dropout function */
...@@ -88,7 +88,7 @@ bool TestDropout1() ...@@ -88,7 +88,7 @@ bool TestDropout1()
XTensor yUserGPU; XTensor yUserGPU;
/* initialize variables */ /* initialize variables */
_SetDataFixedFloat(xGPU, 1.0F); xGPU->SetDataFixed(1);
yGPU->SetZeroAll(); yGPU->SetZeroAll();
/* call Dropout function */ /* call Dropout function */
...@@ -157,10 +157,10 @@ bool TestDropout2() ...@@ -157,10 +157,10 @@ bool TestDropout2()
XTensor * dedy = NewTensorV2(order, dimSize); XTensor * dedy = NewTensorV2(order, dimSize);
/* initialize variables */ /* initialize variables */
_SetDataFixedFloat(x, 1.0F); x->SetDataFixed(1.0);
y->SetZeroAll(); y->SetZeroAll();
dedx->SetZeroAll(); dedx->SetZeroAll();
_SetDataFixedFloat(dedy, 1.5F); dedy->SetDataFixed(1.5);
/* call Dropout function */ /* call Dropout function */
float dropProb = 0.5F; float dropProb = 0.5F;
...@@ -183,10 +183,10 @@ bool TestDropout2() ...@@ -183,10 +183,10 @@ bool TestDropout2()
XTensor * dedyGPU = NewTensorV2(order, dimSize, X_FLOAT, 1.0F, 0); XTensor * dedyGPU = NewTensorV2(order, dimSize, X_FLOAT, 1.0F, 0);
/* initialize variables */ /* initialize variables */
_SetDataFixedFloat(xGPU, 1.0F); xGPU->SetDataFixed(1.0);
yGPU->SetZeroAll(); yGPU->SetZeroAll();
dedxGPU->SetZeroAll(); dedxGPU->SetZeroAll();
_SetDataFixedFloat(dedyGPU, 1.5F); dedyGPU->SetDataFixed(1.5);
/* call Dropout function */ /* call Dropout function */
_Dropout(xGPU, yGPU, seed, dropProb); _Dropout(xGPU, yGPU, seed, dropProb);
......
...@@ -195,8 +195,8 @@ bool TestReduceSum2() ...@@ -195,8 +195,8 @@ bool TestReduceSum2()
XTensor tUser; XTensor tUser;
/* initialize variables */ /* initialize variables */
_SetDataFixedFloat(s, 1.0F); s->SetDataFixed(1);
_SetDataFixedFloat(answer, (float)s->GetDim(1)); answer->SetDataFixed(s->GetDim(1));
/* call ReduceSum function */ /* call ReduceSum function */
_ReduceSum(s, t, 1); _ReduceSum(s, t, 1);
...@@ -215,7 +215,7 @@ bool TestReduceSum2() ...@@ -215,7 +215,7 @@ bool TestReduceSum2()
XTensor tUserGPU; XTensor tUserGPU;
/* initialize variables */ /* initialize variables */
_SetDataFixedFloat(sGPU, 1.0F); sGPU->SetDataFixed(1);
/* call ReduceSum function */ /* call ReduceSum function */
_ReduceSum(sGPU, tGPU, 1); _ReduceSum(sGPU, tGPU, 1);
...@@ -284,8 +284,8 @@ bool TestReduceSum3() ...@@ -284,8 +284,8 @@ bool TestReduceSum3()
XTensor tUser; XTensor tUser;
/* initialize variables */ /* initialize variables */
_SetDataFixedFloat(s, 1.0F); s->SetDataFixed(1);
_SetDataFixedFloat(answer, (float)s->GetDim(1)); answer->SetDataFixed(s->GetDim(1));
/* call ReduceSum function */ /* call ReduceSum function */
_ReduceSum(s, t, 1); _ReduceSum(s, t, 1);
...@@ -304,7 +304,7 @@ bool TestReduceSum3() ...@@ -304,7 +304,7 @@ bool TestReduceSum3()
XTensor tUserGPU; XTensor tUserGPU;
/* initialize variables */ /* initialize variables */
_SetDataFixedFloat(sGPU, 1.0F); sGPU->SetDataFixed(1);
/* call ReduceSum function */ /* call ReduceSum function */
_ReduceSum(sGPU, tGPU, 1); _ReduceSum(sGPU, tGPU, 1);
...@@ -373,8 +373,8 @@ bool TestReduceSum4() ...@@ -373,8 +373,8 @@ bool TestReduceSum4()
XTensor tUser; XTensor tUser;
/* initialize variables */ /* initialize variables */
_SetDataFixedFloat(s, 1.0F); s->SetDataFixed(1);
_SetDataFixedFloat(answer, (float)s->GetDim(1)); answer->SetDataFixed(s->GetDim(1));
/* call ReduceSum function */ /* call ReduceSum function */
_ReduceSum(s, t, 1); _ReduceSum(s, t, 1);
...@@ -393,7 +393,7 @@ bool TestReduceSum4() ...@@ -393,7 +393,7 @@ bool TestReduceSum4()
XTensor tUserGPU; XTensor tUserGPU;
/* initialize variables */ /* initialize variables */
_SetDataFixedFloat(sGPU, 1.0F); sGPU->SetDataFixed(1);
/* call ReduceSum function */ /* call ReduceSum function */
_ReduceSum(sGPU, tGPU, 1); _ReduceSum(sGPU, tGPU, 1);
...@@ -464,8 +464,8 @@ bool TestReduceSum5() ...@@ -464,8 +464,8 @@ bool TestReduceSum5()
XTensor tUser; XTensor tUser;
/* initialize variables */ /* initialize variables */
_SetDataFixedFloat(s, 1.0F); s->SetDataFixed(1);
_SetDataFixedFloat(answer, (float)s->GetDim(1)); answer->SetDataFixed(s->GetDim(1));
/* call ReduceSum function */ /* call ReduceSum function */
_ReduceSum(s, t, 1); _ReduceSum(s, t, 1);
...@@ -484,7 +484,7 @@ bool TestReduceSum5() ...@@ -484,7 +484,7 @@ bool TestReduceSum5()
XTensor tUserGPU; XTensor tUserGPU;
/* initialize variables */ /* initialize variables */
_SetDataFixedFloat(sGPU, 1.0F); sGPU->SetDataFixed(1);
/* call ReduceSum function */ /* call ReduceSum function */
_ReduceSum(sGPU, tGPU, 1); _ReduceSum(sGPU, tGPU, 1);
...@@ -556,8 +556,8 @@ bool TestReduceSum6() ...@@ -556,8 +556,8 @@ bool TestReduceSum6()
XTensor tUser; XTensor tUser;
/* initialize variables */ /* initialize variables */
_SetDataFixedFloat(s, 1.0F); s->SetDataFixed(1);
_SetDataFixedFloat(answer, (float)s->GetDim(1)); answer->SetDataFixed(s->GetDim(1));
/* call ReduceSum function */ /* call ReduceSum function */
_ReduceSum(s, t, 1); _ReduceSum(s, t, 1);
...@@ -576,7 +576,7 @@ bool TestReduceSum6() ...@@ -576,7 +576,7 @@ bool TestReduceSum6()
XTensor tUserGPU; XTensor tUserGPU;
/* initialize variables */ /* initialize variables */
_SetDataFixedFloat(sGPU, 1.0F); sGPU->SetDataFixed(1);
/* call ReduceSum function */ /* call ReduceSum function */
_ReduceSum(sGPU, tGPU, 1); _ReduceSum(sGPU, tGPU, 1);
......
...@@ -119,7 +119,7 @@ bool TestSetData2() ...@@ -119,7 +119,7 @@ bool TestSetData2()
XTensor * modify = NewTensorV2(dataOrder, dataDimSize); XTensor * modify = NewTensorV2(dataOrder, dataDimSize);
/* Initialize variables */ /* Initialize variables */
_SetDataFixedFloat(s, 1.0F); s->SetDataFixed(1);
modify->SetData(data, dataUnitNum); modify->SetData(data, dataUnitNum);
/* call SetDataIndexed function */ /* call SetDataIndexed function */
...@@ -137,7 +137,7 @@ bool TestSetData2() ...@@ -137,7 +137,7 @@ bool TestSetData2()
XTensor * modifyGPU = NewTensorV2(dataOrder, dataDimSize, X_FLOAT, 1.0F, 0); XTensor * modifyGPU = NewTensorV2(dataOrder, dataDimSize, X_FLOAT, 1.0F, 0);
/* Initialize variables */ /* Initialize variables */
_SetDataFixedFloat(sGPU, 1.0F); sGPU->SetDataFixed(1);
modifyGPU->SetData(data, dataUnitNum); modifyGPU->SetData(data, dataUnitNum);
/* call SetDataIndexed function */ /* call SetDataIndexed function */
...@@ -212,11 +212,11 @@ bool TestSetData3() ...@@ -212,11 +212,11 @@ bool TestSetData3()
XTensor * modify = NewTensorV2(dataOrder, dataDimSize); XTensor * modify = NewTensorV2(dataOrder, dataDimSize);
/* Initialize variables */ /* Initialize variables */
_SetDataFixedFloat(s, 1.0F); s->SetDataFixed(1);
modify->SetData(data, dataUnitNum); modify->SetData(data, dataUnitNum);
/* call SetDataIndexed function */ /* call SetDataIndexed function */
_SetDataFixedFloat(s, 1.0F); s->SetDataFixed(1);
_SetDataIndexed(s, modify, 1, 1); _SetDataIndexed(s, modify, 1, 1);
/* check results */ /* check results */
...@@ -231,7 +231,7 @@ bool TestSetData3() ...@@ -231,7 +231,7 @@ bool TestSetData3()
XTensor * modifyGPU = NewTensorV2(dataOrder, dataDimSize, X_FLOAT, 1.0F, 0); XTensor * modifyGPU = NewTensorV2(dataOrder, dataDimSize, X_FLOAT, 1.0F, 0);
/* Initialize variables */ /* Initialize variables */
_SetDataFixedFloat(sGPU, 1.0F); sGPU->SetDataFixed(1);
modifyGPU->SetData(data, dataUnitNum); modifyGPU->SetData(data, dataUnitNum);
/* call SetDataIndexed function */ /* call SetDataIndexed function */
......
...@@ -91,7 +91,7 @@ bool TestSpread1() ...@@ -91,7 +91,7 @@ bool TestSpread1()
XTensor * modify = NewTensorV2(dataOrder, dataDimSize); XTensor * modify = NewTensorV2(dataOrder, dataDimSize);
/* Initialize variables */ /* Initialize variables */
_SetDataFixedFloat(s, 0.0F); s->SetZeroAll();
modify->SetData(data, dataUnitNum); modify->SetData(data, dataUnitNum);
/* call _Spread function */ /* call _Spread function */
...@@ -109,7 +109,7 @@ bool TestSpread1() ...@@ -109,7 +109,7 @@ bool TestSpread1()
XTensor * modifyGPU = NewTensorV2(dataOrder, dataDimSize, X_FLOAT, 1.0F, 0); XTensor * modifyGPU = NewTensorV2(dataOrder, dataDimSize, X_FLOAT, 1.0F, 0);
/* Initialize variables */ /* Initialize variables */
_SetDataFixedFloat(sGPU, 0.0F); sGPU->SetZeroAll();
modifyGPU->SetData(data, dataUnitNum); modifyGPU->SetData(data, dataUnitNum);
/* call _Spread function */ /* call _Spread function */
......
...@@ -296,8 +296,8 @@ bool TestSumDim3() ...@@ -296,8 +296,8 @@ bool TestSumDim3()
/* initialize variables */ /* initialize variables */
a->SetZeroAll(); a->SetZeroAll();
cMe->SetZeroAll(); cMe->SetZeroAll();
_SetDataFixedFloat(b, 1.0F); b->SetDataFixed(1);
_SetDataFixedFloat(answer, 1.0F); answer->SetDataFixed(1);
/* call SumDim function */ /* call SumDim function */
_SumDim(a, b, c, 1); _SumDim(a, b, c, 1);
...@@ -323,7 +323,7 @@ bool TestSumDim3() ...@@ -323,7 +323,7 @@ bool TestSumDim3()
/* Initialize variables */ /* Initialize variables */
aGPU->SetZeroAll(); aGPU->SetZeroAll();
cMe->SetZeroAll(); cMe->SetZeroAll();
_SetDataFixedFloat(bGPU, 1.0F); bGPU->SetDataFixed(1);
/* call sum function */ /* call sum function */
_SumDim(aGPU, bGPU, cGPU, 1); _SumDim(aGPU, bGPU, cGPU, 1);
...@@ -405,8 +405,8 @@ bool TestSumDim4() ...@@ -405,8 +405,8 @@ bool TestSumDim4()
/* initialize variables */ /* initialize variables */
a->SetZeroAll(); a->SetZeroAll();
cMe->SetZeroAll(); cMe->SetZeroAll();
_SetDataFixedFloat(b, 1.0F); b->SetDataFixed(1);
_SetDataFixedFloat(answer, 1.0F); answer->SetDataFixed(1);
/* call SumDim function */ /* call SumDim function */
_SumDim(a, b, c, 1); _SumDim(a, b, c, 1);
...@@ -432,7 +432,7 @@ bool TestSumDim4() ...@@ -432,7 +432,7 @@ bool TestSumDim4()
/* Initialize variables */ /* Initialize variables */
aGPU->SetZeroAll(); aGPU->SetZeroAll();
cMe->SetZeroAll(); cMe->SetZeroAll();
_SetDataFixedFloat(bGPU, 1.0F); bGPU->SetDataFixed(1);
/* call sum function */ /* call sum function */
_SumDim(aGPU, bGPU, cGPU, 1); _SumDim(aGPU, bGPU, cGPU, 1);
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论