optimize xbackward implementation for supporting efficient propagate and gradient accumulation

18a08a65 · xuchen · 0e585782 · 18a08a65 · 18a08a65 · 18a08a65
Commit 18a08a65 authored Feb 18, 2020 by xuchen
--- a/source/network/XBackwardFunc.cpp
+++ b/source/network/XBackwardFunc.cpp
@@ -40,28 +40,37 @@ void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient)
    XTensor * input = income.tails[0];
    XTensor * output = node;
-    XNoder::MakeGrad(input);
+    if (!isEfficient || input->isGrad) {
+        XNoder::MakeGrad(input);
-    if(operID == FUNC_HARDTANH)
+        XTensor * dedx = input->grad;
-        _HardTanHBackward(output, input, output->grad, input->grad);
+        XTensor * dedy = output->grad;
-    else if(operID == FUNC_IDENTITY)
+        XTensor * tmp = NewTensorBufV2(output, output->devID, output->mem);
-        _IdentityBackward(output, input, output->grad, input->grad);
-    else if(operID == FUNC_LOGSOFTMAX){
+        if (operID == FUNC_HARDTANH)
-        int leadDim = income.GetParamInt(0);
+            _HardTanHBackward(output, input, dedy, tmp);
-        CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in logsoftmax!");
+        else if (operID == FUNC_IDENTITY)
-        _LogSoftmaxBackward(NULL, output, input, output->grad, input->grad, NULL, leadDim, NOLOSS);
+            _IdentityBackward(output, input, dedy, tmp);
-    }
+        else if (operID == FUNC_LOGSOFTMAX) {
-    else if(operID == FUNC_RECTIFY)
+            int leadDim = income.GetParamInt(0);
-        _RectifyBackward(output, input, output->grad, input->grad);
+            CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in logsoftmax!");
-    else if(operID == FUNC_SIGMOID)
+            _LogSoftmaxBackward(NULL, output, input, dedy, tmp, NULL, leadDim, NOLOSS);
-        _SigmoidBackward(output, input, output->grad, input->grad);
+        }
-    else if(operID == FUNC_SOFTMAX){
+        else if (operID == FUNC_RECTIFY)
-        int leadDim = income.GetParamInt(0);
+            _RectifyBackward(output, input, dedy, tmp);
-        CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in softmax!");
+        else if (operID == FUNC_SIGMOID)
-        _SoftmaxBackward(NULL, output, input, output->grad, input->grad, NULL, leadDim, NOLOSS);
+            _SigmoidBackward(output, input, dedy, tmp);
-    }
+        else if (operID == FUNC_SOFTMAX) {
-    else{
+            int leadDim = income.GetParamInt(0);
-        ShowNTErrors("Wrong activation function type!");
+            CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in softmax!");
+            _SoftmaxBackward(NULL, output, input, dedy, tmp, NULL, leadDim, NOLOSS);
+        }
+        else {
+            ShowNTErrors("Wrong activation function type!");
+        }
+        _SumMe(dedx, tmp);
+        DelTensorBuf(tmp);
    }
    node->visitMark = NODE_FINISHED;

--- a/source/network/XBackwardLoss.cpp
+++ b/source/network/XBackwardLoss.cpp
@@ -48,33 +48,38 @@ void XLossGrad::MakeGrad(XTensor * node, bool isEfficient)
    XTensor * padding = NULL;
    int leadingDim;
-    XNoder::MakeGrad(output);
+    if (!isEfficient || output->isGrad) {
-    XTensor * dedy = output->grad;
+        XNoder::MakeGrad(output);
+        XTensor * dedy = output->grad;
-    if (income.tailNum == 1) {
-        if(dedy->dataType == X_FLOAT)
+        if (income.tailNum == 1) {
-            _SetDataFixedFloat(dedy, 1.0F);
+            if (dedy->dataType == X_FLOAT)
-        else if(dedy->dataType == X_DOUBLE)
+                _SetDataFixedFloat(dedy, 1.0F);
-            _SetDataFixedDouble(dedy, 1.0);
+            else if (dedy->dataType == X_DOUBLE)
-        else if(dedy->dataType == X_INT)
+                _SetDataFixedDouble(dedy, 1.0);
-            _SetDataFixedInt(dedy, 1);
+            else if (dedy->dataType == X_INT)
-        else
+                _SetDataFixedInt(dedy, 1);
-            ShowNTErrors("TODO");
+            else
+                ShowNTErrors("TODO");
-        return;
-    }
+            return;
+        }
-    gold = income.tails[1];
+        gold = income.tails[1];
-    if(operID == LOSS_CROSSENTROPY) {
-        if (income.tailNum == 3) 
+        XTensor* tmp = NewTensorBufV2(output, output->devID, output->mem);
-            padding = income.tails[2];
+        if (operID == LOSS_CROSSENTROPY) {
-        leadingDim = income.GetParamInt(0);
+            if (income.tailNum == 3)
-        CheckNTErrors(leadingDim >= 0 && leadingDim < output->order, "wrong leading dimension in logsoftmax!");
+                padding = income.tails[2];
-        _CrossEntropyBackward(dedy, output, gold, weight, padding, leadingDim);
+            leadingDim = income.GetParamInt(0);
-    }
+            CheckNTErrors(leadingDim >= 0 && leadingDim < output->order, "wrong leading dimension in logsoftmax!");
-    else{
+            _CrossEntropyBackward(tmp, output, gold, weight, padding, leadingDim);
-        ShowNTErrors("Wrong activation function type!");
+            _SumMe(dedy, tmp);
+        }
+        else {
+            ShowNTErrors("Wrong activation function type!");
+        }
+        DelTensorBuf(tmp);
    }
    node->visitMark = NODE_FINISHED;
@@ -87,79 +92,4 @@ bool XLossGrad::IsLossOP(XTensor * node)
    return (income.typeID & LOSS_BASE) != 0;
 }
-/* 
-compute dE/dx for a given function y = f(x) 
->> gold - gold standard to measure error (or loss)
->> y - output of the function
->> x - input of the function
->> dedy - dE/dy
->> dedx - dE/dx
->> funcID - id of the function f
->> params - parameters of the function
->> lossName - name of the loss, e.g., cross entropy
-*/
-//void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x, 
-//                        XTensor * dedy, XTensor * dedx, XTensor * padding,
-//                        int funcID, void * params,
-//                        LOSS_FUNCTION_NAME lossName)
-//{
-//    CheckNTErrors(gold && y && x, "Empty input tensors!");
-//    CheckNTErrors(dedx, "Empty gradient tensors!");
-//    CheckNTErrors((funcID & FUNCTION_BASE) != 0, "Illegal function id");
-//
-//    if(funcID == FUNC_HARDTANH){
-//        _HardTanHBackward(gold, y, x, dedy, dedx, lossName);
-//    }
-//    else if(funcID == FUNC_IDENTITY){
-//        _IdentityBackward(gold, y, x, dedy, dedx, lossName);
-//    }
-//    else if(funcID == FUNC_LOGSOFTMAX){
-//        int leadDim = *(int*)params;
-//        _LogSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
-//    }
-//    else if(funcID == FUNC_RECTIFY){
-//        _RectifyBackward(gold, y, x, dedy, dedx, lossName);
-//    }
-//    else if(funcID == FUNC_SIGMOID){
-//        _SigmoidBackward(gold, y, x, dedy, dedx, lossName);
-//    }else if(funcID == FUNC_SOFTMAX){
-//        int leadDim = *(int*)params;
-//        _SoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
-//    }
-//    else{
-//        ShowNTErrors("wrong function found when call the backward process!");
-//    }
-//
-//}
-/* 
-compute dE/dy for variable y and error(loss) function E
->> gold - gold standard to measure error (or loss)
->> y - output of the function
->> dedy - dE/dy
->> lossName - name of the loss, e.g., cross entropy
-*/
-//void XLossGrad::Compute(XTensor * gold, XTensor * y, 
-//                        XTensor * dedy, XTensor * padding,
-//                        LOSS_FUNCTION_NAME lossName)
-//{
-//    if(gold == NULL){
-//        if(dedy->dataType == X_FLOAT)
-//            _SetDataFixedFloat(dedy, 1.0F);
-//        else if(dedy->dataType == X_DOUBLE)
-//            _SetDataFixedDouble(dedy, 1.0);
-//        else if(dedy->dataType == X_INT)
-//            _SetDataFixedInt(dedy, 1);
-//        else{
-//            ShowNTErrors("TODO");
-//        }
-//        return;
-//    }
-//
-//    //_LossBackward(dedy, gold, y, lossName);
-//    if(lossName == CROSSENTROPY)
-//        _CrossEntropyBackward(dedy, y, gold, NULL, padding);
-//
-//}
 }
\ No newline at end of file
--- a/source/network/XBackwardMath.cpp
+++ b/source/network/XBackwardMath.cpp
@@ -30,80 +30,80 @@ namespace nts{
 /* compute dE/dx of a node */
 void XMathGrad::MakeGrad(XTensor * node, bool isEfficient)
 {
-    if(!isEfficient){
+    if (!isEfficient) {
        CheckNTErrors(node->grad != NULL, "No gradient found!");
    }
-    else{
+    else {
        CheckNTErrors(!node->isGrad || node->grad != NULL, "No gradient found!");
    }
    XLink &income = node->income;
    int operID = income.typeID;
-    if(operID == MATH_ABSOLUTE)
+    if (operID == MATH_ABSOLUTE)
        GradAbsolute(node, isEfficient);
-    else if(operID == MATH_COS)
+    else if (operID == MATH_COS)
        GradCos(node, isEfficient);
-    else if(operID == MATH_EXP)
+    else if (operID == MATH_EXP)
        GradExp(node, isEfficient);
-    else if(operID == MATH_LOG)
+    else if (operID == MATH_LOG)
        GradLog(node, isEfficient);
-    else if(operID == MATH_ROUND)
+    else if (operID == MATH_ROUND)
        GradRound(node, isEfficient);
-    else if(operID == MATH_SIGN)
+    else if (operID == MATH_SIGN)
        GradSign(node, isEfficient);
-    else if(operID == MATH_SIN)
+    else if (operID == MATH_SIN)
        GradSin(node, isEfficient);
-    else if(operID == MATH_TAN)
+    else if (operID == MATH_TAN)
        GradTan(node, isEfficient);
-    else if(operID == MATH_CLIP)
+    else if (operID == MATH_CLIP)
        GradClip(node, isEfficient);
-    else if(operID == MATH_DIV)
+    else if (operID == MATH_DIV)
        GradDiv(node, isEfficient);
-    else if(operID == MATH_DIVDIM)
+    else if (operID == MATH_DIVDIM)
        GradDivDim(node, isEfficient);
-    else if(operID == MATH_MATRIXMUL)
+    else if (operID == MATH_MATRIXMUL)
        GradMatrixMul(node, isEfficient);
-    else if(operID == MATH_MATRIXMULBATCHED)
+    else if (operID == MATH_MATRIXMULBATCHED)
        GradMatrixMulBatched(node, isEfficient);
-    else if(operID == MATH_MULTIPLY)
+    else if (operID == MATH_MULTIPLY)
        GradMultiply(node, isEfficient);
-    else if(operID == MATH_MULTIPLYDIM)
+    else if (operID == MATH_MULTIPLYDIM)
        GradMultiplyDim(node, isEfficient);
    else if (operID == MATH_MULTIPLYBROADCAST)
        GradMultiplyBroadcast(node, isEfficient);
-    else if(operID == MATH_NEGATE)
+    else if (operID == MATH_NEGATE)
        GradNegate(node, isEfficient);
-    else if(operID == MATH_NORMALIZE)
+    else if (operID == MATH_NORMALIZE)
        GradNormalize(node, isEfficient);
-    else if(operID == MATH_POWER)
+    else if (operID == MATH_POWER)
        GradPower(node, isEfficient);
-    else if(operID == MATH_SCALEANDSHIFT)
+    else if (operID == MATH_SCALEANDSHIFT)
        GradScaleAndShift(node, isEfficient);
-    else if(operID == MATH_SCALE)
+    else if (operID == MATH_SCALE)
        GradScale(node, isEfficient);
-    else if(operID == MATH_DESCALE)
+    else if (operID == MATH_DESCALE)
        GradDescale(node, isEfficient);
-    else if(operID == MATH_SHIFT)
+    else if (operID == MATH_SHIFT)
        GradShift(node, isEfficient);
-    else if(operID == MATH_SUB)
+    else if (operID == MATH_SUB)
        GradSub(node, isEfficient);
-    else if(operID == MATH_SUBDIM)
+    else if (operID == MATH_SUBDIM)
        GradSubDim(node, isEfficient);
-    else if(operID == MATH_SUM)
+    else if (operID == MATH_SUM)
        GradSum(node, isEfficient);
-    else if(operID == MATH_SUMDIM)
+    else if (operID == MATH_SUMDIM)
        GradSumDim(node, isEfficient);
-    else if(operID == MATH_SUMBROADCAST)
+    else if (operID == MATH_SUMBROADCAST)
        GradSumBroadcast(node, isEfficient);
-    else if(operID == REDUCE_REDUCEMEAN)
+    else if (operID == REDUCE_REDUCEMEAN)
        GradReduceMean(node, isEfficient);
-    else if(operID == REDUCE_REDUCESUM)
+    else if (operID == REDUCE_REDUCESUM)
        GradReduceSum(node, isEfficient);
-    else if(operID == REDUCE_REDUCESUMSQUARED)
+    else if (operID == REDUCE_REDUCESUMSQUARED)
        GradReduceSumSquared(node, isEfficient);
-    else if(operID == REDUCE_REDUCEVARIANCE)
+    else if (operID == REDUCE_REDUCEVARIANCE)
        GradReduceVariance(node, isEfficient);
    else if (operID == MATH_MULANDSHIFT)
        GradMulAndShift(node, isEfficient);
@@ -136,14 +136,17 @@ void XMathGrad::GradAbsolute(XTensor * node, bool isEfficient)
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for ABSOLUTE!");
    XTensor * a = income.tails[0];
-    XTensor * b = NewTensorBufV2(a, a->devID, a->mem);
-    XNoder::MakeGrad(a);
+    /* dE/da = dE/dc * sign(a) */
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);
-    _Sign(a, b);
+        XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
-    _Multiply(node->grad, b, a->grad, 1.0F);
+        _Sign(a, tmp);
+        _Multiply(node->grad, tmp, a->grad, 1.0F);
-    DelTensorBuf(b);
+        DelTensorBuf(tmp);
+    }
    node->visitMark = NODE_FINISHED;
 }
@@ -164,15 +167,18 @@ void XMathGrad::GradCos(XTensor * node, bool isEfficient)
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for COS!");
    XTensor * a = income.tails[0];
-    XTensor * b = NewTensorBufV2(a, a->devID, a->mem);
-    XNoder::MakeGrad(a);
+    /* dE/da = dE/dc * -sin(a) */
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);
-    _Sin(a, b);
+        XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
-    _ScaleAndShiftMe(b, -1.0F);
+        _Sin(a, tmp);
-    _Multiply(node->grad, b, a->grad, 1.0F);
+        _NegateMe(tmp);
+        _Multiply(node->grad, tmp, a->grad, 1.0F);
-    DelTensorBuf(b);
+        DelTensorBuf(tmp);
+    }
    node->visitMark = NODE_FINISHED;
 }
@@ -193,14 +199,17 @@ void XMathGrad::GradExp(XTensor * node, bool isEfficient)
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for EXP!");
    XTensor * a = income.tails[0];
-    XTensor * b = NewTensorBufV2(a, a->devID, a->mem);
-    XNoder::MakeGrad(a);
+    /* dE/da = dE/dc * exp(a) */
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);
-    _Exp(a, b);
+        XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
-    _Multiply(node->grad, b, a->grad, 1.0F);
+        _Exp(a, tmp);
+        _Multiply(node->grad, tmp, a->grad, 1.0F);
-    DelTensorBuf(b);
+        DelTensorBuf(tmp);
+    }
    node->visitMark = NODE_FINISHED;
 }
@@ -222,9 +231,11 @@ void XMathGrad::GradLog(XTensor * node, bool isEfficient)
    XTensor * a = income.tails[0];
-    XNoder::MakeGrad(a);
+    /* dE/da = dE/dc * 1/a */
+    if (!isEfficient || a->isGrad) {
-    _Div(node->grad, a, a->grad, 1.0F);
+        XNoder::MakeGrad(a);
+        _Div(node->grad, a, a->grad, 1.0F);
+    }
    node->visitMark = NODE_FINISHED;
 }
@@ -244,8 +255,12 @@ void XMathGrad::GradRound(XTensor * node, bool isEfficient)
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for ROUND!");
-    // we do nothing here
+    XTensor * a = income.tails[0];
-    // TODO: set grad = 0 if the node is the only child
+    /* dE/da = 0, we do nothing here */
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);
+    }
    node->visitMark = NODE_FINISHED;
 }
@@ -265,8 +280,12 @@ void XMathGrad::GradSign(XTensor * node, bool isEfficient)
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SIGN!");
-    // we do nothing here
+    XTensor * a = income.tails[0];
-    // TODO: set grad = 0 if the node is the only child
+    /* dE/da = 0, we do nothing here */
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);
+    }
    node->visitMark = NODE_FINISHED;
 }
@@ -287,14 +306,17 @@ void XMathGrad::GradSin(XTensor * node, bool isEfficient)
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SIN!");
    XTensor * a = income.tails[0];
-    XTensor * b = NewTensorBufV2(a, a->devID, a->mem);
-    XNoder::MakeGrad(a);
+    /* dE/da = dE/dc * cos(a) */
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);
-    _Cos(a, b);
+        XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
-    _Multiply(node->grad, b, a->grad, 1.0F);
+        _Cos(a, tmp);
+        _Multiply(node->grad, tmp, a->grad, 1.0F);
-    DelTensorBuf(b);
+        DelTensorBuf(tmp);
+    }
    node->visitMark = NODE_FINISHED;
 }
@@ -315,15 +337,18 @@ void XMathGrad::GradTan(XTensor * node, bool isEfficient)
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for TAN!");
    XTensor * a = income.tails[0];
-    XTensor * b = NewTensorBufV2(a, a->devID, a->mem);
+    XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
-    XNoder::MakeGrad(a);
+    /* dE/da = dE/dc * 1/(cos(a))^2
+             = dE/dc * (cos(a))^-2 */
-    _Cos(a, b);
+    if (!isEfficient || a->isGrad) {
-    _PowerMe(b, -2.0F);
+        XNoder::MakeGrad(a);
-    _Multiply(node->grad, b, a->grad, 1.0F);
+        _Cos(a, tmp);
+        _PowerMe(tmp, -2.0F);
+        _Multiply(node->grad, tmp, a->grad, 1.0F);
-    DelTensorBuf(b);
+        DelTensorBuf(tmp);
+    }
    node->visitMark = NODE_FINISHED;
 }
@@ -343,17 +368,21 @@ void XMathGrad::GradClip(XTensor * node, bool isEfficient)
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for CLIP!");
    XTensor * a = income.tails[0];
-    XTensor * b = NewTensorBufV2(a, a->devID, a->mem);
    DTYPE lower = income.GetParam(0);
    DTYPE upper = income.GetParam(1);
-    XNoder::MakeGrad(a);
+    /* dE/da = 1  lower < a < upper
+             = 0  otherwise */
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);
-    _ClipBackward(node, a, node->grad, a->grad, lower, upper);
+        XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
-    _Sum(a->grad, b, a->grad);
+        _ClipBackward(node, a, node->grad, tmp, lower, upper);
+        _SumMe(a->grad, tmp);
-    DelTensorBuf(b);
+        DelTensorBuf(tmp);
+    }
    node->visitMark = NODE_FINISHED;
 }
@@ -376,21 +405,26 @@ void XMathGrad::GradDiv(XTensor * node, bool isEfficient)
    XTensor * a = income.tails[0];
    XTensor * b = income.tails[1];
-    XTensor * ab2 = NewTensorBufV2(a, a->devID, a->mem);
-    XNoder::MakeGrad(a);
-    XNoder::MakeGrad(b);
    CheckNTErrors(_IsSameShaped(a, b), "Wrong sized input tensors!");
+    /* dE/da = dE/dc / b */
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);
+        _Div(node->grad, b, a->grad, 1.0F);
+    }
-    _Div(node->grad, b, a->grad, 1.0F);
+    /* dE/db = dE/dc * a/(-b^2)
+             = dE/dc * a * (-b^-2) */
-    _Power(b, ab2, -2.0F);
+    if (!isEfficient || b->isGrad) {
-    _Multiply(a, ab2, ab2);
+        XNoder::MakeGrad(b);
-    _ScaleAndShiftMe(ab2, -1.0F);
+        XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
-    _Multiply(node->grad, ab2, b->grad, 1.0F);
+        _Power(b, tmp, -2.0F);
+        _NegateMe(tmp);
+        _MultiplyMe(tmp, a);
+        _Multiply(node->grad, tmp, b->grad, 1.0F);
-    DelTensorBuf(ab2);
+        DelTensorBuf(tmp);
+    }
    node->visitMark = NODE_FINISHED;
 }
@@ -414,87 +448,82 @@ void XMathGrad::GradDivDim(XTensor * node, bool isEfficient)
    XTensor * a = income.tails[0];
    XTensor * b = income.tails[1];
    int n = income.GetParamInt(0);
-    XNoder::MakeGrad(a);
-    XNoder::MakeGrad(b);
    /* dE/da = dE/dc * (1/b) */
-    _DivDim(node->grad, b, a->grad, n, 1.0);
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);
+        _DivDim(node->grad, b, a->grad, n, 1.0);
+    }
-    /* dE/db = dE/dc * dc/db */
+    /* dE/db = dE/dc * dc/db
-    int order = a->order;
+             = (dE/dc * (-a/b^2)).reduce(0,...,n-1,n+1,...) */
-    int dimSize[MAX_TENSOR_DIM_NUM];
+    if (!isEfficient || b->isGrad) {
-    memcpy(dimSize, a->dimSize, sizeof(int) * a->order);
+        XNoder::MakeGrad(b);
+        int order = a->order;
+        int dimSize[MAX_TENSOR_DIM_NUM];
+        memcpy(dimSize, a->dimSize, sizeof(int) * a->order);
-    XTensor * aTMP1 = NewTensorBufV2(a, a->devID, a->mem);
+        XTensor * aTMP1 = NewTensorBufV2(a, a->devID, a->mem);
-    XTensor * aTMP2 = NewTensorBufV2(a, a->devID, a->mem);
+        XTensor * aTMP2 = NewTensorBufV2(a, a->devID, a->mem);
-    XTensor * bTMP  = NewTensorBufV2(b, b->devID, b->mem);
+        XTensor * bTMP = NewTensorBufV2(b, b->devID, b->mem);
-    XTensor * interGradTMP = NewTensorBufV2(node->grad, node->devID, node->mem);
+        XTensor * interGradTMP = NewTensorBufV2(node->grad, node->devID, node->mem);
-    _Negate(a, aTMP1);
+        _Negate(a, aTMP1);
-    _Power(b, bTMP, -2.0F);
+        _Power(b, bTMP, -2.0F);
-    _MultiplyDim(aTMP1, bTMP, aTMP2, n);
+        _MultiplyDim(aTMP1, bTMP, aTMP2, n);
-    _Multiply(node->grad, aTMP2, interGradTMP);
+        _Multiply(node->grad, aTMP2, interGradTMP);
-    if(n == order - 1){
+        if (n == order - 1) {
-        int reshapedSize[MAX_TENSOR_DIM_NUM];
+            int reshapedSize[MAX_TENSOR_DIM_NUM];
-        reshapedSize[0] = a->unitNum/dimSize[order - 1];
+            reshapedSize[0] = a->unitNum / dimSize[order - 1];
-        reshapedSize[1] = dimSize[order - 1];
+            reshapedSize[1] = dimSize[order - 1];
-        /* we reshape dE/dc * a to a matrix whose column number is equal to the 
+            /* we reshape dE/dc * a to a matrix whose column number is equal to the
-           size of b. Then we can reduce the matrix into a row vector. */
+               size of b. Then we can reduce the matrix into a row vector. */
-        interGradTMP->Reshape(2, reshapedSize);
+            interGradTMP->Reshape(2, reshapedSize);
-        //if(b->outgo.tailNum > 1){
            XTensor * bGradTMP = NewTensorBufV2(b->grad, b->devID, b->mem);
            _ReduceSum(interGradTMP, bGradTMP, 0);
-            _Sum(b->grad, bGradTMP, b->grad);
+            _SumMe(b->grad, bGradTMP);
            DelTensorBuf(bGradTMP);
-        /*}
-        else{
-            _ReduceSum(interGradTMP, b->grad, 0);
-        }*/
-    }
-    else{
-        int reshapedSize[MAX_TENSOR_DIM_NUM];
-        reshapedSize[0] = 1;
-        reshapedSize[1] = dimSize[n];
-        reshapedSize[2] = 1;
-        for(int i = 0; i < order; i++){
-            if(i < n)
-                reshapedSize[0] *= dimSize[i];
        }
+        else {
+            int reshapedSize[MAX_TENSOR_DIM_NUM];
+            reshapedSize[0] = 1;
+            reshapedSize[1] = dimSize[n];
+            reshapedSize[2] = 1;
-        reshapedSize[2] = a->unitNum / (reshapedSize[0] * reshapedSize[1]);
+            for (int i = 0; i < order; i++) {
+                if (i < n)
+                    reshapedSize[0] *= dimSize[i];
+            }
-        /* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|. 
+            reshapedSize[2] = a->unitNum / (reshapedSize[0] * reshapedSize[1]);
-           Then reduce along with z and x to obtain dE/db. */
-        interGradTMP->Reshape(3, reshapedSize);
-        XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
+            /* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|.
-        _ReduceSum(interGradTMP, interGrad, 2);
+               Then reduce along with z and x to obtain dE/db. */
+            interGradTMP->Reshape(3, reshapedSize);
-        //if(b->outgo.tailNum > 1){
+            XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
-            XTensor * bGradTMP2 = NewTensorBufV2(b->grad, b->devID, b->mem);
+            _ReduceSum(interGradTMP, interGrad, 2);
+            XTensor * bGradTMP2 = NewTensorBufV2(b->grad, b->devID, b->mem);
            _ReduceSum(interGrad, bGradTMP2, 0);
-            _Sum(b->grad, bGradTMP2, b->grad);
+            _SumMe(b->grad, bGradTMP2);
            DelTensorBuf(bGradTMP2);
-        /*}
+            DelTensorBuf(interGrad);
-        else{
+        }
-            _ReduceSum(interGrad, b->grad, 0);
-        }*/
-        DelTensorBuf(interGrad);
-    }
-    DelTensorBuf(interGradTMP);
+        DelTensorBuf(interGradTMP);
-    DelTensorBuf(bTMP);
+        DelTensorBuf(bTMP);
-    DelTensorBuf(aTMP2);
+        DelTensorBuf(aTMP2);
-    DelTensorBuf(aTMP1);
+        DelTensorBuf(aTMP1);
+    }
    node->visitMark = NODE_FINISHED;
 }
@@ -521,9 +550,9 @@ void XMathGrad::GradMatrixMul(XTensor * node, bool isEfficient)
    MATRIX_TRANS_TYPE transB = income.GetParamTrans(1);
    DTYPE alpha = income.GetParam(2);
-    if(!isEfficient || a->isGrad)
+    if (!isEfficient || a->isGrad)
        XNoder::MakeGrad(a);
-    if(!isEfficient || b->isGrad)
+    if (!isEfficient || b->isGrad)
        XNoder::MakeGrad(b);
    XTensor * c = node;
@@ -531,9 +560,9 @@ void XMathGrad::GradMatrixMul(XTensor * node, bool isEfficient)
    XTensor * deda = a->grad;
    XTensor * dedb = b->grad;
-    if(a->order == 2 && b->order == 2)
+    if (a->order == 2 && b->order == 2)
        GradMatrixMul(a, deda, transA, b, dedb, transB, dedc, alpha, isEfficient);
-    else if(transA == X_NOTRANS && a->order > 2 && b->order == 2){
+    else if (transA == X_NOTRANS && a->order > 2 && b->order == 2){
        int orderBackupA = a->order;
        int orderBackupC = c->order;
        int dimsBackupA[MAX_TENSOR_DIM_NUM];
@@ -543,7 +572,7 @@ void XMathGrad::GradMatrixMul(XTensor * node, bool isEfficient)
        a->Reshape(a->unitNum/a->GetDim(-1), a->GetDim(-1));
        c->Reshape(c->unitNum/c->GetDim(-1), c->GetDim(-1));
-        if(!isEfficient || a->isGrad)
+        if (!isEfficient || a->isGrad)
            deda->Reshape(deda->unitNum/deda->GetDim(-1), deda->GetDim(-1));
        dedc->Reshape(dedc->unitNum/dedc->GetDim(-1), dedc->GetDim(-1));
@@ -551,7 +580,7 @@ void XMathGrad::GradMatrixMul(XTensor * node, bool isEfficient)
        a->Reshape(orderBackupA, dimsBackupA);
        c->Reshape(orderBackupC, dimsBackupC);
-        if(!isEfficient || a->isGrad)
+        if (!isEfficient || a->isGrad)
            deda->Reshape(orderBackupA, dimsBackupA);
        dedc->Reshape(orderBackupC, dimsBackupC);
    }
@@ -578,54 +607,54 @@ void XMathGrad::GradMatrixMul(XTensor * a, XTensor * deda, MATRIX_TRANS_TYPE tra
                              XTensor * dedc, DTYPE alpha, bool isEfficient)
 {
    /* c = a * b * \alpha */
-    if(transA == X_NOTRANS && transB == X_NOTRANS){
+    if (transA == X_NOTRANS && transB == X_NOTRANS) {
        /* dE/da = dE/dc * b^T * \alpha */
-        if(!isEfficient || a->isGrad)
+        if (!isEfficient || a->isGrad)
            _MatrixMul(dedc, X_NOTRANS, b, X_TRANS, deda, alpha, 1.0F);
        /* dE/db = a^T * dE/dc * \alpha */
-        if(!isEfficient || b->isGrad)
+        if (!isEfficient || b->isGrad)
            _MatrixMul(a, X_TRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
    }
    /* c = a^T * b * \alpha */
-    else if(transA == X_TRANS && transB == X_NOTRANS){
+    else if (transA == X_TRANS && transB == X_NOTRANS){
        /* dE/da = (dE/dc * b^T)^T * \alpha 
                 = b * dE/dc^T * \alpha */
-        if(!isEfficient || a->isGrad)
+        if (!isEfficient || a->isGrad)
            _MatrixMul(b, X_NOTRANS, dedc, X_TRANS, deda, alpha, 1.0F);
        /* dE/db = a * dE/dc * \alpha */
-        if(!isEfficient || b->isGrad)
+        if (!isEfficient || b->isGrad)
            _MatrixMul(a, X_NOTRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
    }
    /* c = a * b^T * \alpha */
-    else if(transA == X_NOTRANS && transB == X_TRANS){
+    else if (transA == X_NOTRANS && transB == X_TRANS){
        /* dE/da = dE/dc * b * \alpha */
-        if(!isEfficient || a->isGrad)
+        if (!isEfficient || a->isGrad)
            _MatrixMul(dedc, X_NOTRANS, b, X_NOTRANS, deda, alpha, 1.0F);
        /* dE/db = (a^T * dE/dc)^T * \alpha 
                 = dE/dc^T * a * \alpha */
-        if(!isEfficient || b->isGrad)
+        if (!isEfficient || b->isGrad)
            _MatrixMul(dedc, X_TRANS, a, X_NOTRANS, dedb, alpha, 1.0F);
    }
    /* c = a^T * b^T * \alpha */
-    else if(transA == X_TRANS && transB == X_TRANS){
+    else if (transA == X_TRANS && transB == X_TRANS){
        /* dE/da = (dE/dc * b)^T * \alpha 
                 = b^T * dE/dc^T * \alpha */
-        if(!isEfficient || a->isGrad)
+        if (!isEfficient || a->isGrad)
            _MatrixMul(b, X_TRANS, dedc, X_TRANS, deda, alpha, 1.0F);
        /* dE/db = (a * dE/dc)^T * \alpha 
                 = dE/dc^T * a^T * \alpha */
-        if(!isEfficient || b->isGrad)
+        if (!isEfficient || b->isGrad)
            _MatrixMul(dedc, X_TRANS, a, X_TRANS, dedb, alpha, 1.0F);
    }
 }
@@ -653,55 +682,65 @@ void XMathGrad::GradMatrixMulBatched(XTensor * node, bool isEfficient)
    MATRIX_TRANS_TYPE transB = income.GetParamTrans(1);
    DTYPE alpha = income.GetParam(2);
-    XNoder::MakeGrad(a);
+    if (!isEfficient || a->isGrad)
-    XNoder::MakeGrad(b);
+        XNoder::MakeGrad(a);
+    if (!isEfficient || b->isGrad)
+        XNoder::MakeGrad(b);
    XTensor * dedc = node->grad;
    XTensor * deda = a->grad;
    XTensor * dedb = b->grad;
    /* c = a * b * \alpha */
-    if(transA == X_NOTRANS && transB == X_NOTRANS){
+    if (transA == X_NOTRANS && transB == X_NOTRANS) {
        /* dE/da = dE/dc * b^T * \alpha */
-        _MatrixMulBatched(dedc, X_NOTRANS, b, X_TRANS, deda, alpha, 1.0F);
+        if (!isEfficient || a->isGrad)
+            _MatrixMulBatched(dedc, X_NOTRANS, b, X_TRANS, deda, alpha, 1.0F);
        /* dE/db = a^T * dE/dc * \alpha */
-        _MatrixMulBatched(a, X_TRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
+        if (!isEfficient || b->isGrad)
+            _MatrixMulBatched(a, X_TRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
    }
    /* c = a^T * b * \alpha */
-    else if(transA == X_TRANS && transB == X_NOTRANS){
+    else if (transA == X_TRANS && transB == X_NOTRANS) {
        /* dE/da = (dE/dc * b^T)^T * \alpha 
                 = b * dE/dc^T * \alpha */
-        _MatrixMulBatched(b, X_NOTRANS, dedc, X_TRANS, deda, alpha, 1.0F);
+        if (!isEfficient || a->isGrad)
+            _MatrixMulBatched(b, X_NOTRANS, dedc, X_TRANS, deda, alpha, 1.0F);
        /* dE/db = a * dE/dc * \alpha */
-        _MatrixMulBatched(a, X_NOTRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
+        if (!isEfficient || b->isGrad)
+            _MatrixMulBatched(a, X_NOTRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
    }
    /* c = a * b^T * \alpha */
-    else if(transA == X_NOTRANS && transB == X_TRANS){
+    else if (transA == X_NOTRANS && transB == X_TRANS) {
        /* dE/da = dE/dc * b * \alpha */
-        _MatrixMulBatched(dedc, X_NOTRANS, b, X_NOTRANS, deda, alpha, 1.0F);
+        if (!isEfficient || a->isGrad)
+            _MatrixMulBatched(dedc, X_NOTRANS, b, X_NOTRANS, deda, alpha, 1.0F);
        /* dE/db = (a^T * dE/dc)^T * \alpha 
                 = dE/dc^T * a * \alpha */
-        _MatrixMulBatched(dedc, X_TRANS, a, X_NOTRANS, dedb, alpha, 1.0F);
+        if (!isEfficient || b->isGrad)
+            _MatrixMulBatched(dedc, X_TRANS, a, X_NOTRANS, dedb, alpha, 1.0F);
    }
    /* c = a^T * b^T * \alpha */
-    else if(transA == X_TRANS && transB == X_TRANS){
+    else if (transA == X_TRANS && transB == X_TRANS) {
        /* dE/da = (dE/dc * b)^T * \alpha 
                 = b^T * dE/dc^T * \alpha */
-        _MatrixMulBatched(b, X_TRANS, dedc, X_TRANS, deda, alpha, 1.0F);
+        if (!isEfficient || a->isGrad)
+            _MatrixMulBatched(b, X_TRANS, dedc, X_TRANS, deda, alpha, 1.0F);
        /* dE/db = (a * dE/dc)^T * \alpha 
                 = dE/dc^T * a^T * \alpha */
-        _MatrixMulBatched(dedc, X_TRANS, a, X_TRANS, dedb, alpha, 1.0F);
+        if (!isEfficient || b->isGrad)
+            _MatrixMulBatched(dedc, X_TRANS, a, X_TRANS, dedb, alpha, 1.0F);
    }
    node->visitMark = NODE_FINISHED;
@@ -728,11 +767,13 @@ void XMathGrad::GradMultiply(XTensor * node, bool isEfficient)
    CheckNTErrors(_IsSameShaped(a, b), "Wrong sized input tensors!");
+    /* dE/da = dE/dc * b */
    if (!isEfficient || a->isGrad) {
        XNoder::MakeGrad(a);
        _Multiply(node->grad, b, a->grad, 1.0F);
    }
+    /* dE/db = dE/dc * a */
    if (!isEfficient || b->isGrad) {
        XNoder::MakeGrad(b);
        _Multiply(node->grad, a, b->grad, 1.0F);
@@ -760,77 +801,70 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
    XTensor * a = income.tails[0];
    XTensor * b = income.tails[1];
    int n = income.GetParamInt(0);
-    XNoder::MakeGrad(a);
-    XNoder::MakeGrad(b);
-    /* dE/da */
+    /* dE/da = dE/dc * b */
-    _MultiplyDim(node->grad, b, a->grad, n, 1.0F);
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);
-    /* dE/db */
+        _MultiplyDim(node->grad, b, a->grad, n, 1.0F);
-    int order = a->order;
+    }
-    int dimSize[MAX_TENSOR_DIM_NUM];
-    memcpy(dimSize, a->dimSize, sizeof(int) * a->order);
-    XTensor * bGradTMP = NewTensorBufV2(node->grad, node->devID, node->mem);
+    /* dE/db = (dE/dc * a).reduce(0,...,n-1,n+1,...) */
-    _Multiply(node->grad, a, bGradTMP);
+    if (!isEfficient || b->isGrad) {
+        XNoder::MakeGrad(b);
-    if(n == order - 1){
+        int order = a->order;
-        int reshapedSize[MAX_TENSOR_DIM_NUM];
+        int dimSize[MAX_TENSOR_DIM_NUM];
-        reshapedSize[0] = a->unitNum/dimSize[order - 1];
+        memcpy(dimSize, a->dimSize, sizeof(int) * a->order);
-        reshapedSize[1] = dimSize[order - 1];
-        /* we reshape dE/dc * a to a matrix whose column number is equal to the 
+        XTensor * bGradTMP = NewTensorBufV2(node->grad, node->devID, node->mem);
-           size of b. Then we can reduce the matrix into a row vector. */
+        _Multiply(node->grad, a, bGradTMP);
-        bGradTMP->Reshape(2, reshapedSize);
-        //if(b->outgo.tailNum > 1){
+        if (n == order - 1) {
-            XTensor * bGradTMP2 = NewTensorBufV2(b->grad, b->devID, b->mem);
+            int reshapedSize[MAX_TENSOR_DIM_NUM];
+            reshapedSize[0] = a->unitNum / dimSize[order - 1];
+            reshapedSize[1] = dimSize[order - 1];
+            /* we reshape dE/dc * a to a matrix whose column number is equal to the
+               size of b. Then we can reduce the matrix into a row vector. */
+            bGradTMP->Reshape(2, reshapedSize);
+            XTensor * bGradTMP2 = NewTensorBufV2(b->grad, b->devID, b->mem);
            _ReduceSum(bGradTMP, bGradTMP2, 0);
            _Sum(b->grad, bGradTMP2, b->grad);
            DelTensorBuf(bGradTMP2);
-        /*}
-        else{
-            _ReduceSum(bGradTMP, b->grad, 0);
-        }*/
-    }
-    else{
-        int reshapedSize[MAX_TENSOR_DIM_NUM];
-        reshapedSize[0] = 1;
-        reshapedSize[1] = dimSize[n];
-        reshapedSize[2] = 1;
-        for(int i = 0; i < order; i++){
-            if(i < n)
-                reshapedSize[0] *= dimSize[i];
        }
+        else {
+            int reshapedSize[MAX_TENSOR_DIM_NUM];
+            reshapedSize[0] = 1;
+            reshapedSize[1] = dimSize[n];
+            reshapedSize[2] = 1;
-        reshapedSize[2] = a->unitNum / (reshapedSize[0] * reshapedSize[1]);
+            for (int i = 0; i < order; i++) {
+                if (i < n)
+                    reshapedSize[0] *= dimSize[i];
+            }
-        /* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|. 
+            reshapedSize[2] = a->unitNum / (reshapedSize[0] * reshapedSize[1]);
-           Then reduce along with z and x to obtain dE/db. */
-        bGradTMP->Reshape(3, reshapedSize);
-        XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
+            /* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|.
-        _ReduceSum(bGradTMP, interGrad, 2);
+               Then reduce along with z and x to obtain dE/db. */
+            bGradTMP->Reshape(3, reshapedSize);
-        //if(b->outgo.tailNum > 1){
+            XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
-            XTensor * bGradTMP2 = NewTensorBufV2(b->grad, b->devID, b->mem);
+            _ReduceSum(bGradTMP, interGrad, 2);
+            XTensor * bGradTMP2 = NewTensorBufV2(b->grad, b->devID, b->mem);
            _ReduceSum(interGrad, bGradTMP2, 0);
            _Sum(b->grad, bGradTMP2, b->grad);
            DelTensorBuf(bGradTMP2);
-        /*}
+            DelTensorBuf(interGrad);
-        else{
+        }
-            _ReduceSum(interGrad, b->grad, 0);
+        DelTensorBuf(bGradTMP);
-        }*/
-        DelTensorBuf(interGrad);
    }
-    DelTensorBuf(bGradTMP);
    node->visitMark = NODE_FINISHED;
 }
@@ -857,11 +891,18 @@ void XMathGrad::GradMultiplyBroadcast(XTensor * node, bool isEfficient)
    XTensor * b = income.tails[1];
    XNoder::MakeGrad(a);
-    _MultiplyBroadcast(node->grad, b, a->grad, 1.0F);
-    if(b->isVar || b->income.tailNum > 0){
+    /* dE/da = dE/dc * b */
-        ShowNTErrors("TODO");
+    if (!isEfficient || a->isGrad)
+        _MultiplyBroadcast(node->grad, b, a->grad, 1.0F);
+    /* dE/db = (dE/dc * a).reduce(0...n) */
+    if (!isEfficient || b->isGrad) {
+        if (b->isVar || b->income.tailNum > 0)
+            ShowNTErrors("TODO");
    }
+    node->visitMark = NODE_FINISHED;
 }
 /*
@@ -880,14 +921,12 @@ void XMathGrad::GradNegate(XTensor * node, bool isEfficient)
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for NEGATE!");
    XTensor * a = income.tails[0];
-    XTensor * b = NewTensorBufV2(a, a->devID, a->mem);
-    XNoder::MakeGrad(a);
-    _ScaleAndShift(node->grad, b, -1.0F);
-    _Sum(a->grad, b, a->grad);
-    DelTensorBuf(b);
+    /* dE/da = dE/dc * (-1) */
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);        
+        _Sum(a->grad, node->grad, a->grad, -1.0F);
+    }
    node->visitMark = NODE_FINISHED;
 }
@@ -901,7 +940,6 @@ gradient for normalize
 void XMathGrad::GradNormalize(XTensor * node, bool isEfficient)
 {
    ShowNTErrors("TODO!");
 }
 /*
@@ -920,17 +958,20 @@ void XMathGrad::GradPower(XTensor * node, bool isEfficient)
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for POWER!");
    XTensor * a = income.tails[0];
-    XTensor * b = NewTensorBufV2(a, a->devID, a->mem);
    DTYPE p = income.GetParam(0);
-    XNoder::MakeGrad(a);
+    /* dE/da = (dE/dc) * p * a^(p-1) */
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);
-    _Power(a, b, p - 1.0F);
+        XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
-    _ScaleAndShiftMe(b, p);
+        _Power(a, tmp, p - 1.0F);
-    _Multiply(node->grad, b, a->grad, 1.0F);
+        _ScaleAndShiftMe(tmp, p);
+        _Multiply(node->grad, tmp, a->grad, 1.0F);
-    DelTensorBuf(b);
+        DelTensorBuf(tmp);
+    }
    node->visitMark = NODE_FINISHED;
 }
@@ -954,9 +995,12 @@ void XMathGrad::GradScaleAndShift(XTensor * node, bool isEfficient)
    DTYPE scale = income.GetParam(0);
-    XNoder::MakeGrad(a);
+    /* dE/da = dE/dc * scale */
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);
-    _Sum(a->grad, node->grad, a->grad, scale);
+        _Sum(a->grad, node->grad, a->grad, scale);
+    }
    node->visitMark = NODE_FINISHED;
 }
@@ -980,9 +1024,12 @@ void XMathGrad::GradScale(XTensor * node, bool isEfficient)
    DTYPE scale = income.GetParam(0);
-    XNoder::MakeGrad(a);
+    /* dE/da = dE/dc * scale */
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);
-    _Sum(a->grad, node->grad, a->grad, scale);
+        _Sum(a->grad, node->grad, a->grad, scale);
+    }
    node->visitMark = NODE_FINISHED;
 }
@@ -1006,9 +1053,12 @@ void XMathGrad::GradDescale(XTensor * node, bool isEfficient)
    DTYPE descale = income.GetParam(0);
-    XNoder::MakeGrad(a);
+    /* dE/da = dE/dc / descale */
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);
-    _Sum(a->grad, node->grad, a->grad, 1/descale);
+        _Sum(a->grad, node->grad, a->grad, 1 / descale);
+    } 
    node->visitMark = NODE_FINISHED;
 }
@@ -1030,9 +1080,12 @@ void XMathGrad::GradShift(XTensor * node, bool isEfficient)
    XTensor * a = income.tails[0];
-    XNoder::MakeGrad(a);
+    /* dE/da = dE/dc */
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);
-    _Sum(a->grad, node->grad, a->grad);
+        _Sum(a->grad, node->grad, a->grad);
+    }
    node->visitMark = NODE_FINISHED;
 }
@@ -1057,11 +1110,17 @@ void XMathGrad::GradSub(XTensor * node, bool isEfficient)
    XTensor * b = income.tails[1];
    DTYPE beta = income.GetParam(0);
-    XNoder::MakeGrad(a);
+    /* dE/da = dE/dc */
-    XNoder::MakeGrad(b);
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);
+        _Sum(a->grad, node->grad, a->grad);
+    }
-    _Sum(a->grad, node->grad, a->grad);
+    /* dE/db = -dE/dc * \beta */
-    _Sum(b->grad, node->grad, b->grad, -beta);
+    if (!isEfficient || b->isGrad) {
+        XNoder::MakeGrad(b);
+        _Sum(b->grad, node->grad, b->grad, -beta);
+    }
    node->visitMark = NODE_FINISHED;
 }
@@ -1085,81 +1144,70 @@ void XMathGrad::GradSubDim(XTensor * node, bool isEfficient)
    XTensor * b = income.tails[1];
    int n = income.GetParamInt(0);
    DTYPE beta = income.GetParam(1);
-    XNoder::MakeGrad(a);
-    XNoder::MakeGrad(b);
-    _Sum(a->grad, node->grad, a->grad);
+    /* dE/da = dE/dc */
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);
+        _Sum(a->grad, node->grad, a->grad);
+    }
-    int order = a->order;
+    /* dE/db = - dE/dc * b.reduce(0,...,n-1,n+1,...) * \beta */
-    int dimSize[MAX_TENSOR_DIM_NUM];
+    if (!isEfficient || b->isGrad) {
-    memcpy(dimSize, a->dimSize, sizeof(int) * a->order);
+        XNoder::MakeGrad(b);
+        int order = a->order;
+        int dimSize[MAX_TENSOR_DIM_NUM];
+        memcpy(dimSize, a->dimSize, sizeof(int) * a->order);
-    if(n == order - 1){
+        if (n == order - 1) {
-        int reshapedSize[MAX_TENSOR_DIM_NUM];
+            int reshapedSize[MAX_TENSOR_DIM_NUM];
-        reshapedSize[0] = a->unitNum / dimSize[order - 1];
+            reshapedSize[0] = a->unitNum / dimSize[order - 1];
-        reshapedSize[1] = dimSize[order - 1];
+            reshapedSize[1] = dimSize[order - 1];
-        /* we reshape dE/dc to a matrix whose column number is equal to the
+            /* we reshape dE/dc to a matrix whose column number is equal to the
-           size of b. Then we can reduce the matrix into a row vector. */
+               size of b. Then we can reduce the matrix into a row vector. */
-        node->grad->Reshape(2, reshapedSize);
+            node->grad->Reshape(2, reshapedSize);
-        //if(b->outgo.tailNum > 1){
            XTensor * bGradTMP = NewTensorBufV2(b->grad, b->devID, b->mem);
            _ReduceSum(node->grad, bGradTMP, 0);
-            if(beta != 1.0F)
+            if (beta != 1.0F)
                _ScaleAndShiftMe(bGradTMP, beta);
            _Sub(b->grad, bGradTMP, b->grad);
            DelTensorBuf(bGradTMP);
-        /*}
-        else{
+            node->grad->Reshape(order, dimSize);
-            _ReduceSum(node->grad, b->grad, 0);
-            if(beta != 1.0F)
-                _ScaleAndShiftMe(b->grad, beta);
-            _ScaleAndShiftMe(b->grad, -1.0F);
-        }*/
-        node->grad->Reshape(order, dimSize);
-    }
-    else{
-        int reshapedSize[MAX_TENSOR_DIM_NUM];
-        reshapedSize[0] = 1;
-        reshapedSize[1] = dimSize[n];
-        reshapedSize[2] = 1;
-        for(int i = 0; i < order; i++){
-            if(i < n)
-                reshapedSize[0] *= dimSize[i];
        }
+        else {
+            int reshapedSize[MAX_TENSOR_DIM_NUM];
+            reshapedSize[0] = 1;
+            reshapedSize[1] = dimSize[n];
+            reshapedSize[2] = 1;
-        reshapedSize[2] = a->unitNum / (reshapedSize[0] * reshapedSize[1]);
+            for (int i = 0; i < order; i++) {
+                if (i < n)
+                    reshapedSize[0] *= dimSize[i];
+            }
-        /* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|.
+            reshapedSize[2] = a->unitNum / (reshapedSize[0] * reshapedSize[1]);
-           Then reduce along with z and x to obtain dE/db. */
-        node->grad->Reshape(3, reshapedSize);
-        XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
+            /* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|.
+               Then reduce along with z and x to obtain dE/db. */
+            node->grad->Reshape(3, reshapedSize);
-        _ReduceSum(node->grad, interGrad, 2);
+            XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
+            _ReduceSum(node->grad, interGrad, 2);
-        //if(b->outgo.tailNum > 1){
            XTensor * bGradTMP = NewTensorBufV2(b->grad, b->devID, b->mem);
            _ReduceSum(interGrad, bGradTMP, 0);
-            if(beta != 1.0F)
+            if (beta != 1.0F)
                _ScaleAndShiftMe(bGradTMP, beta);
            _Sub(b->grad, bGradTMP, b->grad);
            DelTensorBuf(bGradTMP);
-        /*}
-        else{
-            _ReduceSum(interGrad, b->grad, 0);
-            if(beta != 1.0F)
-                _ScaleAndShiftMe(b->grad, beta);
-            _ScaleAndShiftMe(b->grad, -1.0F);
-        }*/
-        node->grad->Reshape(order, dimSize);
-        DelTensorBuf(interGrad);
+            node->grad->Reshape(order, dimSize);
+            DelTensorBuf(interGrad);
+        }
    }
    node->visitMark = NODE_FINISHED;
@@ -1172,7 +1220,6 @@ c =  a + b * \beta
 we have
 dE/da = dE/dc 
 dE/db = dE/dc * \beta
 >> node - the node (c) for backward computation
 >> isEfficient - indicates whether the computation is in
                 an efficient manner
@@ -1186,12 +1233,14 @@ void XMathGrad::GradSum(XTensor * node, bool isEfficient)
    XTensor * b = income.tails[1];
    DTYPE beta = income.GetParam(0);
-    if(!isEfficient || a->isGrad){
+    /* dE/da = dE/dc */
+    if (!isEfficient || a->isGrad) {
        XNoder::MakeGrad(a);
        _Sum(a->grad, node->grad, a->grad);
    }
-    if(!isEfficient || b->isGrad){
+    /* dE/db = dE/dc * \beta */
+    if (!isEfficient || b->isGrad) {
        XNoder::MakeGrad(b);
        _Sum(b->grad, node->grad, b->grad, beta);
    }
@@ -1219,81 +1268,72 @@ void XMathGrad::GradSumDim(XTensor * node, bool isEfficient)
    XTensor * b = income.tails[1];
    int n = income.GetParamInt(0);
    DTYPE beta = income.GetParam(1);
-    XNoder::MakeGrad(a);
-    XNoder::MakeGrad(b);
-    _Sum(a->grad, node->grad, a->grad);
+    if (!isEfficient || a->isGrad) {
+        /* dE/da = dE/dc */
+        XNoder::MakeGrad(a);
+        _Sum(a->grad, node->grad, a->grad);
+    }
-    int order = a->order;
+    /* dE/db = dE/dc * a.reduce(0,...,n-1,n+1,...) * \beta */
-    int dimSize[MAX_TENSOR_DIM_NUM];
+    if (!isEfficient || b->isGrad) {
-    memcpy(dimSize, a->dimSize, sizeof(int) * a->order);
+        XNoder::MakeGrad(b);
+        int order = a->order;
+        int dimSize[MAX_TENSOR_DIM_NUM];
+        memcpy(dimSize, a->dimSize, sizeof(int) * a->order);
-    if(n == order - 1){
+        if (n == order - 1) {
-        int reshapedSize[MAX_TENSOR_DIM_NUM];
+            int reshapedSize[MAX_TENSOR_DIM_NUM];
-        reshapedSize[0] = a->unitNum/dimSize[order - 1];
+            reshapedSize[0] = a->unitNum / dimSize[order - 1];
-        reshapedSize[1] = dimSize[order - 1];
+            reshapedSize[1] = dimSize[order - 1];
-        /* we reshape dE/dc to a matrix whose column number is equal to the 
+            /* we reshape dE/dc to a matrix whose column number is equal to the
-           size of b. Then we can reduce the matrix into a row vector. */
+               size of b. Then we can reduce the matrix into a row vector. */
-        node->grad->Reshape(2, reshapedSize);
+            node->grad->Reshape(2, reshapedSize);
-        //if(b->outgo.tailNum > 1){
            XTensor * bGradTMP = NewTensorBufV2(b->grad, b->devID, b->mem);
            _ReduceSum(node->grad, bGradTMP, 0);
-            if(beta != 1.0F)
+            if (beta != 1.0F)
                _ScaleAndShiftMe(bGradTMP, beta);
            _Sum(bGradTMP, b->grad, b->grad);
            DelTensorBuf(bGradTMP);
-        /*}
-        else{
+            node->grad->Reshape(order, dimSize);
-            _ReduceSum(node->grad, b->grad, 0);
-            if(beta != 1.0F)
-                _ScaleAndShiftMe(b->grad, beta);
-        }*/
-        node->grad->Reshape(order, dimSize);
-    }
-    else{
-        int reshapedSize[MAX_TENSOR_DIM_NUM];
-        reshapedSize[0] = 1;
-        reshapedSize[1] = dimSize[n];
-        reshapedSize[2] = 1;
-        for(int i = 0; i < order; i++){
-            if(i < n)
-                reshapedSize[0] *= dimSize[i];
        }
+        else {
+            int reshapedSize[MAX_TENSOR_DIM_NUM];
+            reshapedSize[0] = 1;
+            reshapedSize[1] = dimSize[n];
+            reshapedSize[2] = 1;
-        reshapedSize[2] = a->unitNum / (reshapedSize[0] * reshapedSize[1]);
+            for (int i = 0; i < order; i++) {
+                if (i < n)
+                    reshapedSize[0] *= dimSize[i];
+            }
-        /* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|. 
+            reshapedSize[2] = a->unitNum / (reshapedSize[0] * reshapedSize[1]);
-           Then reduce along with z and x to obtain dE/db. */
-        node->grad->Reshape(3, reshapedSize);
-        XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
+            /* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|.
+               Then reduce along with z and x to obtain dE/db. */
+            node->grad->Reshape(3, reshapedSize);
-        _ReduceSum(node->grad, interGrad, 2);
+            XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
+            _ReduceSum(node->grad, interGrad, 2);
-        //if(b->outgo.tailNum > 1){
            XTensor * bGradTMP = NewTensorBufV2(b->grad, b->devID, b->mem);
            _ReduceSum(interGrad, bGradTMP, 0);
-            if(beta != 1.0F)
+            if (beta != 1.0F)
                _ScaleAndShiftMe(bGradTMP, beta);
            _Sum(bGradTMP, b->grad, b->grad);
            DelTensorBuf(bGradTMP);
-        /*}
-        else{
-            _ReduceSum(interGrad, b->grad, 0);
-            if(beta != 1.0F)
-                _ScaleAndShiftMe(b->grad, beta);
-        }*/
-        node->grad->Reshape(order, dimSize);
-        DelTensorBuf(interGrad);
+            node->grad->Reshape(order, dimSize);
+            DelTensorBuf(interGrad);
+        }
    }
    node->visitMark = NODE_FINISHED;
 }
@@ -1320,12 +1360,20 @@ void XMathGrad::GradSumBroadcast(XTensor * node, bool isEfficient)
    XTensor * b = income.tails[1];
    //DTYPE beta = income.GetParam(0);
-    XNoder::MakeGrad(a);
+    /* dE/da = dE/dc */
-    _Sum(a->grad, node->grad, a->grad);
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);
+        _Sum(a->grad, node->grad, a->grad);
+    }
-    if(b->isVar || b->income.tailNum > 0){
+    /* dE/db = dE/dc * a.reduce(0..n) * \beta  */
-        ShowNTErrors("TODO");
+    if (!isEfficient || b->isGrad) {
+        if (b->isVar || b->income.tailNum > 0) {
+            ShowNTErrors("TODO");
+        }
    }
+    node->visitMark = NODE_FINISHED;
 }
 /*
@@ -1345,18 +1393,21 @@ void XMathGrad::GradReduceMean(XTensor * node, bool isEfficient)
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for Reduce!");
    XTensor * a = income.tails[0];
-    XTensor * b = NewTensorBufV2(a, a->devID, a->mem);
    int dim = income.GetParamInt(0);
    int n = a->GetDim(dim);
-    XNoder::MakeGrad(a);
+    /* dE/da = Unsqueeze(dE/dc) * 1/dimSizeA[dim] */
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);
-    _Unsqueeze(node->grad, b, dim, n);
+        XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
-    _ScaleAndShiftMe(b, 1.0F/n);
+        _Unsqueeze(node->grad, tmp, dim, n);
-    _Sum(a->grad, b, a->grad);
+        _ScaleAndShiftMe(tmp, 1.0F / n);
+        _Sum(a->grad, tmp, a->grad);
-    DelTensorBuf(b);
+        DelTensorBuf(tmp);
+    }
    node->visitMark = NODE_FINISHED;
 }
@@ -1366,7 +1417,7 @@ gradient for reduceSum
 for
 c = reduceSum(a, dim)
 we have
-dE/da = Unsqueeze(dE/dc) * 1
+dE/da = Unsqueeze(dE/dc)
 >> node - the node (c) for backward computation
 >> isEfficient - indicates whether the computation is in
@@ -1378,17 +1429,19 @@ void XMathGrad::GradReduceSum(XTensor * node, bool isEfficient)
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for Reduce!");
    XTensor * a = income.tails[0];
-    XTensor * b = NewTensorBufV2(a, a->devID, a->mem);
    int dim = income.GetParamInt(0);
    int n = a->GetDim(dim);
-    XNoder::MakeGrad(a);
+    /* dE/da = Unsqueeze(dE/dc) */
+    if (!isEfficient || a->isGrad) {
-    _Unsqueeze(node->grad, b, dim, n);
+        XNoder::MakeGrad(a);
-    _Sum(a->grad, b, a->grad);
-    DelTensorBuf(b);
+        XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
+        _Unsqueeze(node->grad, tmp, dim, n);
+        _Sum(a->grad, tmp, a->grad);
+        DelTensorBuf(tmp);
+    }
    node->visitMark = NODE_FINISHED;
 }
@@ -1419,22 +1472,28 @@ void XMathGrad::GradReduceSumSquared(XTensor * node, bool isEfficient)
    int dim = income.GetParamInt(0);
    int n = a->GetDim(dim);
-    XNoder::MakeGrad(a);
-    XNoder::MakeGrad(b);
-    /* compute a-b */
    _Unsqueeze(b, c, dim, n);
    _Sub(a, c, d);
-    _ReduceSum(d, f, dim);
    /* dE/da_i = Unsqueeze(dE/dc) * 2 * (a_i - b) */
-    _ScaleAndShiftMe(d, 2.0F);
+    if (!isEfficient || a->isGrad) {
-    _Unsqueeze(node->grad, e, dim, n);
+        XNoder::MakeGrad(a);
-    _Multiply(d, e, a->grad, 1.0F);
+        _ScaleAndShiftMe(d, 2.0F);
+        _Unsqueeze(node->grad, e, dim, n);
+        _Multiply(d, e, a->grad, 1.0F);
+    }
    /* dE/db = dE/dc * -2 * n * \sum_i (a_i - b) */
-    _ScaleAndShiftMe(f, -2.0F);
+    if (!isEfficient || b->isGrad) {
-    _Multiply(node->grad, f, b->grad, 1.0F);
+        XNoder::MakeGrad(b);
+        _ReduceSum(d, f, dim);
+        _ScaleAndShiftMe(f, -2.0F);
+        _Multiply(node->grad, f, b->grad, 1.0F);
+    }
    DelTensorBuf(f);
    DelTensorBuf(e);
@@ -1471,22 +1530,27 @@ void XMathGrad::GradReduceVariance(XTensor * node, bool isEfficient)
    int dim = income.GetParamInt(0);
    int n = a->GetDim(dim);
-    XNoder::MakeGrad(a);
-    XNoder::MakeGrad(b);
-    /* compute a-b */
    _Unsqueeze(b, c, dim, n);
    _Sub(a, c, d);
-    _ReduceSum(d, f, dim);
    /* dE/da_i = Unsqueeze(dE/dc) * 2 * (a_i - b) / n */
-    _ScaleAndShiftMe(d, 2.0F / n);
+    if (!isEfficient || a->isGrad) {
-    _Unsqueeze(node->grad, e, dim, n);
+        XNoder::MakeGrad(a);
-    _Multiply(d, e, a->grad, 1.0F);
+        _ScaleAndShiftMe(d, 2.0F / n);
+        _Unsqueeze(node->grad, e, dim, n);
+        _Multiply(d, e, a->grad, 1.0F);
+    }
    /* dE/db = dE/dc * -2 * \sum_i (a_i - b) */
-    _ScaleAndShiftMe(f, -2.0F /n);
+    if (!isEfficient || b->isGrad) {
-    _Multiply(node->grad, f, b->grad, 1.0F);
+        XNoder::MakeGrad(b);
+        _ReduceSum(d, f, dim);
+        _ScaleAndShiftMe(f, -2.0F / n);
+        _Multiply(node->grad, f, b->grad, 1.0F);
+    }
    DelTensorBuf(f);
    DelTensorBuf(e);
@@ -1496,7 +1560,6 @@ void XMathGrad::GradReduceVariance(XTensor * node, bool isEfficient)
    node->visitMark = NODE_FINISHED;
 }
 /*
 gradient for operation
 for c = matmul(x, w) + b 
@@ -1521,66 +1584,66 @@ void XMathGrad::GradMulAndShift(XTensor * node, bool isEfficient)
    MATRIX_TRANS_TYPE transW = income.GetParamTrans(1);
    MATRIX_TRANS_TYPE transX = income.GetParamTrans(2);
-    if (!isEfficient || w->isGrad)
+    /* dE/db = dE/dc * x.reduce(0,...,n-1,n+1,...) */
-        XNoder::MakeGrad(w);
+    if (!isEfficient || b->isGrad) {
-    if (!isEfficient || x->isGrad)
-        XNoder::MakeGrad(x);
-    if (!isEfficient || b->isGrad)
        XNoder::MakeGrad(b);
-    int order = node->order;
+        int order = node->order;
-    int dimSize[MAX_TENSOR_DIM_NUM];
+        int dimSize[MAX_TENSOR_DIM_NUM];
-    memcpy(dimSize, node->dimSize, sizeof(int) * node->order);
+        memcpy(dimSize, node->dimSize, sizeof(int) * node->order);
-    /* compute dE/db */
+        /* compute dE/db */
-    if (n == order - 1) {
+        if (n == order - 1) {
-        int reshapedSize[MAX_TENSOR_DIM_NUM];
+            int reshapedSize[MAX_TENSOR_DIM_NUM];
-        reshapedSize[0] = node->unitNum / dimSize[order - 1];
+            reshapedSize[0] = node->unitNum / dimSize[order - 1];
-        reshapedSize[1] = dimSize[order - 1];
+            reshapedSize[1] = dimSize[order - 1];
-        /* we reshape dE/dc to a matrix whose column number is equal to the
+            /* we reshape dE/dc to a matrix whose column number is equal to the
-        size of b. Then we can reduce the matrix into a row vector. */
+            size of b. Then we can reduce the matrix into a row vector. */
-        node->grad->Reshape(2, reshapedSize);
+            node->grad->Reshape(2, reshapedSize);
-        XTensor * bGradTMP = NewTensorBufV2(b->grad, b->devID, b->mem);
+            XTensor * bGradTMP = NewTensorBufV2(b->grad, b->devID, b->mem);
-        _ReduceSum(node->grad, bGradTMP, 0);
+            _ReduceSum(node->grad, bGradTMP, 0);
-        _Sum(bGradTMP, b->grad, b->grad);
+            _Sum(bGradTMP, b->grad, b->grad);
-        DelTensorBuf(bGradTMP);
+            DelTensorBuf(bGradTMP);
-        node->grad->Reshape(order, dimSize);
+            node->grad->Reshape(order, dimSize);
-    }
-    else {
-        int reshapedSize[MAX_TENSOR_DIM_NUM];
-        reshapedSize[0] = 1;
-        reshapedSize[1] = dimSize[n];
-        reshapedSize[2] = 1;
-        for (int i = 0; i < order; i++) {
-            if (i < n)
-                reshapedSize[0] *= dimSize[i];
        }
+        else {
+            int reshapedSize[MAX_TENSOR_DIM_NUM];
+            reshapedSize[0] = 1;
+            reshapedSize[1] = dimSize[n];
+            reshapedSize[2] = 1;
-        reshapedSize[2] = node->unitNum / (reshapedSize[0] * reshapedSize[1]);
+            for (int i = 0; i < order; i++) {
+                if (i < n)
+                    reshapedSize[0] *= dimSize[i];
+            }
-        /* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|.
+            reshapedSize[2] = node->unitNum / (reshapedSize[0] * reshapedSize[1]);
-        Then reduce along with z and x to obtain dE/db. */
-        node->grad->Reshape(3, reshapedSize);
-        XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
+            /* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|.
+            Then reduce along with z and x to obtain dE/db. */
+            node->grad->Reshape(3, reshapedSize);
-        _ReduceSum(node->grad, interGrad, 2);
+            XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
+            _ReduceSum(node->grad, interGrad, 2);
-        XTensor * bGradTMP = NewTensorBufV2(b->grad, b->devID, b->mem);
+            XTensor * bGradTMP = NewTensorBufV2(b->grad, b->devID, b->mem);
-        _ReduceSum(interGrad, bGradTMP, 0);
+            _ReduceSum(interGrad, bGradTMP, 0);
-        _Sum(bGradTMP, b->grad, b->grad);
+            _Sum(bGradTMP, b->grad, b->grad);
-        DelTensorBuf(bGradTMP);
+            DelTensorBuf(bGradTMP);
-        node->grad->Reshape(order, dimSize);
-        DelTensorBuf(interGrad);
+            node->grad->Reshape(order, dimSize);
+            DelTensorBuf(interGrad);
+        }
    }
+    if (!isEfficient || w->isGrad)
+        XNoder::MakeGrad(w);
+    if (!isEfficient || x->isGrad)
+        XNoder::MakeGrad(x);
    /* compute dE/dx, dE/dw */
    XTensor * c = node;
@@ -1590,7 +1653,7 @@ void XMathGrad::GradMulAndShift(XTensor * node, bool isEfficient)
    if (x->order == 2 && w->order == 2)
        GradMatrixMul(x, dedx, transX, w, dedw, transW, dedc, 1.0F, isEfficient);
-    else if (transX == X_NOTRANS && x->order > 2 && w->order == 2){
+    else if (transX == X_NOTRANS && x->order > 2 && w->order == 2) {
        int orderBackupX = x->order;
        int orderBackupC = c->order;
        int dimsBackupX[MAX_TENSOR_DIM_NUM];

--- a/source/network/XBackwardShape.cpp
+++ b/source/network/XBackwardShape.cpp
@@ -32,33 +32,33 @@
 namespace nts{
 /* compute dE/dx of a node */
-void XShapeGrad::MakeGrad(XTensor * node, bool isEfficent)
+void XShapeGrad::MakeGrad(XTensor * node, bool isEfficient)
 {
    CheckNTErrors(node->grad != NULL, "No gradient found!");
    XLink &income = node->income;
    int operID = income.typeID;
-    if(operID == MOVEMENT_COPYINDEXED)
+    if (operID == MOVEMENT_COPYINDEXED)
-        GradCopyIndexed(node, isEfficent);
+        GradCopyIndexed(node, isEfficient);
-    else if(operID == MOVEMENT_GATHER)
+    else if (operID == MOVEMENT_GATHER)
-        GradGather(node, isEfficent);
+        GradGather(node, isEfficient);
    else if (operID == MOVEMENT_DROPOUTWITHINDEX)
-        GradDropoutWithIndex(node, isEfficent);
+        GradDropoutWithIndex(node, isEfficient);
-    else if(operID == SHAPE_MERGE)
+    else if (operID == SHAPE_MERGE)
-        GradMerge(node, isEfficent);
+        GradMerge(node, isEfficient);
-    else if(operID == SHAPE_MERGE_LIST)
+    else if (operID == SHAPE_MERGE_LIST)
-        GradMergeList(node, isEfficent);
+        GradMergeList(node, isEfficient);
-    else if(operID == SHAPE_RESHAPE)
+    else if (operID == SHAPE_RESHAPE)
-        GradReshape(node, isEfficent);
+        GradReshape(node, isEfficient);
-    else if(operID == SHAPE_SPLIT)
+    else if (operID == SHAPE_SPLIT)
-        GradSplit(node, isEfficent);
+        GradSplit(node, isEfficient);
-    else if(operID == SHAPE_SPLIT_LIST)
+    else if (operID == SHAPE_SPLIT_LIST)
-        GradSplitList(node, isEfficent);
+        GradSplitList(node, isEfficient);
    else if (operID == SHAPE_TRANSPOSE)
-        GradTranspose(node, isEfficent);
+        GradTranspose(node, isEfficient);
-    else if(operID == SHAPE_UNSQUEEZE)
+    else if (operID == SHAPE_UNSQUEEZE)
-        GradUnsqueeze(node, isEfficent);
+        GradUnsqueeze(node, isEfficient);
    else{
        ShowNTErrors("TODO!");
    }
@@ -72,10 +72,10 @@ bool XShapeGrad::IsShapeOP(XTensor * node)
 }
 /* post processing of a node */
-void XShapeGrad::PostProcessing(XTensor * node, int typeID, bool isEfficent)
+void XShapeGrad::PostProcessing(XTensor * node, int typeID, bool isEfficient)
 {
-    if(typeID == SHAPE_SPLIT_LIST)
+    if (typeID == SHAPE_SPLIT_LIST)
-        GradSplitListPost(node, isEfficent);
+        GradSplitListPost(node, isEfficient);
 }
 /* 
@@ -88,7 +88,7 @@ dE/da = spreadforcopyindexed(b)
 >> isEfficient - indicates whether the computation is in
                 an efficient manner
 */
-void XShapeGrad::GradCopyIndexed(XTensor * node, bool isEfficent)
+void XShapeGrad::GradCopyIndexed(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for CopyIndexed!");
@@ -100,8 +100,15 @@ void XShapeGrad::GradCopyIndexed(XTensor * node, bool isEfficent)
    XTensor * srcIndex = income.tails[1];
    XTensor * tgtIndex = income.tails[2];
-    XNoder::MakeGrad(input);
+    if (!isEfficient || input->isGrad) {
-    _SpreadForCopyIndexed(input->grad, node->grad, dim, srcIndex, tgtIndex, copyNum);
+        XNoder::MakeGrad(input);
+        XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
+        _SpreadForCopyIndexed(tmp, node->grad, dim, srcIndex, tgtIndex, copyNum);
+        _SumMe(input->grad, tmp);
+        DelTensorBuf(tmp);
+    }
 }
 /* 
@@ -114,16 +121,23 @@ dE/da = spreadforgather(b)
 >> isEfficient - indicates whether the computation is in
                 an efficient manner
 */
-void XShapeGrad::GradGather(XTensor * node, bool isEfficent)
+void XShapeGrad::GradGather(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for Gather!");
    XTensor * input = income.tails[0];
    XTensor * index = income.tails[1];
-    XNoder::MakeGrad(input);
+    if (!isEfficient || input->isGrad) {
+        XNoder::MakeGrad(input);
-    _SpreadForGather(input->grad, node->grad, index);
+        XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
+        _SpreadForGather(tmp, node->grad, index);
+        _SumMe(input->grad, tmp);
+        DelTensorBuf(tmp);
+    }
    node->visitMark = NODE_FINISHED;
 }
@@ -131,7 +145,7 @@ void XShapeGrad::GradGather(XTensor * node, bool isEfficent)
 /*
 gradient computation for DropoutWithIndex function
 */
-void XShapeGrad::GradDropoutWithIndex(XTensor * node, bool isEfficent)
+void XShapeGrad::GradDropoutWithIndex(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for DropoutWithIndex!");
@@ -139,28 +153,23 @@ void XShapeGrad::GradDropoutWithIndex(XTensor * node, bool isEfficent)
    XTensor * input = income.tails[0];
    XTensor * index = income.tails[1];
    DTYPE scale = income.GetParam(0);
-    XNoder::MakeGrad(input);
+    if (!isEfficient || input->isGrad) {
-    //_Identity(node->grad, input->grad);
+        XNoder::MakeGrad(input);
-    _CopyValues(node->grad, input->grad);
-    int order = node->grad->order;
+        XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
-    int * dimSize = new int[order];
+        _CopyValues(node->grad, tmp);
-    for (int i = 0; i < order; i++) {
+        tmp->Reshape(tmp->unitNum);
-        dimSize[i] = node->grad->dimSize[i];
-    }
-    int order1 = 1;
+        _DropoutWithIndex(node->grad, index, tmp);
-    int * dimSize1 = new int[order1];
+        _ScaleAndShiftMe(tmp, scale);
-    dimSize1[0] = input->grad->unitNum;
-    input->grad->Reshape(order1, dimSize1);
-    _DropoutWithIndex(node->grad, index, input->grad);
+        tmp->Reshape(input->order, input->dimSize);
-    _ScaleAndShiftMe(input->grad, scale);
+        _SumMe(input->grad, tmp);
-    input->grad->Reshape(order, dimSize);
+        DelTensorBuf(tmp);
+    }
    node->visitMark = NODE_FINISHED;
 }
@@ -180,7 +189,7 @@ dE/da = split(dE/dc)
 >> isEfficient - indicates whether the computation is in
                 an efficient manner
 */
-void XShapeGrad::GradMerge(XTensor * node, bool isEfficent)
+void XShapeGrad::GradMerge(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    XTensor * input = income.tails[0];
@@ -191,62 +200,64 @@ void XShapeGrad::GradMerge(XTensor * node, bool isEfficent)
    int whereToMerge = income.GetParamInt(0);
    int leadDim = income.GetParamInt(1);
-    int blockSize = 1;
+    if (!isEfficient || input->isGrad) {
-    int blockNum = 1;
+        XNoder::MakeGrad(input);
-    for(int i = 0; i < input->order; i++){
-        if(i < leadDim)
-            blockNum *= input->dimSize[i];
-    }
-    blockSize = input->GetDataSizeInChar() / blockNum;
-    XNoder::MakeGrad(input);
-    int * dims = new int[input->order];
+        int * dims = new int[input->order];
-    memset(dims, 0, sizeof(int) * input->order);
+        memset(dims, 0, sizeof(int) * input->order);
-    for(int i = 0, j = 0; i < input->order; i++){
+        for (int i = 0, j = 0; i < input->order; i++) {
-        if(i >= leadDim){
+            if (i >= leadDim) {
-            dims[j++] = input->dimSize[i];
+                dims[j++] = input->dimSize[i];
+            }
        }
-    }
+        dims[0] = -dims[0];
-    dims[0] = -dims[0];
+        XTensor gradInputSmall(input->order - leadDim, dims,
-    XTensor gradInputSmall(input->order - leadDim, dims,
+                               input->dataType, input->denseRatio,
-                           input->dataType, input->denseRatio, 
+                               input->devID, input->mem);
-                           input->devID, input->mem);
+        dims[whereToMerge - leadDim] *= dims[0];
-    dims[whereToMerge - leadDim] *= dims[0];
+        XTensor gradNodeSmall(node->order - leadDim, dims + leadDim + 1,
-    XTensor gradNodeSmall(node->order - leadDim, dims + leadDim + 1,
+                              node->dataType, node->denseRatio,
-                          node->dataType, node->denseRatio, 
+                              node->devID, node->mem);
-                          node->devID, node->mem);
+        int blockSize = 1;
-    /* we can simply split the gradient tensor 
+        int blockNum = 1;
-       if the input is used in merging only */
+        for (int i = 0; i < input->order; i++) {
-    if(input->outgo.tailNum == 1){
+            if (i < leadDim)
-        for(int i = 0; i < blockNum; i++){
+                blockNum *= input->dimSize[i];
-            gradNodeSmall.data = (char*)node->grad->data + i * blockSize;
+        }
-            gradInputSmall.data = (char*)input->grad->data + i * blockSize;
+        blockSize = input->GetDataSizeInChar() / blockNum;
-            _Split(&gradNodeSmall, &gradInputSmall, whereToMerge - leadDim - 1, input->dimSize[leadDim]);
+        /* we can simply split the gradient tensor
+           if the input is used in merging only */
+        if (input->outgo.tailNum == 1) {
+            for (int i = 0; i < blockNum; i++) {
+                gradNodeSmall.data = (char*)node->grad->data + i * blockSize;
+                gradInputSmall.data = (char*)input->grad->data + i * blockSize;
+                _Split(&gradNodeSmall, &gradInputSmall, whereToMerge - leadDim - 1, input->dimSize[leadDim]);
+            }
        }
-    }
-    /* a more complicated case is that the input tensor is used for 
-       other operations somewhere else. So we have to do gradient 
-       accumulation after spliting, i.e., we need an additional 
-       SUM operation */
-    else{
-        XTensor gradInputSmallBuf(&gradInputSmall);
-        for(int i = 0; i < blockNum; i++){
+        /* a more complicated case is that the input tensor is used for
-            gradNodeSmall.data = (char*)node->grad->data + i * blockSize;
+           other operations somewhere else. So we have to do gradient
-            gradInputSmall.data = (char*)input->grad->data + i * blockSize;
+           accumulation after spliting, i.e., we need an additional
-            _Split(&gradNodeSmall, &gradInputSmallBuf, whereToMerge - leadDim - 1, input->dimSize[leadDim]);
+           SUM operation */
-            _Sum(&gradInputSmall, &gradInputSmallBuf, &gradInputSmall);
+        else {
+            XTensor gradInputSmallBuf(&gradInputSmall);
+            for (int i = 0; i < blockNum; i++) {
+                gradNodeSmall.data = (char*)node->grad->data + i * blockSize;
+                gradInputSmall.data = (char*)input->grad->data + i * blockSize;
+                _Split(&gradNodeSmall, &gradInputSmallBuf, whereToMerge - leadDim - 1, input->dimSize[leadDim]);
+                _Sum(&gradInputSmall, &gradInputSmallBuf, &gradInputSmall);
+            }
        }
-    }
-    gradNodeSmall.data = NULL;
+        gradNodeSmall.data = NULL;
-    gradInputSmall.data = NULL;
+        gradInputSmall.data = NULL;
-    delete[] dims;
+        delete[] dims;
+    }
    node->visitMark = NODE_FINISHED;
 }
@@ -274,18 +285,18 @@ void XShapeGrad::GradMergeList(XTensor * node, bool isEfficient)
    TensorList smalls(income.tailNum);
    TensorList smallsGrad(income.tailNum);
    bool mergeOnly = true;
-    for(int i = 0; i < income.tailNum; i++){
+    for (int i = 0; i < income.tailNum; i++) {
+        /* TODO! efficient backpropagate */
        XTensor * tail = income.tails[i];
        XNoder::MakeGrad(tail);
        smalls.Add(tail);
        smallsGrad.Add(tail->grad);
-        if(i > 1){
+        if (i > 1)
-            CheckNTErrors(_IsSameShaped(last, tail), 
+            CheckNTErrors(_IsSameShaped(last, tail), "Input tensors must be of the same size!");
-                         "Input tensors must be of the same size!");
-        }
-        if(tail->outgo.tailNum  > 1)
+        if (tail->outgo.tailNum  > 1)
            mergeOnly = false;
        last = tail;
@@ -295,7 +306,7 @@ void XShapeGrad::GradMergeList(XTensor * node, bool isEfficient)
    /* we can simply split the gradient tensor into the input tensors 
       if the inputs are used in merging only */
-    if(mergeOnly)
+    if (mergeOnly)
        _Split(node->grad, &smallsGrad, whereToMerge, smalls.count);
    /* a more complicated case is that the input tensors are used for 
@@ -321,7 +332,7 @@ void XShapeGrad::GradMergeList(XTensor * node, bool isEfficient)
                          last->devID, last->mem);
        /* gradient accumulation for each split */
-        for(int i = 0; i < smalls.count; i++){
+        for (int i = 0; i < smalls.count; i++) {
            XTensor * inputGrad = (XTensor*)smallsGrad.Get(i);
            gradSmall.data = (char*)gradSplit.data + i * last->unitNum * last->unitSize;
            _Sum(inputGrad, &gradSmall, inputGrad);
@@ -344,17 +355,20 @@ dE/da = reshape(dE/db)
 >> isEfficient - indicates whether the computation is in
                 an efficient manner
 */
-void XShapeGrad::GradReshape(XTensor * node, bool isEfficent)
+void XShapeGrad::GradReshape(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for RESHAPE!");
    XTensor * input = income.tails[0];
-    XNoder::MakeGrad(input);
-    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for MERGE!");
+    if (!isEfficient || input->isGrad) {
+        XNoder::MakeGrad(input);
-    node->grad->Reshape(input->order, input->dimSize);
+        node->grad->Reshape(input->order, input->dimSize);
-    _CopyValues(node->grad, input->grad);
+        _CopyValues(node->grad, input->grad);
-    node->grad->Reshape(node->order, node->dimSize);
+        node->grad->Reshape(node->order, node->dimSize);
+    }
    node->visitMark = NODE_FINISHED;
 }
@@ -381,22 +395,24 @@ void XShapeGrad::GradSplit(XTensor * node, bool isEfficient)
    CheckNTErrors(node->order == input->order + 1, "Wrong tensor orders!");
    CheckNTErrors(splitNum == node->dimSize[0], "Wrong split number!");
-    XNoder::MakeGrad(input);
+    if (!isEfficient || input->isGrad) {
+        XNoder::MakeGrad(input);
-    /* we can simply merge the gradient tensor 
+        /* we can simply merge the gradient tensor
-       if the input is used in spliting only */
+           if the input is used in spliting only */
-    if(input->outgo.tailNum == 1)
+        if (input->outgo.tailNum == 1)
-        _Merge(node->grad, input->grad, whereToSplit + 1, 0);
+            _Merge(node->grad, input->grad, whereToSplit + 1, 0);
-    /* if the tensor is used somewhere else, we need another SUM
+        /* if the tensor is used somewhere else, we need another SUM
-       for gradient accumulation */
+           for gradient accumulation */
-    else{
+        else {
-        XTensor * inputGradTMP = NewTensorBufV2(input, input->devID, input->mem);
+            XTensor * inputGradTMP = NewTensorBufV2(input, input->devID, input->mem);
-        _Merge(node->grad, inputGradTMP, whereToSplit + 1, 0);
+            _Merge(node->grad, inputGradTMP, whereToSplit + 1, 0);
-        _Sum(input->grad, inputGradTMP, input->grad);
+            _Sum(input->grad, inputGradTMP, input->grad);
-        DelTensorBuf(inputGradTMP);
+            DelTensorBuf(inputGradTMP);
+        }
    }
    node->visitMark = NODE_FINISHED;
@@ -444,14 +460,14 @@ void XShapeGrad::GradSplitListPost(XTensor * node, bool isEfficient)
    int whereToSplit = -1;
    int splitNum = 0;
-    for(int i = 0; i < outgo.tailNum; i++){
+    for (int i = 0; i < outgo.tailNum; i++) {
        XTensor * parent = (XTensor*)outgo.tails[i];
        XLink &income = parent->income;
-        if(income.typeID == SHAPE_SPLIT_LIST){
+        if (income.typeID == SHAPE_SPLIT_LIST) {
            int w = income.GetParamInt(0);
            int splitID = income.GetParamInt(1);
-            if(whereToSplit < 0)
+            if (whereToSplit < 0)
                whereToSplit = w;
            splitNum++;
@@ -463,24 +479,26 @@ void XShapeGrad::GradSplitListPost(XTensor * node, bool isEfficient)
        }
    }
-    XNoder::MakeGrad(node);
+    if (!isEfficient || node->isGrad) {
+        XNoder::MakeGrad(node);
-    /* we can simply merge the gradient tensor 
+        /* we can simply merge the gradient tensor
-       if the node is used in spliting only */
+           if the node is used in spliting only */
-    if(outgo.tailNum == splitNum){
+        if (outgo.tailNum == splitNum) {
-        _Merge(&splits, node->grad, whereToSplit);
+            _Merge(&splits, node->grad, whereToSplit);
-    }
+        }
-    /* if the tensor is used as input to other nodes
+        /* if the tensor is used as input to other nodes
-       somewhere else, we need another SUM for gradient 
+           somewhere else, we need another SUM for gradient
-       accumulation */
+           accumulation */
-    else{
+        else {
-        XTensor * nodeGradTMP = NewTensorBufV2(node, node->devID, node->mem);
+            XTensor * nodeGradTMP = NewTensorBufV2(node, node->devID, node->mem);
-        _Merge(&splits, nodeGradTMP, whereToSplit + 1);
+            _Merge(&splits, nodeGradTMP, whereToSplit + 1);
-        _Sum(node->grad, nodeGradTMP, node->grad);
+            _Sum(node->grad, nodeGradTMP, node->grad);
-        DelTensorBuf(nodeGradTMP);
+            DelTensorBuf(nodeGradTMP);
+        }
    }
 }
@@ -501,19 +519,23 @@ void XShapeGrad::GradTranspose(XTensor * node, bool isEfficient)
    XTensor * output = node;
    XTensor * input = income.tails[0];
-    XTensor * b = NewTensorBufV2(input, input->devID, input->mem);
-    XNoder::MakeGrad(input);
-    int i = income.GetParamInt(0);
-    int j = income.GetParamInt(1);
-    CheckNTErrors(input->order > i && i >= 0, "index of dimension is out of scope!");
+    if (!isEfficient || input->isGrad) {
-    CheckNTErrors(input->order > j && j >= 0, "index of dimension is out of scope!");
+        XNoder::MakeGrad(input);
-    _Transpose(output->grad, b, i, j);
+        int i = income.GetParamInt(0);
-    _Sum(input->grad, b, input->grad);
+        int j = income.GetParamInt(1);
-    DelTensorBuf(b);
+        CheckNTErrors(input->order > i && i >= 0, "index of dimension is out of scope!");
+        CheckNTErrors(input->order > j && j >= 0, "index of dimension is out of scope!");
+        XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
+        _Transpose(output->grad, tmp, i, j);
+        _Sum(input->grad, tmp, input->grad);
+        DelTensorBuf(tmp);
+    }
    node->visitMark = NODE_FINISHED;
 }
@@ -535,7 +557,6 @@ void XShapeGrad::GradUnsqueeze(XTensor * node, bool isEfficient)
    XTensor * output = node;
    XTensor * input = income.tails[0];
-    XNoder::MakeGrad(input);
    int dim = income.GetParamInt(0);
    int dSize = income.GetParamInt(1);
@@ -543,12 +564,16 @@ void XShapeGrad::GradUnsqueeze(XTensor * node, bool isEfficient)
    CheckNTErrors(dSize == output->GetDim(dim), "Wrong dim size for UNSQUEEZE!");
    CheckNTErrors(output->unitNum = input->unitNum * dSize, "Wrong tensor size!");
-    XTensor * g = NewTensorBufV2(input->grad, input->devID, input->mem);
+    if (!isEfficient || input->isGrad) {
+        XNoder::MakeGrad(input);
-    _ReduceSum(output->grad, g, dim);
-    _Sum(input->grad, g, input->grad);
+        XTensor * tmp = NewTensorBufV2(input->grad, input->devID, input->mem);
-    DelTensorBuf(g);
+        _ReduceSum(output->grad, tmp, dim);
+        _Sum(input->grad, tmp, input->grad);
+        DelTensorBuf(tmp);
+    }
    node->visitMark = NODE_FINISHED;
 }

--- a/source/network/XNet.cpp
+++ b/source/network/XNet.cpp
@@ -316,7 +316,6 @@ void XNet::ClearGrad(XTensor * node)
    }
    if(finished){
-        //fprintf(stderr, "del %d %ld\n", node->id, node->grad->unitNum);
        delete node->grad;
        node->grad = NULL;
    }

--- a/source/tensor/core/arithmetic/MatrixMul.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul.cpp
@@ -62,7 +62,7 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
    /* we transform a higher order tensor to a matrix to kill the number
       of calls of matrix multiplication */
-    if(transposedA == X_NOTRANS && a->order > 2 && b->order == 2){
+    if (transposedA == X_NOTRANS && a->order > 2 && b->order == 2) {
        int ncolA = a->dimSize[a->order - 1];
        int ncolC = c->dimSize[c->order - 1];
        XTensor * a2 = NewTensor2DV2(a->unitNum/ncolA, -ncolA, a->dataType, a->devID, a->mem);

--- a/source/tensor/core/math/Compare.cpp
+++ b/source/tensor/core/math/Compare.cpp
@@ -199,8 +199,8 @@ void funcName(const XTensor &a, const XTensor &b, XTensor c)                    
 }
 #ifdef USE_CUDA
-_SIMPLE_MAX_MIN_FUNCTION(_Max, _CudaMax, max)
+_SIMPLE_MAX_MIN_FUNCTION(_Max, _CudaMax, MAX)
-_SIMPLE_MAX_MIN_FUNCTION(_Min, _CudaMin, min)
+_SIMPLE_MAX_MIN_FUNCTION(_Min, _CudaMin, MIN)
 #else
 _SIMPLE_MAX_MIN_FUNCTION(_Max, max)
 _SIMPLE_MAX_MIN_FUNCTION(_Min, min)

--- a/source/tensor/core/shape/Split.h
+++ b/source/tensor/core/shape/Split.h
 /* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
+ * All rights reserved.
-*
+ *
-* Licensed under the Apache License, Version 2.0 (the "License");
+ * Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
+ * you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
+ * You may obtain a copy of the License at
-*
+ *
-*   http://www.apache.org/licenses/LICENSE-2.0
+ *   http://www.apache.org/licenses/LICENSE-2.0
-*
+ *
-* Unless required by applicable law or agreed to in writing, software
+ * Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
+ * distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
+ * See the License for the specific language governing permissions and
-* limitations under the License.
+ * limitations under the License.
-*/
+ */
 /*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
+ * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
+ */
 #ifndef __SPLIT_H__
 #define __SPLIT_H__

--- a/source/tensor/core/shape/Stack.cpp
+++ b/source/tensor/core/shape/Stack.cpp
@@ -85,7 +85,7 @@ XTensor Stack(const TensorList &smalls, int dim)
 {
    int count = smalls.count;
    CheckNTErrors(count > 0, "Empty list!");
-    CheckNTErrors(dim >= 0, "Illegal dimension to concatenate!");
+    CheckNTErrors(dim >= 0, "Illegal dimension to Stack!");
    XTensor * tensor = smalls.GetItem(0);
    int order = tensor->order + 1;
@@ -95,7 +95,7 @@ XTensor Stack(const TensorList &smalls, int dim)
        if (i < dim)
            dimSize[i] = tensor->GetDim(i);
        else if (i > dim)
-            dimSize[i] = tensor->GetDim(i);
+            dimSize[i] = tensor->GetDim(i-1);
        else if (i == dim)
            dimSize[i] = count;
    }
@@ -149,7 +149,7 @@ void Stack(const TensorList &smalls, XTensor &t, int dim)
 {
    int count = smalls.count;
    CheckNTErrors(count > 0, "Empty list!");
-    CheckNTErrors(dim >= 0, "Illegal dimension to concatenate!");
+    CheckNTErrors(dim >= 0, "Illegal dimension to Stack!");
    if (!t.isInit || !CheckStackShape(smalls, t, dim)) {
        XTensor * tensor = smalls.GetItem(0);