Merge with XU Chen branch (Don't use this! It's an incomplete version)

1. Supporting efficient propagate and gradient accumulation for backward functions. 2. Update the setData functions. 3. Clean the codes.

Merge with XU Chen branch (Don't use this! It's an incomplete version)
1. Supporting efficient propagate and gradient accumulation for backward functions. 2. Update the setData functions. 3. Clean the codes.
3a515f68 · liyinqiao · be870567 · 3a515f68 · 3a515f68 · 3a515f68
Commit 3a515f68 authored Mar 17, 2020 by liyinqiao
--- a/source/network/XBackwardFunc.cpp
+++ b/source/network/XBackwardFunc.cpp
@@ -31,43 +31,54 @@ namespace nts{
 /* compute dE/dx of a node */
 void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient)
 {
-    XLink &income = node->income;
-    int operID = income.typeID;
-
-    if(!isEfficient){
+    if (!isEfficient) {
        CheckNTErrors(node->grad != NULL, "No gradient found!");
    }
-    else{
+    else {
        CheckNTErrors(!node->isGrad || node->grad != NULL, "No gradient found!");
    }

+    XLink &income = node->income;
+    int operID = income.typeID;
+
    CheckNTErrors(income.tailNum == 1, "Too many input tensors for the function!");

    XTensor * input = income.tails[0];
    XTensor * output = node;

-    XNoder::MakeGrad(input);
+    if (!isEfficient || input->isGrad) {
+        XNoder::MakeGrad(input);

-    if(operID == FUNC_HARDTANH)
-        _HardTanHBackward(output, input, output->grad, input->grad);
-    else if(operID == FUNC_IDENTITY)
-        _IdentityBackward(output, input, output->grad, input->grad);
-    else if(operID == FUNC_LOGSOFTMAX){
-        int leadDim = income.GetParamInt(0);
-        CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in logsoftmax!");
-        _LogSoftmaxBackward(NULL, output, input, output->grad, input->grad, NULL, leadDim, NOLOSS);
-    }
-    else if(operID == FUNC_RECTIFY)
-        _RectifyBackward(output, input, output->grad, input->grad);
-    else if(operID == FUNC_SIGMOID)
-        _SigmoidBackward(output, input, output->grad, input->grad);
-    else if(operID == FUNC_SOFTMAX){
-        int leadDim = income.GetParamInt(0);
-        CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in softmax!");
-        _SoftmaxBackward(NULL, output, input, output->grad, input->grad, NULL, leadDim, NOLOSS);
-    }
-    else{
-        ShowNTErrors("Wrong activation function type!");
+        XTensor * dedx = input->grad;
+        XTensor * dedy = output->grad;
+        //XTensor * tmp = NewTensorBufV2(output, output->devID, output->mem);
+        XTensor * tmp = NewTensor(output);
+
+        if (operID == FUNC_HARDTANH)
+            _HardTanHBackward(output, input, dedy, tmp);
+        else if (operID == FUNC_IDENTITY)
+            _IdentityBackward(output, input, dedy, tmp);
+        else if (operID == FUNC_LOGSOFTMAX) {
+            int leadDim = income.GetParamInt(0);
+            CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in logsoftmax!");
+            _LogSoftmaxBackward(NULL, output, input, dedy, tmp, NULL, leadDim, NOLOSS);
+        }
+        else if (operID == FUNC_RECTIFY)
+            _RectifyBackward(output, input, dedy, tmp);
+        else if (operID == FUNC_SIGMOID)
+            _SigmoidBackward(output, input, dedy, tmp);
+        else if (operID == FUNC_SOFTMAX) {
+            int leadDim = income.GetParamInt(0);
+            CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in softmax!");
+            _SoftmaxBackward(NULL, output, input, dedy, tmp, NULL, leadDim, NOLOSS);
+        }
+        else {
+            ShowNTErrors("Wrong activation function type!");
+        }
+
+        _SumMe(dedx, tmp);
+        //DelTensorBuf(tmp);
+        DelTensor(tmp);
    }

    node->visitMark = NODE_FINISHED;

--- a/source/network/XBackwardLoss.cpp
+++ b/source/network/XBackwardLoss.cpp
@@ -33,7 +33,6 @@

 namespace nts{

-
 /* compute dE/dx of a node */
 void XLossGrad::MakeGrad(XTensor * node, bool isEfficient)
 {
@@ -48,33 +47,33 @@ void XLossGrad::MakeGrad(XTensor * node, bool isEfficient)
    XTensor * padding = NULL;
    int leadingDim;

-    XNoder::MakeGrad(output);
-    XTensor * dedy = output->grad;
-
-    if (income.tailNum == 1) {
-        if(dedy->dataType == X_FLOAT)
-            _SetDataFixedFloat(dedy, 1.0F);
-        else if(dedy->dataType == X_DOUBLE)
-            _SetDataFixedDouble(dedy, 1.0);
-        else if(dedy->dataType == X_INT)
-            _SetDataFixedInt(dedy, 1);
-        else
-            ShowNTErrors("TODO");
-
-        return;
-    }
-
-    gold = income.tails[1];
-
-    if(operID == LOSS_CROSSENTROPY) {
-        if (income.tailNum == 3) 
-            padding = income.tails[2];
-        leadingDim = income.GetParamInt(0);
-        CheckNTErrors(leadingDim >= 0 && leadingDim < output->order, "wrong leading dimension in logsoftmax!");
-        _CrossEntropyBackward(dedy, output, gold, weight, padding, leadingDim);
-    }
-    else{
-        ShowNTErrors("Wrong activation function type!");
+    if (!isEfficient || output->isGrad) {
+        XNoder::MakeGrad(output);
+        XTensor * dedy = output->grad;
+
+        if (income.tailNum == 1) {
+            dedy->SetDataFixed(1);
+            return;
+        }
+
+        gold = income.tails[1];
+
+        //XTensor * tmp = NewTensorBufV2(output, output->devID, output->mem);
+        XTensor* tmp = NewTensor(output);
+
+        if (operID == LOSS_CROSSENTROPY) {
+            if (income.tailNum == 3)
+                padding = income.tails[2];
+            leadingDim = income.GetParamInt(0);
+            CheckNTErrors(leadingDim >= 0 && leadingDim < output->order, "wrong leading dimension in logsoftmax!");
+            _CrossEntropyBackward(tmp, output, gold, weight, padding, leadingDim);
+            _SumMe(dedy, tmp);
+        }
+        else {
+            ShowNTErrors("Wrong activation function type!");
+        }
+        //DelTensorBuf(tmp);
+        DelTensor(tmp);
    }

    node->visitMark = NODE_FINISHED;
@@ -87,79 +86,4 @@ bool XLossGrad::IsLossOP(XTensor * node)
    return (income.typeID & LOSS_BASE) != 0;
 }

-/* 
-compute dE/dx for a given function y = f(x) 
->> gold - gold standard to measure error (or loss)
->> y - output of the function
->> x - input of the function
->> dedy - dE/dy
->> dedx - dE/dx
->> funcID - id of the function f
->> params - parameters of the function
->> lossName - name of the loss, e.g., cross entropy
-*/
-//void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x, 
-//                        XTensor * dedy, XTensor * dedx, XTensor * padding,
-//                        int funcID, void * params,
-//                        LOSS_FUNCTION_NAME lossName)
-//{
-//    CheckNTErrors(gold && y && x, "Empty input tensors!");
-//    CheckNTErrors(dedx, "Empty gradient tensors!");
-//    CheckNTErrors((funcID & FUNCTION_BASE) != 0, "Illegal function id");
-//
-//    if(funcID == FUNC_HARDTANH){
-//        _HardTanHBackward(gold, y, x, dedy, dedx, lossName);
-//    }
-//    else if(funcID == FUNC_IDENTITY){
-//        _IdentityBackward(gold, y, x, dedy, dedx, lossName);
-//    }
-//    else if(funcID == FUNC_LOGSOFTMAX){
-//        int leadDim = *(int*)params;
-//        _LogSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
-//    }
-//    else if(funcID == FUNC_RECTIFY){
-//        _RectifyBackward(gold, y, x, dedy, dedx, lossName);
-//    }
-//    else if(funcID == FUNC_SIGMOID){
-//        _SigmoidBackward(gold, y, x, dedy, dedx, lossName);
-//    }else if(funcID == FUNC_SOFTMAX){
-//        int leadDim = *(int*)params;
-//        _SoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
-//    }
-//    else{
-//        ShowNTErrors("wrong function found when call the backward process!");
-//    }
-//
-//}
-
-/* 
-compute dE/dy for variable y and error(loss) function E
->> gold - gold standard to measure error (or loss)
->> y - output of the function
->> dedy - dE/dy
->> lossName - name of the loss, e.g., cross entropy
-*/
-//void XLossGrad::Compute(XTensor * gold, XTensor * y, 
-//                        XTensor * dedy, XTensor * padding,
-//                        LOSS_FUNCTION_NAME lossName)
-//{
-//    if(gold == NULL){
-//        if(dedy->dataType == X_FLOAT)
-//            _SetDataFixedFloat(dedy, 1.0F);
-//        else if(dedy->dataType == X_DOUBLE)
-//            _SetDataFixedDouble(dedy, 1.0);
-//        else if(dedy->dataType == X_INT)
-//            _SetDataFixedInt(dedy, 1);
-//        else{
-//            ShowNTErrors("TODO");
-//        }
-//        return;
-//    }
-//
-//    //_LossBackward(dedy, gold, y, lossName);
-//    if(lossName == CROSSENTROPY)
-//        _CrossEntropyBackward(dedy, y, gold, NULL, padding);
-//
-//}
-
 }
\ No newline at end of file
--- a/source/network/XBackwardMath.cpp
+++ b/source/network/XBackwardMath.cpp
@@ -30,82 +30,82 @@ namespace nts{
 /* compute dE/dx of a node */
 void XMathGrad::MakeGrad(XTensor * node, bool isEfficient)
 {
-    if(!isEfficient){
+    if (!isEfficient) {
        CheckNTErrors(node->grad != NULL, "No gradient found!");
    }
-    else{
+    else {
        CheckNTErrors(!node->isGrad || node->grad != NULL, "No gradient found!");
    }

    XLink &income = node->income;
    int operID = income.typeID;

-    if(operID == MATH_ABSOLUTE)
+    if (operID == MATH_ABSOLUTE)
        GradAbsolute(node, isEfficient);
-    else if(operID == MATH_COS)
+    else if (operID == MATH_COS)
        GradCos(node, isEfficient);
-    else if(operID == MATH_EXP)
+    else if (operID == MATH_EXP)
        GradExp(node, isEfficient);
-    else if(operID == MATH_LOG)
+    else if (operID == MATH_LOG)
        GradLog(node, isEfficient);
-    else if(operID == MATH_ROUND)
+    else if (operID == MATH_ROUND)
        GradRound(node, isEfficient);
-    else if(operID == MATH_SIGN)
+    else if (operID == MATH_SIGN)
        GradSign(node, isEfficient);
-    else if(operID == MATH_SIN)
+    else if (operID == MATH_SIN)
        GradSin(node, isEfficient);
-    else if(operID == MATH_TAN)
+    else if (operID == MATH_TAN)
        GradTan(node, isEfficient);

-    else if(operID == MATH_CLIP)
+    else if (operID == MATH_CLIP)
        GradClip(node, isEfficient);
-    else if(operID == MATH_DIV)
+    else if (operID == MATH_DIV)
        GradDiv(node, isEfficient);
-    else if(operID == MATH_DIVDIM)
+    else if (operID == MATH_DIVDIM)
        GradDivDim(node, isEfficient);
-    else if(operID == MATH_MATRIXMUL)
+    else if (operID == MATH_MATRIXMUL)
        GradMatrixMul(node, isEfficient);
-    else if(operID == MATH_MATRIXMULBATCHED)
+    else if (operID == MATH_MATRIXMULBATCHED)
        GradMatrixMulBatched(node, isEfficient);
-    else if(operID == MATH_MULTIPLY)
+    else if (operID == MATH_MULTIPLY)
        GradMultiply(node, isEfficient);
-    else if(operID == MATH_MULTIPLYDIM)
+    else if (operID == MATH_MULTIPLYDIM)
        GradMultiplyDim(node, isEfficient);
    else if (operID == MATH_MULTIPLYBROADCAST)
        GradMultiplyBroadcast(node, isEfficient);
-    else if(operID == MATH_NEGATE)
+    else if (operID == MATH_NEGATE)
        GradNegate(node, isEfficient);
-    else if(operID == MATH_NORMALIZE)
+    else if (operID == MATH_NORMALIZE)
        GradNormalize(node, isEfficient);
-    else if(operID == MATH_POWER)
+    else if (operID == MATH_POWER)
        GradPower(node, isEfficient);
-    else if(operID == MATH_SCALEANDSHIFT)
+    else if (operID == MATH_SCALEANDSHIFT)
        GradScaleAndShift(node, isEfficient);
-    else if(operID == MATH_SCALE)
+    else if (operID == MATH_SCALE)
        GradScale(node, isEfficient);
-    else if(operID == MATH_DESCALE)
+    else if (operID == MATH_DESCALE)
        GradDescale(node, isEfficient);
-    else if(operID == MATH_SHIFT)
+    else if (operID == MATH_SHIFT)
        GradShift(node, isEfficient);
-    else if(operID == MATH_SUB)
+    else if (operID == MATH_SUB)
        GradSub(node, isEfficient);
-    else if(operID == MATH_SUBDIM)
+    else if (operID == MATH_SUBDIM)
        GradSubDim(node, isEfficient);
-    else if(operID == MATH_SUM)
+    else if (operID == MATH_SUM)
        GradSum(node, isEfficient);
-    else if(operID == MATH_SUMDIM)
+    else if (operID == MATH_SUMDIM)
        GradSumDim(node, isEfficient);
-    else if(operID == MATH_SUMBROADCAST)
+    else if (operID == MATH_SUMBROADCAST)
        GradSumBroadcast(node, isEfficient);
-    else if(operID == REDUCE_REDUCEMEAN)
+    else if (operID == REDUCE_REDUCEMEAN)
        GradReduceMean(node, isEfficient);
-    else if(operID == REDUCE_REDUCESUM)
+    else if (operID == REDUCE_REDUCESUM)
        GradReduceSum(node, isEfficient);
-    else if(operID == REDUCE_REDUCESUMALL)
+    else if (operID == REDUCE_REDUCESUMALL)
        GradReduceSumAll(node, isEfficient);
-    else if(operID == REDUCE_REDUCESUMSQUARED)
+    else if (operID == REDUCE_REDUCESUMSQUARED)
        GradReduceSumSquared(node, isEfficient);
-    else if(operID == REDUCE_REDUCEVARIANCE)
+    else if (operID == REDUCE_REDUCEVARIANCE)
        GradReduceVariance(node, isEfficient);
    else if (operID == MATH_MULANDSHIFT)
        GradMulAndShift(node, isEfficient);
@@ -138,14 +138,17 @@ void XMathGrad::GradAbsolute(XTensor * node, bool isEfficient)
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for ABSOLUTE!");

    XTensor * a = income.tails[0];
-    XTensor * b = NewTensorBufV2(a, a->devID, a->mem);

-    XNoder::MakeGrad(a);
+    /* dE/da = dE/dc * sign(a) */
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);

-    _Sign(a, b);
-    _Multiply(node->grad, b, a->grad, 1.0F);
+        XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
+        _Sign(a, tmp);
+        _Multiply(node->grad, tmp, a->grad, 1.0F);

-    DelTensorBuf(b);
+        DelTensorBuf(tmp);
+    }

    node->visitMark = NODE_FINISHED;
 }
@@ -166,15 +169,18 @@ void XMathGrad::GradCos(XTensor * node, bool isEfficient)
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for COS!");

    XTensor * a = income.tails[0];
-    XTensor * b = NewTensorBufV2(a, a->devID, a->mem);

-    XNoder::MakeGrad(a);
+    /* dE/da = dE/dc * -sin(a) */
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);

-    _Sin(a, b);
-    _ScaleAndShiftMe(b, -1.0F);
-    _Multiply(node->grad, b, a->grad, 1.0F);
+        XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
+        _Sin(a, tmp);
+        _NegateMe(tmp);
+        _Multiply(node->grad, tmp, a->grad, 1.0F);

-    DelTensorBuf(b);
+        DelTensorBuf(tmp);
+    }

    node->visitMark = NODE_FINISHED;
 }
@@ -195,14 +201,17 @@ void XMathGrad::GradExp(XTensor * node, bool isEfficient)
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for EXP!");

    XTensor * a = income.tails[0];
-    XTensor * b = NewTensorBufV2(a, a->devID, a->mem);

-    XNoder::MakeGrad(a);
+    /* dE/da = dE/dc * exp(a) */
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);

-    _Exp(a, b);
-    _Multiply(node->grad, b, a->grad, 1.0F);
+        XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
+        _Exp(a, tmp);
+        _Multiply(node->grad, tmp, a->grad, 1.0F);

-    DelTensorBuf(b);
+        DelTensorBuf(tmp);
+    }

    node->visitMark = NODE_FINISHED;
 }
@@ -224,9 +233,11 @@ void XMathGrad::GradLog(XTensor * node, bool isEfficient)

    XTensor * a = income.tails[0];

-    XNoder::MakeGrad(a);
-
-    _Div(node->grad, a, a->grad, 1.0F);
+    /* dE/da = dE/dc * 1/a */
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);
+        _Div(node->grad, a, a->grad, 1.0F);
+    }

    node->visitMark = NODE_FINISHED;
 }
@@ -246,8 +257,12 @@ void XMathGrad::GradRound(XTensor * node, bool isEfficient)
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for ROUND!");

-    // we do nothing here
-    // TODO: set grad = 0 if the node is the only child
+    XTensor * a = income.tails[0];
+
+    /* dE/da = 0, we do nothing here */
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);
+    }

    node->visitMark = NODE_FINISHED;
 }
@@ -267,8 +282,12 @@ void XMathGrad::GradSign(XTensor * node, bool isEfficient)
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SIGN!");

-    // we do nothing here
-    // TODO: set grad = 0 if the node is the only child
+    XTensor * a = income.tails[0];
+
+    /* dE/da = 0, we do nothing here */
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);
+    }

    node->visitMark = NODE_FINISHED;
 }
@@ -289,14 +308,17 @@ void XMathGrad::GradSin(XTensor * node, bool isEfficient)
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SIN!");

    XTensor * a = income.tails[0];
-    XTensor * b = NewTensorBufV2(a, a->devID, a->mem);

-    XNoder::MakeGrad(a);
+    /* dE/da = dE/dc * cos(a) */
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);

-    _Cos(a, b);
-    _Multiply(node->grad, b, a->grad, 1.0F);
+        XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
+        _Cos(a, tmp);
+        _Multiply(node->grad, tmp, a->grad, 1.0F);

-    DelTensorBuf(b);
+        DelTensorBuf(tmp);
+    }

    node->visitMark = NODE_FINISHED;
 }
@@ -317,15 +339,18 @@ void XMathGrad::GradTan(XTensor * node, bool isEfficient)
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for TAN!");

    XTensor * a = income.tails[0];
-    XTensor * b = NewTensorBufV2(a, a->devID, a->mem);
-
-    XNoder::MakeGrad(a);
+    XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);

-    _Cos(a, b);
-    _PowerMe(b, -2.0F);
-    _Multiply(node->grad, b, a->grad, 1.0F);
+    /* dE/da = dE/dc * 1/(cos(a))^2
+             = dE/dc * (cos(a))^-2 */
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);
+        _Cos(a, tmp);
+        _PowerMe(tmp, -2.0F);
+        _Multiply(node->grad, tmp, a->grad, 1.0F);

-    DelTensorBuf(b);
+        DelTensorBuf(tmp);
+    }

    node->visitMark = NODE_FINISHED;
 }
@@ -345,17 +370,21 @@ void XMathGrad::GradClip(XTensor * node, bool isEfficient)
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for CLIP!");

    XTensor * a = income.tails[0];
-    XTensor * b = NewTensorBufV2(a, a->devID, a->mem);

    DTYPE lower = income.GetParam(0);
    DTYPE upper = income.GetParam(1);

-    XNoder::MakeGrad(a);
+    /* dE/da = 1  lower < a < upper
+             = 0  otherwise */
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);

-    _ClipBackward(node, a, node->grad, a->grad, lower, upper);
-    _Sum(a->grad, b, a->grad);
+        XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
+        _ClipBackward(node, a, node->grad, tmp, lower, upper);
+        _SumMe(a->grad, tmp);

-    DelTensorBuf(b);
+        DelTensorBuf(tmp);
+    }

    node->visitMark = NODE_FINISHED;
 }
@@ -378,21 +407,26 @@ void XMathGrad::GradDiv(XTensor * node, bool isEfficient)

    XTensor * a = income.tails[0];
    XTensor * b = income.tails[1];
-    XTensor * ab2 = NewTensorBufV2(a, a->devID, a->mem);
-
-    XNoder::MakeGrad(a);
-    XNoder::MakeGrad(b);
-
    CheckNTErrors(_IsSameShaped(a, b), "Wrong sized input tensors!");
+    
+    /* dE/da = dE/dc / b */
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);
+        _Div(node->grad, b, a->grad, 1.0F);
+    }

-    _Div(node->grad, b, a->grad, 1.0F);
-
-    _Power(b, ab2, -2.0F);
-    _Multiply(a, ab2, ab2);
-    _ScaleAndShiftMe(ab2, -1.0F);
-    _Multiply(node->grad, ab2, b->grad, 1.0F);
+    /* dE/db = dE/dc * a/(-b^2)
+             = dE/dc * a * (-b^-2) */
+    if (!isEfficient || b->isGrad) {
+        XNoder::MakeGrad(b);
+        XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
+        _Power(b, tmp, -2.0F);
+        _NegateMe(tmp);
+        _MultiplyMe(tmp, a);
+        _Multiply(node->grad, tmp, b->grad, 1.0F);

-    DelTensorBuf(ab2);
+        DelTensorBuf(tmp);
+    }

    node->visitMark = NODE_FINISHED;
 }
@@ -416,87 +450,82 @@ void XMathGrad::GradDivDim(XTensor * node, bool isEfficient)
    XTensor * a = income.tails[0];
    XTensor * b = income.tails[1];
    int n = income.GetParamInt(0);
-    XNoder::MakeGrad(a);
-    XNoder::MakeGrad(b);

    /* dE/da = dE/dc * (1/b) */
-    _DivDim(node->grad, b, a->grad, n, 1.0);
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);
+        _DivDim(node->grad, b, a->grad, n, 1.0);
+    }

-    /* dE/db = dE/dc * dc/db */
-    int order = a->order;
-    int dimSize[MAX_TENSOR_DIM_NUM];
-    memcpy(dimSize, a->dimSize, sizeof(int) * a->order);
+    /* dE/db = dE/dc * dc/db
+             = (dE/dc * (-a/b^2)).reduce(0,...,n-1,n+1,...) */
+    if (!isEfficient || b->isGrad) {
+        XNoder::MakeGrad(b);
+        int order = a->order;
+        int dimSize[MAX_TENSOR_DIM_NUM];
+        memcpy(dimSize, a->dimSize, sizeof(int) * a->order);

-    XTensor * aTMP1 = NewTensorBufV2(a, a->devID, a->mem);
-    XTensor * aTMP2 = NewTensorBufV2(a, a->devID, a->mem);
-    XTensor * bTMP  = NewTensorBufV2(b, b->devID, b->mem);
-    XTensor * interGradTMP = NewTensorBufV2(node->grad, node->devID, node->mem);
+        XTensor * aTMP1 = NewTensorBufV2(a, a->devID, a->mem);
+        XTensor * aTMP2 = NewTensorBufV2(a, a->devID, a->mem);
+        XTensor * bTMP = NewTensorBufV2(b, b->devID, b->mem);
+        XTensor * interGradTMP = NewTensorBufV2(node->grad, node->devID, node->mem);

-    _Negate(a, aTMP1);
-    _Power(b, bTMP, -2.0F);
-    _MultiplyDim(aTMP1, bTMP, aTMP2, n);
+        _Negate(a, aTMP1);
+        _Power(b, bTMP, -2.0F);
+        _MultiplyDim(aTMP1, bTMP, aTMP2, n);

-    _Multiply(node->grad, aTMP2, interGradTMP);
+        _Multiply(node->grad, aTMP2, interGradTMP);

-    if(n == order - 1){
-        int reshapedSize[MAX_TENSOR_DIM_NUM];
-        reshapedSize[0] = a->unitNum/dimSize[order - 1];
-        reshapedSize[1] = dimSize[order - 1];
+        if (n == order - 1) {
+            int reshapedSize[MAX_TENSOR_DIM_NUM];
+            reshapedSize[0] = a->unitNum / dimSize[order - 1];
+            reshapedSize[1] = dimSize[order - 1];

-        /* we reshape dE/dc * a to a matrix whose column number is equal to the 
-           size of b. Then we can reduce the matrix into a row vector. */
-        interGradTMP->Reshape(2, reshapedSize);
+            /* we reshape dE/dc * a to a matrix whose column number is equal to the
+               size of b. Then we can reduce the matrix into a row vector. */
+            interGradTMP->Reshape(2, reshapedSize);

-        //if(b->outgo.tailNum > 1){
            XTensor * bGradTMP = NewTensorBufV2(b->grad, b->devID, b->mem);
-
            _ReduceSum(interGradTMP, bGradTMP, 0);
-            _Sum(b->grad, bGradTMP, b->grad);
+
+            _SumMe(b->grad, bGradTMP);

            DelTensorBuf(bGradTMP);
-        /*}
-        else{
-            _ReduceSum(interGradTMP, b->grad, 0);
-        }*/
-    }
-    else{
-        int reshapedSize[MAX_TENSOR_DIM_NUM];
-        reshapedSize[0] = 1;
-        reshapedSize[1] = dimSize[n];
-        reshapedSize[2] = 1;
-
-        for(int i = 0; i < order; i++){
-            if(i < n)
-                reshapedSize[0] *= dimSize[i];
        }
+        else {
+            int reshapedSize[MAX_TENSOR_DIM_NUM];
+            reshapedSize[0] = 1;
+            reshapedSize[1] = dimSize[n];
+            reshapedSize[2] = 1;

-        reshapedSize[2] = a->unitNum / (reshapedSize[0] * reshapedSize[1]);
+            for (int i = 0; i < order; i++) {
+                if (i < n)
+                    reshapedSize[0] *= dimSize[i];
+            }

-        /* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|. 
-           Then reduce along with z and x to obtain dE/db. */
-        interGradTMP->Reshape(3, reshapedSize);
+            reshapedSize[2] = a->unitNum / (reshapedSize[0] * reshapedSize[1]);

-        XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
-        _ReduceSum(interGradTMP, interGrad, 2);
+            /* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|.
+               Then reduce along with z and x to obtain dE/db. */
+            interGradTMP->Reshape(3, reshapedSize);

-        //if(b->outgo.tailNum > 1){
-            XTensor * bGradTMP2 = NewTensorBufV2(b->grad, b->devID, b->mem);
+            XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
+            _ReduceSum(interGradTMP, interGrad, 2);

+            XTensor * bGradTMP2 = NewTensorBufV2(b->grad, b->devID, b->mem);
            _ReduceSum(interGrad, bGradTMP2, 0);
-            _Sum(b->grad, bGradTMP2, b->grad);
+
+            _SumMe(b->grad, bGradTMP2);

            DelTensorBuf(bGradTMP2);
-        /*}
-        else{
-            _ReduceSum(interGrad, b->grad, 0);
-        }*/
-        DelTensorBuf(interGrad);
-    }
+            DelTensorBuf(interGrad);
+        }

-    DelTensorBuf(interGradTMP);
-    DelTensorBuf(bTMP);
-    DelTensorBuf(aTMP2);
-    DelTensorBuf(aTMP1);
+        DelTensorBuf(interGradTMP);
+        DelTensorBuf(bTMP);
+        DelTensorBuf(aTMP2);
+        DelTensorBuf(aTMP1);
+    }

    node->visitMark = NODE_FINISHED;
 }
@@ -523,9 +552,9 @@ void XMathGrad::GradMatrixMul(XTensor * node, bool isEfficient)
    MATRIX_TRANS_TYPE transB = income.GetParamTrans(1);
    DTYPE alpha = income.GetParam(2);

-    if(!isEfficient || a->isGrad)
+    if (!isEfficient || a->isGrad)
        XNoder::MakeGrad(a);
-    if(!isEfficient || b->isGrad)
+    if (!isEfficient || b->isGrad)
        XNoder::MakeGrad(b);

    XTensor * c = node;
@@ -533,9 +562,9 @@ void XMathGrad::GradMatrixMul(XTensor * node, bool isEfficient)
    XTensor * deda = a->grad;
    XTensor * dedb = b->grad;

-    if(a->order == 2 && b->order == 2)
+    if (a->order == 2 && b->order == 2)
        GradMatrixMul(a, deda, transA, b, dedb, transB, dedc, alpha, isEfficient);
-    else if(transA == X_NOTRANS && a->order > 2 && b->order == 2){
+    else if (transA == X_NOTRANS && a->order > 2 && b->order == 2){
        int orderBackupA = a->order;
        int orderBackupC = c->order;
        int dimsBackupA[MAX_TENSOR_DIM_NUM];
@@ -545,7 +574,7 @@ void XMathGrad::GradMatrixMul(XTensor * node, bool isEfficient)

        a->Reshape(a->unitNum/a->GetDim(-1), a->GetDim(-1));
        c->Reshape(c->unitNum/c->GetDim(-1), c->GetDim(-1));
-        if(!isEfficient || a->isGrad)
+        if (!isEfficient || a->isGrad)
            deda->Reshape(deda->unitNum/deda->GetDim(-1), deda->GetDim(-1));
        dedc->Reshape(dedc->unitNum/dedc->GetDim(-1), dedc->GetDim(-1));

@@ -553,7 +582,7 @@ void XMathGrad::GradMatrixMul(XTensor * node, bool isEfficient)

        a->Reshape(orderBackupA, dimsBackupA);
        c->Reshape(orderBackupC, dimsBackupC);
-        if(!isEfficient || a->isGrad)
+        if (!isEfficient || a->isGrad)
            deda->Reshape(orderBackupA, dimsBackupA);
        dedc->Reshape(orderBackupC, dimsBackupC);
    }
@@ -580,54 +609,54 @@ void XMathGrad::GradMatrixMul(XTensor * a, XTensor * deda, MATRIX_TRANS_TYPE tra
                              XTensor * dedc, DTYPE alpha, bool isEfficient)
 {
    /* c = a * b * \alpha */
-    if(transA == X_NOTRANS && transB == X_NOTRANS){
+    if (transA == X_NOTRANS && transB == X_NOTRANS) {
        
        /* dE/da = dE/dc * b^T * \alpha */
-        if(!isEfficient || a->isGrad)
+        if (!isEfficient || a->isGrad)
            _MatrixMul(dedc, X_NOTRANS, b, X_TRANS, deda, alpha, 1.0F);
        
        /* dE/db = a^T * dE/dc * \alpha */
-        if(!isEfficient || b->isGrad)
+        if (!isEfficient || b->isGrad)
            _MatrixMul(a, X_TRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
    }
    
    /* c = a^T * b * \alpha */
-    else if(transA == X_TRANS && transB == X_NOTRANS){
+    else if (transA == X_TRANS && transB == X_NOTRANS){
        
        /* dE/da = (dE/dc * b^T)^T * \alpha 
                 = b * dE/dc^T * \alpha */
-        if(!isEfficient || a->isGrad)
+        if (!isEfficient || a->isGrad)
            _MatrixMul(b, X_NOTRANS, dedc, X_TRANS, deda, alpha, 1.0F);
        
        /* dE/db = a * dE/dc * \alpha */
-        if(!isEfficient || b->isGrad)
+        if (!isEfficient || b->isGrad)
            _MatrixMul(a, X_NOTRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
    }
    
    /* c = a * b^T * \alpha */
-    else if(transA == X_NOTRANS && transB == X_TRANS){
+    else if (transA == X_NOTRANS && transB == X_TRANS){
        
        /* dE/da = dE/dc * b * \alpha */
-        if(!isEfficient || a->isGrad)
+        if (!isEfficient || a->isGrad)
            _MatrixMul(dedc, X_NOTRANS, b, X_NOTRANS, deda, alpha, 1.0F);
        
        /* dE/db = (a^T * dE/dc)^T * \alpha 
                 = dE/dc^T * a * \alpha */
-        if(!isEfficient || b->isGrad)
+        if (!isEfficient || b->isGrad)
            _MatrixMul(dedc, X_TRANS, a, X_NOTRANS, dedb, alpha, 1.0F);
    }
    
    /* c = a^T * b^T * \alpha */
-    else if(transA == X_TRANS && transB == X_TRANS){
+    else if (transA == X_TRANS && transB == X_TRANS){
        
        /* dE/da = (dE/dc * b)^T * \alpha 
                 = b^T * dE/dc^T * \alpha */
-        if(!isEfficient || a->isGrad)
+        if (!isEfficient || a->isGrad)
            _MatrixMul(b, X_TRANS, dedc, X_TRANS, deda, alpha, 1.0F);
        
        /* dE/db = (a * dE/dc)^T * \alpha 
                 = dE/dc^T * a^T * \alpha */
-        if(!isEfficient || b->isGrad)
+        if (!isEfficient || b->isGrad)
            _MatrixMul(dedc, X_TRANS, a, X_TRANS, dedb, alpha, 1.0F);
    }
 }
@@ -655,55 +684,65 @@ void XMathGrad::GradMatrixMulBatched(XTensor * node, bool isEfficient)
    MATRIX_TRANS_TYPE transB = income.GetParamTrans(1);
    DTYPE alpha = income.GetParam(2);

-    XNoder::MakeGrad(a);
-    XNoder::MakeGrad(b);
+    if (!isEfficient || a->isGrad)
+        XNoder::MakeGrad(a);
+    if (!isEfficient || b->isGrad)
+        XNoder::MakeGrad(b);

    XTensor * dedc = node->grad;
    XTensor * deda = a->grad;
    XTensor * dedb = b->grad;

    /* c = a * b * \alpha */
-    if(transA == X_NOTRANS && transB == X_NOTRANS){
+    if (transA == X_NOTRANS && transB == X_NOTRANS) {
        
        /* dE/da = dE/dc * b^T * \alpha */
-        _MatrixMulBatched(dedc, X_NOTRANS, b, X_TRANS, deda, alpha, 1.0F);
+        if (!isEfficient || a->isGrad)
+            _MatrixMulBatched(dedc, X_NOTRANS, b, X_TRANS, deda, alpha, 1.0F);
        
        /* dE/db = a^T * dE/dc * \alpha */
-        _MatrixMulBatched(a, X_TRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
+        if (!isEfficient || b->isGrad)
+            _MatrixMulBatched(a, X_TRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
    }
    
    /* c = a^T * b * \alpha */
-    else if(transA == X_TRANS && transB == X_NOTRANS){
+    else if (transA == X_TRANS && transB == X_NOTRANS) {
        
        /* dE/da = (dE/dc * b^T)^T * \alpha 
                 = b * dE/dc^T * \alpha */
-        _MatrixMulBatched(b, X_NOTRANS, dedc, X_TRANS, deda, alpha, 1.0F);
+        if (!isEfficient || a->isGrad)
+            _MatrixMulBatched(b, X_NOTRANS, dedc, X_TRANS, deda, alpha, 1.0F);
        
        /* dE/db = a * dE/dc * \alpha */
-        _MatrixMulBatched(a, X_NOTRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
+        if (!isEfficient || b->isGrad)
+            _MatrixMulBatched(a, X_NOTRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
    }
    
    /* c = a * b^T * \alpha */
-    else if(transA == X_NOTRANS && transB == X_TRANS){
+    else if (transA == X_NOTRANS && transB == X_TRANS) {
        
        /* dE/da = dE/dc * b * \alpha */
-        _MatrixMulBatched(dedc, X_NOTRANS, b, X_NOTRANS, deda, alpha, 1.0F);
+        if (!isEfficient || a->isGrad)
+            _MatrixMulBatched(dedc, X_NOTRANS, b, X_NOTRANS, deda, alpha, 1.0F);
        
        /* dE/db = (a^T * dE/dc)^T * \alpha 
                 = dE/dc^T * a * \alpha */
-        _MatrixMulBatched(dedc, X_TRANS, a, X_NOTRANS, dedb, alpha, 1.0F);
+        if (!isEfficient || b->isGrad)
+            _MatrixMulBatched(dedc, X_TRANS, a, X_NOTRANS, dedb, alpha, 1.0F);
    }
    
    /* c = a^T * b^T * \alpha */
-    else if(transA == X_TRANS && transB == X_TRANS){
+    else if (transA == X_TRANS && transB == X_TRANS) {
        
        /* dE/da = (dE/dc * b)^T * \alpha 
                 = b^T * dE/dc^T * \alpha */
-        _MatrixMulBatched(b, X_TRANS, dedc, X_TRANS, deda, alpha, 1.0F);
+        if (!isEfficient || a->isGrad)
+            _MatrixMulBatched(b, X_TRANS, dedc, X_TRANS, deda, alpha, 1.0F);
        
        /* dE/db = (a * dE/dc)^T * \alpha 
                 = dE/dc^T * a^T * \alpha */
-        _MatrixMulBatched(dedc, X_TRANS, a, X_TRANS, dedb, alpha, 1.0F);
+        if (!isEfficient || b->isGrad)
+            _MatrixMulBatched(dedc, X_TRANS, a, X_TRANS, dedb, alpha, 1.0F);
    }

    node->visitMark = NODE_FINISHED;
@@ -730,11 +769,13 @@ void XMathGrad::GradMultiply(XTensor * node, bool isEfficient)

    CheckNTErrors(_IsSameShaped(a, b), "Wrong sized input tensors!");

+    /* dE/da = dE/dc * b */
    if (!isEfficient || a->isGrad) {
        XNoder::MakeGrad(a);
        _Multiply(node->grad, b, a->grad, 1.0F);
    }

+    /* dE/db = dE/dc * a */
    if (!isEfficient || b->isGrad) {
        XNoder::MakeGrad(b);
        _Multiply(node->grad, a, b->grad, 1.0F);
@@ -762,77 +803,70 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
    XTensor * a = income.tails[0];
    XTensor * b = income.tails[1];
    int n = income.GetParamInt(0);
-    XNoder::MakeGrad(a);
-    XNoder::MakeGrad(b);

-    /* dE/da */
-    _MultiplyDim(node->grad, b, a->grad, n, 1.0F);
-    
-    /* dE/db */
-    int order = a->order;
-    int dimSize[MAX_TENSOR_DIM_NUM];
-    memcpy(dimSize, a->dimSize, sizeof(int) * a->order);
+    /* dE/da = dE/dc * b */
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);
+        _MultiplyDim(node->grad, b, a->grad, n, 1.0F);
+    }

-    XTensor * bGradTMP = NewTensorBufV2(node->grad, node->devID, node->mem);
-    _Multiply(node->grad, a, bGradTMP);
-    
-    if(n == order - 1){
-        int reshapedSize[MAX_TENSOR_DIM_NUM];
-        reshapedSize[0] = a->unitNum/dimSize[order - 1];
-        reshapedSize[1] = dimSize[order - 1];
+    /* dE/db = (dE/dc * a).reduce(0,...,n-1,n+1,...) */
+    if (!isEfficient || b->isGrad) {
+        XNoder::MakeGrad(b);
+        int order = a->order;
+        int dimSize[MAX_TENSOR_DIM_NUM];
+        memcpy(dimSize, a->dimSize, sizeof(int) * a->order);

-        /* we reshape dE/dc * a to a matrix whose column number is equal to the 
-           size of b. Then we can reduce the matrix into a row vector. */
-        bGradTMP->Reshape(2, reshapedSize);
+        XTensor * bGradTMP = NewTensorBufV2(node->grad, node->devID, node->mem);
+        _Multiply(node->grad, a, bGradTMP);

-        //if(b->outgo.tailNum > 1){
-            XTensor * bGradTMP2 = NewTensorBufV2(b->grad, b->devID, b->mem);
+        if (n == order - 1) {
+            int reshapedSize[MAX_TENSOR_DIM_NUM];
+            reshapedSize[0] = a->unitNum / dimSize[order - 1];
+            reshapedSize[1] = dimSize[order - 1];

+            /* we reshape dE/dc * a to a matrix whose column number is equal to the
+               size of b. Then we can reduce the matrix into a row vector. */
+            bGradTMP->Reshape(2, reshapedSize);
+
+            XTensor * bGradTMP2 = NewTensorBufV2(b->grad, b->devID, b->mem);
            _ReduceSum(bGradTMP, bGradTMP2, 0);
+
            _Sum(b->grad, bGradTMP2, b->grad);

            DelTensorBuf(bGradTMP2);
-        /*}
-        else{
-            _ReduceSum(bGradTMP, b->grad, 0);
-        }*/
-    }
-    else{
-        int reshapedSize[MAX_TENSOR_DIM_NUM];
-        reshapedSize[0] = 1;
-        reshapedSize[1] = dimSize[n];
-        reshapedSize[2] = 1;
-
-        for(int i = 0; i < order; i++){
-            if(i < n)
-                reshapedSize[0] *= dimSize[i];
        }
+        else {
+            int reshapedSize[MAX_TENSOR_DIM_NUM];
+            reshapedSize[0] = 1;
+            reshapedSize[1] = dimSize[n];
+            reshapedSize[2] = 1;

-        reshapedSize[2] = a->unitNum / (reshapedSize[0] * reshapedSize[1]);
+            for (int i = 0; i < order; i++) {
+                if (i < n)
+                    reshapedSize[0] *= dimSize[i];
+            }

-        /* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|. 
-           Then reduce along with z and x to obtain dE/db. */
-        bGradTMP->Reshape(3, reshapedSize);
+            reshapedSize[2] = a->unitNum / (reshapedSize[0] * reshapedSize[1]);

-        XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
-        _ReduceSum(bGradTMP, interGrad, 2);
+            /* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|.
+               Then reduce along with z and x to obtain dE/db. */
+            bGradTMP->Reshape(3, reshapedSize);

-        //if(b->outgo.tailNum > 1){
-            XTensor * bGradTMP2 = NewTensorBufV2(b->grad, b->devID, b->mem);
+            XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
+            _ReduceSum(bGradTMP, interGrad, 2);

+            XTensor * bGradTMP2 = NewTensorBufV2(b->grad, b->devID, b->mem);
            _ReduceSum(interGrad, bGradTMP2, 0);
+
            _Sum(b->grad, bGradTMP2, b->grad);

            DelTensorBuf(bGradTMP2);
-        /*}
-        else{
-            _ReduceSum(interGrad, b->grad, 0);
-        }*/
-
-        DelTensorBuf(interGrad);
+            DelTensorBuf(interGrad);
+        }
+        DelTensorBuf(bGradTMP);
    }

-    DelTensorBuf(bGradTMP);
    node->visitMark = NODE_FINISHED;
 }

@@ -859,11 +893,18 @@ void XMathGrad::GradMultiplyBroadcast(XTensor * node, bool isEfficient)
    XTensor * b = income.tails[1];

    XNoder::MakeGrad(a);
-    _MultiplyBroadcast(node->grad, b, a->grad, 1.0F);

-    if(b->isVar || b->income.tailNum > 0){
-        ShowNTErrors("TODO");
+    /* dE/da = dE/dc * b */
+    if (!isEfficient || a->isGrad)
+        _MultiplyBroadcast(node->grad, b, a->grad, 1.0F);
+
+    /* dE/db = (dE/dc * a).reduce(0...n) */
+    if (!isEfficient || b->isGrad) {
+        if (b->isVar || b->income.tailNum > 0)
+            ShowNTErrors("TODO");
    }
+
+    node->visitMark = NODE_FINISHED;
 }

 /*
@@ -882,14 +923,12 @@ void XMathGrad::GradNegate(XTensor * node, bool isEfficient)
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for NEGATE!");

    XTensor * a = income.tails[0];
-    XTensor * b = NewTensorBufV2(a, a->devID, a->mem);
-
-    XNoder::MakeGrad(a);

-    _ScaleAndShift(node->grad, b, -1.0F);
-    _Sum(a->grad, b, a->grad);
-
-    DelTensorBuf(b);
+    /* dE/da = dE/dc * (-1) */
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);        
+        _Sum(a->grad, node->grad, a->grad, -1.0F);
+    }

    node->visitMark = NODE_FINISHED;
 }
@@ -903,7 +942,6 @@ gradient for normalize
 void XMathGrad::GradNormalize(XTensor * node, bool isEfficient)
 {
    ShowNTErrors("TODO!");
-    
 }

 /*
@@ -922,17 +960,20 @@ void XMathGrad::GradPower(XTensor * node, bool isEfficient)
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for POWER!");

    XTensor * a = income.tails[0];
-    XTensor * b = NewTensorBufV2(a, a->devID, a->mem);

    DTYPE p = income.GetParam(0);

-    XNoder::MakeGrad(a);
+    /* dE/da = (dE/dc) * p * a^(p-1) */
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);

-    _Power(a, b, p - 1.0F);
-    _ScaleAndShiftMe(b, p);
-    _Multiply(node->grad, b, a->grad, 1.0F);
+        XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
+        _Power(a, tmp, p - 1.0F);
+        _ScaleAndShiftMe(tmp, p);
+        _Multiply(node->grad, tmp, a->grad, 1.0F);

-    DelTensorBuf(b);
+        DelTensorBuf(tmp);
+    }

    node->visitMark = NODE_FINISHED;
 }
@@ -956,9 +997,12 @@ void XMathGrad::GradScaleAndShift(XTensor * node, bool isEfficient)

    DTYPE scale = income.GetParam(0);

-    XNoder::MakeGrad(a);
+    /* dE/da = dE/dc * scale */
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);

-    _Sum(a->grad, node->grad, a->grad, scale);
+        _Sum(a->grad, node->grad, a->grad, scale);
+    }

    node->visitMark = NODE_FINISHED;
 }
@@ -982,9 +1026,12 @@ void XMathGrad::GradScale(XTensor * node, bool isEfficient)

    DTYPE scale = income.GetParam(0);

-    XNoder::MakeGrad(a);
+    /* dE/da = dE/dc * scale */
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);

-    _Sum(a->grad, node->grad, a->grad, scale);
+        _Sum(a->grad, node->grad, a->grad, scale);
+    }

    node->visitMark = NODE_FINISHED;
 }
@@ -1008,9 +1055,12 @@ void XMathGrad::GradDescale(XTensor * node, bool isEfficient)

    DTYPE descale = income.GetParam(0);

-    XNoder::MakeGrad(a);
+    /* dE/da = dE/dc / descale */
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);

-    _Sum(a->grad, node->grad, a->grad, 1/descale);
+        _Sum(a->grad, node->grad, a->grad, 1 / descale);
+    } 

    node->visitMark = NODE_FINISHED;
 }
@@ -1032,9 +1082,12 @@ void XMathGrad::GradShift(XTensor * node, bool isEfficient)

    XTensor * a = income.tails[0];

-    XNoder::MakeGrad(a);
+    /* dE/da = dE/dc */
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);

-    _Sum(a->grad, node->grad, a->grad);
+        _Sum(a->grad, node->grad, a->grad);
+    }

    node->visitMark = NODE_FINISHED;
 }
@@ -1059,11 +1112,17 @@ void XMathGrad::GradSub(XTensor * node, bool isEfficient)
    XTensor * b = income.tails[1];
    DTYPE beta = income.GetParam(0);

-    XNoder::MakeGrad(a);
-    XNoder::MakeGrad(b);
+    /* dE/da = dE/dc */
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);
+        _Sum(a->grad, node->grad, a->grad);
+    }

-    _Sum(a->grad, node->grad, a->grad);
-    _Sum(b->grad, node->grad, b->grad, -beta);
+    /* dE/db = -dE/dc * \beta */
+    if (!isEfficient || b->isGrad) {
+        XNoder::MakeGrad(b);
+        _Sum(b->grad, node->grad, b->grad, -beta);
+    }

    node->visitMark = NODE_FINISHED;
 }
@@ -1087,81 +1146,70 @@ void XMathGrad::GradSubDim(XTensor * node, bool isEfficient)
    XTensor * b = income.tails[1];
    int n = income.GetParamInt(0);
    DTYPE beta = income.GetParam(1);
-    XNoder::MakeGrad(a);
-    XNoder::MakeGrad(b);

-    _Sum(a->grad, node->grad, a->grad);
+    /* dE/da = dE/dc */
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);
+        _Sum(a->grad, node->grad, a->grad);
+    }

-    int order = a->order;
-    int dimSize[MAX_TENSOR_DIM_NUM];
-    memcpy(dimSize, a->dimSize, sizeof(int) * a->order);
+    /* dE/db = - dE/dc * b.reduce(0,...,n-1,n+1,...) * \beta */
+    if (!isEfficient || b->isGrad) {
+        XNoder::MakeGrad(b);
+        int order = a->order;
+        int dimSize[MAX_TENSOR_DIM_NUM];
+        memcpy(dimSize, a->dimSize, sizeof(int) * a->order);

-    if(n == order - 1){
-        int reshapedSize[MAX_TENSOR_DIM_NUM];
-        reshapedSize[0] = a->unitNum / dimSize[order - 1];
-        reshapedSize[1] = dimSize[order - 1];
+        if (n == order - 1) {
+            int reshapedSize[MAX_TENSOR_DIM_NUM];
+            reshapedSize[0] = a->unitNum / dimSize[order - 1];
+            reshapedSize[1] = dimSize[order - 1];

-        /* we reshape dE/dc to a matrix whose column number is equal to the
-           size of b. Then we can reduce the matrix into a row vector. */
-        node->grad->Reshape(2, reshapedSize);
+            /* we reshape dE/dc to a matrix whose column number is equal to the
+               size of b. Then we can reduce the matrix into a row vector. */
+            node->grad->Reshape(2, reshapedSize);

-        //if(b->outgo.tailNum > 1){
            XTensor * bGradTMP = NewTensorBufV2(b->grad, b->devID, b->mem);
            _ReduceSum(node->grad, bGradTMP, 0);
-            if(beta != 1.0F)
+            if (beta != 1.0F)
                _ScaleAndShiftMe(bGradTMP, beta);
            _Sub(b->grad, bGradTMP, b->grad);
            DelTensorBuf(bGradTMP);
-        /*}
-        else{
-            _ReduceSum(node->grad, b->grad, 0);
-            if(beta != 1.0F)
-                _ScaleAndShiftMe(b->grad, beta);
-            _ScaleAndShiftMe(b->grad, -1.0F);
-        }*/
-
-        node->grad->Reshape(order, dimSize);
-    }
-    else{
-        int reshapedSize[MAX_TENSOR_DIM_NUM];
-        reshapedSize[0] = 1;
-        reshapedSize[1] = dimSize[n];
-        reshapedSize[2] = 1;
-
-        for(int i = 0; i < order; i++){
-            if(i < n)
-                reshapedSize[0] *= dimSize[i];
+
+            node->grad->Reshape(order, dimSize);
        }
+        else {
+            int reshapedSize[MAX_TENSOR_DIM_NUM];
+            reshapedSize[0] = 1;
+            reshapedSize[1] = dimSize[n];
+            reshapedSize[2] = 1;

-        reshapedSize[2] = a->unitNum / (reshapedSize[0] * reshapedSize[1]);
+            for (int i = 0; i < order; i++) {
+                if (i < n)
+                    reshapedSize[0] *= dimSize[i];
+            }

-        /* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|.
-           Then reduce along with z and x to obtain dE/db. */
-        node->grad->Reshape(3, reshapedSize);
+            reshapedSize[2] = a->unitNum / (reshapedSize[0] * reshapedSize[1]);

-        XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
+            /* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|.
+               Then reduce along with z and x to obtain dE/db. */
+            node->grad->Reshape(3, reshapedSize);

-        _ReduceSum(node->grad, interGrad, 2);
+            XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
+
+            _ReduceSum(node->grad, interGrad, 2);

-        //if(b->outgo.tailNum > 1){
            XTensor * bGradTMP = NewTensorBufV2(b->grad, b->devID, b->mem);
            _ReduceSum(interGrad, bGradTMP, 0);
-            if(beta != 1.0F)
+            if (beta != 1.0F)
                _ScaleAndShiftMe(bGradTMP, beta);
            _Sub(b->grad, bGradTMP, b->grad);
            DelTensorBuf(bGradTMP);
-        /*}
-        else{
-            _ReduceSum(interGrad, b->grad, 0);
-            if(beta != 1.0F)
-                _ScaleAndShiftMe(b->grad, beta);
-            _ScaleAndShiftMe(b->grad, -1.0F);
-        }*/
-
-        node->grad->Reshape(order, dimSize);

-        DelTensorBuf(interGrad);
+            node->grad->Reshape(order, dimSize);

+            DelTensorBuf(interGrad);
+        }
    }

    node->visitMark = NODE_FINISHED;
@@ -1174,7 +1222,6 @@ c =  a + b * \beta
 we have
 dE/da = dE/dc 
 dE/db = dE/dc * \beta
-
 >> node - the node (c) for backward computation
 >> isEfficient - indicates whether the computation is in
                 an efficient manner
@@ -1188,12 +1235,14 @@ void XMathGrad::GradSum(XTensor * node, bool isEfficient)
    XTensor * b = income.tails[1];
    DTYPE beta = income.GetParam(0);

-    if(!isEfficient || a->isGrad){
+    /* dE/da = dE/dc */
+    if (!isEfficient || a->isGrad) {
        XNoder::MakeGrad(a);
        _Sum(a->grad, node->grad, a->grad);
    }

-    if(!isEfficient || b->isGrad){
+    /* dE/db = dE/dc * \beta */
+    if (!isEfficient || b->isGrad) {
        XNoder::MakeGrad(b);
        _Sum(b->grad, node->grad, b->grad, beta);
    }
@@ -1221,81 +1270,72 @@ void XMathGrad::GradSumDim(XTensor * node, bool isEfficient)
    XTensor * b = income.tails[1];
    int n = income.GetParamInt(0);
    DTYPE beta = income.GetParam(1);
-    XNoder::MakeGrad(a);
-    XNoder::MakeGrad(b);

-    _Sum(a->grad, node->grad, a->grad);
+    if (!isEfficient || a->isGrad) {
+        /* dE/da = dE/dc */
+        XNoder::MakeGrad(a);
+        _Sum(a->grad, node->grad, a->grad);
+    }

-    int order = a->order;
-    int dimSize[MAX_TENSOR_DIM_NUM];
-    memcpy(dimSize, a->dimSize, sizeof(int) * a->order);
+    /* dE/db = dE/dc * a.reduce(0,...,n-1,n+1,...) * \beta */
+    if (!isEfficient || b->isGrad) {
+        XNoder::MakeGrad(b);
+        int order = a->order;
+        int dimSize[MAX_TENSOR_DIM_NUM];
+        memcpy(dimSize, a->dimSize, sizeof(int) * a->order);

-    if(n == order - 1){
-        int reshapedSize[MAX_TENSOR_DIM_NUM];
-        reshapedSize[0] = a->unitNum/dimSize[order - 1];
-        reshapedSize[1] = dimSize[order - 1];
+        if (n == order - 1) {
+            int reshapedSize[MAX_TENSOR_DIM_NUM];
+            reshapedSize[0] = a->unitNum / dimSize[order - 1];
+            reshapedSize[1] = dimSize[order - 1];

-        /* we reshape dE/dc to a matrix whose column number is equal to the 
-           size of b. Then we can reduce the matrix into a row vector. */
-        node->grad->Reshape(2, reshapedSize);
+            /* we reshape dE/dc to a matrix whose column number is equal to the
+               size of b. Then we can reduce the matrix into a row vector. */
+            node->grad->Reshape(2, reshapedSize);

-        //if(b->outgo.tailNum > 1){
            XTensor * bGradTMP = NewTensorBufV2(b->grad, b->devID, b->mem);
            _ReduceSum(node->grad, bGradTMP, 0);
-            if(beta != 1.0F)
+            if (beta != 1.0F)
                _ScaleAndShiftMe(bGradTMP, beta);
            _Sum(bGradTMP, b->grad, b->grad);
            DelTensorBuf(bGradTMP);
-        /*}
-        else{
-            _ReduceSum(node->grad, b->grad, 0);
-            if(beta != 1.0F)
-                _ScaleAndShiftMe(b->grad, beta);
-        }*/
-
-        node->grad->Reshape(order, dimSize);
-    }
-    else{
-        int reshapedSize[MAX_TENSOR_DIM_NUM];
-        reshapedSize[0] = 1;
-        reshapedSize[1] = dimSize[n];
-        reshapedSize[2] = 1;
-
-        for(int i = 0; i < order; i++){
-            if(i < n)
-                reshapedSize[0] *= dimSize[i];
+
+            node->grad->Reshape(order, dimSize);
        }
+        else {
+            int reshapedSize[MAX_TENSOR_DIM_NUM];
+            reshapedSize[0] = 1;
+            reshapedSize[1] = dimSize[n];
+            reshapedSize[2] = 1;
+
+            for (int i = 0; i < order; i++) {
+                if (i < n)
+                    reshapedSize[0] *= dimSize[i];
+            }

-        reshapedSize[2] = a->unitNum / (reshapedSize[0] * reshapedSize[1]);
+            reshapedSize[2] = a->unitNum / (reshapedSize[0] * reshapedSize[1]);

-        /* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|. 
-           Then reduce along with z and x to obtain dE/db. */
-        node->grad->Reshape(3, reshapedSize);
+            /* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|.
+               Then reduce along with z and x to obtain dE/db. */
+            node->grad->Reshape(3, reshapedSize);

-        XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
+            XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);

-        _ReduceSum(node->grad, interGrad, 2);
+            _ReduceSum(node->grad, interGrad, 2);

-        //if(b->outgo.tailNum > 1){
            XTensor * bGradTMP = NewTensorBufV2(b->grad, b->devID, b->mem);
            _ReduceSum(interGrad, bGradTMP, 0);
-            if(beta != 1.0F)
+            if (beta != 1.0F)
                _ScaleAndShiftMe(bGradTMP, beta);
            _Sum(bGradTMP, b->grad, b->grad);
            DelTensorBuf(bGradTMP);
-        /*}
-        else{
-            _ReduceSum(interGrad, b->grad, 0);
-            if(beta != 1.0F)
-                _ScaleAndShiftMe(b->grad, beta);
-        }*/

-        node->grad->Reshape(order, dimSize);
-
-        DelTensorBuf(interGrad);
+            node->grad->Reshape(order, dimSize);

+            DelTensorBuf(interGrad);
+        }
    }
-    
+
    node->visitMark = NODE_FINISHED;
 }

@@ -1322,12 +1362,20 @@ void XMathGrad::GradSumBroadcast(XTensor * node, bool isEfficient)
    XTensor * b = income.tails[1];
    //DTYPE beta = income.GetParam(0);

-    XNoder::MakeGrad(a);
-    _Sum(a->grad, node->grad, a->grad);
+    /* dE/da = dE/dc */
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);
+        _Sum(a->grad, node->grad, a->grad);
+    }

-    if(b->isVar || b->income.tailNum > 0){
-        ShowNTErrors("TODO");
+    /* dE/db = dE/dc * a.reduce(0..n) * \beta  */
+    if (!isEfficient || b->isGrad) {
+        if (b->isVar || b->income.tailNum > 0) {
+            ShowNTErrors("TODO");
+        }
    }
+
+    node->visitMark = NODE_FINISHED;
 }

 /*
@@ -1347,18 +1395,21 @@ void XMathGrad::GradReduceMean(XTensor * node, bool isEfficient)
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for Reduce!");

    XTensor * a = income.tails[0];
-    XTensor * b = NewTensorBufV2(a, a->devID, a->mem);

    int dim = income.GetParamInt(0);
    int n = a->GetDim(dim);

-    XNoder::MakeGrad(a);
+    /* dE/da = Unsqueeze(dE/dc) * 1/dimSizeA[dim] */
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);

-    _Unsqueeze(node->grad, b, dim, n);
-    _ScaleAndShiftMe(b, 1.0F/n);
-    _Sum(a->grad, b, a->grad);
+        XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
+        _Unsqueeze(node->grad, tmp, dim, n);
+        _ScaleAndShiftMe(tmp, 1.0F / n);
+        _Sum(a->grad, tmp, a->grad);

-    DelTensorBuf(b);
+        DelTensorBuf(tmp);
+    }

    node->visitMark = NODE_FINISHED;
 }
@@ -1368,7 +1419,7 @@ gradient for reduceSum
 for
 c = reduceSum(a, dim)
 we have
-dE/da = Unsqueeze(dE/dc) * 1
+dE/da = Unsqueeze(dE/dc)

 >> node - the node (c) for backward computation
 >> isEfficient - indicates whether the computation is in
@@ -1380,17 +1431,19 @@ void XMathGrad::GradReduceSum(XTensor * node, bool isEfficient)
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for Reduce!");

    XTensor * a = income.tails[0];
-    XTensor * b = NewTensorBufV2(a, a->devID, a->mem);

    int dim = income.GetParamInt(0);
    int n = a->GetDim(dim);

-    XNoder::MakeGrad(a);
-
-    _Unsqueeze(node->grad, b, dim, n);
-    _Sum(a->grad, b, a->grad);
+    /* dE/da = Unsqueeze(dE/dc) */
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);

-    DelTensorBuf(b);
+        XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
+        _Unsqueeze(node->grad, tmp, dim, n);
+        _Sum(a->grad, tmp, a->grad);
+        DelTensorBuf(tmp);
+    }

    node->visitMark = NODE_FINISHED;
 }
@@ -1412,16 +1465,17 @@ void XMathGrad::GradReduceSumAll(XTensor * node, bool isEfficient)
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for Reduce!");

    XTensor * a = income.tails[0];
-    XTensor * b = NewTensorBufV2(a, a->devID, a->mem);
-
-    XNoder::MakeGrad(a);
-
-    DTYPE value = node->grad->Get0D();
-    _SetDataFixed(b, (void*)&value);

-    _Sum(a->grad, b, a->grad);
+    /* dE/da = dE/dc * 1 */
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);

-    DelTensorBuf(b);
+        XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
+        DTYPE value = node->grad->Get0D();
+        tmp->SetDataFixed(value);
+        _Sum(a->grad, tmp, a->grad);
+        DelTensorBuf(tmp);
+    }

    node->visitMark = NODE_FINISHED;
 }
@@ -1452,22 +1506,28 @@ void XMathGrad::GradReduceSumSquared(XTensor * node, bool isEfficient)

    int dim = income.GetParamInt(0);
    int n = a->GetDim(dim);
-    XNoder::MakeGrad(a);
-    XNoder::MakeGrad(b);

-    /* compute a-b */
    _Unsqueeze(b, c, dim, n);
    _Sub(a, c, d);
-    _ReduceSum(d, f, dim);
-    
+
    /* dE/da_i = Unsqueeze(dE/dc) * 2 * (a_i - b) */
-    _ScaleAndShiftMe(d, 2.0F);
-    _Unsqueeze(node->grad, e, dim, n);
-    _Multiply(d, e, a->grad, 1.0F);
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);
+
+        
+        _ScaleAndShiftMe(d, 2.0F);
+        _Unsqueeze(node->grad, e, dim, n);
+        _Multiply(d, e, a->grad, 1.0F);
+    }

    /* dE/db = dE/dc * -2 * n * \sum_i (a_i - b) */
-    _ScaleAndShiftMe(f, -2.0F);
-    _Multiply(node->grad, f, b->grad, 1.0F);
+    if (!isEfficient || b->isGrad) {
+        XNoder::MakeGrad(b);
+
+        _ReduceSum(d, f, dim);
+        _ScaleAndShiftMe(f, -2.0F);
+        _Multiply(node->grad, f, b->grad, 1.0F);
+    }

    DelTensorBuf(f);
    DelTensorBuf(e);
@@ -1504,22 +1564,27 @@ void XMathGrad::GradReduceVariance(XTensor * node, bool isEfficient)

    int dim = income.GetParamInt(0);
    int n = a->GetDim(dim);
-    XNoder::MakeGrad(a);
-    XNoder::MakeGrad(b);

-    /* compute a-b */
    _Unsqueeze(b, c, dim, n);
    _Sub(a, c, d);
-    _ReduceSum(d, f, dim);

    /* dE/da_i = Unsqueeze(dE/dc) * 2 * (a_i - b) / n */
-    _ScaleAndShiftMe(d, 2.0F / n);
-    _Unsqueeze(node->grad, e, dim, n);
-    _Multiply(d, e, a->grad, 1.0F);
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);
+        
+        _ScaleAndShiftMe(d, 2.0F / n);
+        _Unsqueeze(node->grad, e, dim, n);
+        _Multiply(d, e, a->grad, 1.0F);
+    }

    /* dE/db = dE/dc * -2 * \sum_i (a_i - b) */
-    _ScaleAndShiftMe(f, -2.0F /n);
-    _Multiply(node->grad, f, b->grad, 1.0F);
+    if (!isEfficient || b->isGrad) {
+        XNoder::MakeGrad(b);
+
+        _ReduceSum(d, f, dim);
+        _ScaleAndShiftMe(f, -2.0F / n);
+        _Multiply(node->grad, f, b->grad, 1.0F);
+    }

    DelTensorBuf(f);
    DelTensorBuf(e);
@@ -1529,7 +1594,6 @@ void XMathGrad::GradReduceVariance(XTensor * node, bool isEfficient)
    node->visitMark = NODE_FINISHED;
 }

-
 /*
 gradient for operation
 for c = matmul(x, w) + b 
@@ -1554,67 +1618,67 @@ void XMathGrad::GradMulAndShift(XTensor * node, bool isEfficient)
    MATRIX_TRANS_TYPE transW = income.GetParamTrans(1);
    MATRIX_TRANS_TYPE transX = income.GetParamTrans(2);
    DTYPE alpha = income.GetParam(3);
-
-    if (!isEfficient || w->isGrad)
-        XNoder::MakeGrad(w);
-    if (!isEfficient || x->isGrad)
-        XNoder::MakeGrad(x);
-    if (!isEfficient || b->isGrad)
+    /* dE/db = dE/dc * x.reduce(0,...,n-1,n+1,...) */
+    if (!isEfficient || b->isGrad) {
        XNoder::MakeGrad(b);

-    int order = node->order;
-    int dimSize[MAX_TENSOR_DIM_NUM];
-    memcpy(dimSize, node->dimSize, sizeof(int) * node->order);
+        int order = node->order;
+        int dimSize[MAX_TENSOR_DIM_NUM];
+        memcpy(dimSize, node->dimSize, sizeof(int) * node->order);

-    /* compute dE/db */
-    if (n == order - 1) {
-        int reshapedSize[MAX_TENSOR_DIM_NUM];
-        reshapedSize[0] = node->unitNum / dimSize[order - 1];
-        reshapedSize[1] = dimSize[order - 1];
+        /* compute dE/db */
+        if (n == order - 1) {
+            int reshapedSize[MAX_TENSOR_DIM_NUM];
+            reshapedSize[0] = node->unitNum / dimSize[order - 1];
+            reshapedSize[1] = dimSize[order - 1];

-        /* we reshape dE/dc to a matrix whose column number is equal to the
-        size of b. Then we can reduce the matrix into a row vector. */
-        node->grad->Reshape(2, reshapedSize);
+            /* we reshape dE/dc to a matrix whose column number is equal to the
+            size of b. Then we can reduce the matrix into a row vector. */
+            node->grad->Reshape(2, reshapedSize);

-        XTensor * bGradTMP = NewTensorBufV2(b->grad, b->devID, b->mem);
-        _ReduceSum(node->grad, bGradTMP, 0);
-        _Sum(bGradTMP, b->grad, b->grad);
-        DelTensorBuf(bGradTMP);
+            XTensor * bGradTMP = NewTensorBufV2(b->grad, b->devID, b->mem);
+            _ReduceSum(node->grad, bGradTMP, 0);
+            _Sum(bGradTMP, b->grad, b->grad);
+            DelTensorBuf(bGradTMP);

-        node->grad->Reshape(order, dimSize);
-    }
-    else {
-        int reshapedSize[MAX_TENSOR_DIM_NUM];
-        reshapedSize[0] = 1;
-        reshapedSize[1] = dimSize[n];
-        reshapedSize[2] = 1;
-
-        for (int i = 0; i < order; i++) {
-            if (i < n)
-                reshapedSize[0] *= dimSize[i];
+            node->grad->Reshape(order, dimSize);
        }
+        else {
+            int reshapedSize[MAX_TENSOR_DIM_NUM];
+            reshapedSize[0] = 1;
+            reshapedSize[1] = dimSize[n];
+            reshapedSize[2] = 1;

-        reshapedSize[2] = node->unitNum / (reshapedSize[0] * reshapedSize[1]);
+            for (int i = 0; i < order; i++) {
+                if (i < n)
+                    reshapedSize[0] *= dimSize[i];
+            }

-        /* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|.
-        Then reduce along with z and x to obtain dE/db. */
-        node->grad->Reshape(3, reshapedSize);
+            reshapedSize[2] = node->unitNum / (reshapedSize[0] * reshapedSize[1]);

-        XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
+            /* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|.
+            Then reduce along with z and x to obtain dE/db. */
+            node->grad->Reshape(3, reshapedSize);

-        _ReduceSum(node->grad, interGrad, 2);
-
-        XTensor * bGradTMP = NewTensorBufV2(b->grad, b->devID, b->mem);
-        _ReduceSum(interGrad, bGradTMP, 0);
-        _Sum(bGradTMP, b->grad, b->grad);
-        DelTensorBuf(bGradTMP);
+            XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
+            _ReduceSum(node->grad, interGrad, 2);

-        node->grad->Reshape(order, dimSize);
+            XTensor * bGradTMP = NewTensorBufV2(b->grad, b->devID, b->mem);
+            _ReduceSum(interGrad, bGradTMP, 0);
+            _Sum(bGradTMP, b->grad, b->grad);
+            DelTensorBuf(bGradTMP);

-        DelTensorBuf(interGrad);
+            node->grad->Reshape(order, dimSize);

+            DelTensorBuf(interGrad);
+        }
    }

+    if (!isEfficient || w->isGrad)
+        XNoder::MakeGrad(w);
+    if (!isEfficient || x->isGrad)
+        XNoder::MakeGrad(x);
+
    /* compute dE/dx, dE/dw */
    XTensor * c = node;
    XTensor * dedc = node->grad;
@@ -1623,7 +1687,7 @@ void XMathGrad::GradMulAndShift(XTensor * node, bool isEfficient)

    if (x->order == 2 && w->order == 2)
        GradMatrixMul(x, dedx, transX, w, dedw, transW, dedc, alpha, isEfficient);
-    else if (transX == X_NOTRANS && x->order > 2 && w->order == 2){
+    else if (transX == X_NOTRANS && x->order > 2 && w->order == 2) {
        int orderBackupX = x->order;
        int orderBackupC = c->order;
        int dimsBackupX[MAX_TENSOR_DIM_NUM];

--- a/source/network/XBackwardShape.cpp
+++ b/source/network/XBackwardShape.cpp
@@ -34,35 +34,35 @@ namespace nts{
 /* compute dE/dx of a node */
 void XShapeGrad::MakeGrad(XTensor * node, bool isEfficient)
 {
-    if(!isEfficient){
+    if (!isEfficient) {
        CheckNTErrors(node->grad != NULL, "No gradient found!");
    }
-    else{
+    else {
        CheckNTErrors(!node->isGrad || node->grad != NULL, "No gradient found!");
    }

    XLink &income = node->income;
    int operID = income.typeID;

-    if(operID == MOVEMENT_COPYINDEXED)
+    if (operID == MOVEMENT_COPYINDEXED)
        GradCopyIndexed(node, isEfficient);
-    else if(operID == MOVEMENT_GATHER)
+    else if (operID == MOVEMENT_GATHER)
        GradGather(node, isEfficient);
    else if (operID == MOVEMENT_DROPOUTWITHINDEX)
        GradDropoutWithIndex(node, isEfficient);
-    else if(operID == SHAPE_MERGE)
+    else if (operID == SHAPE_MERGE)
        GradMerge(node, isEfficient);
-    else if(operID == SHAPE_MERGE_LIST)
+    else if (operID == SHAPE_MERGE_LIST)
        GradMergeList(node, isEfficient);
-    else if(operID == SHAPE_RESHAPE)
+    else if (operID == SHAPE_RESHAPE)
        GradReshape(node, isEfficient);
-    else if(operID == SHAPE_SPLIT)
+    else if (operID == SHAPE_SPLIT)
        GradSplit(node, isEfficient);
-    else if(operID == SHAPE_SPLIT_LIST)
+    else if (operID == SHAPE_SPLIT_LIST)
        GradSplitList(node, isEfficient);
    else if (operID == SHAPE_TRANSPOSE)
        GradTranspose(node, isEfficient);
-    else if(operID == SHAPE_UNSQUEEZE)
+    else if (operID == SHAPE_UNSQUEEZE)
        GradUnsqueeze(node, isEfficient);
    else{
        ShowNTErrors("TODO!");
@@ -77,10 +77,10 @@ bool XShapeGrad::IsShapeOP(XTensor * node)
 }

 /* post processing of a node */
-void XShapeGrad::PostProcessing(XTensor * node, int typeID, bool isEfficent)
+void XShapeGrad::PostProcessing(XTensor * node, int typeID, bool isEfficient)
 {
-    if(typeID == SHAPE_SPLIT_LIST)
-        GradSplitListPost(node, isEfficent);
+    if (typeID == SHAPE_SPLIT_LIST)
+        GradSplitListPost(node, isEfficient);
 }

 /* 
@@ -93,7 +93,7 @@ dE/da = spreadforcopyindexed(b)
 >> isEfficient - indicates whether the computation is in
                 an efficient manner
 */
-void XShapeGrad::GradCopyIndexed(XTensor * node, bool isEfficent)
+void XShapeGrad::GradCopyIndexed(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for CopyIndexed!");
@@ -105,8 +105,15 @@ void XShapeGrad::GradCopyIndexed(XTensor * node, bool isEfficent)
    XTensor * srcIndex = income.tails[1];
    XTensor * tgtIndex = income.tails[2];

-    XNoder::MakeGrad(input);
-    _SpreadForCopyIndexed(input->grad, node->grad, dim, srcIndex, tgtIndex, copyNum);
+    if (!isEfficient || input->isGrad) {
+        XNoder::MakeGrad(input);
+
+        XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
+        _SpreadForCopyIndexed(tmp, node->grad, dim, srcIndex, tgtIndex, copyNum);
+        _SumMe(input->grad, tmp);
+
+        DelTensorBuf(tmp);
+    }
 }

 /* 
@@ -119,16 +126,23 @@ dE/da = spreadforgather(b)
 >> isEfficient - indicates whether the computation is in
                 an efficient manner
 */
-void XShapeGrad::GradGather(XTensor * node, bool isEfficent)
+void XShapeGrad::GradGather(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for Gather!");

    XTensor * input = income.tails[0];
    XTensor * index = income.tails[1];
-    XNoder::MakeGrad(input);
+    
+    if (!isEfficient || input->isGrad) {
+        XNoder::MakeGrad(input);
+
+        XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
+        _SpreadForGather(tmp, node->grad, index);
+        _SumMe(input->grad, tmp);

-    _SpreadForGather(input->grad, node->grad, index);
+        DelTensorBuf(tmp);
+    }

    node->visitMark = NODE_FINISHED;
 }
@@ -136,7 +150,7 @@ void XShapeGrad::GradGather(XTensor * node, bool isEfficent)
 /*
 gradient computation for DropoutWithIndex function
 */
-void XShapeGrad::GradDropoutWithIndex(XTensor * node, bool isEfficent)
+void XShapeGrad::GradDropoutWithIndex(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for DropoutWithIndex!");
@@ -144,28 +158,23 @@ void XShapeGrad::GradDropoutWithIndex(XTensor * node, bool isEfficent)
    XTensor * input = income.tails[0];
    XTensor * index = income.tails[1];
    DTYPE scale = income.GetParam(0);
-    XNoder::MakeGrad(input);
-
-    //_Identity(node->grad, input->grad);
-    _CopyValues(node->grad, input->grad);
+    
+    if (!isEfficient || input->isGrad) {
+        XNoder::MakeGrad(input);

-    int order = node->grad->order;
-    int * dimSize = new int[order];
+        XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
+        _CopyValues(node->grad, tmp);

-    for (int i = 0; i < order; i++) {
-        dimSize[i] = node->grad->dimSize[i];
-    }
+        tmp->Reshape(tmp->unitNum);

-    int order1 = 1;
-    int * dimSize1 = new int[order1];
-    dimSize1[0] = input->grad->unitNum;
-    
-    input->grad->Reshape(order1, dimSize1);
+        _DropoutWithIndex(node->grad, index, tmp);
+        _ScaleAndShiftMe(tmp, scale);

-    _DropoutWithIndex(node->grad, index, input->grad);
-    _ScaleAndShiftMe(input->grad, scale);
+        tmp->Reshape(input->order, input->dimSize);
+        _SumMe(input->grad, tmp);

-    input->grad->Reshape(order, dimSize);
+        DelTensorBuf(tmp);
+    }

    node->visitMark = NODE_FINISHED;
 }
@@ -185,7 +194,7 @@ dE/da = split(dE/dc)
 >> isEfficient - indicates whether the computation is in
                 an efficient manner
 */
-void XShapeGrad::GradMerge(XTensor * node, bool isEfficent)
+void XShapeGrad::GradMerge(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    XTensor * input = income.tails[0];
@@ -196,62 +205,64 @@ void XShapeGrad::GradMerge(XTensor * node, bool isEfficent)
    int whereToMerge = income.GetParamInt(0);
    int leadDim = income.GetParamInt(1);

-    int blockSize = 1;
-    int blockNum = 1;
-    for(int i = 0; i < input->order; i++){
-        if(i < leadDim)
-            blockNum *= input->dimSize[i];
-    }
-    blockSize = input->GetDataSizeInChar() / blockNum;
-
-    XNoder::MakeGrad(input);
+    if (!isEfficient || input->isGrad) {
+        XNoder::MakeGrad(input);

-    int * dims = new int[input->order];
-    memset(dims, 0, sizeof(int) * input->order);
-    for(int i = 0, j = 0; i < input->order; i++){
-        if(i >= leadDim){
-            dims[j++] = input->dimSize[i];
+        int * dims = new int[input->order];
+        memset(dims, 0, sizeof(int) * input->order);
+        for (int i = 0, j = 0; i < input->order; i++) {
+            if (i >= leadDim) {
+                dims[j++] = input->dimSize[i];
+            }
        }
-    }
-    dims[0] = -dims[0];
-    XTensor gradInputSmall(input->order - leadDim, dims,
-                           input->dataType, input->denseRatio, 
-                           input->devID, input->mem);
-
-    dims[whereToMerge - leadDim] *= dims[0];
-    XTensor gradNodeSmall(node->order - leadDim, dims + leadDim + 1,
-                          node->dataType, node->denseRatio, 
-                          node->devID, node->mem);
-
-    /* we can simply split the gradient tensor 
-       if the input is used in merging only */
-    if(input->outgo.tailNum == 1){
-        for(int i = 0; i < blockNum; i++){
-            gradNodeSmall.data = (char*)node->grad->data + i * blockSize;
-            gradInputSmall.data = (char*)input->grad->data + i * blockSize;
-            _Split(&gradNodeSmall, &gradInputSmall, whereToMerge - leadDim - 1, input->dimSize[leadDim]);
+        dims[0] = -dims[0];
+        XTensor gradInputSmall(input->order - leadDim, dims,
+                               input->dataType, input->denseRatio,
+                               input->devID, input->mem);
+
+        dims[whereToMerge - leadDim] *= dims[0];
+        XTensor gradNodeSmall(node->order - leadDim, dims + leadDim + 1,
+                              node->dataType, node->denseRatio,
+                              node->devID, node->mem);
+
+        int blockSize = 1;
+        int blockNum = 1;
+        for (int i = 0; i < input->order; i++) {
+            if (i < leadDim)
+                blockNum *= input->dimSize[i];
+        }
+        blockSize = input->GetDataSizeInChar() / blockNum;
+
+        /* we can simply split the gradient tensor
+           if the input is used in merging only */
+        if (input->outgo.tailNum == 1) {
+            for (int i = 0; i < blockNum; i++) {
+                gradNodeSmall.data = (char*)node->grad->data + i * blockSize;
+                gradInputSmall.data = (char*)input->grad->data + i * blockSize;
+                _Split(&gradNodeSmall, &gradInputSmall, whereToMerge - leadDim - 1, input->dimSize[leadDim]);
+            }
        }
-    }
-
-    /* a more complicated case is that the input tensor is used for 
-       other operations somewhere else. So we have to do gradient 
-       accumulation after spliting, i.e., we need an additional 
-       SUM operation */
-    else{
-        XTensor gradInputSmallBuf(&gradInputSmall);

-        for(int i = 0; i < blockNum; i++){
-            gradNodeSmall.data = (char*)node->grad->data + i * blockSize;
-            gradInputSmall.data = (char*)input->grad->data + i * blockSize;
-            _Split(&gradNodeSmall, &gradInputSmallBuf, whereToMerge - leadDim - 1, input->dimSize[leadDim]);
-            _Sum(&gradInputSmall, &gradInputSmallBuf, &gradInputSmall);
+        /* a more complicated case is that the input tensor is used for
+           other operations somewhere else. So we have to do gradient
+           accumulation after spliting, i.e., we need an additional
+           SUM operation */
+        else {
+            XTensor gradInputSmallBuf(&gradInputSmall);
+
+            for (int i = 0; i < blockNum; i++) {
+                gradNodeSmall.data = (char*)node->grad->data + i * blockSize;
+                gradInputSmall.data = (char*)input->grad->data + i * blockSize;
+                _Split(&gradNodeSmall, &gradInputSmallBuf, whereToMerge - leadDim - 1, input->dimSize[leadDim]);
+                _Sum(&gradInputSmall, &gradInputSmallBuf, &gradInputSmall);
+            }
        }
-    }

-    gradNodeSmall.data = NULL;
-    gradInputSmall.data = NULL;
+        gradNodeSmall.data = NULL;
+        gradInputSmall.data = NULL;

-    delete[] dims;
+        delete[] dims;
+    }

    node->visitMark = NODE_FINISHED;
 }
@@ -279,18 +290,18 @@ void XShapeGrad::GradMergeList(XTensor * node, bool isEfficient)
    TensorList smalls(income.tailNum);
    TensorList smallsGrad(income.tailNum);
    bool mergeOnly = true;
-    for(int i = 0; i < income.tailNum; i++){
+
+    for (int i = 0; i < income.tailNum; i++) {
+        /* TODO! efficient backpropagate */
        XTensor * tail = income.tails[i];
        XNoder::MakeGrad(tail);
        smalls.Add(tail);
        smallsGrad.Add(tail->grad);
        
-        if(i > 1){
-            CheckNTErrors(_IsSameShaped(last, tail), 
-                         "Input tensors must be of the same size!");
-        }
+        if (i > 1)
+            CheckNTErrors(_IsSameShaped(last, tail), "Input tensors must be of the same size!");

-        if(tail->outgo.tailNum  > 1)
+        if (tail->outgo.tailNum > 1)
            mergeOnly = false;

        last = tail;
@@ -300,7 +311,7 @@ void XShapeGrad::GradMergeList(XTensor * node, bool isEfficient)

    /* we can simply split the gradient tensor into the input tensors 
       if the inputs are used in merging only */
-    if(mergeOnly)
+    if (mergeOnly)
        _Split(node->grad, &smallsGrad, whereToMerge, smalls.count);

    /* a more complicated case is that the input tensors are used for 
@@ -326,7 +337,7 @@ void XShapeGrad::GradMergeList(XTensor * node, bool isEfficient)
                          last->devID, last->mem);

        /* gradient accumulation for each split */
-        for(int i = 0; i < smalls.count; i++){
+        for (int i = 0; i < smalls.count; i++) {
            XTensor * inputGrad = (XTensor*)smallsGrad.Get(i);
            gradSmall.data = (char*)gradSplit.data + i * last->unitNum * last->unitSize;
            _Sum(inputGrad, &gradSmall, inputGrad);
@@ -349,17 +360,20 @@ dE/da = reshape(dE/db)
 >> isEfficient - indicates whether the computation is in
                 an efficient manner
 */
-void XShapeGrad::GradReshape(XTensor * node, bool isEfficent)
+void XShapeGrad::GradReshape(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for RESHAPE!");
+    
    XTensor * input = income.tails[0];
-    XNoder::MakeGrad(input);

-    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for MERGE!");
+    if (!isEfficient || input->isGrad) {
+        XNoder::MakeGrad(input);

-    node->grad->Reshape(input->order, input->dimSize);
-    _CopyValues(node->grad, input->grad);
-    node->grad->Reshape(node->order, node->dimSize);
+        node->grad->Reshape(input->order, input->dimSize);
+        _CopyValues(node->grad, input->grad);
+        node->grad->Reshape(node->order, node->dimSize);
+    }

    node->visitMark = NODE_FINISHED;
 }
@@ -386,22 +400,24 @@ void XShapeGrad::GradSplit(XTensor * node, bool isEfficient)
    CheckNTErrors(node->order == input->order + 1, "Wrong tensor orders!");
    CheckNTErrors(splitNum == node->dimSize[0], "Wrong split number!");

-    XNoder::MakeGrad(input);
+    if (!isEfficient || input->isGrad) {
+        XNoder::MakeGrad(input);

-    /* we can simply merge the gradient tensor 
-       if the input is used in spliting only */
-    if(input->outgo.tailNum == 1)
-        _Merge(node->grad, input->grad, whereToSplit + 1, 0);
+        /* we can simply merge the gradient tensor
+           if the input is used in spliting only */
+        if (input->outgo.tailNum == 1)
+            _Merge(node->grad, input->grad, whereToSplit + 1, 0);

-    /* if the tensor is used somewhere else, we need another SUM
-       for gradient accumulation */
-    else{
-        XTensor * inputGradTMP = NewTensorBufV2(input, input->devID, input->mem);
+        /* if the tensor is used somewhere else, we need another SUM
+           for gradient accumulation */
+        else {
+            XTensor * inputGradTMP = NewTensorBufV2(input, input->devID, input->mem);

-        _Merge(node->grad, inputGradTMP, whereToSplit + 1, 0);
-        _Sum(input->grad, inputGradTMP, input->grad);
-        
-        DelTensorBuf(inputGradTMP);
+            _Merge(node->grad, inputGradTMP, whereToSplit + 1, 0);
+            _Sum(input->grad, inputGradTMP, input->grad);
+
+            DelTensorBuf(inputGradTMP);
+        }
    }

    node->visitMark = NODE_FINISHED;
@@ -449,14 +465,14 @@ void XShapeGrad::GradSplitListPost(XTensor * node, bool isEfficient)
    int whereToSplit = -1;
    int splitNum = 0;

-    for(int i = 0; i < outgo.tailNum; i++){
+    for (int i = 0; i < outgo.tailNum; i++) {
        XTensor * parent = (XTensor*)outgo.tails[i];
        XLink &income = parent->income;
-        if(income.typeID == SHAPE_SPLIT_LIST){
+        if (income.typeID == SHAPE_SPLIT_LIST) {
            int w = income.GetParamInt(0);
            int splitID = income.GetParamInt(1);
            
-            if(whereToSplit < 0)
+            if (whereToSplit < 0)
                whereToSplit = w;
            splitNum++;

@@ -468,24 +484,26 @@ void XShapeGrad::GradSplitListPost(XTensor * node, bool isEfficient)
        }
    }

-    XNoder::MakeGrad(node);
+    if (!isEfficient || node->isGrad) {
+        XNoder::MakeGrad(node);

-    /* we can simply merge the gradient tensor 
-       if the node is used in spliting only */
-    if(outgo.tailNum == splitNum){
-        _Merge(&splits, node->grad, whereToSplit);
-    }
+        /* we can simply merge the gradient tensor
+           if the node is used in spliting only */
+        if (outgo.tailNum == splitNum) {
+            _Merge(&splits, node->grad, whereToSplit);
+        }

-    /* if the tensor is used as input to other nodes
-       somewhere else, we need another SUM for gradient 
-       accumulation */
-    else{
-        XTensor * nodeGradTMP = NewTensorBufV2(node, node->devID, node->mem);
+        /* if the tensor is used as input to other nodes
+           somewhere else, we need another SUM for gradient
+           accumulation */
+        else {
+            XTensor * nodeGradTMP = NewTensorBufV2(node, node->devID, node->mem);

-        _Merge(&splits, nodeGradTMP, whereToSplit + 1);
-        _Sum(node->grad, nodeGradTMP, node->grad);
-        
-        DelTensorBuf(nodeGradTMP);
+            _Merge(&splits, nodeGradTMP, whereToSplit + 1);
+            _Sum(node->grad, nodeGradTMP, node->grad);
+
+            DelTensorBuf(nodeGradTMP);
+        }
    }
 }

@@ -506,19 +524,23 @@ void XShapeGrad::GradTranspose(XTensor * node, bool isEfficient)

    XTensor * output = node;
    XTensor * input = income.tails[0];
-    XTensor * b = NewTensorBufV2(input, input->devID, input->mem);
-    XNoder::MakeGrad(input);

-    int i = income.GetParamInt(0);
-    int j = income.GetParamInt(1);

-    CheckNTErrors(input->order > i && i >= 0, "index of dimension is out of scope!");
-    CheckNTErrors(input->order > j && j >= 0, "index of dimension is out of scope!");
+    if (!isEfficient || input->isGrad) {
+        XNoder::MakeGrad(input);

-    _Transpose(output->grad, b, i, j);
-    _Sum(input->grad, b, input->grad);
-    
-    DelTensorBuf(b);
+        int i = income.GetParamInt(0);
+        int j = income.GetParamInt(1);
+
+        CheckNTErrors(input->order > i && i >= 0, "index of dimension is out of scope!");
+        CheckNTErrors(input->order > j && j >= 0, "index of dimension is out of scope!");
+
+        XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
+        _Transpose(output->grad, tmp, i, j);
+        _Sum(input->grad, tmp, input->grad);
+
+        DelTensorBuf(tmp);
+    }

    node->visitMark = NODE_FINISHED;
 }
@@ -540,7 +562,6 @@ void XShapeGrad::GradUnsqueeze(XTensor * node, bool isEfficient)

    XTensor * output = node;
    XTensor * input = income.tails[0];
-    XNoder::MakeGrad(input);

    int dim = income.GetParamInt(0);
    int dSize = income.GetParamInt(1);
@@ -548,12 +569,16 @@ void XShapeGrad::GradUnsqueeze(XTensor * node, bool isEfficient)
    CheckNTErrors(dSize == output->GetDim(dim), "Wrong dim size for UNSQUEEZE!");
    CheckNTErrors(output->unitNum = input->unitNum * dSize, "Wrong tensor size!");
    
-    XTensor * g = NewTensorBufV2(input->grad, input->devID, input->mem);
-    
-    _ReduceSum(output->grad, g, dim);
-    _Sum(input->grad, g, input->grad);
-    
-    DelTensorBuf(g);
+    if (!isEfficient || input->isGrad) {
+        XNoder::MakeGrad(input);
+
+        XTensor * tmp = NewTensorBufV2(input->grad, input->devID, input->mem);
+
+        _ReduceSum(output->grad, tmp, dim);
+        _Sum(input->grad, tmp, input->grad);
+
+        DelTensorBuf(tmp);
+    }

    node->visitMark = NODE_FINISHED;
 }

--- a/source/network/XBackwardShape.h
+++ b/source/network/XBackwardShape.h
@@ -42,55 +42,55 @@ public:

    /* post processing of a node */
    static
-    void PostProcessing(XTensor * node, int typeId, bool isEfficent);
+    void PostProcessing(XTensor * node, int typeId, bool isEfficient);

 private:
    
    /* gradient computation for copying indexed sub-tensors: b = copyindexed(a, srcIndex, indexSize, tgtIndex, copyNum) */
    static
-    void GradCopyIndexed(XTensor * node, bool isEfficent);
+    void GradCopyIndexed(XTensor * node, bool isEfficient);
        
    /* gradient computation for copying indexed sub-tensors: b = gather(a, index) */
    static
-    void GradGather(XTensor * node, bool isEfficent);
+    void GradGather(XTensor * node, bool isEfficient);

    /* gradient computation for dropout with index: b = dropoutwithindex(a, index) */
    static
-    void GradDropoutWithIndex(XTensor * node, bool isEfficent);
+    void GradDropoutWithIndex(XTensor * node, bool isEfficient);

    /* gradient computation for merge: c = merge(a, b, ...) */
    static
-    void GradMerge(XTensor * node, bool isEfficent);
+    void GradMerge(XTensor * node, bool isEfficient);

    /* gradient computation for merging a list of tensors : c = merge(list(a, b, ...)) */
    static
-    void GradMergeList(XTensor * node, bool isEfficent);
+    void GradMergeList(XTensor * node, bool isEfficient);
    
    /* gradient computation for transposing a tensor : b = transpose(a) */
    static
-    void GradTranspose(XTensor * node, bool isEfficent);
+    void GradTranspose(XTensor * node, bool isEfficient);

    /* gradient computation for reshaping a tensor: c = reshape(a) */
    static
-    void GradReshape(XTensor * node, bool isEfficent);
+    void GradReshape(XTensor * node, bool isEfficient);

    /* gradient computation for split: c = split(a) */
    static
-    void GradSplit(XTensor * node, bool isEfficent);
+    void GradSplit(XTensor * node, bool isEfficient);

    /* gradient computation for spliting. we return the list of the splits : list(c_1, ...) = split(a) */
    static
-    void GradSplitList(XTensor * node, bool isEfficent);
+    void GradSplitList(XTensor * node, bool isEfficient);

    /* gradient computation for spliting. we return the list of the splits : list(c_1, ...) = split(a).
       this method is called only when all nodes of spliting have been processed. We do this in a post-processing
       manner because we can fuze multiple memory copy jobs one time. This is good for system speed up. */
    static
-    void GradSplitListPost(XTensor * node, bool isEfficent);
+    void GradSplitListPost(XTensor * node, bool isEfficient);

    /* gradient computation for unsqueezing a tensor : c = unsqueeze(a) */
    static
-    void GradUnsqueeze(XTensor * node, bool isEfficent);
+    void GradUnsqueeze(XTensor * node, bool isEfficient);

 };


--- a/source/network/XNet.cpp
+++ b/source/network/XNet.cpp
@@ -316,7 +316,6 @@ void XNet::ClearGrad(XTensor * node)
    }

    if(finished){
-        //fprintf(stderr, "del %d %ld\n", node->id, node->grad->unitNum);
        delete node->grad;
        node->grad = NULL;
    }

--- a/source/sample/transformer/T2TPredictor.cpp
+++ b/source/sample/transformer/T2TPredictor.cpp
@@ -171,7 +171,7 @@ void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding,
    dims[inputEnc->order - 1] = 1;

    InitTensor(&first, inputEnc->order, dims, X_INT, inputEnc->devID);
-    _SetDataFixedInt(&first, startSymbol);
+    first.SetDataFixed(startSymbol);

    /* add a new word into the input sequence of the decoder side */
    if (inputLast == NULL) {
@@ -195,7 +195,7 @@ void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding,
    
    XTensor paddingDec;
    InitTensor(&paddingDec, inputDec.order, dims, X_INT, paddingEnc->devID);
-    SetDataFixedInt(paddingDec, 1);
+    paddingDec.SetDataFixed(1);
    
    XTensor maskDec;
    XTensor maskEncDec;

--- a/source/sample/transformer/T2TSearch.cpp
+++ b/source/sample/transformer/T2TSearch.cpp
@@ -503,7 +503,7 @@ void T2TSearch::Dump(XTensor * output)
    int * words = new int[maxLength];

    InitTensor(output, 3, dims, X_INT);
-    SetDataFixedInt(*output, -1);
+    output->SetDataFixed(-1);

    /* heap for an input sentence in the batch */
    for(int h = 0; h < batchSize; h++){

--- a/source/tensor/XDataType.h
+++ b/source/tensor/XDataType.h
@@ -50,6 +50,15 @@ extern TENSOR_DATA_TYPE GetDataType(const char * typeName);
 unsigned short FloatToFloat16(float f);
 float Float16ToFloat(unsigned short h);

+#define CheckDataType(a, b) \
+{ \
+    if(GetDataTypeName(a) != GetDataTypeName(a)){ \
+        fprintf(stderr, "[ERROR] (%s line %d): we must run the code on the same datatype (%s vs %s)\n", \
+                __FILENAME__, __LINE__, GetDataTypeName(a), GetDataTypeName(b)); \
+        exit(1); \
+    } \
+} \
+
 } /* end of the nts (NiuTrans.Tensor) namespace */

 #endif
\ No newline at end of file
--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -64,7 +64,7 @@
 #endif

 /* the nts (NiuTrans.Tensor) namespace */
-namespace nts {
+namespace nts{

 int tensorIDGlobal = 0;
 MUTEX_HANDLE tensorMutex;
@@ -73,7 +73,7 @@ XTensor NULLTensor;
 /* generate a tensor id */
 int MakeTensorID()
 {
-    if (tensorIDGlobal == 0)
+    if(tensorIDGlobal == 0)
        MUTEX_INIT(tensorMutex);

    MUTEX_LOCK(tensorMutex);
@@ -145,7 +145,7 @@ XTensor::XTensor(const int myOrder, const int* myDimSize, const TENSOR_DATA_TYPE
    mem = myMem;
    devID = myMem != NULL ? myMem->devID : myDevID;

-    if (order >= 0)
+    if(order >= 0)
        Resize(myOrder, myDimSize, myDataType, myDenseRatio);
 }

@@ -158,8 +158,8 @@ XTensor::XTensor(const XTensor& reference)
    ShallowCopy(reference);
    data = NULL;
    dataHost = NULL;
-
-    if (reference.isTmp) {
+    
+    if(reference.isTmp){
        devID = reference.devID;
        mem = reference.mem;
        data = reference.data;
@@ -172,16 +172,16 @@ XTensor::XTensor(const XTensor& reference)
           This is VERY tricky and there might be better solutions :) */
        *reference.dataP = NULL;
    }
-    else {
+    else{
        devID = reference.devID;
        mem = reference.mem;
        InitTensorV2(this, &reference);
        _CopyValues(&reference, this);
    }

-    if (reference.isTmp)
+    if(reference.isTmp)
        XLink::Replace(&reference, this);
-    else {
+    else{
        CheckNTErrors(outgo.tailNum == 0, "The node has outgoing edge to other nodes!");
        XLink::CopyIncoming(&reference, this);
    }
@@ -225,7 +225,7 @@ XTensor::~XTensor()
       the connectivity of the graph. To kill memory
       leak, we release the data of the new tensor
       when its parent is deleted (see ClearIncoming). */
-    if (outgo.tailNum > 0) {
+    if(outgo.tailNum > 0){
        int dims[MAX_TENSOR_DIM_NUM];
        memcpy(dims, dimSize, order * sizeof(int));
        dims[0] = -dims[0];
@@ -243,7 +243,7 @@ XTensor::~XTensor()
    
    DestroyData();

-    if (grad != NULL)
+    if(grad != NULL)
        delete grad;
 }

@@ -288,16 +288,16 @@ void XTensor::Init()
 /* delete data arrays */
 void XTensor::DestroyData()
 {
-    if (data != NULL && mem == NULL && !isShared)
+    if(data != NULL && mem == NULL && !isShared)
        XMemFree(devID, data);
-    else if (data != NULL && isInGlobalMem)
+    else if(data != NULL && isInGlobalMem)
        FreeData(this, mem);
-    else if (data != NULL)
+    else if(data != NULL)
        mem->Release(data, GetDataSizeInChar(), signature);
    
    data = NULL;

-    if (dataHost != NULL)
+    if(dataHost != NULL)
        delete[] (char*)dataHost;
    dataHost = NULL;
 }
@@ -330,11 +330,11 @@ XTensor& XTensor::operator= (const XTensor& tensor)
 {
    /* we must make a hard copy of the tensor if it is the input
       of another node. */
-    if (outgo.tailNum > 0) {
+    if(outgo.tailNum > 0){
        int dims[MAX_TENSOR_DIM_NUM];
        memcpy(dims, dimSize, order * sizeof(int));
        dims[0] = -dims[0];
-
+        
        XTensor* newTensor = new XTensor(order, dims, dataType, denseRatio, devID, mem);
        newTensor->SetTMPFlag();
        newTensor->data = data;
@@ -350,35 +350,35 @@ XTensor& XTensor::operator= (const XTensor& tensor)
        dataHost = NULL;
    }

-    if (false && !tensor.isTmp) {
+    if(false && !tensor.isTmp){
        /* NOTE: this might lead to additional data copy by Mac LLVM compilers */
        /* we make an identity transformation here */
-
-        if (outgo.tailNum > 0)
+        
+        if(outgo.tailNum > 0)
            XLink::ClearOutgoing(this);
        XLink::ClearIncoming(this);
-
-        if (!_IsSameShaped(this, &tensor))
+        
+        if(!_IsSameShaped(this, &tensor))
            Resize(tensor.order, tensor.dimSize, tensor.dataType, tensor.denseRatio);
        
        _Identity(&tensor, this);
        XLink::MakeLink(&tensor, NULL, this, FUNC_IDENTITY);
    }
-    else {
+    else{
        /* hard copy of the data array */
        int size = unitNum * unitSize;
-        if (isInit && !isSparse && !tensor.isSparse &&
-            size == tensor.unitNum * tensor.unitSize &&
-            ((devID < 0 && tensor.devID < 0) && devID == tensor.devID) &&
+        if(isInit && !isSparse && !tensor.isSparse &&
+           size == tensor.unitNum * tensor.unitSize &&
+           ((devID < 0 && tensor.devID < 0) && devID == tensor.devID) &&
            data != NULL)
        {
            XMemCopy(data, devID, tensor.data, tensor.devID, size);
-            if (dataHost != NULL && tensor.dataHost != NULL)
+            if(dataHost != NULL && tensor.dataHost != NULL)
                XMemCopy(dataHost, -1, tensor.dataHost, tensor.devID, size);
        }
-        else {
+        else{
            DestroyData();
-            if (!isInit) {
+            if(!isInit){
                devID = tensor.devID;
                mem = tensor.mem;
            }
@@ -407,11 +407,11 @@ XTensor& XTensor::operator= (const XTensor&& tensor)
 {
    /* we must make a hard copy of the tensor if it is the input
       of another node. */
-    if (outgo.tailNum > 0) {
+    if(outgo.tailNum > 0){
        int dims[MAX_TENSOR_DIM_NUM];
        memcpy(dims, dimSize, order * sizeof(int));
        dims[0] = -dims[0];
-
+        
        XTensor* newTensor = new XTensor(order, dims, dataType, denseRatio, devID, mem);
        newTensor->SetTMPFlag();
        newTensor->data = data;
@@ -520,7 +520,7 @@ relocate the data on the target device
 */
 void XTensor::SetDevice(int myDevId, XMem* myMem)
 {
-    if (myMem == NULL) {
+    if(myMem == NULL){
        myMem = GMems.GetMem(myDevId);
    }
    FlushToMem(myMem);
@@ -529,7 +529,7 @@ void XTensor::SetDevice(int myDevId, XMem* myMem)

 bool XTensor::IsReduceShaped(const XTensor* a, const XTensor* b, int dim)
 {
-    if (a == NULL || b == NULL)
+    if(a == NULL || b == NULL)
        return false;

    if ((a->order - 1) != b->order)
@@ -541,18 +541,18 @@ bool XTensor::IsReduceShaped(const XTensor* a, const XTensor* b, int dim)
                return false;
        }
        else if (i >= dim) {
-            if (a->dimSize[i + 1] != b->dimSize[i])
+            if (a->dimSize[i+1] != b->dimSize[i])
                return false;
        }
    }

-    if (a->dataType != b->dataType)
+    if(a->dataType != b->dataType)
        return false;

-    if (a->denseRatio != b->denseRatio)
+    if(a->denseRatio != b->denseRatio)
        return false;

-    if (a->isSparse != b->isSparse)
+    if(a->isSparse != b->isSparse)
        return false;

    return true;
@@ -579,7 +579,7 @@ int XTensor::GetDim(const int dim) const
    CheckNTErrors(dim >= -order, "dimenision is out of range!");
    
    int d = dim;
-    if (dim < 0)
+    if(dim < 0)
        d = order + dim;

    return dimSize[d];
@@ -595,7 +595,7 @@ void XTensor::Reshape(const int myOrder, const int* myDimSize)
    int dims[MAX_TENSOR_DIM_NUM];
    int num = 1;

-    for (int i = 0; i < myOrder; i++) {
+    for(int i = 0; i < myOrder; i++){
        num *= myDimSize[i];
        dims[i] = abs(myDimSize[i]);
    }
@@ -663,7 +663,7 @@ XTensor XTensor::TypeAs(const XTensor input)
 /* get the number of items in the data array */
 int XTensor::GetSize() const
 {
-    if (isSparse)
+    if(isSparse)
        return unitNumNonZero;
    else
        return unitNum;
@@ -672,13 +672,13 @@ int XTensor::GetSize() const
 /* get the size of the memory space used */
 int XTensor::GetDataSizeInChar() const
 {
-    if (isSparse) {
+    if(isSparse){
        int num = int(unitNum * denseRatio + 1);
-        int tupleSize = sizeof(int) + sizeof(DTYPE);
-        int size = sizeof(int) + tupleSize * (num);
+        int tupleSize = sizeof(int)+sizeof(DTYPE);
+        int size = sizeof(int) + tupleSize*(num);
        return size;
    }
-    else {
+    else{
        return unitNum * unitSize;
    }
 }
@@ -690,15 +690,15 @@ get unit size in terms of "dataType"
 */
 int XTensor::GetUnitSize(TENSOR_DATA_TYPE myDataType) const
 {
-    if (myDataType == X_INT)
+    if(myDataType == X_INT)
        return sizeof(int);
-    else if (myDataType == X_FLOAT)
+    else if(myDataType == X_FLOAT)
        return sizeof(float);
-    else if (myDataType == X_DOUBLE)
+    else if(myDataType == X_DOUBLE)
        return sizeof(double);
-    else if (myDataType == X_INT8)
+    else if(myDataType == X_INT8)
        return 1;
-    else if (myDataType == X_FLOAT16)
+    else if(myDataType == X_FLOAT16)
        return 2;
    return sizeof(float);
 }
@@ -739,19 +739,19 @@ a vector with all entries of 0
 */
 void XTensor::SetZeroAll(XStream* stream)
 {
-    if (data == NULL)
+    if(data == NULL)
        return;

-    if (isSparse) {
-        if (devID >= 0) {
+    if(isSparse){
+        if(devID >= 0){
 #ifdef USE_CUDA
-            int size = sizeof(int) + (sizeof(int) + sizeof(DTYPE)) * unitNumNonZero;
-
+            int size = sizeof(int) + (sizeof(int)+sizeof(DTYPE)) * unitNumNonZero;
+            
            int devIDBackup = 0;
            cudaGetDevice(&devIDBackup);
            cudaSetDevice(devID);

-            if (stream == NULL)
+            if(stream == NULL)
                cudaMemset(data, 0, size);
            else
                cudaMemsetAsync(data, 0, size, stream->stream);
@@ -764,14 +764,14 @@ void XTensor::SetZeroAll(XStream* stream)

        unitNumNonZero = 0; 
    }
-    else {
-        if (devID >= 0) {
+    else{
+        if(devID >= 0){
 #ifdef USE_CUDA
            int devIDBackup = 0;
            cudaGetDevice(&devIDBackup);
            cudaSetDevice(devID);
-
-            if (stream == NULL)
+            
+            if(stream == NULL)
                cudaMemset(data, 0, unitNum * unitSize);
            else
                cudaMemsetAsync(data, 0, unitNum * unitSize, stream->stream);
@@ -791,7 +791,7 @@ void XTensor::SetZeroAll(XStream* stream)
 */
 void XTensor::SetData(const void* d, int num, int beg)
 {
-    if (data == NULL || d == NULL)
+    if(data == NULL || d == NULL)
        return;

    CheckNTErrors(!isSparse, "TODO");
@@ -816,6 +816,16 @@ void XTensor::Range(DTYPE lower, DTYPE upper, DTYPE step)
    _SetDataRange(this, lower, upper, step);
 }

+/* generate data items with a fixed value */
+template<class T>
+void XTensor::SetDataFixed(T num)
+{
+    _SetDataFixed(this, num);
+}
+template void XTensor::SetDataFixed<int>(int);
+template void XTensor::SetDataFixed<float>(float);
+template void XTensor::SetDataFixed<double>(double);
+
 /* 
 set the tensor items by a uniform distribution in range [lower, upper]
 >> lower - lower value of the range
@@ -823,62 +833,7 @@ set the tensor items by a uniform distribution in range [lower, upper]
 */
 void XTensor::SetDataRand(DTYPE lower, DTYPE upper)
 {
-    // TODO: GPU code!!!!!!!
-
-    if (data == NULL)
-        return;
-
-    // srand((unsigned)time(0));
-    DTYPE variance = upper - lower;
-    void* d = NULL;
-    if (dataType == X_FLOAT) {
-        d = new float[unitNum];
-        for (int i = 0; i < unitNum; i++) {
-            DTYPE value = lower + variance * (float)rand() / RAND_MAX;
-            *((float*)d + i) = value;
-        }
-    }
-    else if (dataType == X_DOUBLE) {
-        d = new double[unitNum];
-        for (int i = 0; i < unitNum; i++) {
-            *((double*)d + i) = lower + variance * rand() / RAND_MAX;
-        }
-    }
-    else {
-        ShowNTErrors("Data type must be X_FLOAT or X_Double!");
-    }
-
-    SetData(d, unitNum);
-    
-    if (dataType == X_FLOAT) {
-        delete[] (float*)d;
-    }
-    else {
-        delete[] (double*)d;
-    }
-}
-
-/* a gauss distribution (Box-Muller method) */
-double GaussRand(DTYPE mean, DTYPE standardDeviation)
-{
-    // TODO: GPU code!!!!!!!
-
-    static double u, v;
-    static int phase = 0;
-    double z;
-    double pi = 3.141592654;
-
-    if (phase == 0) {
-        u = (rand() + 1.0) / (RAND_MAX + 1.0);
-        v = (rand() + 1.0) / (RAND_MAX + 1.0);
-        z = sqrt(-2.0 * log(u)) * sin(2.0 * pi * v);
-    }
-    else {
-        z = sqrt(-2.0 * log(u)) * cos(2.0 * pi * v);
-    }
-
-    phase = 1 - phase;
-    return mean + (z * standardDeviation);
+    _SetDataRand(this, lower, upper);
 }

 /* 
@@ -888,37 +843,7 @@ set the tensor items by a normal distribution
 */
 void XTensor::SetDataRandn(DTYPE mean, DTYPE standardDeviation)
 {
-    // TODO: cuda code!!!!!!!
-
-    if (data == NULL)
-        return;
-
-    // srand((unsigned)time(0));
-    void* d = NULL;
-    if (dataType == X_FLOAT) {
-        d = new float[unitNum];
-        for (int i = 0; i < unitNum; i++) {
-            *((float*)d + i) = (float)GaussRand(mean, standardDeviation);
-        }
-    }
-    else if (dataType == X_DOUBLE) {
-        d = new double[unitNum];
-        for (int i = 0; i < unitNum; i++) {
-            *((double*)d + i) = GaussRand(mean, standardDeviation);
-        }
-    }
-    else {
-        ShowNTErrors("Data type must be X_FLOAT or X_Double!");
-    }
-
-    SetData(d, unitNum);
-
-    if (dataType == X_FLOAT) {
-        delete[] (float*)d;
-    }
-    else {
-        delete[] (double*)d;
-    }
+    _SetDataRandN(this, mean, standardDeviation);
 }

 /* 
@@ -990,20 +915,20 @@ void* XTensor::GetCell(int index[], int size) const
    CheckNTErrors((size == order), "Illegal index!");

    int offset = index[0];
-    for (int i = 1; i < size; ++i) {
+    for(int i = 1; i < size; ++i){
        CheckNTErrors((index[i] < dimSize[i]), "Index is out of range!");
        offset = offset * dimSize[i] + index[i];
    }
-
-    if (isSparse) {
+    
+    if(isSparse){
        DTYPE value;
        void* p;
-        if (BinarySearch(offset, value, p))
+        if(BinarySearch(offset, value, p))
            return (char*)p + sizeof(int);
        else
            return NULL;
    }
-    else {
+    else{
        return ((char*)data) + offset * unitSize;
    }
 }
@@ -1089,9 +1014,9 @@ int XTensor::GetInt(int offset) const
    CheckNTErrors(offset >= 0 && offset < unitNum, "Invalid index!");
    CheckNTErrors(data != NULL, "Cannot use an uninitialized tensor!");
    CheckNTErrors(denseRatio == 1.0F, "Only dense tensors are supported in Get(offset).");
-
+    
    int* address = (int*)data + offset;
-
+    
    return ToCPUInt(devID, address);
 }

@@ -1195,7 +1120,7 @@ int XTensor::GetKeyInSparse(int i) const

    char* d = (char*)data + sizeof(int);
    int* key = (int*)(d + (sizeof(int) + sizeof(DTYPE)) * i);
-
+    
    return ToCPUInt(devID, key);
 }

@@ -1308,9 +1233,9 @@ bool XTensor::SetInt(int value, int offset)
 {
    CheckNTErrors(offset >= 0 && offset < unitNum, "Invalid index!");
    CheckNTErrors(data != NULL, "Cannot use an uninitialized tensor!");
-
+    
    int* d = (int*)data + offset;
-
+    
    return SetToDeviceInt(devID, d, value);
 }

@@ -1415,7 +1340,7 @@ bool XTensor::Add2D(DTYPE value, int ni, int mi)
    CheckNTErrors(dataType == DEFAULT_DTYPE, "The tensor is not in default type.");
    CheckNTErrors(isSparse == false, "TODO!");

-    if (devID < 0) {
+    if(devID < 0){
        DTYPE* p = (DTYPE*)data + ni * dimSize[1] + mi;

        CheckNTErrors((p != NULL), "No data array is found!");
@@ -1424,7 +1349,7 @@ bool XTensor::Add2D(DTYPE value, int ni, int mi)
    
        return true;
    }
-    else {
+    else{
        int dims[2] = {ni, mi};
        return SetToDevice(devID, GetCell(dims, 2), Get2D(ni, mi) + value);
    }
@@ -1433,24 +1358,24 @@ bool XTensor::Add2D(DTYPE value, int ni, int mi)
 /* get the number of non-zero elements (in a sparse tensor) */
 int XTensor::GetNonzeroSize() const
 {
-    if (!isSparse) {
+    if(!isSparse){
        XPRINT(1, stderr, "WARNING! Counting non-zero elements in a dense tensor might be slow!\n");
        CheckNTErrors(devID < 0, "TODO");
-        if (dataType == DEFAULT_DTYPE) {
+        if(dataType == DEFAULT_DTYPE){
            int count = 0;
-            for (int i = 0; i < unitNum; i++) {
+            for(int i = 0; i < unitNum; i++){
                DTYPE value = *(DTYPE*)((char*)data + i * sizeof(DTYPE));
-                if (value == 0)
+                if(value == 0)
                    count++;
            }
            return count;
        }
-        else {
+        else{
            ShowNTErrors("TODO!");
            return -1;
        }
    }
-    else {
+    else{
        /* return the head of the tuple list */
        return unitNumNonZero;
    }
@@ -1481,7 +1406,7 @@ set the tensor as "variable"
 void XTensor::SetVarFlag(bool myIsVar)
 {
    isVar = myIsVar;
-    if (isVar)
+    if(isVar)
        SetGradFlag(true);
 }

@@ -1497,7 +1422,7 @@ bool XTensor::Resize(const int myOrder, const int* myDimSize,
                     const TENSOR_DATA_TYPE myDataType, const float myDenseRatio)
 {
    /* free old mem */
-    if (data != NULL) {
+    if(data != NULL){
        if (mem == NULL)
            XMemFree(devID, data);
        else
@@ -1513,11 +1438,11 @@ bool XTensor::Resize(const int myOrder, const int* myDimSize,

    bool filledData = true;
    bool zeroData = false;
-    for (int i = 0; i < order; i++) {
+    for(int i = 0; i < order; i++){
        dimSize[i] = abs(myDimSize[i]);
-        if (myDimSize[i] < 0)
+        if(myDimSize[i] < 0)
            filledData = false;
-        if (myDimSize[i] == 0)
+        if(myDimSize[i] == 0)
            zeroData = true;
        unitNum *= dimSize[i];
    }
@@ -1528,17 +1453,17 @@ bool XTensor::Resize(const int myOrder, const int* myDimSize,
    dataType = myDataType;
    unitSize = GetUnitSize(dataType);

-    if (myDataType != DEFAULT_DTYPE)
+    if(myDataType != DEFAULT_DTYPE)
        isDefaultDType = false;
    else
        isDefaultDType = true;

-    if (zeroData) {
+    if(zeroData){
        unitNum = 0;
        return false;
    }

-    if (isSparse) {
+    if(isSparse){
        /*
        for sparse matrices, we use a list of tuple (key, value), 
        ordered by key. Take a (2-dimensional) matrix as an example, 
@@ -1560,18 +1485,18 @@ bool XTensor::Resize(const int myOrder, const int* myDimSize,
        int tupleSize = sizeof(int) + sizeof(DTYPE);
        int size = sizeof(int) + tupleSize * (num);

-        if (filledData) {
+        if(filledData){
            int* d = NULL;

-            if (mem == NULL) {
+            if(mem == NULL){
                d = new int[size];
                memset(d, 0, size);
            }
-            else {
+            else{
                d = (int*)mem->Alloc(mem->devID, size);
            }

-            if (d == NULL)
+            if(d == NULL)
                return false;

 #if !defined(UNSAFE_BUT_FAST_MEM)
@@ -1581,11 +1506,11 @@ bool XTensor::Resize(const int myOrder, const int* myDimSize,
        }
        return true;
    }
-    else {
-        if (filledData) {
+    else{
+        if(filledData){
            /* allocate the new one */
-            if (mem == NULL) {
-                data = XMemAlloc(devID, unitNum * unitSize);
+            if(mem == NULL){
+                data = XMemAlloc(devID, unitNum * unitSize); 
 #if defined(UNSAFE_BUT_FAST_MEM)
                XMemSet(devID, data, 0, unitNum * unitSize);
 #endif
@@ -1593,12 +1518,12 @@ bool XTensor::Resize(const int myOrder, const int* myDimSize,
            else
                data = (void*)mem->Alloc(mem->devID, unitNum * unitSize);

-            if (data == NULL)
+            if(data == NULL)
                return false;
        }

 #if !defined(UNSAFE_BUT_FAST_MEM)
-        if (data != NULL)
+        if(data != NULL)
            XMem::SetZero(data, unitNum * unitSize, mem);
 #endif
        return true;
@@ -1614,7 +1539,7 @@ bool XTensor::Resize(const XTensor* myTensor)
    denseRatio = myTensor->denseRatio;
    TENSOR_DATA_TYPE myDataType = myTensor->dataType;

-    if (myDataType != DEFAULT_DTYPE)
+    if(myDataType != DEFAULT_DTYPE)
        isDefaultDType = false;
    else
        isDefaultDType = true;
@@ -1637,7 +1562,7 @@ bool XTensor::BinarySearch(int key, DTYPE& value, void*& position) const

    int* d = (int*)data;

-    if (key < 0 || *d == 0) {
+    if(key < 0 || *d == 0){
        value = 0;
        position = NULL;
        return false;
@@ -1652,32 +1577,32 @@ bool XTensor::BinarySearch(int key, DTYPE& value, void*& position) const
    int tupleSize = sizeof(int) + sizeof(DTYPE);
    char* p = (char*)data + headSize;

-    while (low <= high) {
-        int mid = low + (high - low) / 2;
+    while (low <= high){  
+        int mid = low + (high - low)/2;
        k = (int*)(p + tupleSize * mid);
-        if (*k == key) {
+        if(*k == key){
            ok = true;
            high = mid - 1;
            break;
        }
-        else if (*k > key) {
+        else if(*k > key){
            high = mid - 1;
        }
-        else {
+        else{
            low = mid + 1;
            last = mid;
        }
    }

-    if (ok) {
+    if(ok){
        DTYPE* p = (DTYPE*)((char*)k + sizeof(int));
        value = *p;
        position = k;
        return true;
    }
-    else {
+    else{
        value = 0;
-        if (last == -1)
+        if(last == -1)
            position = NULL;
        else
            position = (char*)data + headSize + tupleSize * last;
@@ -1730,10 +1655,10 @@ void XTensor::Dump(FILE* file, const char* label, const int n, const int beg, co

    if (label != NULL)
        fprintf(file, "%s ", label);
-
-    if (isInit) {
+    
+    if(isInit){
        fprintf(file, "order=%d dimsize=", order);
-        if (order == 0) {
+        if(order == 0) {
            fprintf(file, "%d,", dimSize[0]);
        }
        for (int i = 0; i < order; i++) {
@@ -1742,21 +1667,21 @@ void XTensor::Dump(FILE* file, const char* label, const int n, const int beg, co
                fprintf(file, ",");
        }
    }
-    else {
+    else{
        fprintf(file, "order=-1 dimsize=-1");
    }

    fprintf(file, " dtype=%s dense=%f\n", GetDataTypeName(dataType), denseRatio);

-    if (!isInit) {
+    if(!isInit){
        fprintf(file, "NULL");
    }
    if (!isSparse) {
        if (dataType == DEFAULT_DTYPE) {
            int end = MIN(n > 0 ? beg + n : beg + unitNum, unitNum);
-            for (int i = beg; i < end; i++) {
+            for(int i = beg; i < end; i++){
                DTYPE f = ((DTYPE*)d)[i];
-                if (i == beg)
+                if(i == beg)
                    fprintf(file, "%e", f);
                else
                    fprintf(file, " %e", f);
@@ -1765,9 +1690,9 @@ void XTensor::Dump(FILE* file, const char* label, const int n, const int beg, co
        }
        else if (dataType == X_INT) {
            int end = MIN(n > 0 ? beg + n : beg + unitNum, unitNum);
-            for (int i = beg; i < end; i++) {
+            for(int i = beg; i < end; i++){
                int f = ((int*)d)[i];
-                if (i == beg)
+                if(i == beg)
                    fprintf(file, "%d", f);
                else
                    fprintf(file, " %d", f);
@@ -1858,7 +1783,7 @@ void XTensor::Read(FILE* file, const char* label)
    fgetc(file);

    if (fscanf(file, "order=%d dimsize=%s dtype=%s dense=%f",
-        &dimNum, dimSizeName, typeName, &dRatio) < 4) {
+                      &dimNum, dimSizeName, typeName, &dRatio) < 4) {
        ShowNTErrors("Incorrect format when reading the tensor!");
    }

@@ -2018,22 +1943,22 @@ allocate the memory space of the tensor (in the global memory)
 */
 void XTensor::AllocateData(XTensor* tensor, XMem* myMem, bool useBuf)
 {
-    if (tensor == NULL)
+    if(tensor == NULL)
        return;

-    if (myMem == NULL) {
-        if (tensor->data != NULL)
+    if(myMem == NULL){
+        if(tensor->data != NULL)
            FreeData(tensor, NULL, false);
        tensor->data = XMemAlloc(tensor->devID, tensor->GetDataSizeInChar());
        tensor->isInGlobalMem = true;
    }
-    else {
+    else{
        CheckNTErrors((tensor->data == NULL), "Cannot renew the space for the tensor");
-        if (useBuf) {
+        if(useBuf){
            tensor->data = myMem->AllocBuf(tensor->devID, tensor->GetDataSizeInChar());
            tensor->isInGlobalMem = false;
        }
-        else {
+        else{
            tensor->data = myMem->AllocGlobal(tensor->devID, tensor->GetDataSizeInChar());
            tensor->isInGlobalMem = true;
        }
@@ -2050,14 +1975,14 @@ free the memory space of the tensor (in the global memory)
 */
 void XTensor::FreeData(XTensor* tensor, XMem* myMem, bool useBuf)
 {
-    if (tensor == NULL)
+    if(tensor == NULL)
        return;

-    if (myMem == NULL) {
+    if(myMem == NULL){
        XMemFree(tensor->devID, tensor->data);
    }
-    else {
-        if (tensor->isInGlobalMem)
+    else{
+        if(tensor->isInGlobalMem)
            myMem->ReleaseGlobal(tensor->devID, tensor->data);
        else
            myMem->ReleaseBuf(tensor->devID, tensor->GetDataSizeInChar());

--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
@@ -303,6 +303,10 @@ public:
    /* generate data items with a range by start, end and the step */
    void Range(DTYPE lower, DTYPE upper, DTYPE step);

+    /* generate data items with a fixed value */
+    template<class T>
+    void SetDataFixed(T num);
+
    /* set tensor items by a uniform distribution */
    void SetDataRand(DTYPE lower = 0.0F, DTYPE upper = 1.0F);

@@ -423,11 +427,11 @@ public:
    bool BinarySearch(int key, DTYPE &value, void * &position) const;

    /* dump data to a file */
-    void Dump(FILE * file, const char * label = NULL, const int n = -1, const int beg = 0, const int verbose = 0);
+    void Dump(FILE * file = stderr, const char * label = NULL, const int n = -1, const int beg = 0, const int verbose = 0);

    /* dump data to a file */
    static
-    void Dump(const XTensor * tensor, FILE * file, const char * label = NULL, const int n = -1, const int beg = 0, const int verbose = 0);
+    void Dump(const XTensor * tensor, FILE * file = stderr, const char * label = NULL, const int n = -1, const int beg = 0, const int verbose = 0);

    /* dump data to a binary file */
    void BinaryDump(FILE * file);

--- a/source/tensor/core/getandset/OnehotAndIndex.cpp
+++ b/source/tensor/core/getandset/OnehotAndIndex.cpp
@@ -116,7 +116,7 @@ void _IndexToOnehot(const XTensor * index, XTensor * onehot,
    float confidence = 1 - labelSmoothingP;
    float lowconfidence = labelSmoothingP / size;

-    _SetDataFixedFloat(onehot, lowconfidence);
+    onehot->SetDataFixed(lowconfidence);

 #ifdef USE_CUDA
    if(onehot->devID >= 0 && index->devID >= 0) {

--- a/source/tensor/core/getandset/SetData.cpp
+++ b/source/tensor/core/getandset/SetData.cpp
@@ -76,278 +76,191 @@ void _SetDataFanInOut(XTensor * tensor, DTYPE gain)
    //_SetDataRand(tensor, -finfout, finfout);
 }

-/* 
-generate data items with a fixed value p 
->> tensor - the tensor whose data array would be initialized
->> p - pointer to the number for initializing the tensor
+/*
+set a data array with a fixed value
+
+>> d - pointer to the data array
+>> v - the initial value
+>> size - size of the array
 */
-void _SetDataFixed(XTensor * tensor, void * valuePointer)
+template<class T>
+void ArraySetDataFixed(T * d, T v, int size)
 {
-    int num = tensor->unitNum;
-
-    if(tensor->dataType == X_INT){
-        int p = *(int*)valuePointer;
-        if(tensor->devID < 0){
-            int * d = (int*)tensor->data;
-            if(num % 4 == 0){
-                for(int i = 0; i < num; i += 4){
-                    d[i] = p;
-                    d[i + 1] = p;
-                    d[i + 2] = p;
-                    d[i + 3] = p;
-                }
-            }
-            else{
-                for(int i = 0; i < num; i++)
-                    d[i] = p;
-            }
-        }
-        else{
-#ifdef USE_CUDA
-            _CudaSetDataFixedInt(tensor, p);
-#endif
-        }
-    }
-    else if(tensor->dataType == X_FLOAT){
-        float p = *(float*)valuePointer;
-        if(tensor->devID < 0){
-            float * d = (float*)tensor->data;
-            if(num % 4 == 0){
-                for(int i = 0; i < num; i += 4){
-                    d[i] = p;
-                    d[i + 1] = p;
-                    d[i + 2] = p;
-                    d[i + 3] = p;
-                }
-            }
-            else{
-                for(int i = 0; i < num; i++)
-                    d[i] = p;
-            }
-        }
-        else{
-#ifdef USE_CUDA
-            _CudaSetDataFixedFloat(tensor, p);
-#endif
-        }
-    }
-    else if(tensor->dataType == X_DOUBLE){
-        double p = *(double*)valuePointer;
-        if(tensor->devID < 0){
-            double * d = (double*)tensor->data;
-            if(num % 4 == 0){
-                for(int i = 0; i < num; i += 4){
-                    d[i] = p;
-                    d[i + 1] = p;
-                    d[i + 2] = p;
-                    d[i + 3] = p;
-                }
-            }
-            else{
-                for(int i = 0; i < num; i++)
-                    d[i] = p;
-            }
-        }
-        else{
-#ifdef USE_CUDA
-            _CudaSetDataFixedDouble(tensor, p);
-#endif
+    if (size % 4 == 0) {
+        for (int i = 0; i < size; i += 4) {
+            d[i] = v;
+            d[i + 1] = v;
+            d[i + 2] = v;
+            d[i + 3] = v;
        }
    }
-    else{
-        ShowNTErrors("TODO");
+    else {
+        for (int i = 0; i < size; i++)
+            d[i] = v;
    }
 }

-/* 
-generate data items with a fixed value p (in default type) 
->> tensor - the tensor whose data array would be initialized
->> p - number in default type
-*/
-void SetDataFixed(XTensor &tensor, DTYPE p)
-{
-    _SetDataFixed(&tensor, &p);
-}
-    
 /*
-generate data items with a fixed value p (in integer)
->> tensor - the tensor whose data array would be initialized
->> p - an integer
-*/
-void SetDataFixedInt(XTensor &tensor, int p)
-{
-    CheckNTErrors(tensor.dataType == X_INT, "An integer tensor is required!");
-    _SetDataFixed(&tensor, &p);
-}
+generate data items with a fixed value

-/* 
-generate data items with a fixed value p (in integer) 
->> tensor - the tensor whose data array would be initialized
->> p - an int-valued number
+>> tensor - the tensor for initialization
+>> value - the initial value
 */
-void _SetDataFixedInt(XTensor * tensor, int p)
+template<class T>
+void _SetDataFixed(XTensor * tensor, T value)
 {
-    CheckNTErrors(tensor->dataType == X_INT, "the tensor must be in X_INT!");
+    if (tensor->devID >= 0) {
+#ifdef USE_CUDA
+        _CudaSetDataFixed(tensor, value);
+        return;
+#else
+        ShowNTErrors("Please specify USE_CUDA and recompile the code!");
+#endif
+    }

-    if(p == 0)
-        tensor->SetZeroAll();
+    int num = tensor->unitNum;
+
+    if (tensor->dataType == X_INT)
+        ArraySetDataFixed((int*)tensor->data, (int)value, num);
+    else if (tensor->dataType == X_FLOAT)
+        ArraySetDataFixed((float*)tensor->data, (float)value, num);
+    else if (tensor->dataType == X_DOUBLE)
+        ArraySetDataFixed((double*)tensor->data, (double)value, num);
    else
-        _SetDataFixed(tensor, &p);
+        ShowNTErrors("TODO! Unsupported datatype!")
 }
+template void _SetDataFixed<int>(XTensor*, int);
+template void _SetDataFixed<float>(XTensor*, float);
+template void _SetDataFixed<double>(XTensor*, double);

 /*
-generate data items with a fixed value p (in float) 
->> tensor - the tensor whose data array would be initialized
->> p - a float-valued number
-*/
-void _SetDataFixedFloat(XTensor * tensor, float p)
-{
-    CheckNTErrors(tensor->dataType == X_FLOAT, "the tensor must be in X_FLOAT!");
-
-    if(p == 0)
-        tensor->SetZeroAll();
-    else
-        _SetDataFixed(tensor, &p);
-}
+generate data items with a fixed value p only if the condition entry is non-zero

-/* 
-generate data items with a fixed value p (in double) 
->> tensor - the tensor whose data array would be initialized
->> p - a double-valued number
+>> d - pointer to the data array
+>> c - pointer to the condition array
+>> v - the initial value
+>> size - size of the array
 */
-void _SetDataFixedDouble(XTensor * tensor, double p)
+template<class T>
+void ArraySetDataFixedCond(T* d, T* c, T v, int size)
 {
-    CheckNTErrors(tensor->dataType == X_DOUBLE, "the tensor must be in X_DOUBLE!");
-
-    if(p == 0)
-        tensor->SetZeroAll();
-    else
-        _SetDataFixed(tensor, &p);
+    for (int i = 0; i < size; i++) {
+        if (c[i] != 0)
+            d[i] = v;
+    }
 }

 /* 
-generate data items with a fixed value p only if 
-the condition entry is non-zero 
+generate data items with a fixed value p only if the condition entry is non-zero 
+
 >> tensor - the tensor whose data array would be initialized
 >> condition - the condition tensor whose entries would be checked
               for set the corresponding entries in "tensor"
->> p - a given value
+>> value - a given value
 */
-void _SetDataFixedCond(XTensor * tensor, XTensor * condition, DTYPE p)
+template<class T>
+void _SetDataFixedCond(XTensor * tensor, XTensor * condition, T value)
 {
-    int num = tensor->unitNum;
+    CheckDev(tensor->devID, condition->devID);
+    CheckDataType(tensor->dataType, condition->dataType);

-    CheckNTErrors(num == condition->unitNum, "Wrong size of the condition tensor!");
-    CheckNTErrors(condition->unitSize == sizeof(float), "TODO!");
-
-    if(tensor->dataType == DEFAULT_DTYPE){
-        if(tensor->devID < 0){
-            DTYPE * data = (DTYPE*)tensor->data;
-            DTYPE * cond = (DTYPE*)condition->data;
-            for(int i = 0; i < num; i++){
-                if(cond[i] != 0)
-                    data[i] = p;
-            }
-        }
-        else{
+    if (tensor->devID >= 0) {
 #ifdef USE_CUDA
-            _CudaSetDataFixedCondFloat(tensor, condition, p);
+        _CudaSetDataFixedCond(tensor, condition, value);
+        return;
 #else
-            ShowNTErrors("Please specify USE_CUDA and recompile the code");
+        ShowNTErrors("Please specify USE_CUDA and recompile the code!");
 #endif
-        }
-    }
-    else{
-        ShowNTErrors("the tensor should be in integer typed!");
    }
-}

-/* 
-generate data items with a fixed value p only if 
-the condition entry is non-zero 
->> tensor - the tensor whose data array would be initialized
->> condition - the condition tensor whose entries would be checked
-               for set the corresponding entries in "tensor"
->> p - a given value
-*/
-void _SetDataFixedCondInt(XTensor * tensor, XTensor * condition, int p)
-{
    int num = tensor->unitNum;

-    CheckNTErrors(num == condition->unitNum, "Wrong size of the condition tensor!");
-    CheckNTErrors(condition->unitSize == sizeof(float), "TODO!");
-
-    if(tensor->dataType == DEFAULT_DTYPE){
-        if(tensor->devID < 0){
-            int * data = (int*)tensor->data;
-            int * cond = (int*)condition->data;
-            for(int i = 0; i < num; i++){
-                if(cond[i] != 0)
-                    data[i] = p;
-            }
-        }
-        else{
-#ifdef USE_CUDA
-            _CudaSetDataFixedCondInt(tensor, condition, p);
-#else
-            ShowNTErrors("Please specify USE_CUDA and recompile the code");
-#endif
-        }
-    }
-    else{
-        ShowNTErrors("TODO!");
-    }
+    if (tensor->dataType == X_INT)
+        ArraySetDataFixedCond((int*)tensor->data, (int*)condition->data, (int)value, num);
+    else if (tensor->dataType == X_FLOAT)
+        ArraySetDataFixedCond((float*)tensor->data, (float*)condition->data, (float)value, num);
+    else if (tensor->dataType == X_DOUBLE)
+        ArraySetDataFixedCond((double*)tensor->data, (double*)condition->data, (double)value, num);
+    else
+        ShowNTErrors("TODO! Unsupported datatype!")
 }
+template void _SetDataFixedCond<int>(XTensor*, XTensor*, int);
+template void _SetDataFixedCond<float>(XTensor*, XTensor*, float);
+template void _SetDataFixedCond<double>(XTensor*, XTensor*, double);

 /* 
 set data items along with a given dimension (and keep the remaining items unchanged) 
->> tensor - the tensor whose data array would be initialized
+
+>> tensor - the tensor for initialization
 >> beg - the beginning position
 >> len - length along with the given dimension
 >> dim - the dimension along which we set the data
-e.g., given a 3 * 3 tensor 
-      1 2 3
-      4 5 6
-      7 8 9
-      when beg = 1, len = 1, dim = 0 and p = 0, we have
-      1 2 3
-      0 0 0
-      7 8 9
-      i.e., we set all entries of row 1 to 0
+   e.g., given a 3 * 3 tensor 
+         1 2 3
+         4 5 6
+         7 8 9
+         when beg = 1, len = 1, dim = 0 and value = 0, we have
+         1 2 3
+         0 0 0
+         7 8 9
+         i.e., we set all entries of row 1 to 0
+>> value - the given value
 */
-void _SetDataDim(XTensor * tensor, int beg, int len, int dim, DTYPE p)
+template<class T>
+void _SetDataDim(XTensor * tensor, int beg, int len, int dim, T value)
 {
-    int n = tensor->order;
-
-    CheckNTErrors(tensor->dataType == DEFAULT_DTYPE, "TODO!");
-    CheckNTErrors(dim < n && dim >= 0, "Illegal dimension!");
-    CheckNTErrors(beg >= 0 && beg < tensor->GetDim(dim), "Illegal beginning position!");
-    CheckNTErrors(beg + len >= 0 && beg + len < tensor->GetDim(dim), "Illegal length!");
-    
-    if(tensor->devID < 0){
-        int stride = 1;
-        int blockSize = 1;
-        int blockNum  = 1;
-        for(int i = n - 1; i > dim; i--){
-            stride *= tensor->GetDim(i);
-        }
-        blockSize = stride * tensor->GetDim(dim);
-        blockNum = tensor->unitNum / blockSize;
+    int order = tensor->order;
+    int size = tensor->GetDim(dim);
+    if (dim < 0)
+        dim = order + dim; 

-        int l = len * stride;
+    CheckNTErrors(dim < order && dim >= 0, "Illegal dimension!");
+    CheckNTErrors(beg >= 0 && beg < size, "Illegal beginning position!");
+    CheckNTErrors(len >= 0 && beg + len <= size, "Illegal length!");

-        for(int i = 0; i < blockNum; i++){
-            DTYPE * d = (DTYPE*)tensor->data + blockSize * i + beg * stride;    
-            for(int j = 0; j < l; j++)
-                d[j] = p;
-        }
-    }
-    else{
+    if (tensor->devID >= 0) {
 #ifdef USE_CUDA
-        _CudaSetDataDim(tensor, beg, len, dim, p);
+        _CudaSetDataDim(tensor, beg, len, dim, (DTYPE)value);
+        return;
+#else
+        ShowNTErrors("Please specify USE_CUDA and recompile the code!");
 #endif
    }
+
+    int stride = 1;
+    int blockSize = 1;
+    int blockNum  = 1;
+
+    for (int i = order - 1; i > dim; i--)
+        stride *= tensor->GetDim(i);
+    blockSize = stride * size;
+    blockNum = tensor->unitNum / blockSize;
+
+    int initNum = len * stride;
+
+    for(int i = 0; i < blockNum; i++) {
+        if (tensor->dataType == X_INT) {
+            int* d = (int*)tensor->data + blockSize * i + beg * stride;
+            for (int j = 0; j < initNum; j++)
+                d[j] = (int)value;
+        }
+        else if (tensor->dataType == X_FLOAT) {
+            float* d = (float*)tensor->data + blockSize * i + beg * stride;
+            for (int j = 0; j < initNum; j++)
+                d[j] = (float)value;
+        }
+        else if (tensor->dataType == X_DOUBLE) {
+            double* d = (double*)tensor->data + blockSize * i + beg * stride;
+            for (int j = 0; j < initNum; j++)
+                d[j] = (double)value;
+        }
+        else
+            ShowNTErrors("TODO! Unsupported datatype!")
+    }
 }
+template void _SetDataDim<int>(XTensor*, int, int, int, int);
+template void _SetDataDim<float>(XTensor*, int, int, int, float);
+template void _SetDataDim<double>(XTensor*, int, int, int, double);

 /* 
 modify data items along with a given index and dimension (and keep the remaining items unchanged) 
@@ -355,115 +268,140 @@ modify data items along with a given index and dimension (and keep the remaining
 >> modify - the tensor whose data array would be used to modify the source tensor
 >> dim - the dimension along which we modify the tensor
 >> index - index of the given dimension
-e.g., given a source tensor (3, 3)
-      1 2 3
-      4 5 6
-      7 8 9
-      given a modified tensor (3)
-      1 2 3
-      when dim = 0, index = 1, we have
-      1 2 3
-      1 2 3
-      7 8 9
-      i.e., we set entries of row 1 to {1, 2, 3}
+   e.g., given a source tensor (3, 3)
+         1 2 3
+         4 5 6
+         7 8 9
+         given a modified tensor (3)
+         1 2 3
+         when dim = 0, index = 1, we have
+         1 2 3
+         1 2 3
+         7 8 9
+         i.e., we set entries of row 1 to {1, 2, 3}
 */
-void _SetDataIndexed(XTensor * source, XTensor * modify, int dim, int index)
+void _SetDataIndexed(XTensor * tensor, XTensor * modify, int dim, int index)
 {
-    int order = source->order;
-    int size = source->GetDim(dim);
+    int order = tensor->order;
+    int size = tensor->GetDim(dim);
+    if (dim < 0)
+        dim = order + dim;

-    CheckNTErrors(source->dataType == DEFAULT_DTYPE, "TODO!");
+    CheckDev(tensor->devID, modify->devID);
    CheckNTErrors(dim >= 0 && dim < order, "Illegal dimension!");
    CheckNTErrors(index >= 0 && index < size, "Illegal index!");
    
-    for(int i = 0; i < order - 1; i++){
-        if(i < dim){
-            CheckNTErrors(modify->GetDim(i) == source->GetDim(i), "Illegal dimension!");
+    for(int i = 0; i < order - 1; i++) {
+        if(i < dim) {
+            CheckNTErrors(modify->GetDim(i) == tensor->GetDim(i), "Illegal dimension!");
        }
-        else if(i >= dim){
-            CheckNTErrors(modify->GetDim(i) == source->GetDim(i+1), "Illegal dimension!");
+        else if(i >= dim) {
+            CheckNTErrors(modify->GetDim(i) == tensor->GetDim(i+1), "Illegal dimension!");
        }
    }

-    if(source->devID < 0 && modify->devID < 0){
+    if (tensor->devID >= 0) {
+#ifdef USE_CUDA
+        _CudaSetDataIndexed(tensor, modify, dim, index);
+        return;
+#else
+        ShowNTErrors("Please specify USE_CUDA and recompile the code!");
+#endif
+    }
+
+    if(tensor->devID < 0) {
        int stride = 1;
        int blockSize = 1;
        int blockNum  = 1;

-        for(int i = order - 1; i > dim; i--){
-            stride *= source->GetDim(i);
+        for (int i = order - 1; i > dim; i--) {
+            stride *= tensor->GetDim(i);
        }

-        blockSize = stride * source->GetDim(dim);
-        blockNum = source->unitNum / blockSize;
+        blockSize = stride * tensor->GetDim(dim);
+        blockNum = tensor->unitNum / blockSize;

-        for(int i = 0; i < blockNum; i++){
-            DTYPE * d = (DTYPE*)source->data + blockSize * i + index * stride;
+        for (int i = 0; i < blockNum; i++) {
+            DTYPE * d = (DTYPE*)tensor->data + blockSize * i + index * stride;
            DTYPE * p = (DTYPE*)modify->data + stride * i;
            for(int j = 0; j < stride; j++)
                d[j] = p[j];
        }
    }
-    else if(source->devID >= 0 && modify->devID >= 0) {
-#ifdef USE_CUDA
-        _CudaSetDataIndexed(source, modify, dim, index);
-#else
-        ShowNTErrors("Please specify USE_CUDA and recompile the code!");
-#endif
-    }
-    else{
-        ShowNTErrors("TODO!");
-    }
 }

 /* 
 generate data as lower triangular matrics for last two dimensions 
+
 >> tensor - the tensor whose data to be set
->> p - the value for each entry of the lower triangular matrics
+>> value - the value for each entry of the lower triangular matrics
 >> shift - the offset from diagonal
-e.g., for a 3 * 3 tensor, 
-      when p = 1 ans shift = 0, we have
-      1 0 0
-      1 1 0
-      1 1 1
-      when p = 2 and shift = -1, we have
-      0 0 0
-      2 0 0
-      2 2 0
+
+   e.g., for a 3 * 3 tensor, 
+         when value = 1 ans shift = 0, we have
+         1 0 0
+         1 1 0
+         1 1 1
+         when value = 2 and shift = -1, we have
+         0 0 0
+         2 0 0
+         2 2 0
 */
-void _SetDataLowTri(XTensor * tensor, DTYPE p, int shift)
+void _SetDataLowTri(XTensor * tensor, DTYPE value, int shift)
 {
    int n = tensor->order;

-    CheckNTErrors(tensor->dataType == DEFAULT_DTYPE, "TODO!");
    CheckNTErrors(n >= 2, "The tensor must have a order no less than 2!");
    CheckNTErrors(tensor->GetDim(n - 1) == tensor->GetDim(n - 2), 
                 "The last two dimensions must be of the same size!");

-    if(tensor->devID < 0){
-        int l = tensor->GetDim(-1);
-        int blockNum = 1;
-        int blockSize = l * l;
-        for(int i = 0; i < n - 2; i++)
-            blockNum *= tensor->GetDim(i);
-
-        for(int i = 0; i < blockNum; i++){
-            DTYPE * d = (DTYPE*)tensor->data + i * blockSize;
-            for(int row = 0; row < l; row++){
-                for(int col = 0; col <= row + shift; col++){
-                    d[row * l + col] = p;
+    tensor->SetZeroAll();
+    if (tensor->devID >= 0) {
+#ifdef USE_CUDA
+        _CudaSetDataLowTri(tensor, value, shift);
+        return;
+#else
+        ShowNTErrors("Please specify USE_CUDA and recompile the code!");
+#endif
+    }
+
+    int size = tensor->GetDim(-1);
+    int blockSize = size * size;
+    int blockNum = tensor->unitNum / blockSize;
+
+    for (int i = 0; i < blockNum; i++) {
+        for (int row = 0; row < size; row++) {
+            if (tensor->dataType == X_INT) {
+                int * d = (int*)tensor->data + i * blockSize;
+                for (int col = 0; col <= row + shift; col++) {
+                    d[row * size + col] = (int)value;
                }
-                for(int col = MAX(0, row + shift + 1); col < l; col++){
-                    d[row * l + col] = 0;
+                /*for (int col = MAX(0, row + shift + 1); col < size; col++) {
+                    d[row * size + col] = 0;
+                }*/
+            }
+            else if (tensor->dataType == X_FLOAT) {
+                float * d = (float*)tensor->data + i * blockSize;
+                for (int col = 0; col <= row + shift; col++) {
+                    d[row * size + col] = (float)value;
+                }
+                /*for (int col = MAX(0, row + shift + 1); col < size; col++) {
+                    d[row * size + col] = 0;
+                }*/
+            }
+            else if (tensor->dataType == X_DOUBLE) {
+                double * d = (double*)tensor->data + i * blockSize;
+                for (int col = 0; col <= row + shift; col++) {
+                    d[row * size + col] = (double)value;
                }
+                /*for (int col = MAX(0, row + shift + 1); col < size; col++) {
+                    d[row * size + col] = 0;
+                }*/
            }
+            else 
+                ShowNTErrors("TODO! Unsupported datatype!")
        }
    }
-    else{
-#ifdef USE_CUDA
-        _CudaSetDataLowTri(tensor, p, shift);
-#endif
-    }
 }

 /* generate data items with a uniform distribution in [0, 1] */
@@ -484,7 +422,7 @@ generate data items with a uniform distribution in [lower, upper]
 */
 void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
 {
-    CheckNTErrors(upper > lower, "the high value must be greater than low value!");
+    CheckNTErrors(upper >= lower, "the high value must be greater than low value!");

    if(tensor == NULL)
        return;
@@ -506,27 +444,50 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
            }
        }
        else{
-            ShowNTErrors("TODO");
+            ShowNTErrors("TODO! Unsupported datatype!")
        }
    }
-    /* 
-    GPU code
-    The trick here is that initialize the data on a temperary tensor on CPU.
-    The CPU data is then copied to GPU.
-    TODO: generate data points on GPUs straightforwardly.
-    */
    else{
 #ifdef USE_CUDA
-        _CudaSetDataRand(tensor, lower, upper);
+        /*
+        GPU code
+        The trick here is that initialize the data on a temperary tensor on CPU.
+        The CPU data is then copied to GPU.
+        TODO: generate data points on GPUs straightforwardly.
+        */
+        //_CudaSetDataRand(tensor, lower, upper);
+        int num = tensor->unitNum;
+        DTYPE variance = upper - lower;
+
+        void * d = NULL;
+        if (tensor->dataType == X_FLOAT) {
+            d = new float[num];
+            for (int i = 0; i < num; i++) 
+                *((float*)d + i) = lower + variance * (float)rand() / RAND_MAX;
+        }
+        else if (tensor->dataType == X_DOUBLE) {
+            d = new double[num];
+            for (int i = 0; i < num; i++) 
+                *((double*)d + i) = (double)lower + variance * rand() / RAND_MAX;
+        }
+        else {
+            ShowNTErrors("Data type must be X_FLOAT or X_Double!");
+        }
+
+        tensor->SetData(d, num);
+
+        if (tensor->dataType == X_FLOAT) {
+            delete[](float*)d;
+        }
+        else {
+            delete[](double*)d;
+        }
 #endif
-        //XTensor * t2 = NewTensorV2(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, -1);
-        //_SetDataRand(t2, low, high);
-        //_CopyValues(t2, tensor);
-        //delete t2;
    }
 }

 /* generate data items with a range by start, end and the step
+
 >> tensor - the tensor whose data array would be initialized
 >> start - the begin of the array
 >> end - the end of the array (not included self)
@@ -537,7 +498,7 @@ void _SetDataRange(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE step)
    CheckNTErrors((tensor->order == 1), "Tensor must be 1 dimension!");

    /* compute the true length according to the (start, end, step) */
-    DTYPE size = fabs(upper - lower);
+    DTYPE size = (DTYPE)fabs(upper - lower);
    int num = ceil(size / fabs(step));
    CheckNTErrors((tensor->unitNum == num), "Unit number of the tensor is not matched.");

@@ -554,7 +515,7 @@ void _SetDataRange(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE step)
            *((float*)data + i) = lower + i * step;
    }
    else {
-        ShowNTErrors("TODO!");
+        ShowNTErrors("TODO! Unsupported datatype!")
    }

    /* set the data from the array */
@@ -564,8 +525,10 @@ void _SetDataRange(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE step)
 }

 /* 
-generate data items with a uniform distribution in [lower, upper] and set
-the item to a pre-defined value if the item >= p, set the item to 0 otherwise
+generate data items with a uniform distribution in [lower, upper] and 
+set the item to a pre-defined value if the item >= p, 
+set the item to 0 otherwise
+
 >> tensor - the tensor whose data array would be initialized
 >> lower - lower value of the range
 >> upper - upper value of the range
@@ -595,9 +558,31 @@ void _SetDataRandP(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE va
 #endif // USE_CUDA
    }
 }
-    
+
+/* a gauss distribution (Box-Muller method) */
+double GaussRand(DTYPE mean, DTYPE standardDeviation)
+{
+    static double u, v;
+    static int phase = 0;
+    double z;
+    double pi = 3.141592654;
+
+    if (phase == 0) {
+        u = (rand() + 1.0) / (RAND_MAX + 1.0);
+        v = (rand() + 1.0) / (RAND_MAX + 1.0);
+        z = sqrt(-2.0 * log(u)) * sin(2.0 * pi * v);
+    }
+    else {
+        z = sqrt(-2.0 * log(u)) * cos(2.0 * pi * v);
+    }
+
+    phase = 1 - phase;
+    return mean + (z * standardDeviation);
+}
+
 /*
 generate data items with a normal distribution with specified mean and standard deviation 
+
 >> tensor - the tensor that keeps the data
 >> mean - mean or expectation of the distribution
 >> standardDeviation - standard deviation of the distribution
@@ -605,7 +590,31 @@ generate data items with a normal distribution with specified mean and standard 
 void _SetDataRandN(XTensor * tensor, DTYPE mean, DTYPE standardDeviation)
 {
    // TODO: rewrite it and add cuda code!!!!!!!
-    tensor->SetDataRandn(mean, standardDeviation);
+    int num = tensor->unitNum;
+
+    void * d = NULL;
+    if (tensor->dataType == X_FLOAT) {
+        d = new float[num];
+        for (int i = 0; i < num; i++)
+            *((float*)d + i) = (float)GaussRand(mean, standardDeviation);
+    }
+    else if (tensor->dataType == X_DOUBLE) {
+        d = new double[num];
+        for (int i = 0; i < num; i++)
+            *((double*)d + i) = GaussRand(mean, standardDeviation);
+    }
+    else {
+        ShowNTErrors("TODO! Unsupported datatype!")
+    }
+
+    tensor->SetData(d, num);
+
+    if (tensor->dataType == X_FLOAT) {
+        delete[](float*)d;
+    }
+    else {
+        delete[](double*)d;
+    }
 }

 /* 

--- a/source/tensor/core/getandset/SetData.cu
+++ b/source/tensor/core/getandset/SetData.cu
 /* 
-* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */

 /*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-18
-* I'm surprised that I did not write this file till today.
-*/
+ * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-18
+ * I'm surprised that I did not write this file till today.
+ */

 #include <curand.h>
 #include <time.h>
@@ -32,110 +32,35 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 #ifdef USE_CUDA

-/* 
-set an integer data array with a fixed value p (in int) 
->> d - pointer to the data array
->> size - size of the array
->> p - the initial value
-*/
-__global__ 
-void KernelSetDataFixedInt(int * d, int size, int p)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-    if (i < size)
-        d[i] = p;
-}
-
-/* 
-generate data items with a fixed value p (in int) 
->> tensor - the tensor for initialization
->> p - the initial value
-*/
-void _CudaSetDataFixedInt(XTensor * tensor, int p)
-{
-    CheckNTErrors(tensor->dataType == X_INT, "the tensor must be in X_INT!");
-
-    int gridSize[3];
-    int blockSize[3];
-
-    GDevs.GetCudaThread(tensor->devID, tensor->unitNum, gridSize, blockSize);
-
-    dim3 blocks(gridSize[0]);
-    dim3 threads(blockSize[0]);
-
-    int devIDBackup;
-    ProtectCudaDev(tensor->devID, devIDBackup);
-
-    KernelSetDataFixedInt <<<blocks, threads >>>((int*)tensor->data, tensor->unitNum, p);
-
-    BacktoCudaDev(tensor->devID, devIDBackup);
-}
+/*
+set a data array with a fixed value

-/* 
-set a float data array with a fixed value p (in int) 
 >> d - pointer to the data array
+>> v - the initial value
 >> size - size of the array
->> p - the initial value
 */
-__global__ 
-void KernelSetDataFixedFloat(float * d, int size, float p)
+template<class T>
+__global__
+void KernelSetDataFixed(T * d, T v, int size)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if (i < size)
-        d[i] = p;
-}
-
-/* 
-generate data items with a fixed value p (in float)
->> tensor - the tensor for initialization
->> p - the initial value
-*/
-void _CudaSetDataFixedFloat(XTensor * tensor, float p)
-{
-    CheckNTErrors(tensor->dataType == X_FLOAT, "the tensor must be in X_FLOAT!");
-
-    int gridSize[3];
-    int blockSize[3];
-
-    GDevs.GetCudaThread(tensor->devID, tensor->unitNum, gridSize, blockSize);
-
-    dim3 blocks(gridSize[0]);
-    dim3 threads(blockSize[0]);
-
-    int devIDBackup;
-    ProtectCudaDev(tensor->devID, devIDBackup);
-
-    KernelSetDataFixedFloat <<<blocks, threads >>>((float*)tensor->data, tensor->unitNum, p);
-
-    BacktoCudaDev(tensor->devID, devIDBackup);
+        d[i] = v;
 }
+template __global__ void KernelSetDataFixed<int>(int *, int, int);
+template __global__ void KernelSetDataFixed<float>(float *, float, int);
+template __global__ void KernelSetDataFixed<double>(double *, double, int);

 /* 
-set a double data array with a fixed value p (in int) 
->> d - pointer to the data array
->> size - size of the array
->> p - the initial value
-*/
-__global__ 
-void KernelSetDataFixedDouble(double * d, int size, double p)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
+generate data items with a fixed value 

-    if (i < size)
-        d[i] = p;
-}
-
-/* 
-generate data items with a fixed value p (in double) 
 >> tensor - the tensor for initialization
->> p - the initial value
+>> value - the initial value
 */
-void _CudaSetDataFixedDouble(XTensor * tensor, double p)
+template<class T>
+void _CudaSetDataFixed(XTensor * tensor, T value)
 {
-    CheckNTErrors(tensor->dataType == X_DOUBLE, "the tensor must be in X_DOUBLE!");
-
    int gridSize[3];
    int blockSize[3];

@@ -145,59 +70,23 @@ void _CudaSetDataFixedDouble(XTensor * tensor, double p)
    dim3 threads(blockSize[0]);

    int devIDBackup;
-    ProtectCudaDev(tensor->devID, devIDBackup);
-
-    KernelSetDataFixedDouble <<<blocks, threads >>>((double*)tensor->data, tensor->unitNum, p);
-
-    BacktoCudaDev(tensor->devID, devIDBackup);
-}
-
-/* 
-set a float data array with a fixed value p (in int) only 
-if the condition entry is non-zero 
->> d - pointer to the data array
->> c - pointer to the condition array
->> size - size of the array
->> p - the initial value
-*/
-__global__ 
-void KernelSetDataFixedCondFloat(float * d, float * c, int size, float p)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-    if (i < size && c[i] != 0)
-        d[i] = p;
-}
-
-/* 
-generate data items with a fixed value p (in float) only 
-if the condition entry is non-zero 
->> tensor - the tensor for initialization
->> condition - the condition tensor whose entry would be check to
-               set the corresponding entry in "tensor"
->> p - the initial value   
-*/
-void _CudaSetDataFixedCondFloat(XTensor * tensor, XTensor * condition, float p)
-{
-    CheckNTErrors(tensor->dataType == X_FLOAT, "the tensor must be in X_FLOAT!");
-    CheckNTErrors(condition->unitSize == sizeof(float), "TODO!");
-
-    int gridSize[3];
-    int blockSize[3];
-
-    GDevs.GetCudaThread(tensor->devID, tensor->unitNum, gridSize, blockSize);

-    dim3 blocks(gridSize[0]);
-    dim3 threads(blockSize[0]);
-
-    int devIDBackup;
    ProtectCudaDev(tensor->devID, devIDBackup);

-    KernelSetDataFixedCondFloat <<<blocks, threads >>>((float*)tensor->data, (float*)condition->data, 
-                                                               tensor->unitNum, p);
+    if (tensor->dataType == X_INT)
+        KernelSetDataFixed << <blocks, threads >> > ((int*)tensor->data, (int)value, tensor->unitNum);
+    else if (tensor->dataType == X_FLOAT)
+        KernelSetDataFixed << <blocks, threads >> > ((float*)tensor->data, (float)value, tensor->unitNum);
+    else if (tensor->dataType == X_DOUBLE)
+        KernelSetDataFixed << <blocks, threads >> > ((double*)tensor->data, (double)value, tensor->unitNum);
+    else
+        ShowNTErrors("TODO! Unsupported datatype!")

    BacktoCudaDev(tensor->devID, devIDBackup);
 }
+template void _CudaSetDataFixed<int>(XTensor *, int);
+template void _CudaSetDataFixed<float>(XTensor *, float);
+template void _CudaSetDataFixed<double>(XTensor *, double);

 /* 
 set a float data array with a fixed value p (in int) only 
@@ -207,28 +96,30 @@ if the condition entry is non-zero
 >> size - size of the array
 >> p - the initial value
 */
+template<class T>
 __global__ 
-void KernelSetDataFixedCondInt(int * d, float * c, int size, int p)
+void KernelSetDataFixedCond(T * d, T * c, T value, int size)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if (i < size && c[i] != 0)
-        d[i] = p;
+        d[i] = value;
 }
-
+template __global__ void KernelSetDataFixedCond<int>(int*, int*, int, int);
+template __global__ void KernelSetDataFixedCond<float>(float*, float*, float, int);
+template __global__ void KernelSetDataFixedCond<double>(double*, double*, double, int);
 /* 
-generate data items with a fixed value p (in int) only 
-if the condition entry is non-zero 
+generate data items with a fixed value p 
+only if the condition entry is non-zero 
+
 >> tensor - the tensor for initialization
 >> condition - the condition tensor whose entry would be check to
               set the corresponding entry in "tensor"
->> p - the initial value   
+>> value - the initial value   
 */
-void _CudaSetDataFixedCondInt(XTensor * tensor, XTensor * condition, int p)
+template<class T>
+void _CudaSetDataFixedCond(XTensor* tensor, XTensor* condition, T value)
 {
-    CheckNTErrors(tensor->dataType == X_FLOAT, "the tensor must be in X_FLOAT!");
-    CheckNTErrors(condition->unitSize == sizeof(float), "TODO!");
-
    int gridSize[3];
    int blockSize[3];

@@ -240,11 +131,24 @@ void _CudaSetDataFixedCondInt(XTensor * tensor, XTensor * condition, int p)
    int devIDBackup;
    ProtectCudaDev(tensor->devID, devIDBackup);

-    KernelSetDataFixedCondInt <<<blocks, threads >>>((int*)tensor->data, (float*)condition->data, 
-                                                           tensor->unitNum, p);
+    if (tensor->dataType == X_INT)
+        KernelSetDataFixedCond <<< blocks, threads >>> ((int*)tensor->data, (int*)condition->data,
+                                                       (int)value, tensor->unitNum);
+    else if (tensor->dataType == X_FLOAT)
+        KernelSetDataFixedCond <<< blocks, threads >>> ((float*)tensor->data, (float*)condition->data,
+                                                       (float)value, tensor->unitNum);
+
+    else if (tensor->dataType == X_DOUBLE)
+        KernelSetDataFixedCond <<< blocks, threads >>> ((double*)tensor->data, (double*)condition->data,
+                                                       (double)value, tensor->unitNum);
+    else
+        ShowNTErrors("TODO! Unsupported datatype!")

    BacktoCudaDev(tensor->devID, devIDBackup);
 }
+template void _CudaSetDataFixedCond<int>(XTensor*, XTensor*, int);
+template void _CudaSetDataFixedCond<float>(XTensor*, XTensor*, float);
+template void _CudaSetDataFixedCond<double>(XTensor*, XTensor*, double);

 /* 
 set data array with a uniform distribution in [low, high] 
@@ -309,8 +213,9 @@ set data items along with a given dimension (and keep the remaining items unchan
 >> blockSize - size of a data block
 >> blockNum - number of data blocks
 */
+template<class T>
 __global__
-void KernelSetDataDim(DTYPE * d, int beg, int len, int blockSize, int blockNum, DTYPE p)
+void KernelSetDataDim(T * d, int beg, int len, int blockSize, int blockNum, T p)
 {
    /* offset in each block */
    int i = blockDim.x * blockIdx.x + threadIdx.x;
@@ -326,6 +231,9 @@ void KernelSetDataDim(DTYPE * d, int beg, int len, int blockSize, int blockNum, 

    d[blockSize * j + i] = p;
 }
+template __global__ void KernelSetDataDim<int>(int*, int, int, int, int, int);
+template __global__ void KernelSetDataDim<float>(float*, int, int, int, int, float);
+template __global__ void KernelSetDataDim<double>(double*, int, int, int, int, double);

 /* 
 set data items along with a given dimension (and keep the remaining items unchanged) - cuda version
@@ -343,7 +251,8 @@ e.g., given a 3 * 3 tensor
      7 8 9
      i.e., we set all entries of row 1 to 0
 */
-void _CudaSetDataDim(XTensor * tensor, int beg, int len, int dim, DTYPE p)
+template<class T>
+void _CudaSetDataDim(XTensor * tensor, int beg, int len, int dim, T p)
 {
    int n = tensor->order;

@@ -372,11 +281,24 @@ void _CudaSetDataDim(XTensor * tensor, int beg, int len, int dim, DTYPE p)
    int devIDBackup;
    ProtectCudaDev(tensor->devID, devIDBackup);

-    KernelSetDataDim<<<blocks, threads >>>((DTYPE*)tensor->data, beg * stride, 
-                                            len * stride, blockSize, blockNum, p);
+    if (tensor->dataType == X_INT)
+        KernelSetDataDim << <blocks, threads >> > ((int*)tensor->data, beg * stride,
+                                                    len * stride, blockSize, blockNum, (int)p);
+    else if (tensor->dataType == X_FLOAT)
+        KernelSetDataDim << <blocks, threads >> > ((float*)tensor->data, beg * stride,
+                                                    len * stride, blockSize, blockNum, (float)p);
+
+    else if (tensor->dataType == X_DOUBLE)
+        KernelSetDataDim << <blocks, threads >> > ((double*)tensor->data, beg * stride,
+                                                    len * stride, blockSize, blockNum, (double)p);
+    else
+        ShowNTErrors("TODO! Unsupported datatype!")

    BacktoCudaDev(tensor->devID, devIDBackup);
 }
+template void _CudaSetDataDim<int>(XTensor*, int, int, int, int);
+template void _CudaSetDataDim<float>(XTensor*, int, int, int, float);
+template void _CudaSetDataDim<double>(XTensor*, int, int, int, double);

 /* 
 modify data items along with a given index and dimension 
@@ -462,6 +384,7 @@ void _CudaSetDataIndexed(XTensor * source, XTensor * modify, int dim, int index)

 /* 
 set lower triangular matrics for each block
+
 >> d - pointer to the data array
 >> l - row number (or column number) of each block, i.e, 
       a block is l * l matrix
@@ -469,15 +392,15 @@ set lower triangular matrics for each block
 >> blockNum - number of the blocks
 >> p - the value for each entry of the lower triangular matrics
 >> shift - the offset from diagonal
-e.g., for a 3* 3 tensor, 
-      when p = 1 ans shift = 0, we have
-      1 0 0
-      1 1 0
-      1 1 1
-      when p = 2 and shift = -1, we have
-      0 0 0
-      2 0 0
-      2 2 0
+   e.g., for a 3* 3 tensor, 
+         when p = 1 ans shift = 0, we have
+         1 0 0
+         1 1 0
+         1 1 1
+         when p = 2 and shift = -1, we have
+         0 0 0
+         2 0 0
+         2 2 0
 */
 __global__
 void KernelSetDataLowTri(DTYPE * d, int l, int blockSize, int blockNum, DTYPE p, int shift)
@@ -501,35 +424,28 @@ void KernelSetDataLowTri(DTYPE * d, int l, int blockSize, int blockNum, DTYPE p,
        *d2 = 0;
 }

-/* 
+/*
 generate data as lower triangular matrics for last two dimensions (cuda version)
+
 >> tensor - the tensor whose data to be set
->> p - the value for each entry of the lower triangular matrics
+>> value - the value for each entry of the lower triangular matrics
 >> shift - the offset from diagonal
-e.g., for a 3* 3 tensor, 
-      when p = 1 ans shift = 0, we have
-      1 0 0
-      1 1 0
-      1 1 1
-      when p = 2 and shift = -1, we have
-      0 0 0
-      2 0 0
-      2 2 0
+
+   e.g., for a 3 * 3 tensor,
+         when value = 1 ans shift = 0, we have
+         1 0 0
+         1 1 0
+         1 1 1
+         when value = 2 and shift = -1, we have
+         0 0 0
+         2 0 0
+         2 2 0
 */
-void _CudaSetDataLowTri(XTensor * tensor, DTYPE p, int shift)
+void _CudaSetDataLowTri(XTensor * tensor, DTYPE value, int shift)
 {
-    int n = tensor->order;
-
-    CheckNTErrors(tensor->dataType == DEFAULT_DTYPE, "TODO!");
-    CheckNTErrors(n >= 2, "The tensor must have a order no less than 2!");
-    CheckNTErrors(tensor->GetDim(n - 1) == tensor->GetDim(n - 2), 
-                 "The last two dimensions must be of the same size!");
-
-    int l = tensor->GetDim(-1);
-    int blockNum = 1;
-    int blockSize = l * l;
-    for(int i = 0; i < n - 2; i++)
-        blockNum *= tensor->GetDim(i);
+    int size = tensor->GetDim(-1);
+    int blockSize = size * size;
+    int blockNum = tensor->unitNum / blockSize;

    int cudaGrids[3];
    int cudaBlocks[3];
@@ -542,7 +458,7 @@ void _CudaSetDataLowTri(XTensor * tensor, DTYPE p, int shift)
    int devIDBackup;
    ProtectCudaDev(tensor->devID, devIDBackup);

-    KernelSetDataLowTri<<<blocks, threads >>>((DTYPE*)tensor->data, l, blockSize, blockNum, p, shift);
+    KernelSetDataLowTri<<<blocks, threads >>>((DTYPE*)tensor->data, size, blockSize, blockNum, value, shift);

    BacktoCudaDev(tensor->devID, devIDBackup);
 }

--- a/source/tensor/core/getandset/SetData.cuh
+++ b/source/tensor/core/getandset/SetData.cuh
@@ -28,31 +28,24 @@

 namespace nts { // namespace nts(NiuTrans.Tensor)

-/* generate data items with a fixed value p (in int) */
-void _CudaSetDataFixedInt(XTensor * tensor, int p);
+/* generate data items with a fixed value */
+template<class T>
+void _CudaSetDataFixed(XTensor * tensor, T value);

-/* generate data items with a fixed value p (in float) */
-void _CudaSetDataFixedFloat(XTensor * tensor, float p);
-
-/* generate data items with a fixed value p (in double) */
-void _CudaSetDataFixedDouble(XTensor * tensor, double p);
-
-/* generate data items with a fixed value p (in float) only 
-   if the condition entry is non-zero */
-void _CudaSetDataFixedCondFloat(XTensor * tensor, XTensor * condition, float p);
-
-/* generate data items with a fixed value p (in int) only 
-   if the condition entry is non-zero */
-void _CudaSetDataFixedCondInt(XTensor * tensor, XTensor * condition, int p);
+/* generate data items with a fixed value p
+   only if the condition entry is non-zero */
+template<class T>
+void _CudaSetDataFixedCond(XTensor * tensor, XTensor * condition, T p);

 /* set data items along with a given dimension (and keep the remaining items unchanged) */
-void _CudaSetDataDim(XTensor * tensor, int beg, int len, int dim, DTYPE p);
+template<class T>
+void _CudaSetDataDim(XTensor * tensor, int beg, int len, int dim, T p);

 /* modify data items along with a given index and dimension (and keep the remaining items unchanged) */
 void _CudaSetDataIndexed(XTensor * source, XTensor * modify, int dim, int index);

 /* generate data as lower triangular matrics for last two dimensions (cuda version) */
-void _CudaSetDataLowTri(XTensor * tensor, DTYPE p, int shift);
+void _CudaSetDataLowTri(XTensor * tensor, DTYPE value, int shift);

 /* generate data items with a uniform distribution in [lower, upper] */
 void _CudaSetDataRand(const XTensor * tensor, DTYPE lower, DTYPE upper);

--- a/source/tensor/core/getandset/SetData.h
+++ b/source/tensor/core/getandset/SetData.h
@@ -30,32 +30,17 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* generate data items with a xavier initialization */
 void _SetDataFanInOut(XTensor * tensor, DTYPE gain = 1.0F);

-/* generate data items with a fixed value p */
-void _SetDataFixed(XTensor * tensor, void * valuePointer);
+/* generate data items with a fixed value */
+template<class T>
+void _SetDataFixed(XTensor * tensor, T value);

-/* generate data items with a fixed value p (in default type) */
-void SetDataFixed(XTensor &tensor, DTYPE p);
-    
-/* generate data items with a fixed value p (in integer) */
-void SetDataFixedInt(XTensor &tensor, int p);
-
-/* generate data items with a fixed value p (in int) */
-void _SetDataFixedInt(XTensor * tensor, int p);
-
-/* generate data items with a fixed value p (in float) */
-void _SetDataFixedFloat(XTensor * tensor, float p);
-
-/* generate data items with a fixed value p (in double) */
-void _SetDataFixedDouble(XTensor * tensor, double p);
-
-/* generate data items with a fixed value p only if the condition entry is non-zero */
-void _SetDataFixedCond(XTensor * tensor, XTensor * condition, DTYPE p);
-
-/* generate data items with a fixed value p only if the condition entry is non-zero */
-void _SetDataFixedCondInt(XTensor * tensor, XTensor * condition, int p);
+/* generate data items with a fixed value only if the condition entry is non-zero */
+template<class T>
+void _SetDataFixedCond(XTensor* tensor, XTensor* condition, T value);

 /* set data items along with a given dimension (and keep the remaining items unchanged) */
-void _SetDataDim(XTensor * tensor, int beg, int len, int dim, DTYPE p);
+template<class T>
+void _SetDataDim(XTensor * tensor, int beg, int len, int dim, T p);

 /* modify data items along with a given index and dimension (and keep the remaining items unchanged) */
 void _SetDataIndexed(XTensor * source, XTensor * modify, int dim, int index);

--- a/source/tensor/function/DropoutWithIndex.cpp
+++ b/source/tensor/function/DropoutWithIndex.cpp
@@ -70,7 +70,7 @@ XTensor DropoutWithIndex(const XTensor &x, XTensor &maskIndex, DTYPE scale)

    InitTensor1DV2(&c, x.unitNum, x.dataType, x.devID, x.mem);

-    _SetDataFixedFloat(&c, 1.0F);
+    c.SetDataFixed(1.0);

    _DropoutWithIndex(&x, &maskIndex, &c);


--- a/source/tensor/function/Loss.cpp
+++ b/source/tensor/function/Loss.cpp
@@ -383,15 +383,7 @@ void _LossBackward(XTensor * dedy, XTensor * t, XTensor * y,
                  int leadDim, int tBeg, int tLen, int yBeg)
 {
    if(t == NULL){
-        if(dedy->dataType == X_FLOAT)
-            _SetDataFixedFloat(dedy, 1.0F);
-        else if(dedy->dataType == X_DOUBLE)
-            _SetDataFixedDouble(dedy, 1.0);
-        else if(dedy->dataType == X_INT)
-            _SetDataFixedInt(dedy, 1);
-        else{
-            ShowNTErrors("TODO");
-        }
+        dedy->SetDataFixed(1);
        return;
    }


--- a/source/tensor/test/TDropout.cpp
+++ b/source/tensor/test/TDropout.cpp
@@ -50,7 +50,7 @@ bool TestDropout1()
    XTensor yUser;

    /* initialize variables */
-    _SetDataFixedFloat(x, 1.0F);
+    x->SetDataFixed(1);
    y->SetZeroAll();

    /* call Dropout function */
@@ -88,7 +88,7 @@ bool TestDropout1()
    XTensor yUserGPU;

    /* initialize variables */
-    _SetDataFixedFloat(xGPU, 1.0F);
+    xGPU->SetDataFixed(1);
    yGPU->SetZeroAll();

    /* call Dropout function */
@@ -157,10 +157,10 @@ bool TestDropout2()
    XTensor * dedy = NewTensorV2(order, dimSize);

    /* initialize variables */
-    _SetDataFixedFloat(x, 1.0F);
+    x->SetDataFixed(1.0);
    y->SetZeroAll();
    dedx->SetZeroAll();
-    _SetDataFixedFloat(dedy, 1.5F);
+    dedy->SetDataFixed(1.5);

    /* call Dropout function */
    float dropProb = 0.5F;
@@ -183,10 +183,10 @@ bool TestDropout2()
    XTensor * dedyGPU = NewTensorV2(order, dimSize, X_FLOAT, 1.0F, 0);

    /* initialize variables */
-    _SetDataFixedFloat(xGPU, 1.0F);
+    xGPU->SetDataFixed(1.0);
    yGPU->SetZeroAll();
    dedxGPU->SetZeroAll();
-    _SetDataFixedFloat(dedyGPU, 1.5F);
+    dedyGPU->SetDataFixed(1.5);

    /* call Dropout function */
    _Dropout(xGPU, yGPU, seed, dropProb);

--- a/source/tensor/test/TReduceSum.cpp
+++ b/source/tensor/test/TReduceSum.cpp
@@ -195,8 +195,8 @@ bool TestReduceSum2()
    XTensor tUser;

    /* initialize variables */
-    _SetDataFixedFloat(s, 1.0F);
-    _SetDataFixedFloat(answer, (float)s->GetDim(1));
+    s->SetDataFixed(1);
+    answer->SetDataFixed(s->GetDim(1));

    /* call ReduceSum function */
    _ReduceSum(s, t, 1);
@@ -215,7 +215,7 @@ bool TestReduceSum2()
    XTensor tUserGPU;

    /* initialize variables */
-    _SetDataFixedFloat(sGPU, 1.0F);
+    sGPU->SetDataFixed(1);

    /* call ReduceSum function */
    _ReduceSum(sGPU, tGPU, 1);
@@ -284,8 +284,8 @@ bool TestReduceSum3()
    XTensor tUser;

    /* initialize variables */
-    _SetDataFixedFloat(s, 1.0F);
-    _SetDataFixedFloat(answer, (float)s->GetDim(1));
+    s->SetDataFixed(1);
+    answer->SetDataFixed(s->GetDim(1));

    /* call ReduceSum function */
    _ReduceSum(s, t, 1);
@@ -304,7 +304,7 @@ bool TestReduceSum3()
    XTensor tUserGPU;

    /* initialize variables */
-    _SetDataFixedFloat(sGPU, 1.0F);
+    sGPU->SetDataFixed(1);

    /* call ReduceSum function */
    _ReduceSum(sGPU, tGPU, 1);
@@ -373,8 +373,8 @@ bool TestReduceSum4()
    XTensor tUser;

    /* initialize variables */
-    _SetDataFixedFloat(s, 1.0F);
-    _SetDataFixedFloat(answer, (float)s->GetDim(1));
+    s->SetDataFixed(1);
+    answer->SetDataFixed(s->GetDim(1));

    /* call ReduceSum function */
    _ReduceSum(s, t, 1);
@@ -393,7 +393,7 @@ bool TestReduceSum4()
    XTensor tUserGPU;

    /* initialize variables */
-    _SetDataFixedFloat(sGPU, 1.0F);
+    sGPU->SetDataFixed(1);

    /* call ReduceSum function */
    _ReduceSum(sGPU, tGPU, 1);
@@ -464,8 +464,8 @@ bool TestReduceSum5()
    XTensor tUser;

    /* initialize variables */
-    _SetDataFixedFloat(s, 1.0F);
-    _SetDataFixedFloat(answer, (float)s->GetDim(1));
+    s->SetDataFixed(1);
+    answer->SetDataFixed(s->GetDim(1));

    /* call ReduceSum function */
    _ReduceSum(s, t, 1);
@@ -484,7 +484,7 @@ bool TestReduceSum5()
    XTensor tUserGPU;

    /* initialize variables */
-    _SetDataFixedFloat(sGPU, 1.0F);
+    sGPU->SetDataFixed(1);

    /* call ReduceSum function */
    _ReduceSum(sGPU, tGPU, 1);
@@ -556,8 +556,8 @@ bool TestReduceSum6()
    XTensor tUser;

    /* initialize variables */
-    _SetDataFixedFloat(s, 1.0F);
-    _SetDataFixedFloat(answer, (float)s->GetDim(1));
+    s->SetDataFixed(1);
+    answer->SetDataFixed(s->GetDim(1));

    /* call ReduceSum function */
    _ReduceSum(s, t, 1);
@@ -576,7 +576,7 @@ bool TestReduceSum6()
    XTensor tUserGPU;

    /* initialize variables */
-    _SetDataFixedFloat(sGPU, 1.0F);
+    sGPU->SetDataFixed(1);

    /* call ReduceSum function */
    _ReduceSum(sGPU, tGPU, 1);

--- a/source/tensor/test/TSetData.cpp
+++ b/source/tensor/test/TSetData.cpp
@@ -119,7 +119,7 @@ bool TestSetData2()
    XTensor * modify = NewTensorV2(dataOrder, dataDimSize);

    /* Initialize variables */
-    _SetDataFixedFloat(s, 1.0F);
+    s->SetDataFixed(1);
    modify->SetData(data, dataUnitNum);

    /* call SetDataIndexed function */
@@ -137,7 +137,7 @@ bool TestSetData2()
    XTensor * modifyGPU = NewTensorV2(dataOrder, dataDimSize, X_FLOAT, 1.0F, 0);

    /* Initialize variables */
-    _SetDataFixedFloat(sGPU, 1.0F);
+    sGPU->SetDataFixed(1);
    modifyGPU->SetData(data, dataUnitNum);

    /* call SetDataIndexed function */
@@ -212,11 +212,11 @@ bool TestSetData3()
    XTensor * modify = NewTensorV2(dataOrder, dataDimSize);

    /* Initialize variables */
-    _SetDataFixedFloat(s, 1.0F);
+    s->SetDataFixed(1);
    modify->SetData(data, dataUnitNum);

    /* call SetDataIndexed function */
-    _SetDataFixedFloat(s, 1.0F);
+    s->SetDataFixed(1);
    _SetDataIndexed(s, modify, 1, 1);
    
    /* check results */
@@ -231,7 +231,7 @@ bool TestSetData3()
    XTensor * modifyGPU = NewTensorV2(dataOrder, dataDimSize, X_FLOAT, 1.0F, 0);

    /* Initialize variables */
-    _SetDataFixedFloat(sGPU, 1.0F);
+    sGPU->SetDataFixed(1);
    modifyGPU->SetData(data, dataUnitNum);
    
    /* call SetDataIndexed function */

--- a/source/tensor/test/TSpread.cpp
+++ b/source/tensor/test/TSpread.cpp
@@ -91,7 +91,7 @@ bool TestSpread1()
    XTensor * modify = NewTensorV2(dataOrder, dataDimSize);

    /* Initialize variables */
-    _SetDataFixedFloat(s, 0.0F);
+    s->SetZeroAll();
    modify->SetData(data, dataUnitNum);

    /* call _Spread function */
@@ -109,7 +109,7 @@ bool TestSpread1()
    XTensor * modifyGPU = NewTensorV2(dataOrder, dataDimSize, X_FLOAT, 1.0F, 0);

    /* Initialize variables */
-    _SetDataFixedFloat(sGPU, 0.0F);
+    sGPU->SetZeroAll();
    modifyGPU->SetData(data, dataUnitNum);
    
    /* call _Spread function */

--- a/source/tensor/test/TSumDim.cpp
+++ b/source/tensor/test/TSumDim.cpp
@@ -296,8 +296,8 @@ bool TestSumDim3()
    /* initialize variables */
    a->SetZeroAll();
    cMe->SetZeroAll();
-    _SetDataFixedFloat(b, 1.0F);
-    _SetDataFixedFloat(answer, 1.0F);
+    b->SetDataFixed(1);
+    answer->SetDataFixed(1);

    /* call SumDim function */
    _SumDim(a, b, c, 1);
@@ -323,7 +323,7 @@ bool TestSumDim3()
    /* Initialize variables */
    aGPU->SetZeroAll();
    cMe->SetZeroAll();
-    _SetDataFixedFloat(bGPU, 1.0F);
+    bGPU->SetDataFixed(1);

    /* call sum function */
    _SumDim(aGPU, bGPU, cGPU, 1);
@@ -405,8 +405,8 @@ bool TestSumDim4()
    /* initialize variables */
    a->SetZeroAll();
    cMe->SetZeroAll();
-    _SetDataFixedFloat(b, 1.0F);
-    _SetDataFixedFloat(answer, 1.0F);
+    b->SetDataFixed(1);
+    answer->SetDataFixed(1);

    /* call SumDim function */
    _SumDim(a, b, c, 1);
@@ -432,7 +432,7 @@ bool TestSumDim4()
    /* Initialize variables */
    aGPU->SetZeroAll();
    cMe->SetZeroAll();
-    _SetDataFixedFloat(bGPU, 1.0F);
+    bGPU->SetDataFixed(1);

    /* call sum function */
    _SumDim(aGPU, bGPU, cGPU, 1);