merge with huchi branch, fix the bug of binary (without link) and modify the interface of the fnnlm

a0aa3d49 · xuchen · b30fad5f · a0aa3d49 · a0aa3d49 · a0aa3d49
Commit a0aa3d49 authored Jul 29, 2019 by xuchen
--- a/source/network/XBackwardMath.cpp
+++ b/source/network/XBackwardMath.cpp
@@ -530,7 +530,7 @@ void XMathGrad::GradMatrixMul(XTensor * node, bool isEfficient)
    XTensor * dedc = node->grad;
    XTensor * deda = a->grad;
    XTensor * dedb = b->grad;
    if(a->order == 2 && b->order == 2)
        GradMatrixMul(a, deda, transA, b, dedb, transB, dedc, alpha, isEfficient);
    else if(transA == X_NOTRANS && a->order > 2 && b->order == 2){
@@ -584,9 +584,19 @@ void XMathGrad::GradMatrixMul(XTensor * a, XTensor * deda, MATRIX_TRANS_TYPE tra
        if(!isEfficient || a->isGrad)
            _MatrixMul(dedc, X_NOTRANS, b, X_TRANS, deda, alpha, 1.0F);
+        //if (b->id == 15) 
+        //    dedb->Dump(stdout, "dedb", 100);
        /* dE/db = a^T * dE/dc * \alpha */
        if(!isEfficient || b->isGrad)
            _MatrixMul(a, X_TRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
+        //
+        //if (b->id == 15) {
+        //    a->Dump(stdout, "a", 100);
+        //    dedc->Dump(stdout, "dedc", 100);
+        //    dedb->Dump(stdout, "dedb", 100);
+        //    exit(1);
+        //}
    }
    /* c = a^T * b * \alpha */
@@ -1578,10 +1588,8 @@ void XMathGrad::GradMulAndShift(XTensor * node, bool isEfficient)
        node->grad->Reshape(order, dimSize);
        DelTensorBuf(interGrad);
    }
    /* compute dE/dx, dE/dw */
    XTensor * c = node;
    XTensor * dedc = node->grad;

--- a/source/network/XNet.cpp
+++ b/source/network/XNet.cpp
@@ -271,6 +271,11 @@ void XNet::BackwardNode(XTensor * node, bool isEfficent)
        else{
            ShowNTErrors("Wrong node type!");
        }
+        //FILE *f = fopen("debug", "a");
+        //node->Dump(f, "node", 10);
+        //if (node->grad != NULL)
+        //    node->grad->Dump(f, "node->grad", 10);
    }
    else{
        node->visitMark = NODE_FINISHED;

--- a/source/sample/fnnlm/FNNLM.cpp
+++ b/source/sample/fnnlm/FNNLM.cpp
@@ -586,9 +586,6 @@ void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad)
        XTensor * para = (XTensor*)paraList.GetItem(i);
        XTensor * paraGrad = (XTensor*)gradList.GetItem(i);
-        //fprintf(stderr, "%d\n", i);
-        //paraGrad->Dump(stderr, "grad:", 10);
        /* the delta rule */
        _Sum(para, paraGrad, para, -epsilon);
    }
@@ -607,14 +604,14 @@ float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs)
    InitTensorV2(&probs, &output);
    /* probs[i,j] = output[i,j] * gold[i,j] */
-    _Multiply(&output, &gold, &probs);
+    Multiply(output, gold, probs);
    /* probability of each word */
    XTensor wprobs;
    InitTensor1DV2(&wprobs, output.GetDim(0), output.dataType, output.devID);
-    _ReduceSum(&probs, &wprobs, 1);
+    ReduceSum(probs, wprobs, 1);
    if(wordProbs != NULL)
-        _CopyValues(&wprobs, wordProbs);
+        CopyValues(wprobs, *wordProbs);
    /* reshape the tensor to fit it into the reduce procedure 
       TODO: XTensor supports scalars */
@@ -626,7 +623,7 @@ float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs)
    /* probability for the batch */
    XTensor result;
    InitTensor1DV2(&result, 1, X_FLOAT, output.devID);
-    _ReduceSum(&probs, &result, 1);
+    ReduceSum(probs, result, 1);
    return result.Get1D(0);
 }
@@ -793,7 +790,7 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
        /* generate word embedding of position i:
           embedding = input * w   */
-        _MatrixMul(&input, X_NOTRANS, &w, X_NOTRANS, &embedding);
+        MatrixMul(input, X_NOTRANS, w, X_NOTRANS, embedding);
        eList.Add(&net.embeddings[i]);
    }
@@ -801,7 +798,7 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
    /* concatenate word embeddings
       embeddingcat = cat(embedding_0...embedding_{n-1}) */
    InitModelTensor2D(net.embeddingCat, batchSize, (n - 1) * model.eSize, model);
-    _Concatenate(&eList, &net.embeddingCat, 1);
+    Concatenate(eList, net.embeddingCat, 1);
    /* go over each hidden layer */
    for(int i = 0; i < depth; i++){
@@ -816,22 +813,22 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
        /* generate hidden states of layer i: 
           s = h_pre * w    */
-        _MatrixMul(&h_pre, X_NOTRANS, &w, X_NOTRANS, &s);
+        MatrixMul(h_pre, X_NOTRANS, w, X_NOTRANS, s);
        /* make a 2d tensor for the bias term */
        XTensor b2D;
        InitTensorV2(&b2D, &s);
-        _Unsqueeze(&b, &b2D, 0, batchSize);
+        Unsqueeze(b, b2D, 0, batchSize);
        /* introduce bias term:
           s = s + b
           NOTE: the trick here is to extend b to a 2d tensor
                 to fit into the 2d representation in tensor summation */
-        _Sum(&s, &b2D, &s);
+        Sum(s, b2D, s);
        /* pass the state through the hard tanh function:
           h = tanh(s) */
-        _HardTanH(&s, &h);
+        HardTanH(s, h);
    }
    /* generate the output Pr(w_{n-1}|w_0...w_{n-2}):
@@ -849,16 +846,16 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
        InitModelTensor2D(y, batchSize, model.vSize, model);
        /* s = h_last * w  */
-        _MatrixMul(&h_last, X_NOTRANS, &w, X_NOTRANS, &s);
+        MatrixMul(h_last, X_NOTRANS, w, X_NOTRANS, s);
        XTensor b2D;
        InitTensorV2(&b2D, &s);
-        _Unsqueeze(&b, &b2D, 0, batchSize);
+        Unsqueeze(b, b2D, 0, batchSize);
-        _Sum(&s, &b2D, &s);
+        Sum(s, b2D, s);
        /* y = softmax(s) */
-        _LogSoftmax(&s, &y, 1);
+        LogSoftmax(s, y, 1);
    }
 }
@@ -900,18 +897,18 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
        x is the top most hidden layer)
       so we know 
       dE/dw = x^T * dE/ds */
-    _MatrixMul(&x, X_TRANS, &deds, X_NOTRANS, &dedw);
+    MatrixMul(x, X_TRANS, deds, X_NOTRANS, dedw);
    /* gradient of the bias: dE/db = dE/ds * 1 = dE/ds
    specifically dE/db_{j} = \sum_{i} dE/ds_{i,j} */
-    _ReduceSum(&deds, &dedb, 0);
+    ReduceSum(deds, dedb, 0);
    /* then, we compute 
       dE/dx_{j} = \sum_j' (dE/ds_{j'} * ds_{j'}/dx_j) 
                 = \sum_j' (dE/ds_{j'} * w_{j, j'})
       i.e., 
       dE/dx = dE/ds * w^T */
-    _MatrixMul(&deds, X_NOTRANS, &w, X_TRANS, &dedx);
+    MatrixMul(deds, X_NOTRANS, w, X_TRANS, dedx);
    XTensor &gradPassed = dedx;
    XTensor dedsHidden;
@@ -939,17 +936,17 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
        _HardTanHBackward(&h, &s, &dedh, &deds);
        /* gradient of the weight: dE/dw = x^T * dE/ds   */
-        _MatrixMul(&x, X_TRANS, &deds, X_NOTRANS, &dedw);
+        MatrixMul(x, X_TRANS, deds, X_NOTRANS, dedw);
        /* gradient of the bias: dE/db = dE/ds * 1 = dE/ds
           specifically dE/db_{j} = \sum_{i} dE/ds_{i,j} */
-        _ReduceSum(&deds, &dedb, 0);
+        ReduceSum(deds, dedb, 0);
        /* gradient of the input: dE/dx = dE/ds * w^T    */
-        _MatrixMul(&deds, X_NOTRANS, &w, X_TRANS, &dedx);
+        MatrixMul(deds, X_NOTRANS, w, X_TRANS, dedx);
        if (i > 0)
-            _CopyValues(&dedx, &gradPassed);
+            CopyValues(dedx, gradPassed);
    }
    TensorList eList(n - 1);
@@ -964,7 +961,7 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
    XTensor &dedyCat = depth > 0 ? dedxBottom : dedx;
    /* split the concatenation of gradients of the embeddings */
-    _Split(&dedyCat, &eList, 1, n - 1);
+    Split(dedyCat, eList, 1, n - 1);
    /* go over for each word */
    for (int i = 0; i < n - 1; i++) {
@@ -975,7 +972,7 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
        /* gradient of the embedding weight: dE/dw += x^T * dE/dy 
           NOTE that we accumulate dE/dw here because the matrix w
           is shared by several layers (or words) */
-        _MatrixMul(&x, X_TRANS, dedy, X_NOTRANS, &dedw, 1.0F, 1.0F);
+        MatrixMul(x, X_TRANS, *dedy, X_NOTRANS, dedw, 1.0F, 1.0F);
        delete dedy;
    }

--- a/source/tensor/XLink.cpp
+++ b/source/tensor/XLink.cpp
@@ -737,6 +737,11 @@ void XLink::ShowNode(FILE * file, XTensor * node)
        }
    }
+    fprintf(file, "shape[%d] ", node->order);
+    for (int i = 0; i < node->order; i++)
+        fprintf(file, "%d ", node->GetDim(i));
    fprintf(stderr, "\n");
 }

--- a/source/tensor/XName.cpp
+++ b/source/tensor/XName.cpp
@@ -59,6 +59,8 @@ const char * GetOPName(int type)
            return "M_DIV";
        else if (type == MATH_DIVDIM)
            return "M_DIVDIM";
+        else if (type == MATH_MASK)
+            return "M_MASK";
        else if (type == MATH_MATRIXMUL)
            return "M_MATRIXMUL";
        else if (type == MATH_MATRIXMULBATCHED)

--- a/source/tensor/XName.h
+++ b/source/tensor/XName.h
@@ -48,7 +48,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define MATH_CLIP               MATH_ROUND + 1
 #define MATH_DIV                MATH_CLIP + 1
 #define MATH_DIVDIM             MATH_DIV + 1
-#define MATH_MATRIXMUL          MATH_DIVDIM + 1
+#define MATH_MASK               MATH_DIVDIM + 1
+#define MATH_MATRIXMUL          MATH_MASK + 1
 #define MATH_MATRIXMULBATCHED   MATH_MATRIXMUL + 1
 #define MATH_MULTIPLY           MATH_MATRIXMULBATCHED + 1
 #define MATH_MULTIPLYDIM        MATH_MULTIPLY + 1

--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -1826,6 +1826,7 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg, 
        fprintf(file, "%s ", label);
    if(isInit){
+        fprintf(file, "id=%d ", id);
        fprintf(file, "order=%d dimsize=", order);
        for (int i = 0; i < order; i++) {
            fprintf(file, "%d", dimSize[i]);
@@ -1878,7 +1879,7 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg, 
            fprintf(file, "[%d]%e ", key, value);
        }
    }
-    fprintf(file, "\n");
+    fprintf(file, "\n\n");
    if (isNewData) {
        delete[](char*)d;

--- a/source/tensor/core/arithmetic/Div.cpp
+++ b/source/tensor/core/arithmetic/Div.cpp
@@ -142,6 +142,23 @@ void _DivMe(XTensor * a, const XTensor * b, DTYPE alpha, int leadingDim)
    _Div(a, b, a, alpha, leadingDim);
 }
+/*
+element-wise division of two tensors (do it on site)
+keep the result in the input tensor a and return nothing
+a(i) = a(i)*b(i) + \alpha * a(i)
+where i is the index of the item
+>> a - tensor a (where keep the result)
+>> b - tensor b
+>> alpha - the coefficient
+>> leadingDim - the dimension along which we perform broadcasting
+*/
+void DivMe(XTensor& a, const XTensor& b, DTYPE alpha, int leadingDim)
+{
+    _Div(&a, &b, &a, alpha, leadingDim);
+}
 /* 
 return a dimension if the division is performed as DivDim (in more details in DivDim.h)
 >> a - a tensor
@@ -229,9 +246,8 @@ where i is the index of the item
 >> c - result tensor
 >> alpha - the coefficient
 >> leadingDim - the dimension along which we perform broadcasting
->> requireLink - if add operation to network
 */
-void Div(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int leadingDim, bool requireLink)
+void Div(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int leadingDim)
 {
    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
        InitTensor(&c, &a);
@@ -245,7 +261,7 @@ void Div(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int leadin
        /* call _Div function */
        _Div(&a, &b, &c, 0, leadingDim);
-        if (requireLink) {
+        if (c.enableGrad) {
            /* tensor connections */
            XLink::MakeLink(&a, &b, &c, MATH_DIV);
            XLink::AddParamToHead(&c, alpha);
@@ -256,7 +272,7 @@ void Div(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int leadin
        /* call _DivDim function */
        _DivDim(&a, &b, &c, n, alpha);
-        if (requireLink) {
+        if (c.enableGrad) {
            /* tensor connections */
            XLink::MakeLink(&a, &b, &c, MATH_DIVDIM);
            XLink::AddParamToHeadInt(&c, n);

--- a/source/tensor/core/arithmetic/Div.cu
+++ b/source/tensor/core/arithmetic/Div.cu
@@ -122,7 +122,7 @@ where i is the item index
 */
 void _CudaDiv(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
 {
-	int leadingDimRDI = a->order - leadingDim - 1;
+    int leadingDimRDI = a->order - leadingDim - 1;
    CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
                  "Unmatched tensors in multiplication!");
    CheckNTErrors((a->order == b->order && a->order == c->order), "Unmatched tensors!");

--- a/source/tensor/core/arithmetic/Div.h
+++ b/source/tensor/core/arithmetic/Div.h
@@ -40,6 +40,7 @@ a(i) = a(i)/b(i) + \alpha * a(i)
 where i is the index of the element 
 */
 void _DivMe(XTensor * a, const XTensor * b, DTYPE alpha = 0.0, int leadingDim = 0);
+void DivMe(XTensor & a, const XTensor & b, DTYPE alpha = 0.0, int leadingDim = 0);
 /* 
 element-wise division of two tensors (return an XTensor structure)
@@ -54,7 +55,7 @@ element-wise division of two tensors:
 c(i) = a(i)/b(i) + \alpha * c(i)
 where i is the index of the element
 */
-void Div(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha = 0.0, int leadingDim = 0, bool requireLink = false);
+void Div(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha = 0.0, int leadingDim = 0);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/DivDim.cpp
+++ b/source/tensor/core/arithmetic/DivDim.cpp
@@ -183,9 +183,8 @@ i.e., a is divided with b by broadcasting
 >> c - where we put result. we save it in a if c is NULL
 >> n - the dimension index
 >> alpha - the scaling factor
->> requireLink - if add operation to network
 */
-void DivDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE alpha, bool requireLink)
+void DivDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE alpha)
 {
    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
        InitTensor(&c, &a);
@@ -194,7 +193,7 @@ void DivDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE alpha, 
    /* call _Div function */
    _DivDim(&a, &b, &c, n, alpha);
-    if (requireLink) {
+    if (c.enableGrad == true) {
        /* tensor connections */
        XLink::MakeLink(&a, &b, &c, MATH_DIVDIM);
        XLink::AddParamToHeadInt(&c, n);

--- a/source/tensor/core/arithmetic/DivDim.h
+++ b/source/tensor/core/arithmetic/DivDim.h
@@ -59,7 +59,7 @@ c(i) = a/b + \alpha * c
 where the size of b is equal to the n-th dimension of a, 
 i.e., a is divided with b by broadcasting 
 */
-void DivDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE alpha = (DTYPE)0.0, bool requireLink = false);
+void DivDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE alpha = (DTYPE)0.0);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/Mask.cpp
+++ b/source/tensor/core/arithmetic/Mask.cpp
@@ -130,6 +130,17 @@ void _MaskMe(XTensor * a, const XTensor * mask, DTYPE alpha)
 }
 /*
+mask entries of a given tensor (on site):
+a(i) = a(i) if mask(i) is non-zero
+a(i) = alpha if mask(i) = 0
+where i is the index of the element
+*/
+void MaskMe(XTensor& a, const XTensor& mask, DTYPE alpha)
+{
+    _Mask(&a, &mask, &a, alpha);
+}
+/*
 mask entries of a given tensor (return an XTensor structure):
 a(i) = a(i) if mask(i) is non-zero
 a(i) = alpha if mask(i) = 0
@@ -140,16 +151,35 @@ XTensor Mask(const XTensor &a, const XTensor &mask, DTYPE alpha)
    XTensor c(&a);
    c.SetTMPFlag();
-    /* call _Sum function */
+    /* call _Mask function */
    _Mask(&a, &mask, &c, alpha);
    /* tensor connections */
-    //XLink::MakeLink(&a, &mask, &c, MATH_SUM);
+    XLink::MakeLink(&a, &mask, &c, MATH_MASK);
-    //XLink::AddParamToHead(&c, alpha);
+    XLink::AddParamToHead(&c, alpha);
-    // TODO!!
-    ShowNTErrors("TODO!");
    return c;
 }
+/*
+mask entries of a given tensor (return an XTensor structure):
+a(i) = a(i) if mask(i) is non-zero
+a(i) = alpha if mask(i) = 0
+where i is the index of the element
+*/
+void Mask(const XTensor &a, const XTensor &mask, XTensor &c, DTYPE alpha)
+{
+    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
+        InitTensor(&c, &a);
+    }
+    /* call _Mask function */
+    _Mask(&a, &mask, &c, alpha);
+    if (c.enableGrad) {
+        XLink::MakeLink(&a, &mask, &c, MATH_MASK);
+        XLink::AddParamToHead(&c, alpha);
+    }
+}
 }
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Mask.h
+++ b/source/tensor/core/arithmetic/Mask.h
@@ -43,6 +43,7 @@ a(i) = alpha if mask(i) = 0
 where i is the index of the element
 */
 void _MaskMe(XTensor * a, const XTensor * mask, DTYPE alpha = 0.0);
+void MaskMe(XTensor & a, const XTensor & mask, DTYPE alpha = 0.0);
 /*
 mask entries of a given tensor (return an XTensor structure):
@@ -52,6 +53,14 @@ where i is the index of the element
 */
 XTensor Mask(const XTensor &a, const XTensor &mask, DTYPE alpha = 0.0);
+/*
+mask entries of a given tensor (return an XTensor structure):
+a(i) = a(i) if mask(i) is non-zero
+a(i) = alpha if mask(i) = 0
+where i is the index of the element
+*/
+void Mask(const XTensor &a, const XTensor &mask, XTensor &c, DTYPE alpha = 0.0);
 } // namespace nts(NiuTrans.Tensor)
 #endif // __MASK_H__
--- a/source/tensor/core/arithmetic/MatrixMul.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul.cpp
@@ -202,7 +202,9 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
    delete cList;
 }
-bool CheckMMulShape(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c)
+bool CheckMMulShape(const XTensor * a, MATRIX_TRANS_TYPE transposedA, 
+                    const XTensor * b, MATRIX_TRANS_TYPE transposedB, 
+                    XTensor * c)
 {
    if (!(a && b && c))
        return false;
@@ -231,10 +233,13 @@ bool CheckMMulShape(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTen
    dimSize[sub++] = bm;
    for (int i = 0; i < order; i++) {
-        if (dimSize[i] != c->dimSize[i])
+        if (dimSize[i] != c->dimSize[i]) {
+            delete[] dimSize;
            return false;
+        }
    }
+    delete[] dimSize;
    return true;
 }
@@ -303,8 +308,8 @@ XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
 }
 void MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
-    const XTensor &b, MATRIX_TRANS_TYPE transposedB, XTensor &c, 
+               const XTensor &b, MATRIX_TRANS_TYPE transposedB, XTensor &c, 
-    DTYPE alpha, XPRunner * parallelRunner, bool requireLink)
+               DTYPE alpha, DTYPE beta, XPRunner * parallelRunner)
 {
    CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
    CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
@@ -337,9 +342,9 @@ void MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
    }
    /* call _MatrixMul function */
-    _MatrixMul(&a, transposedA, &b, transposedB, &c, alpha, 0, parallelRunner);
+    _MatrixMul(&a, transposedA, &b, transposedB, &c, alpha, beta, parallelRunner);
-    if (requireLink) {
+    if (c.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&a, &b, &c, MATH_MATRIXMUL);
        XLink::AddParamToHeadTrans(&c, transposedA);
@@ -400,7 +405,7 @@ XTensor MatrixMul(const XTensor &a, const XTensor &b,
 }
 void MatrixMul(const XTensor &a, const XTensor &b, XTensor &c,
-    DTYPE alpha, XPRunner * parallelRunner, bool requireLink)
+               DTYPE alpha, XPRunner * parallelRunner)
 {
    CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
    CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
@@ -435,7 +440,7 @@ void MatrixMul(const XTensor &a, const XTensor &b, XTensor &c,
    /* call _MatrixMul function */
    _MatrixMul(&a, X_NOTRANS, &b, X_NOTRANS, &c, alpha, 0, parallelRunner);
-    if (requireLink) {
+    if (c.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&a, &b, &c, MATH_MATRIXMUL);
        XLink::AddParamToHeadTrans(&c, X_NOTRANS);

--- a/source/tensor/core/arithmetic/MatrixMul.h
+++ b/source/tensor/core/arithmetic/MatrixMul.h
@@ -40,8 +40,11 @@ bj is the j-th element tensor of B, and c_{i,j} is the (i,j) elementtensor of th
 C should be a tensor of z * x * n * m. 
 Obviously C = A * B performs normal matrix multiplication if A = y * z and B = x * y.
 */
-void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
+void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA, 
-                DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
+                const XTensor * b, MATRIX_TRANS_TYPE transposedB, 
+                XTensor * c,
+                DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, 
+                XPRunner * parallelRunner = NULL);
 /* 
 matrix multiplication (return an XTensor structure) c = trans(a) * trans(b) * alpha
@@ -56,19 +59,23 @@ bj is the j-th element tensor of B, and c_{i,j} is the (i,j) elementtensor of th
 C should be a tensor of z * x * n * m. 
 Obviously C = A * B performs normal matrix multiplication if A = y * z and B = x * y.
 */
-XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB, 
+XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, 
-                  DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
+                  const XTensor &b, MATRIX_TRANS_TYPE transposedB, 
+                  DTYPE alpha = (DTYPE)1.0, 
+                  XPRunner * parallelRunner = NULL);
-void MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
+void MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, 
-    XTensor &c, DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL, bool requireLink = false);
+               const XTensor &b, MATRIX_TRANS_TYPE transposedB,
+               XTensor &c, 
+               DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, 
+               XPRunner * parallelRunner = NULL);
 /* matrix multiplication with no transposition c = a * b * alpha*/
 XTensor MatrixMul(const XTensor &a, const XTensor &b, 
                  DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
 void MatrixMul(const XTensor &a, const XTensor &b, XTensor &c, 
-    DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL, bool requireLink = false);
+               DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/Multiply.cpp
+++ b/source/tensor/core/arithmetic/Multiply.cpp
@@ -143,6 +143,23 @@ void _MultiplyMe(XTensor * a, const XTensor * b, DTYPE alpha, int leadingDim)
    _Multiply(a, b, a, alpha, leadingDim);
 }
+/*
+element-wise product of two tensors (do it on site)
+keep the result in the input tensor a and return nothing
+a(i) = a(i)*b(i) + \alpha * a(i)
+where i is the index of the item
+>> a - tensor a (where keep the result)
+>> b - tensor b
+>> alpha - the coefficient
+>> leadingDim - the dimension along which we perform broadcasting
+*/
+void MultiplyMe(XTensor& a, const XTensor& b, DTYPE alpha, int leadingDim)
+{
+    _Multiply(&a, &b, &a, alpha, leadingDim);
+}
 /* 
 return a dimension if the multiplication is performed as MultiplyDim (in more details in MultiplyDim.h)
 >> a - a tensor
@@ -230,9 +247,8 @@ where i is the index of the item
 >> c - result tensor
 >> alpha - the coefficient
 >> leadingDim - the dimension along which we perform broadcasting
->> requireLink - if add operation to network
 */
-void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int leadingDim, bool requireLink)
+void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int leadingDim)
 {
    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
        InitTensor(&c, &a);
@@ -246,7 +262,7 @@ void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int l
        /* call _Multiply function */
        _Multiply(&a, &b, &c, 0, leadingDim);
-        if (requireLink) {
+        if (c.enableGrad) {
            /* tensor connections */
            XLink::MakeLink(&a, &b, &c, MATH_MULTIPLY);
            XLink::AddParamToHead(&c, alpha);
@@ -257,7 +273,7 @@ void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int l
        /* call _MultiplyDim function */
        _MultiplyDim(&a, &b, &c, n, alpha);
-        if (requireLink) {
+        if (c.enableGrad) {
            /* tensor connections */
            XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM);
            XLink::AddParamToHeadInt(&c, n);

--- a/source/tensor/core/arithmetic/Multiply.cu
+++ b/source/tensor/core/arithmetic/Multiply.cu
@@ -122,8 +122,8 @@ where i is the item index
 */
 void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
 {
-	int leadingDimRDI = a->order - leadingDim - 1;
+    int leadingDimRDI = a->order - leadingDim - 1;
-    CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
+    CheckNTErrors(a->unitNum <= c->unitNum && b->unitNum <= c->unitNum,
                  "Unmatched tensors in multiplication!");
    CheckNTErrors((a->order == b->order && a->order == c->order), "Unmatched tensors!");

--- a/source/tensor/core/arithmetic/Multiply.h
+++ b/source/tensor/core/arithmetic/Multiply.h
@@ -40,6 +40,7 @@ a(i) = a(i)*b(i) + \alpha * a(i)
 where i is the index of the element 
 */
 void _MultiplyMe(XTensor * a, const XTensor * b, DTYPE alpha = 0.0, int leadingDim = 0);
+void MultiplyMe(XTensor & a, const XTensor & b, DTYPE alpha = 0.0, int leadingDim = 0);
 /* 
 element-wise product of two tensors (return an XTensor structure)
@@ -54,7 +55,7 @@ element-wise product of two tensors:
 c(i) = a(i)*b(i) + \alpha * c(i) 
 where i is the index of the element
 */
-void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha = 0.0, int leadingDim = 0, bool requireLink = false);
+void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha = 0.0, int leadingDim = 0);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/MultiplyDim.cpp
+++ b/source/tensor/core/arithmetic/MultiplyDim.cpp
@@ -139,6 +139,24 @@ void _MultiplyDimMe(XTensor * a, const XTensor * b, int n, DTYPE alpha)
 }
 /*
+tensor multiplication(do it on site)
+make a new tensor to keep the result and return it
+c = a * b + \alpha * c
+where the size of b is equal to the n-th dimension of a,
+i.e., a is multiplied with b by broadcasting
+>> a - a tensor
+>> b - another tensor whose size is equal to that of dimension n of a
+>> n - the dimension index
+>> alpha - the scaling factor
+*/
+void MultiplyDimMe(XTensor& a, const XTensor& b, int n, DTYPE alpha)
+{
+    _MultiplyDim(&a, &b, &a, n, alpha);
+}
+/*
 tensor multiplication (return an XTensor structure and make tensor connections)
 make a new tensor to keep the result and return it
@@ -180,9 +198,8 @@ i.e., a is multiplied with b by broadcasting
 >> b - another tensor whose size is equal to that of dimension n of a
 >> c - where we put a * b + \alpha * c. we save it in a if c is NULL
 >> n - the dimension index
->> requireLink - if add operation to network
 */
-void MultiplyDim(const XTensor &a, const XTensor &b, XTensor &c, int n, bool requireLink)
+void MultiplyDim(const XTensor &a, const XTensor &b, XTensor &c, int n)
 {
    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
        InitTensor(&c, &a);
@@ -191,7 +208,7 @@ void MultiplyDim(const XTensor &a, const XTensor &b, XTensor &c, int n, bool req
    /* call _Multiply function */
    _MultiplyDim(&a, &b, &c, n, 0);
-    if (requireLink) {
+    if (c.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM);
        XLink::AddParamToHeadInt(&c, n);
@@ -347,9 +364,8 @@ where some of dimensions of b can be of size 1
 >> a - a tensor
 >> b - another tensor that would be broadcasted
 >> c - the resulting tensor
->> requireLink - if add operation to network
 */
-void MultiplyBroadcast(const XTensor &a, const XTensor &b, XTensor &c, bool requireLink)
+void MultiplyBroadcast(const XTensor &a, const XTensor &b, XTensor &c)
 {
    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
        InitTensor(&c, &a);
@@ -358,7 +374,7 @@ void MultiplyBroadcast(const XTensor &a, const XTensor &b, XTensor &c, bool requ
    /* call _SumBroadcast function */
    _MultiplyBroadcast(&a, &b, &c, 0);
-    if (requireLink) {
+    if (c.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYBROADCAST);
        XLink::AddParamToHead(&c, 0);

--- a/source/tensor/core/arithmetic/Sub.cpp
+++ b/source/tensor/core/arithmetic/Sub.cpp
@@ -126,6 +126,19 @@ void _SubMe(XTensor * a, const XTensor * b, DTYPE beta)
 {
    _Sub(a, b, a, beta);
 }
+/*
+tensor subtraction a = a - b * \beta (do it on site)
+keep the result in the tensor a and return nothing
+>> a - a tensor
+>> b - another tensor
+>> beta - the scaling factor
+*/
+void SubMe(XTensor& a, const XTensor& b, DTYPE beta)
+{
+    _Sub(&a, &b, &a, beta);
+}
 /* 
 return a dimension if the subtraction is performed as SubDim (in more details in SubDim.h)
@@ -203,9 +216,8 @@ tensor subtraction c = a - b * \beta
 >> b - another tensor
 >> c - where we put a-b*\beta. we save it in a if c is NULL
 >> beta - the scaling factor
->> requireLink - if add operation to network
 */
-void Sub(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta, bool requireLink)
+void Sub(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta)
 {
    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
        InitTensor(&c, &a);
@@ -217,7 +229,7 @@ void Sub(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta, bool requir
        /* call _Sub function */
        _Sub(&a, &b, &c, beta);
-        if (requireLink) {
+        if (c.enableGrad) {
            /* tensor connections */
            XLink::MakeLink(&a, &b, &c, MATH_SUB);
            XLink::AddParamToHead(&c, beta);
@@ -227,7 +239,7 @@ void Sub(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta, bool requir
        /* call _SubDim function */
        _SubDim(&a, &b, &c, n, beta);
-        if (requireLink) {
+        if (c.enableGrad) {
            /* tensor connections */
            XLink::MakeLink(&a, &b, &c, MATH_SUBDIM);
            XLink::AddParamToHeadInt(&c, n);

--- a/source/tensor/core/arithmetic/Sub.h
+++ b/source/tensor/core/arithmetic/Sub.h
@@ -35,6 +35,7 @@ tensor subtraction a = a - b * \beta
 keep the result in the input tensor a and return nothing
 */
 void _SubMe(XTensor * a, const XTensor * b, DTYPE beta = (DTYPE)1.0);
+void SubMe(XTensor & a, const XTensor & b, DTYPE beta = (DTYPE)1.0);
 /*
 tensor subtraction c = a - b * \beta
@@ -43,7 +44,7 @@ make a new tensor c to keep the result and return it
 XTensor Sub(const XTensor &a, const XTensor &b, DTYPE beta = (DTYPE)1.0);
 /* tensor subtraction c = a - b * \beta */
-void Sub(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta = (DTYPE)1.0, bool requireLink = false);
+void Sub(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta = (DTYPE)1.0);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/SubDim.h
+++ b/source/tensor/core/arithmetic/SubDim.h
@@ -40,7 +40,7 @@ XTensor SubDim(const XTensor &a, const XTensor &b, int n, DTYPE beta = (DTYPE)1.
 /* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a, 
   i.e., a is subtracted with b by broadcasting*/
-void SubDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta = (DTYPE)1.0, bool requireLink = false);
+void SubDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta = (DTYPE)1.0);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/Sum.cpp
+++ b/source/tensor/core/arithmetic/Sum.cpp
@@ -132,6 +132,19 @@ void _SumMe(XTensor * a, const XTensor * b, DTYPE beta)
    _Sum(a, b, a, beta);
 }
+/*
+tensor summation a = a + b * \beta (do it on site)
+keep the result in the tensor a and return nothing
+>> a - a tensor
+>> b - another tensor
+>> beta - the scaling factor
+*/
+void SumMe(XTensor& a, const XTensor& b, DTYPE beta)
+{
+    _Sum(&a, &b, &a, beta);
+}
 /* 
 return a dimension if the sum is performed as SumDim (in more details in SumDim.h)
 >> a - a tensor
@@ -207,9 +220,8 @@ tensor summation c = a + b * \beta
 >> a - a tensor
 >> b - another tensor
 >> beta - the scaling factor
->> requireLink - if add operation to network
 */
-void Sum(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta, bool requireLink)
+void Sum(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta)
 {
    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
        InitTensor(&c, &a);
@@ -221,7 +233,7 @@ void Sum(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta, bool requir
        /* call _Sum function */
        _Sum(&a, &b, &c, beta);
-        if (requireLink) {
+        if (c.enableGrad) {
            /* tensor connections */
            XLink::MakeLink(&a, &b, &c, MATH_SUM);
            XLink::AddParamToHead(&c, beta);
@@ -231,7 +243,7 @@ void Sum(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta, bool requir
        /* call _SumDim function */
        _SumDim(&a, &b, &c, n, beta);
-        if (requireLink) {
+        if (c.enableGrad) {
            /* tensor connections */
            XLink::MakeLink(&a, &b, &c, MATH_SUMDIM);
            XLink::AddParamToHeadInt(&c, n);

--- a/source/tensor/core/arithmetic/Sum.h
+++ b/source/tensor/core/arithmetic/Sum.h
@@ -34,6 +34,7 @@ tensor summation a = a + b * \beta
 keep the result in the input tensor a and return nothing
 */
 void _SumMe(XTensor * a, const XTensor * b, DTYPE beta = (DTYPE)1.0);
+void SumMe(XTensor & a, const XTensor & b, DTYPE beta = (DTYPE)1.0);
 /*
 tensor summation c = a + b * \beta
@@ -42,7 +43,7 @@ make a new tensor c to keep the result and return it
 XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta = (DTYPE)1.0);
 /* tensor summation c = a + b * \beta */
-void Sum(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta = (DTYPE)1.0, bool requireLink = false);
+void Sum(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta = (DTYPE)1.0);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/SumDim.cpp
+++ b/source/tensor/core/arithmetic/SumDim.cpp
@@ -200,9 +200,8 @@ i.e., a is summed with b by broadcasting
 >> c - where we put a+b*\beta. we save it in a if c is NULL
 >> n - the dimension index
 >> beta - the scaling factor
->> requireLink - if add operation to network
 */
-void SumDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta, bool requireLink)
+void SumDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta)
 {
    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
        InitTensor(&c, &a);
@@ -211,7 +210,7 @@ void SumDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta, b
    /* call _SumDim function */
    _SumDim(&a, &b, &c, n, beta);
-    if (requireLink) {
+    if (c.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&a, &b, &c, MATH_SUMDIM);
        XLink::AddParamToHeadInt(&c, n);
@@ -368,9 +367,8 @@ c = a + b * \beta
 >> b - another tensor that would be broadcasted
 >> c - the resulting tensor
 >> beta - the scaling factor
->> requireLink - if add operation to network
 */
-void SumBroadcast(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta, bool requireLink)
+void SumBroadcast(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta)
 {
    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
        InitTensor(&c, &a);
@@ -379,7 +377,7 @@ void SumBroadcast(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta, bo
    /* call _SumBroadcast function */
    _SumBroadcast(&a, &b, &c, beta);
-    if (requireLink) {
+    if (c.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&a, &b, &c, MATH_SUMBROADCAST);
        XLink::AddParamToHead(&c, beta);

--- a/source/tensor/core/arithmetic/SumDim.h
+++ b/source/tensor/core/arithmetic/SumDim.h
@@ -44,7 +44,7 @@ XTensor SumDim(const XTensor &a, const XTensor &b, int n, DTYPE beta = (DTYPE)1.
 /* tensor summation c = a + b * \beta where the size of b is equal to the n-th dimension of a, 
   i.e., a is summed with b by broadcasting */
-void SumDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta = (DTYPE)1.0, bool requireLink = false);
+void SumDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta = (DTYPE)1.0);
 /* tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1 */
 void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
@@ -54,7 +54,7 @@ void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta
 XTensor SumBroadcast(const XTensor &a, const XTensor &b, DTYPE beta = (DTYPE)1.0);
 /* tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1 */
-void SumBroadcast(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta = (DTYPE)1.0, bool requireLink = false);
+void SumBroadcast(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta = (DTYPE)1.0);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/getandset/ConvertDataType.cpp
+++ b/source/tensor/core/getandset/ConvertDataType.cpp
@@ -126,4 +126,18 @@ XTensor ConvertDataType(const XTensor & input, TENSOR_DATA_TYPE dataType)
    return output;
 }
+void ConvertDataType(const XTensor & input, XTensor & output, TENSOR_DATA_TYPE dataType)
+{
+    if (!output.isInit || input.dataType != output.dataType) {
+        float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
+        InitTensor(&output, input.order, input.dimSize, dataType, dr, input.devID, input.mem);
+    }
+    _ConvertDataType(&input, &output);
+    /* tensor connection */
+    if (output.enableGrad)
+        XLink::MakeLink(&input, NULL, &output, GETANDSET_CONVERTDATATYPE);
+}
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/getandset/ConvertDataType.h
+++ b/source/tensor/core/getandset/ConvertDataType.h
@@ -38,6 +38,9 @@ void _ConvertDataType(const XTensor * input, XTensor * output);
 /* convert data type (return an XTensor structure) */
 XTensor ConvertDataType(const XTensor & input, TENSOR_DATA_TYPE dataType);
+/* convert data type */
+void ConvertDataType(const XTensor & input, XTensor & output, TENSOR_DATA_TYPE dataType);
 } // namespace nts(NiuTrans.Tensor)
 #endif // __CONVERTDATATYPE_H__
--- a/source/tensor/core/math/Binary.cpp
+++ b/source/tensor/core/math/Binary.cpp
@@ -27,41 +27,41 @@
 namespace nts {
 template<class T1, class T2>
-T1 descale(T1 x, T2 num)
+T1 BinaryDescale(T1 x, T2 num)
 {
    return (T1)(x / num);
 }
 template<class T1, class T2>
-T1 power(T1 x, T2 num)
+T1 BinaryPower(T1 x, T2 num)
 {
    if (num == 0)
        return (T1)1.0;
    else if (num == 0.5)
-        return (T1)sqrt(num);
+        return (T1)sqrt(x);
    else if (num == 2)
        return x * x;
    else {
        if (x == 0 && num < 0)
-            return (T1)NAN;
+            return (T1)1e20F;
        else
            return (T1)pow(x, num);
    }
 }
 template<class T1, class T2>
-T1 scale(T1 x, T2 num)
+T1 BinaryScale(T1 x, T2 num)
 {
    return (T1)(x * num);
 }
 template<class T1, class T2>
-T1 shift(T1 x, T2 num)
+T1 BinaryShift(T1 x, T2 num)
 {
    return (T1)(x + num);
 }
-int mod(int x, int num)
+int BinaryMod(int x, int num)
 {
    return x % num;
 }
@@ -135,6 +135,7 @@ XTensor funcName(const XTensor &a, T num)                                       
    b.SetTMPFlag();                                                                  \
    _funcName(&a, &b, num);                                                          \
    XLink::MakeLink(&a, NULL, &b, operationId);                                      \
+    XLink::AddParamToHead(&b, num);                                                  \
    return b;                                                                        \
 }                                                                                    \
 template XTensor funcName<int>(const XTensor&, int);                                 \
@@ -151,37 +152,38 @@ void funcName(const XTensor &a, XTensor &b, T num)                              
    _funcName(&a, &b, num);                                                          \
    if (b.enableGrad) {                                                              \
        XLink::MakeLink(&a, NULL, &b, operationId);                                  \
+        XLink::AddParamToHead(&b, num);                                              \
    }                                                                                \
 }                                                                                    \
 template void funcName<int>(const XTensor&, XTensor&, int);                          \
 template void funcName<float>(const XTensor&, XTensor&, float);                      \
 template void funcName<double>(const XTensor&, XTensor&, double);                                                                           
-_SIMPLE_BINARY_FUNCTION(_Descale, _CudaDescale, descale)
+_SIMPLE_BINARY_FUNCTION(_Descale, _CudaDescale, BinaryDescale)
 _SIMPLE_BINARY_FUNCTION_ME(_DescaleMe, _Descale)
 SIMPLE_BINARY_FUNCTION_ME(DescaleMe, _Descale)
 SIMPLE_BINARY_FUNCTION(Descale, _Descale, MATH_DESCALE)
 SIMPLE_BINARY_FUNCTION_VOID(Descale, _Descale, MATH_DESCALE)
-_SIMPLE_BINARY_FUNCTION(_Mod, _CudaMod, mod)
+_SIMPLE_BINARY_FUNCTION(_Mod, _CudaMod, BinaryMod)
 _SIMPLE_BINARY_FUNCTION_ME(_ModMe, _Mod)
 SIMPLE_BINARY_FUNCTION_ME(ModMe, _Mod)
 SIMPLE_BINARY_FUNCTION(Mod, _Mod, MATH_MOD)
 SIMPLE_BINARY_FUNCTION_VOID(Mod, _Mod, MATH_MOD)
-_SIMPLE_BINARY_FUNCTION(_Power, _CudaPower, power)
+_SIMPLE_BINARY_FUNCTION(_Power, _CudaPower, BinaryPower)
 _SIMPLE_BINARY_FUNCTION_ME(_PowerMe, _Power)
 SIMPLE_BINARY_FUNCTION_ME(PowerMe, _Power)
 SIMPLE_BINARY_FUNCTION(Power, _Power, MATH_POWER)
 SIMPLE_BINARY_FUNCTION_VOID(Power, _Power, MATH_POWER)
-_SIMPLE_BINARY_FUNCTION(_Scale, _CudaScale, scale)
+_SIMPLE_BINARY_FUNCTION(_Scale, _CudaScale, BinaryScale)
 _SIMPLE_BINARY_FUNCTION_ME(_ScaleMe, _Scale)
 SIMPLE_BINARY_FUNCTION_ME(ScaleMe, _Scale)
 SIMPLE_BINARY_FUNCTION(Scale, _Scale, MATH_SCALE)
 SIMPLE_BINARY_FUNCTION_VOID(Scale, _Scale, MATH_SCALE)
-_SIMPLE_BINARY_FUNCTION(_Shift, _CudaShift, shift)
+_SIMPLE_BINARY_FUNCTION(_Shift, _CudaShift, BinaryShift)
 _SIMPLE_BINARY_FUNCTION_ME(_ShiftMe, _Shift)
 SIMPLE_BINARY_FUNCTION_ME(ShiftMe, _Shift)
 SIMPLE_BINARY_FUNCTION(Shift, _Shift, MATH_SHIFT)

--- a/source/tensor/core/math/Binary.cu
+++ b/source/tensor/core/math/Binary.cu
@@ -31,31 +31,31 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
 __device__
-int BaseMod(int x, int base)
+int BinaryCudaMod(int x, int base)
 {
    return x % base;
 }
 template<class T1, class T2>
 __device__
-T1 BaseDescale(T1 x, T2 num)
+T1 BinaryCudaDescale(T1 x, T2 num)
 {
    return x / num;
 }
 template<class T1, class T2>
 __device__
-T1 BasePower(T1 x, T2 num)
+T1 BinaryCudaPower(T1 x, T2 num)
 {
    if (num == 0)
        return (T1)1.0;
    else if (num == 0.5)
-        return (T1)sqrt((float)num);
+        return (T1)sqrt((float)x);
    else if (num == 2)
        return (T1)(x * x);
    else {
        if (x == 0 && num < 0)
-            return 1e20F;
+            return (T1)1e20F;
        else
            return (T1)pow((float)x, (float)num);
    }
@@ -63,14 +63,14 @@ T1 BasePower(T1 x, T2 num)
 template<class T1, class T2>
 __device__
-T1 BaseScale(T1 x, T2 num)
+T1 BinaryCudaScale(T1 x, T2 num)
 {
    return x * num;
 }
 template<class T1, class T2>
 __device__
-T1 BaseShift(T1 x, T2 num)
+T1 BinaryCudaShift(T1 x, T2 num)
 {
    return x + num;
 }
@@ -126,11 +126,11 @@ template void _Cuda##funcName<int>(const XTensor*, XTensor*, int);              
 template void _Cuda##funcName<float>(const XTensor*, XTensor*, float);              \
 template void _Cuda##funcName<double>(const XTensor*, XTensor*, double);            
-SIMPLE_BINARY_FUNCTION_GPU(Descale, BaseDescale)
+SIMPLE_BINARY_FUNCTION_GPU(Descale, BinaryCudaDescale)
-SIMPLE_BINARY_FUNCTION_GPU(Mod, BaseMod)
+SIMPLE_BINARY_FUNCTION_GPU(Mod, BinaryCudaMod)
-SIMPLE_BINARY_FUNCTION_GPU(Power, BasePower)
+SIMPLE_BINARY_FUNCTION_GPU(Power, BinaryCudaPower)
-SIMPLE_BINARY_FUNCTION_GPU(Scale, BaseScale)
+SIMPLE_BINARY_FUNCTION_GPU(Scale, BinaryCudaScale)
-SIMPLE_BINARY_FUNCTION_GPU(Shift, BaseShift)
+SIMPLE_BINARY_FUNCTION_GPU(Shift, BinaryCudaShift)
 #endif // USE_CUDA

--- a/source/tensor/core/math/Clip.cpp
+++ b/source/tensor/core/math/Clip.cpp
@@ -71,6 +71,18 @@ void _ClipMe(XTensor * a, DTYPE lower, DTYPE upper)
 }
 /*
+set every entry to its clip value (do it on site)
+keep the result in the input tensor a and return nothing
+>> a - the tensor we are processing
+>> lower - the lower border
+>> upper - the upper border
+*/
+void ClipMe(XTensor& a, DTYPE lower, DTYPE upper)
+{
+    _Clip(&a, &a, lower, upper);
+}
+/*
 set every entry to its clip value (return an XTensor structure)
 make a new tensor to keep the result and return it
 >> a - input tensor we are processing
@@ -94,7 +106,7 @@ XTensor Clip(const XTensor & a, DTYPE lower, DTYPE upper)
 	return b;
 }
-void Clip(const XTensor & a, XTensor & b, DTYPE lower, DTYPE upper, bool requireLink)
+void Clip(const XTensor & a, XTensor & b, DTYPE lower, DTYPE upper)
 {
    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {
        InitTensor(&b, &a);
@@ -103,7 +115,7 @@ void Clip(const XTensor & a, XTensor & b, DTYPE lower, DTYPE upper, bool require
    /* call _Clip function */
    _Clip(&a, &b, lower, upper);
-    if (requireLink) {
+    if (b.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&a, NULL, &b, MATH_CLIP);
        XLink::AddParamToHead(&b, lower);

--- a/source/tensor/core/math/Clip.h
+++ b/source/tensor/core/math/Clip.h
@@ -33,11 +33,15 @@ void _Clip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper);
   keep the result in the input tensor a and return nothing */
 void _ClipMe(XTensor * a, DTYPE lower, DTYPE upper);
+/* set every entry to its clip value (do it on site)
+keep the result in the input tensor a and return nothing */
+void ClipMe(XTensor & a, DTYPE lower, DTYPE upper);
 /* set every entry to its clip value  (return an XTensor structure)
   make a new tensor to keep the result and return it */
 XTensor Clip(const XTensor & a, DTYPE lower, DTYPE upper);
-void Clip(const XTensor & a, XTensor & b, DTYPE lower, DTYPE upper, bool requireLink = false);
+void Clip(const XTensor & a, XTensor & b, DTYPE lower, DTYPE upper);
 /*
 backward of Clip function

--- a/source/tensor/core/math/Compare.cpp
+++ b/source/tensor/core/math/Compare.cpp
@@ -37,37 +37,56 @@ DTYPE myIsNotEqual(DTYPE a, DTYPE b)
 }
 #ifdef USE_CUDA
-/* define three marco separately, specify the respective function names  (GPU mode) */
+/* define three marco separately, specify the respective function names */
-#define _SIMPLE_COMPARE_FUNCTION(_funcName, _cudaFuncName, origFunc)        \
+#define _SIMPLE_COMPARE_FUNCTION(_funcName, _cudaFuncName, origFunc)                 \
-void _funcName(const XTensor * a, XTensor * b, DTYPE number)                \
+void _funcName(const XTensor * a, XTensor * b, DTYPE number)                         \
-{                                                                           \
+{                                                                                    \
-    CheckNTErrors((XTensor::IsSameShaped(a, b)),                            \
+    CheckNTErrors((XTensor::IsSameShaped(a, b)),                                     \
-                  "Input tensors should have the same type!");              \
+                  "Input tensors should have the same type!");                       \
-    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");                 \
+    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");                          \
-    /* run it on GPUs */                                                    \
+    /* run it on GPUs */                                                             \
-    if (a->devID >= 0) {                                                    \
+    if (a->devID >= 0) {                                                             \
-        _cudaFuncName(a, b, number);                                        \
+        if (useCUDA) {                                                               \
-        return;                                                             \
+            _cudaFuncName(a, b, number);                                             \
-    }                                                                       \
+            return;                                                                  \
-    DTYPE * d = (DTYPE*)a->data;                                            \
+        }                                                                            \
-    DTYPE * db = (DTYPE*)b->data;                                           \
+        else                                                                         \
-    for (int i = 0; i < a->unitNum; i++)                                    \
+            ShowNTErrors("No GPU devices support!")                                  \
-        db[i] = (DTYPE)origFunc(d[i], number);                              \
+    }                                                                                \
+    DTYPE * d = (DTYPE*)a->data;                                                     \
+    DTYPE * db = (DTYPE*)b->data;                                                    \
+    for (int i = 0; i < a->unitNum; i++)                                             \
+        db[i] = (DTYPE)origFunc(d[i], number);                                       \
+}                                                                                    
+#define _SIMPLE_COMPARE_FUNCTION_ME(_funcNameMe, _funcName)                          \
+void _funcNameMe(XTensor * a, DTYPE number)                                          \
+{                                                                                    \
+    _funcName(a, a, number);                                                         \
+}                                                                                    
+#define SIMPLE_COMPARE_FUNCTION_ME(funcNameMe, _funcName)                            \
+void funcNameMe(XTensor & a, DTYPE number)                                           \
+{                                                                                    \
+    _funcName(&a, &a, number);                                                       \
+}                                                                                    
+#define SIMPLE_COMPARE_FUNCTION(funcName, _funcName, operationId)                    \
+XTensor funcName(const XTensor &a, DTYPE number)                                     \
+{                                                                                    \
+    XTensor b(&a);                                                                   \
+    b.SetTMPFlag();                                                                  \
+    _funcName(&a, &b, number);                                                       \
+    return b;                                                                        \
 }
-#define _SIMPLE_COMPARE_FUNCTION_ME(_funcNameMe, _funcName)                 \
+#define SIMPLE_COMPARE_FUNCTION_VOID(funcName, _funcName, operationId)               \
-void _funcNameMe(XTensor * a, DTYPE number)                                 \
+void funcName(const XTensor &a, XTensor &b, DTYPE number)                            \
-{                                                                           \
+{                                                                                    \
-    _funcName(a, a, number);                                                \
+    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {                               \
-}        
+        InitTensor(&b, &a);                                                          \
+    }                                                                                \
-#define SIMPLE_COMPARE_FUNCTION(funcName, _funcName, operationId)           \
+    _funcName(&a, &b, number);                                                       \
-XTensor funcName(const XTensor &a, DTYPE number)                            \
-{                                                                           \
-    XTensor b(&a);                                                          \
-    b.SetTMPFlag();                                                         \
-    _funcName(&a, &b, number);                                              \
-    return b;                                                               \
 }
 // I think we needn't to make link.
@@ -75,51 +94,15 @@ XTensor funcName(const XTensor &a, DTYPE number)                            \
 _SIMPLE_COMPARE_FUNCTION(_Equal, _CudaEqual, myIsEqual)
 _SIMPLE_COMPARE_FUNCTION_ME(_EqualMe, _Equal)
+SIMPLE_COMPARE_FUNCTION_ME(EqualMe, _Equal)
 SIMPLE_COMPARE_FUNCTION(Equal, _Equal, MATH_EQUAL)
+SIMPLE_COMPARE_FUNCTION_VOID(Equal, _Equal, MATH_EQUAL)
 _SIMPLE_COMPARE_FUNCTION(_NotEqual, _CudaNotEqual, myIsNotEqual)
 _SIMPLE_COMPARE_FUNCTION_ME(_NotEqualMe, _NotEqual)
+SIMPLE_COMPARE_FUNCTION_ME(NotEqualMe, _NotEqual)
 SIMPLE_COMPARE_FUNCTION(NotEqual, _NotEqual, MATH_NOTEQUAL)
+SIMPLE_COMPARE_FUNCTION_VOID(NotEqual, _NotEqual, MATH_NOTEQUAL)
-#else
-/* define three marco separately, specify the respective function names (CPU mode) */
-#define _SIMPLE_COMPARE_FUNCTION(_funcName, origFunc)                       \
-void _funcName(const XTensor * a, XTensor * b, DTYPE number)                \
-{                                                                           \
-    CheckNTErrors((XTensor::IsSameShaped(a, b)),                            \
-                  "Input tensors should have the same type!");              \
-    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");                 \
-    DTYPE * d = (DTYPE*)a->data;                                            \
-    DTYPE * db = (DTYPE*)b->data;                                           \
-    for (int i = 0; i < a->unitNum; i++)                                    \
-        db[i] = (DTYPE)origFunc(d[i], number);                              \
-}
-#define _SIMPLE_COMPARE_FUNCTION_ME(_funcNameMe, _funcName)                 \
-void _funcNameMe(XTensor * a, DTYPE number)                                 \
-{                                                                           \
-    _funcName(a, a, number);                                                \
-}        
-#define SIMPLE_COMPARE_FUNCTION(funcName, _funcName, operationId)           \
-XTensor funcName(const XTensor &a, DTYPE number)                            \
-{                                                                           \
-    XTensor b(&a);                                                          \
-    b.SetTMPFlag();                                                         \
-    _funcName(&a, &b, number);                                              \
-    return b;                                                               \
-}
-// I think we needn't to make link.
-// XLink::MakeLink(&a, NULL, &b, operationId);
-_SIMPLE_COMPARE_FUNCTION(_Equal, myIsEqual)
-_SIMPLE_COMPARE_FUNCTION_ME(_EqualMe, _Equal)
-SIMPLE_COMPARE_FUNCTION(Equal, _Equal, MATH_EQUAL)
-_SIMPLE_COMPARE_FUNCTION(_NotEqual, myIsNotEqual)
-_SIMPLE_COMPARE_FUNCTION_ME(_NotEqualMe, _NotEqual)
-SIMPLE_COMPARE_FUNCTION(NotEqual, _NotEqual, MATH_NOTEQUAL)
 #endif

--- a/source/tensor/core/math/Compare.h
+++ b/source/tensor/core/math/Compare.h
@@ -32,18 +32,30 @@ void _Equal(const XTensor * a, XTensor * b, DTYPE value);
 /* check whether every entry is equal to the given value (do it on site) */
 void _EqualMe(XTensor * a, DTYPE value);
+/* check whether every entry is equal to the given value (do it on site) */
+void EqualMe(XTensor & a, DTYPE value);
 /* check whether every entry is equal to the given value (return an XTensor structure) */
 XTensor Equal(const XTensor & a, DTYPE value);
+/* check whether every entry is equal to the given value */
+void Equal(const XTensor & a, XTensor & b, DTYPE value);
 /* check whether every entry is not equal to the given value */
 void _NotEqual(const XTensor * a, XTensor * b, DTYPE value);
 /* check whether every entry is not equal to the given value (do it on site) */
 void _NotEqualMe(XTensor * a, DTYPE value);
+/* check whether every entry is not equal to the given value (do it on site) */
+void NotEqualMe(XTensor & a, DTYPE value);
 /* check whether every entry is not equal to the given value (return an XTensor structure) */
 XTensor NotEqual(const XTensor & a, DTYPE value);
+/* check whether every entry is not equal to the given value */
+void NotEqual(const XTensor & a, XTensor & b, DTYPE value);
 } // namespace nts(NiuTrans.Tensor)
 #endif // end __COMPARE_H__
\ No newline at end of file
--- a/source/tensor/core/math/Normalize.cpp
+++ b/source/tensor/core/math/Normalize.cpp
@@ -42,7 +42,9 @@ where a and b are the scalar and bias respectively, and \epsilon is the adjustme
 >> b - the bias
 >> epsilon - a parameter
 */
-void _Normalize(const XTensor * input, XTensor * output, int dim, const XTensor * mean, const XTensor * var, const XTensor * a, const XTensor * b, DTYPE epsilon)
+void _Normalize(const XTensor * input, XTensor * output, int dim, 
+                const XTensor * mean, const XTensor * var, 
+                const XTensor * a, const XTensor * b, DTYPE epsilon)
 {
 	int dimRDI = input->order - dim - 1;
    CheckNTErrors((XTensor::IsSameShaped(input, output)), "Unmatched input tensors!");
@@ -109,10 +111,35 @@ where a and b are the scalar and bias respectively, and \epsilon is the adjustme
 >> b - the bias
 >> epsilon - a parameter
 */
-void _NormalizeMe(XTensor * input, int dim, const XTensor * mean, const XTensor * var, const XTensor * a, const XTensor * b, DTYPE epsilon)
+void _NormalizeMe(XTensor * input, int dim, 
+                  const XTensor * mean, const XTensor * var, 
+                  const XTensor * a, const XTensor * b, DTYPE epsilon)
 {
    _Normalize(input, input, dim, mean, var, a, b, epsilon);
 }
+/*
+normalized the data with normal distribution (do it on site)
+keep the result in the input tensor and return nothing
+For an input x, x = a * (x-mean)/sqrt(variance+\epsilon) + b
+where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
+>> input - the input tensor
+>> dim - dimension alone which we generate the mean and variance
+>> mean - the mean of the input
+>> var - the variance of the input
+>> a - the scalar
+>> b - the bias
+>> epsilon - a parameter
+*/
+void NormalizeMe(XTensor& input, int dim, 
+                 const XTensor& mean, const XTensor& var, 
+                 const XTensor& a, const XTensor& b, DTYPE epsilon)
+{
+    _Normalize(&input, &input, dim, &mean, &var, &a, &b, epsilon);
+}
 /*
 normalized the data with normal distribution (return an XTensor structure)
 make a new tensor to keep the result and return it 
@@ -129,7 +156,9 @@ where a and b are the scalar and bias respectively, and \epsilon is the adjustme
 >> epsilon - a parameter
 << return - the result of normalized the data with normal distribution
 */
-XTensor Normalize(const XTensor &input, int dim, const XTensor &mean, const XTensor &var, const XTensor &a, const XTensor &b, DTYPE epsilon)
+XTensor Normalize(const XTensor &input, int dim, 
+                  const XTensor &mean, const XTensor &var, 
+                  const XTensor &a, const XTensor &b, DTYPE epsilon)
 {
    XTensor output(&input);
    output.SetTMPFlag();
@@ -150,4 +179,48 @@ XTensor Normalize(const XTensor &input, int dim, const XTensor &mean, const XTen
    return output;
 }
+/*
+normalized the data with normal distribution (return an XTensor structure)
+make a new tensor to keep the result and return it 
+For an input x, y = a * (x-mean)/sqrt(variance+\epsilon) + b
+where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
+>> input - the input tensor
+>> output - the output tensor
+>> dim - dimension alone which we generate the mean and variance
+>> mean - the mean of the input
+>> var - the variance of the input
+>> a - the scalar
+>> b - the bias
+>> epsilon - a parameter
+<< return - the result of normalized the data with normal distribution
+*/
+void Normalize(const XTensor &input, XTensor &output, int dim, 
+               const XTensor &mean, const XTensor &var, 
+               const XTensor &a, const XTensor &b, DTYPE epsilon)
+{
+    if (!output.isInit || !XTensor::IsSameShaped(&input, &output)) {
+        InitTensor(&output, &input);
+    }
+    /* call _Normalize function */
+    _Normalize(&input, &output, dim, &mean, &var, &a, &b, epsilon);
+    if (output.enableGrad == true) {
+        /* tensor connections */
+        TensorList list(5);
+        list.Add((XTensor*)&input);
+        list.Add((XTensor*)&mean);
+        list.Add((XTensor*)&var);
+        list.Add((XTensor*)&a);
+        list.Add((XTensor*)&b);
+        XLink::MakeLink(&list, &output, MATH_NORMALIZE);
+        XLink::AddParamToHeadInt(&output, dim);
+        XLink::AddParamToHead(&output, epsilon);
+    }
+}
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/math/Normalize.h
+++ b/source/tensor/core/math/Normalize.h
@@ -31,7 +31,9 @@ normalized the data with normal distribution.
 For an input x, y = a * (x-mean)/sqrt(variance+\epsilon) + b
 where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
 */
-void _Normalize(const XTensor * input, XTensor * output, int dim, const XTensor * mean, const XTensor * var, const XTensor * a, const XTensor * b, DTYPE epsilon);
+void _Normalize(const XTensor * input, XTensor * output, int dim, 
+                const XTensor * mean, const XTensor * var, 
+                const XTensor * a, const XTensor * b, DTYPE epsilon);
 /*
 normalized the data with normal distribution (do it on site)
@@ -39,7 +41,29 @@ keep the result in the input tenosr and return nothing
 For an input x, x = a * (x-mean)/sqrt(variance+\epsilon) + b
 where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
 */
-void _NormalizeMe(XTensor * input, int dim, const XTensor * mean, const XTensor * var, const XTensor * a, const XTensor * b, DTYPE epsilon);
+void _NormalizeMe(XTensor * input, int dim, 
+                  const XTensor * mean, const XTensor * var, 
+                  const XTensor * a, const XTensor * b, DTYPE epsilon);
+/*
+normalized the data with normal distribution (do it on site)
+keep the result in the input tenosr and return nothing
+For an input x, x = a * (x-mean)/sqrt(variance+\epsilon) + b
+where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
+*/
+void NormalizeMe(XTensor & input, int dim, 
+                 const XTensor & mean, const XTensor & var, 
+                 const XTensor & a, const XTensor & b, DTYPE epsilon);
+/*
+normalized the data with normal distribution (return an XTensor structure)
+make a new tensor to keep the result and return it 
+For an input x, y = a * (x-mean)/sqrt(variance+\epsilon) + b
+where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
+*/
+XTensor Normalize(const XTensor &input, int dim, 
+                  const XTensor &mean, const XTensor &var, 
+                  const XTensor &a, const XTensor &b, DTYPE epsilon);
 /*
 normalized the data with normal distribution (return an XTensor structure)
@@ -47,7 +71,9 @@ make a new tensor to keep the result and return it
 For an input x, y = a * (x-mean)/sqrt(variance+\epsilon) + b
 where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
 */
-XTensor Normalize(const XTensor &input, int dim, const XTensor &mean, const XTensor &var, const XTensor &a, const XTensor &b, DTYPE epsilon);
+void Normalize(const XTensor &input, XTensor &output, int dim, 
+               const XTensor &mean, const XTensor &var, 
+               const XTensor &a, const XTensor &b, DTYPE epsilon);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/math/ScaleAndShift.cpp
+++ b/source/tensor/core/math/ScaleAndShift.cpp
@@ -92,6 +92,21 @@ void _ScaleAndShiftMe(XTensor * a, DTYPE scale, DTYPE shift)
 }
 /* 
+scale and shift all tensor entires (do it on site)
+keep the result in the input tensor a and return nothing
+a = a * scale + shift
+>> a - the input/output tensor
+>> scale - the scaler factor
+>> shift - the shift factor
+*/
+void ScaleAndShiftMe(XTensor& a, DTYPE scale, DTYPE shift)
+{
+    _ScaleAndShift(&a, &a, scale, shift);
+}
+/* 
 scale and shift all tensor entires (return an XTensor structure)
 make a new tensor to keep the result and return it
@@ -127,9 +142,8 @@ b = a * scale + shift
 >> b - the output tensor
 >> scale - the scaler factor
 >> shift - the shift factor
->> requireLink - if add operation to network
 */
-void ScaleAndShift(const XTensor & a, XTensor & b, DTYPE scale, DTYPE shift, bool requireLink)
+void ScaleAndShift(const XTensor & a, XTensor & b, DTYPE scale, DTYPE shift)
 {
    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {
        InitTensor(&b, &a);
@@ -138,7 +152,7 @@ void ScaleAndShift(const XTensor & a, XTensor & b, DTYPE scale, DTYPE shift, boo
    /* call _ScaleAndShift function */
    _ScaleAndShift(&a, &b, scale, shift);
-    if (requireLink) {
+    if (b.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&a, NULL, &b, MATH_SCALEANDSHIFT);
        XLink::AddParamToHead(&b, scale);

--- a/source/tensor/core/math/ScaleAndShift.h
+++ b/source/tensor/core/math/ScaleAndShift.h
@@ -45,6 +45,13 @@ void _ScaleAndShiftMe(XTensor * a, DTYPE scale, DTYPE shift = 0);
 /*
 scale and shift all tensor entires
+keep the result in the input tensor a and return nothing
+a = a * scale + shift 
+*/
+void ScaleAndShiftMe(XTensor & a, DTYPE scale, DTYPE shift = 0);
+/*
+scale and shift all tensor entires
 make a new tensor to keep the result and return it
 b = a * scale + shift 
 */
@@ -54,7 +61,7 @@ XTensor ScaleAndShift(const XTensor &a, DTYPE scale, DTYPE shift = 0);
 scale and shift all tensor entires 
 b = a * scale + shift 
 */
-void ScaleAndShift(const XTensor &a, XTensor &b, DTYPE scale, DTYPE shift = 0, bool requireLink = false);
+void ScaleAndShift(const XTensor &a, XTensor &b, DTYPE scale, DTYPE shift = 0);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/math/Unary.cpp
+++ b/source/tensor/core/math/Unary.cpp
@@ -28,24 +28,24 @@
 namespace nts{
 template<class T>
-T negate(T x) {
+T UnaryNegate(T x) {
    return (T)-x;
 }
 template<class T>
-T square(T x)
+T UnarySquare(T x)
 {
    return (T)(x * x);
 }
 template<class T>
-T round(T r)
+T UnaryRound(T r)
 {
 	return (r > 0.0) ? (T)floor(r + 0.5) : (T)ceil(r - 0.5);
 }
 template<class T>
-T sign(T r)
+T UnarySign(T r)
 {
    if (r > 0.0)
       return (T)1.0;
@@ -56,13 +56,13 @@ T sign(T r)
 }
 template<class T>
-T isnonzero(T r)
+T UnaryIsNonZero(T r)
 {
    return (r != 0.0) ? (T)1.0 : (T)0.0;
 }
 template<class T>
-T iszero(T r)
+T UnaryIsZero(T r)
 {
    return (r == 0.0) ? (T)1.0 : (T)0.0;
 }
@@ -142,14 +142,14 @@ _SIMPLE_UNARY_FUNCTION(_Absolute, _CudaAbsolute, fabs)
 _SIMPLE_UNARY_FUNCTION(_Ceil, _CudaCeil, ceil)
 _SIMPLE_UNARY_FUNCTION(_Exp, _CudaExp, exp)
 _SIMPLE_UNARY_FUNCTION(_Floor, _CudaFloor, floor)
-_SIMPLE_UNARY_FUNCTION(_IsNonZero, _CudaIsNonZero, isnonzero)
+_SIMPLE_UNARY_FUNCTION(_IsNonZero, _CudaIsNonZero, UnaryIsNonZero)
-_SIMPLE_UNARY_FUNCTION(_IsZero, _CudaIsZero, iszero)
+_SIMPLE_UNARY_FUNCTION(_IsZero, _CudaIsZero, UnaryIsZero)
 _SIMPLE_UNARY_FUNCTION(_Log, _CudaLog, log)
-_SIMPLE_UNARY_FUNCTION(_Negate, _CudaNegate, negate)
+_SIMPLE_UNARY_FUNCTION(_Negate, _CudaNegate, UnaryNegate)
 _SIMPLE_UNARY_FUNCTION(_Round, _CudaRound, round)
-_SIMPLE_UNARY_FUNCTION(_Sign, _CudaSign, sign)
+_SIMPLE_UNARY_FUNCTION(_Sign, _CudaSign, UnarySign)
 _SIMPLE_UNARY_FUNCTION(_Sqrt, _CudaSqrt, sqrt)
-_SIMPLE_UNARY_FUNCTION(_Square, _CudaSquare, square)
+_SIMPLE_UNARY_FUNCTION(_Square, _CudaSquare, UnarySquare)
 _SIMPLE_UNARY_FUNCTION(_Sin, _CudaSin, sin)
 _SIMPLE_UNARY_FUNCTION(_Cos, _CudaCos, cos)
 _SIMPLE_UNARY_FUNCTION(_Tan, _CudaTan, tan)

--- a/source/tensor/core/math/Unary.cu
+++ b/source/tensor/core/math/Unary.cu
@@ -32,63 +32,63 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 template<class T>
 __device__
-T BaseCeil(T x)
+T UnaryCudaCeil(T x)
 {
    return (T)ceil((float)x);
 }
 template<class T>
 __device__
-T BaseExp(T x)
+T UnaryCudaExp(T x)
 {
    return (T)exp((float)x);
 }
 template<class T>
 __device__
-T BaseFabs(T x)
+T UnaryCudaFabs(T x)
 {
    return (T)fabs((float)x);
 }
 template<class T>
 __device__
-T BaseFloor(T x)
+T UnaryCudaFloor(T x)
 {
    return (T)floor((float)x);
 }
 template<class T>
 __device__
-T BaseIsNonZero(T r)
+T UnaryCudaIsNonZero(T r)
 {
    return (r != (T)0.0) ? (T)1.0 : (T)0.0;
 }
 template<class T>
 __device__
-T BaseIsZero(T r)
+T UnaryCudaIsZero(T r)
 {
    return (r == (T)0.0) ? (T)1.0 : (T)0.0;
 }
 template<class T>
 __device__
-T BaseLog(T x)
+T UnaryCudaLog(T x)
 {
    return (T)log((float)x);
 }
 template<class T>
 __device__
-T BaseNegate(T x)
+T UnaryCudaNegate(T x)
 {
    return -x;
 }
 template<class T>
 __device__
-T BaseSign(T r)
+T UnaryCudaSign(T r)
 {
    if (r > (T)0)
       return 1.0;
@@ -100,43 +100,43 @@ T BaseSign(T r)
 template<class T>
 __device__
-T BaseSqrt(T x)
+T UnaryCudaSqrt(T x)
 {
    return (T)sqrt((float)x);
 }
 template<class T>
 __device__
-T BaseSquare(T x)
+T UnaryCudaSquare(T x)
 {
    return x * x;
 }
 template<class T>
 __device__
-T BaseRound(T r)
+T UnaryCudaRound(T r)
 {
-	return (r > (T)0.0) ? (T)BaseFloor(r + (T)0.5) : (T)BaseCeil(r - (T)0.5);
+	return (r > (T)0.0) ? (T)UnaryCudaFloor(r + (T)0.5) : (T)UnaryCudaCeil(r - (T)0.5);
 }
 template<class T>
 __device__
-T BaseSin(T x)
+T UnaryCudaSin(T x)
 {
    return (T)sin((float)x);
 }
 template<class T>
 __device__
-T BaseCos(T x)
+T UnaryCudaCos(T x)
 {
    return (T)cos((float)x);
 }
 template<class T>
 __device__
-T BaseTan(T x)
+T UnaryCudaTan(T x)
 {
    return (T)tan((float)x);
 }
@@ -181,10 +181,6 @@ void _Cuda##funcName(const XTensor * a, XTensor * b)                        \
        Kernel##funcName<<<blocks, threads>>>                               \
                         ((int*)a->data, (int*)b->data, a->unitNum);        \
    }                                                                       \
-    else if (a->dataType == X_FLOAT16) {                                    \
-        Kernel##funcName<<<blocks, threads>>>                               \
-                         ((__half*)a->data, (__half*)b->data, a->unitNum);  \
-    }                                                                       \
    else {                                                                  \
        ShowNTErrors("TODO!");                                              \
    }                                                                       \
@@ -194,22 +190,22 @@ void _Cuda##funcName(const XTensor * a, XTensor * b)                        \
-SIMPLE_UNARY_FUNCTION_GPU(Absolute, BaseFabs)
+SIMPLE_UNARY_FUNCTION_GPU(Absolute, UnaryCudaFabs)
-SIMPLE_UNARY_FUNCTION_GPU(Ceil, BaseCeil)
+SIMPLE_UNARY_FUNCTION_GPU(Ceil, UnaryCudaCeil)
-SIMPLE_UNARY_FUNCTION_GPU(Exp, BaseExp)
+SIMPLE_UNARY_FUNCTION_GPU(Exp, UnaryCudaExp)
-SIMPLE_UNARY_FUNCTION_GPU(Floor, BaseFloor)
+SIMPLE_UNARY_FUNCTION_GPU(Floor, UnaryCudaFloor)
-SIMPLE_UNARY_FUNCTION_GPU(IsNonZero, BaseIsNonZero)
+SIMPLE_UNARY_FUNCTION_GPU(IsNonZero, UnaryCudaIsNonZero)
-SIMPLE_UNARY_FUNCTION_GPU(IsZero, BaseIsZero)
+SIMPLE_UNARY_FUNCTION_GPU(IsZero, UnaryCudaIsZero)
-SIMPLE_UNARY_FUNCTION_GPU(Log, BaseLog)
+SIMPLE_UNARY_FUNCTION_GPU(Log, UnaryCudaLog)
-SIMPLE_UNARY_FUNCTION_GPU(Negate, BaseNegate)
+SIMPLE_UNARY_FUNCTION_GPU(Negate, UnaryCudaNegate)
-SIMPLE_UNARY_FUNCTION_GPU(Round, BaseRound)
+SIMPLE_UNARY_FUNCTION_GPU(Round, UnaryCudaRound)
-SIMPLE_UNARY_FUNCTION_GPU(Sign, BaseSign)
+SIMPLE_UNARY_FUNCTION_GPU(Sign, UnaryCudaSign)
-SIMPLE_UNARY_FUNCTION_GPU(Sqrt, BaseSqrt)
+SIMPLE_UNARY_FUNCTION_GPU(Sqrt, UnaryCudaSqrt)
-SIMPLE_UNARY_FUNCTION_GPU(Square, BaseSquare)
+SIMPLE_UNARY_FUNCTION_GPU(Square, UnaryCudaSquare)
-SIMPLE_UNARY_FUNCTION_GPU(Sin, BaseSin)
+SIMPLE_UNARY_FUNCTION_GPU(Sin, UnaryCudaSin)
-SIMPLE_UNARY_FUNCTION_GPU(Cos, BaseCos)
+SIMPLE_UNARY_FUNCTION_GPU(Cos, UnaryCudaCos)
-SIMPLE_UNARY_FUNCTION_GPU(Tan, BaseTan)
+SIMPLE_UNARY_FUNCTION_GPU(Tan, UnaryCudaTan)
 #endif // USE_CUDA

--- a/source/tensor/core/movement/CopyValues.cu
+++ b/source/tensor/core/movement/CopyValues.cu
@@ -52,15 +52,15 @@ void _CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream)
    }
    /* dense -> sparse */
    else if (!s->isSparse && t->isSparse &&
-        s->dataType == DEFAULT_DTYPE &&
+              s->dataType == DEFAULT_DTYPE &&
-        t->dataType == DEFAULT_DTYPE)
+              t->dataType == DEFAULT_DTYPE)
    {
        ShowNTErrors("TODO!");
    }
    /* sparse -> dense */
    else if (s->isSparse && !t->isSparse &&
-        s->dataType == DEFAULT_DTYPE &&
+             s->dataType == DEFAULT_DTYPE &&
-        t->dataType == DEFAULT_DTYPE)
+             t->dataType == DEFAULT_DTYPE)
    {
        ShowNTErrors("TODO!");
    }

--- a/source/tensor/core/movement/Spread.cpp
+++ b/source/tensor/core/movement/Spread.cpp
@@ -219,7 +219,6 @@ void _SpreadForCopyIndexed(XTensor * s, XTensor * c, int dim,
        }
    }
 }
 /*
@@ -236,15 +235,18 @@ void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index)
    int order = source->order;
    CheckNTErrors(source->dataType == DEFAULT_DTYPE, "TODO!");
+    CheckNTErrors(collection->GetDim(-1) == source->GetDim(-1), "Illegal dimension!");
+    CheckNTErrors(collection->unitNum/collection->GetDim(-1) == index->unitNum, 
+                 "Illegal dimension!");
-    for(int i = 0; i < order; i++){
+    //for(int i = 0; i < order; i++){
-        if(i == dim){
+    //    if(i == dim){
-            CheckNTErrors(collection->GetDim(i) == index->unitNum, "Illegal dimension!");
+    //        CheckNTErrors(collection->GetDim(i) == index->unitNum, "Illegal dimension!");
-        }
+    //    }
-        else {
+    //    else {
-            CheckNTErrors(collection->GetDim(i) == source->GetDim(i), "Illegal dimension!");
+    //        CheckNTErrors(collection->GetDim(i) == source->GetDim(i), "Illegal dimension!");
-        }
+    //    }
-    }
+    //}
 #ifdef USE_CUDA
    if(source->devID >= 0 && collection->devID >= 0) {

--- a/source/tensor/core/reduce/ReduceMax.cpp
+++ b/source/tensor/core/reduce/ReduceMax.cpp
@@ -137,9 +137,8 @@ get the max value of the items along a dimension of the tensor
 >> input - the input tensor
 >> output - the output tensor
 >> dim - the dimension where the reduction is performed on
->> requireLink - if add operation to network
 */
-void ReduceMax(const XTensor &input, XTensor &output, int dim, bool requireLink)
+void ReduceMax(const XTensor &input, XTensor &output, int dim)
 {
    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
@@ -163,7 +162,7 @@ void ReduceMax(const XTensor &input, XTensor &output, int dim, bool requireLink)
    /* call _ReduceMax function */
    _ReduceMax(&input, &output, dim);
-    if (requireLink) {
+    if (output.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCEMAX);
        XLink::AddParamToHeadInt(&output, dim);

--- a/source/tensor/core/reduce/ReduceMax.cu
+++ b/source/tensor/core/reduce/ReduceMax.cu
@@ -504,7 +504,7 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
    CheckNTErrors(input->order > dim && dim >=0, "Illegal dimension to reduce!");
    CheckNTErrors(input->dataType == output->dataType, "Unmatched data types!");
-	int dimRDI = input->order - dim - 1;
+    int dimRDI = input->order - dim - 1;
    for(int i = 0; i < input->order; i++){
        if(i < dimRDI){
            CheckNTErrors(input->dimSizeRDI[i] == output->dimSizeRDI[i], "Unmatched tensors!");

--- a/source/tensor/core/reduce/ReduceMax.h
+++ b/source/tensor/core/reduce/ReduceMax.h
@@ -36,7 +36,7 @@ make a new tensor to keep the result and return it
 XTensor ReduceMax(const XTensor &input, int dim);
 /* get the max value of the items along a dimension of the tensor. */
-void ReduceMax(const XTensor &input, XTensor &output, int dim, bool requireLink = false);
+void ReduceMax(const XTensor &input, XTensor &output, int dim);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/reduce/ReduceMean.cpp
+++ b/source/tensor/core/reduce/ReduceMean.cpp
@@ -94,9 +94,8 @@ For a 1-dimensional data array a, mean = (1/n) * sum_i input_i
 >> input - the input tensor
 >> output - the output tensor
 >> dim - the dimension where the reduction is performed on
->> requireLink - if add operation to network
 */
-void ReduceMean(const XTensor &input, XTensor &output, int dim, bool requireLink)
+void ReduceMean(const XTensor &input, XTensor &output, int dim)
 {
    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
@@ -120,7 +119,7 @@ void ReduceMean(const XTensor &input, XTensor &output, int dim, bool requireLink
    /* call _ReduceMean function */
    _ReduceMean(&input, &output, dim);
-    if (requireLink) {
+    if (output.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCEMEAN);
        XLink::AddParamToHeadInt(&output, dim);

--- a/source/tensor/core/reduce/ReduceMean.h
+++ b/source/tensor/core/reduce/ReduceMean.h
@@ -43,7 +43,7 @@ XTensor ReduceMean(const XTensor &input, int dim);
 get the mean value along a dimension of the tensor
 For a 1-dimensional data array a, mean = (1/n) * sum_i input_i
 */
-void ReduceMean(const XTensor &input, XTensor &output, int dim, bool requireLink = false);
+void ReduceMean(const XTensor &input, XTensor &output, int dim);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/reduce/ReduceSum.cpp
+++ b/source/tensor/core/reduce/ReduceSum.cpp
@@ -244,7 +244,7 @@ XTensor ReduceSum(const XTensor &input, int dim, const XTensor &shift, DTYPE pow
    return output;
 }
-void ReduceSum(const XTensor &input, XTensor &output, int dim, const XTensor &shift, DTYPE power, bool isExp, bool requireLink)
+void ReduceSum(const XTensor &input, XTensor &output, int dim, const XTensor &shift, DTYPE power, bool isExp)
 {
    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
@@ -268,7 +268,7 @@ void ReduceSum(const XTensor &input, XTensor &output, int dim, const XTensor &sh
    /* call _ReduceSum function */
    _ReduceSum(&input, &output, dim, &shift, power, isExp);
-    if (requireLink) {
+    if (output.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&input, &shift, &output, REDUCE_REDUCESUM);
        XLink::AddParamToHeadInt(&output, dim);
@@ -336,9 +336,8 @@ sum = \sum_i exp((a_i - shift)^power) if isExp == true
 >> shift - shift the input
 >> ieExp - specify if the exp() is performed
 >> power - we perform pow(item_i, power) on each item in the array
->> requireLink - if add operation to network
 */
-void ReduceSum(const XTensor &input, XTensor &output, int dim, DTYPE power, bool isExp, bool requireLink)
+void ReduceSum(const XTensor &input, XTensor &output, int dim, DTYPE power, bool isExp)
 {
    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
@@ -362,7 +361,7 @@ void ReduceSum(const XTensor &input, XTensor &output, int dim, DTYPE power, bool
    /* call _ReduceSum function */
    _ReduceSum(&input, &output, dim, NULL, power, isExp);
-    if (requireLink) {
+    if (output.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCESUM);
        XLink::AddParamToHeadInt(&output, dim);

--- a/source/tensor/core/reduce/ReduceSum.cu
+++ b/source/tensor/core/reduce/ReduceSum.cu
@@ -341,7 +341,7 @@ void KernelReduceSumFast(DTYPE * input, DTYPE * output,
        if (tid < blockDim.x / 32)
            value = data[tid];
        else
-	        value = 0;
+            value = 0;
        value = shflDownReduceSum(value);
        if (tid == 0 && blockIdx.x < reducedStrideNum) {
@@ -692,7 +692,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
    CheckNTErrors(input->dataType == output->dataType, "Unmatched data types!");
    CheckNTErrors(shift == NULL || output->unitNum == shift->unitNum, "Incorrect shift tensor size!");
-	int dimRDI = input->order - dim - 1;
+    int dimRDI = input->order - dim - 1;
    for(int i = 0; i < input->order; i++){
        if(i < dimRDI){
            CheckNTErrors(input->dimSizeRDI[i] == output->dimSizeRDI[i], "Unmatched tensors!");

--- a/source/tensor/core/reduce/ReduceSum.h
+++ b/source/tensor/core/reduce/ReduceSum.h
@@ -44,7 +44,7 @@ sum = \sum_i exp(a_i - shift) if isExp == true
 */
 XTensor ReduceSum(const XTensor &input, int dim, const XTensor &shift, DTYPE power = (DTYPE)1.0F, bool isExp = false);
-void ReduceSum(const XTensor &input, XTensor &output, int dim, const XTensor &shift, DTYPE power = (DTYPE)1.0F, bool isExp = false, bool requireLink = false);
+void ReduceSum(const XTensor &input, XTensor &output, int dim, const XTensor &shift, DTYPE power = (DTYPE)1.0F, bool isExp = false);
 /* 
 sum the items along a dimension of the tensor (return an XTensor structure)
@@ -61,7 +61,7 @@ For a 1-dimensional data array a,
 sum = \sum_i (a_i - shift) if isExp == false
 sum = \sum_i exp(a_i - shift) if isExp == true
 */
-void ReduceSum(const XTensor &input, XTensor &output, int dim, DTYPE power = (DTYPE)1.0F, bool isExp = false, bool requireLink = false);
+void ReduceSum(const XTensor &input, XTensor &output, int dim, DTYPE power = (DTYPE)1.0F, bool isExp = false);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/reduce/ReduceSumSquared.cpp
+++ b/source/tensor/core/reduce/ReduceSumSquared.cpp
@@ -91,9 +91,8 @@ For a 1-dimensional data array a, sum = \sum_i (a_i - shift)^2
 >> output - the output tensor
 >> dim - the dimension where the reduction is performed on
 >> shift - bias on the input
->> requireLink - if add operation to network
 */
-void ReduceSumSquared(const XTensor &input, XTensor &output, int dim, const XTensor &shift, bool requireLink)
+void ReduceSumSquared(const XTensor &input, XTensor &output, int dim, const XTensor &shift)
 {
    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
@@ -117,7 +116,7 @@ void ReduceSumSquared(const XTensor &input, XTensor &output, int dim, const XTen
    /* call _ReduceSumSquared function */
    _ReduceSumSquared(&input, &output, dim, &shift);
-    if (requireLink) {
+    if (output.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&input, &shift, &output, REDUCE_REDUCESUMSQUARED);
        XLink::AddParamToHeadInt(&output, dim);

--- a/source/tensor/core/reduce/ReduceSumSquared.h
+++ b/source/tensor/core/reduce/ReduceSumSquared.h
@@ -45,7 +45,7 @@ squared sum of the items along a dimension of the tensor
 For a 1-dimensional data array a,
 sum = \sum_i (a_i - shift)^2
 */
-void ReduceSumSquared(const XTensor &input, XTensor &output, int dim, const XTensor &shift, bool requireLink = false);
+void ReduceSumSquared(const XTensor &input, XTensor &output, int dim, const XTensor &shift);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/reduce/ReduceVariance.cpp
+++ b/source/tensor/core/reduce/ReduceVariance.cpp
@@ -94,9 +94,8 @@ For a 1-dimensional data array a, variance = 1/n * \sum_i (a_i - mean)^2
 >> output - the output tensor
 >> dim - the dimension where the reduction is performed on
 >> mean - the mean value
->> requireLink - if add operation to network
 */
-void ReduceVariance(const XTensor &input, XTensor &output, int dim, const XTensor &mean, bool requireLink)
+void ReduceVariance(const XTensor &input, XTensor &output, int dim, const XTensor &mean)
 {
    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
@@ -120,7 +119,7 @@ void ReduceVariance(const XTensor &input, XTensor &output, int dim, const XTenso
    /* call _ReduceVariance function */
    _ReduceVariance(&input, &output, dim, &mean);
-    if (requireLink) {
+    if (output.enableGrad) {
        /* tensor connection */
        XLink::MakeLink(&input, &mean, &output, REDUCE_REDUCEVARIANCE);
        XLink::AddParamToHeadInt(&output, dim);

--- a/source/tensor/core/reduce/ReduceVariance.h
+++ b/source/tensor/core/reduce/ReduceVariance.h
@@ -43,7 +43,7 @@ XTensor ReduceVariance(const XTensor &input, int dim, const XTensor &mean);
 variance of the items along a dimension of the tensor
 For a 1-dimensional data array a, variance = 1/n * \sum_i (a_i - mean)^2
 */
-void ReduceVariance(const XTensor &input, XTensor &output, int dim, const XTensor &mean, bool requireLink = false);
+void ReduceVariance(const XTensor &input, XTensor &output, int dim, const XTensor &mean);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/shape/Concatenate.cpp
+++ b/source/tensor/core/shape/Concatenate.cpp
@@ -137,6 +137,115 @@ XTensor Concatenate(const TensorList &smalls, int dim)
    }
 }
+bool CheckConcatenateShape(const TensorList &smalls, int dim, XTensor &big, bool uniform)
+{
+    XTensor * tensor = (XTensor*)smalls.GetItem(0);
+    int order = tensor->order;
+    int * dimSize = new int[order];
+    if (uniform) {
+        for (int i = 0; i < tensor->order; i++) {
+            if (i != dim)
+                dimSize[i] = tensor->dimSize[i];
+            else
+                dimSize[i] = tensor->dimSize[dim] * smalls.count;
+        }
+    }
+    else {
+        for (int i = 0; i < tensor->order; i++)
+            if (i != dim)
+                dimSize[i] = tensor->dimSize[i];
+        int catDimSize = 0;
+        for (int i = 0; i < smalls.count; i++) {
+            XTensor * tensor = (XTensor*)smalls.GetItem(i);
+            catDimSize += tensor->dimSize[dim];
+        }
+        dimSize[dim] = catDimSize;
+    }
+    for (int i = 0; i < order; i++) {
+        if (dimSize[i] != big.dimSize[i]) {
+            delete[] dimSize;
+            return false;
+        }
+    }
+    delete[] dimSize;
+    return false;
+}
+void Concatenate(const TensorList & smalls, XTensor & big, int dim)
+{
+    CheckNTErrors(smalls.count > 0, "Empty list!");
+    CheckNTErrors(dim >= 0, "Illegal dimension to concatenate!");
+    bool uniform = true;
+    for (int i = 1; i < smalls.count; i++) {
+        XTensor * a = (XTensor*)smalls.GetItem(i - 1);
+        XTensor * b = (XTensor*)smalls.GetItem(i);
+        CheckNTErrors((a && b), "Empty input tensors!");
+        if (!XTensor::IsSameShaped(a, b))
+            uniform = false;
+    }
+    if (!big.isInit || !CheckConcatenateShape(smalls, dim, big, uniform)) {
+        XTensor * tensor = (XTensor*)smalls.GetItem(0);
+        int order = tensor->order;
+        int * dimSize = new int[order];
+        if (uniform) {
+            for (int i = 0; i < tensor->order; i++) {
+                if (i != dim)
+                    dimSize[i] = tensor->dimSize[i];
+                else
+                    dimSize[i] = tensor->dimSize[dim] * smalls.count;
+            }
+            float dr = (!tensor->isSparse) ? 1.0F : tensor->denseRatio;
+            InitTensor(&big, order, dimSize, tensor->dataType, dr, tensor->devID, tensor->mem);
+        }
+        else {
+            for (int i = 0; i < tensor->order; i++)
+                if (i != dim)
+                    dimSize[i] = tensor->dimSize[i];
+            int catDimSize = 0;
+            for (int i = 0; i < smalls.count; i++) {
+                XTensor * tensor = (XTensor*)smalls.GetItem(i);
+                catDimSize += tensor->dimSize[dim];
+            }
+            dimSize[dim] = catDimSize;
+            float dr = (!tensor->isSparse) ? 1.0F : tensor->denseRatio;
+            InitTensor(&big, order, dimSize, tensor->dataType, dr, tensor->devID, tensor->mem);
+        }    
+        /* destroy variables */
+        delete[] dimSize;
+    }
+    if (uniform) {
+        /* call _Merge function */
+        _Merge(&smalls, &big, dim);
+        /* tensor connection */
+        if (big.enableGrad) {
+            XLink::MakeLink(&smalls, &big, SHAPE_MERGE);
+            XLink::AddParamToHeadInt(&big, dim);
+        }
+    }
+    else {
+        /* call _ConcatenateSolely function */
+        _ConcatenateSolely(&smalls, &big, dim);
+        /* tensor connection */
+        if (big.enableGrad) {
+            XLink::MakeLink(&smalls, &big, SHAPE_CONCATENATE);
+            XLink::AddParamToHeadInt(&big, dim);    
+        }
+    }
+}
 /*
 concatenate two tensors along a given dimension

--- a/source/tensor/core/shape/Concatenate.h
+++ b/source/tensor/core/shape/Concatenate.h
@@ -41,6 +41,8 @@ Note that this is actually a wrapper that selects
 */
 XTensor Concatenate(const TensorList &smalls, int dim);
+void Concatenate(const TensorList & smalls, XTensor & big, int dim);
 /* concatenate two tensors along a given dimension */
 void _Concatenate(const XTensor * smallA, const XTensor * smallB, XTensor * big, int dim);

--- a/source/tensor/core/shape/Merge.cpp
+++ b/source/tensor/core/shape/Merge.cpp
@@ -232,7 +232,7 @@ XTensor Merge(const XTensor &s, int whereToMerge, int leadingDim)
    return t;
 }
-void Merge(const XTensor &s, XTensor &t, int whereToMerge, int leadingDim, bool requireLink)
+void Merge(const XTensor &s, XTensor &t, int whereToMerge, int leadingDim)
 {
    if (!t.isInit || !CheckMergeSize(&s, &t, whereToMerge, leadingDim)) {
        if (leadingDim < 0)
@@ -261,7 +261,7 @@ void Merge(const XTensor &s, XTensor &t, int whereToMerge, int leadingDim, bool 
    /* call _Merge function */
    _Merge(&s, &t, whereToMerge, leadingDim);
-    if (requireLink) {
+    if (t.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&s, NULL, &t, SHAPE_MERGE);
        XLink::AddParamToHeadInt(&t, whereToMerge);

--- a/source/tensor/core/shape/Merge.h
+++ b/source/tensor/core/shape/Merge.h
@@ -33,7 +33,7 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim = -
   e.g., (M, N/3, 3) -> (M, N) */
 XTensor Merge(const XTensor &s, int whereToMerge, int leadingDim = -1);
-void Merge(const XTensor &s, XTensor &t, int whereToMerge, int leadingDim = -1, bool requireLink = false);
+void Merge(const XTensor &s, XTensor &t, int whereToMerge, int leadingDim = -1);
 /* merge small tensors into a big tensor */
 void _Merge(const TensorList * smalls, XTensor * t, int whereToMerge);

--- a/source/tensor/core/shape/Permute.h
+++ b/source/tensor/core/shape/Permute.h
@@ -41,6 +41,13 @@ a = permuted(a)
 */
 void _PermuteMe(XTensor * a, int * dimPermute);
+/*
+permute the tensor dimensions (do it on site).
+keep the result in the input tensor and return nothing.
+a = permuted(a)
+*/
+void PermuteMe(XTensor  &a, int * dimPermute);
 /* 
 make a tensor with permuted dimensions (return an XTensor structure).
 make a new tensor to keep the result and return it.

--- a/source/tensor/core/shape/Reshape.cpp
+++ b/source/tensor/core/shape/Reshape.cpp
@@ -48,7 +48,7 @@ XTensor Reshape(XTensor &s, int order, int * dimSize)
 	return t;
 }
-void Reshape(XTensor &s, XTensor &t, int order, int * dimSize, bool requireLink)
+void Reshape(XTensor &s, XTensor &t, int order, int * dimSize)
 {
    if (!t.isInit || !XTensor::IsSameShaped(&t, &s)) {
        InitTensor(&t, &s);
@@ -57,7 +57,7 @@ void Reshape(XTensor &s, XTensor &t, int order, int * dimSize, bool requireLink)
    /* call Reshape function */
    t.Reshape(order, dimSize);
-    if (requireLink) {
+    if (t.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&s, NULL, &t, SHAPE_RESHAPE);
    }

--- a/source/tensor/core/shape/Reshape.h
+++ b/source/tensor/core/shape/Reshape.h
@@ -29,7 +29,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* reshape the tensor */
 XTensor Reshape(XTensor &s, int order, int * dimSize);
-void Reshape(XTensor &s, XTensor &t, int order, int * dimSize, bool requireLink = false);
+void Reshape(XTensor &s, XTensor &t, int order, int * dimSize);
 } // namespace nts(NiuTrans.Tensor)
 #endif // __RESHAPE_H__
--- a/source/tensor/core/shape/Split.cpp
+++ b/source/tensor/core/shape/Split.cpp
@@ -31,7 +31,7 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /*
-transform a tensor by splitting it, e.g., (N, M) -> (N/3, M, 3)
+transform a tensor by splitting it, e.g., (N, M) -> (3, N/3, M)
 >> s - the source tensor
 >> t - the target tensor (for return)
@@ -61,7 +61,7 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
    }
    /* for the case that we split the last dimension. Actually
-    (N, M) and (N, M/3, 3) have the same memory layout */
+    (N, M) and (3, N/3, M) have the same memory layout */
    if (s->order - 1 == whereToSplitRDI) {
        XMemCopy(t->data, t->devID, s->data, s->devID, s->unitNum * s->unitSize);
        return;
@@ -184,7 +184,7 @@ bool CheckSplitSize(const XTensor * s, const XTensor * t, int whereToSplit, int 
 }
 /*
-transform a tensor by splitting it, e.g., (N, M) -> (N/3, M, 3) (return an XTensor structure)
+transform a tensor by splitting it, e.g., (N, M) -> (3, N/3, M) (return an XTensor structure)
 make a new tensor to keep the result and return it
 >> s - the source tensor
@@ -227,7 +227,7 @@ XTensor Split(const XTensor &s, int whereToSplit, int splitNum)
    return t;
 }
-void Split(const XTensor &s, XTensor &t, int whereToSplit, int splitNum, bool requireLink)
+void Split(const XTensor &s, XTensor &t, int whereToSplit, int splitNum)
 {
    if (!t.isInit || !CheckSplitSize(&s, &t, whereToSplit, splitNum)) {
        int order = s.order + 1;
@@ -251,7 +251,7 @@ void Split(const XTensor &s, XTensor &t, int whereToSplit, int splitNum, bool re
    /* call _Split function */
    _Split(&s, &t, whereToSplit, splitNum);
-    if (requireLink) {
+    if (t.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&s, NULL, &t, SHAPE_SPLIT);
        XLink::AddParamToHeadInt(&t, whereToSplit);

--- a/source/tensor/core/shape/Split.h
+++ b/source/tensor/core/shape/Split.h
@@ -41,7 +41,7 @@ e.g., (M, N) -> (M, N/3, 3)
 */
 XTensor Split(const XTensor &s, int whereToSplit, int splitNum);
-void Split(const XTensor &s, XTensor &t, int whereToSplit, int splitNum, bool requireLink = false);
+void Split(const XTensor &s, XTensor &t, int whereToSplit, int splitNum);
 /* split a big tensor into small tensors */
 void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int splitNum);

--- a/source/tensor/core/shape/Squeeze.cpp
+++ b/source/tensor/core/shape/Squeeze.cpp
@@ -89,6 +89,20 @@ void _SqueezeMe(XTensor * source, int leadingDim)
 }
 /*
+squeeze the tensor along the specified dimension  (do it on site)
+keep the result in the input tensor a and return nothing
+>> source - the input tensor
+>> leadingDim - the dimension that we would squeeze
+                if leadingDim = -1, squeeze all dimensions that are 1
+                else, squeeze the specified dimension
+*/
+void SqueezeMe(XTensor& source, int leadingDim)
+{
+    _Squeeze(&source, &source, leadingDim);
+}
+/*
 squeeze the tensor along the specified dimension (return an XTensor structure)
 make a new tensor to keep the result and return it
@@ -112,7 +126,7 @@ XTensor Squeeze(XTensor & source, int leadingDim)
    return target;
 }
-void Squeeze(XTensor & source, XTensor & target, int leadingDim, bool requireLink)
+void Squeeze(XTensor & source, XTensor & target, int leadingDim)
 {
    if (!target.isInit || !XTensor::IsSameShaped(&source, &target)) {
        InitTensor(&target, &source);
@@ -121,7 +135,7 @@ void Squeeze(XTensor & source, XTensor & target, int leadingDim, bool requireLin
    /* call _Squeeze function */
    _Squeeze(&source, &target, leadingDim);
-    if (requireLink) {
+    if (target.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&source, NULL, &target, SHAPE_SQUEEZE);
    }

--- a/source/tensor/core/shape/Squeeze.h
+++ b/source/tensor/core/shape/Squeeze.h
@@ -33,11 +33,15 @@ void _Squeeze(XTensor * source, XTensor * target, int leadingDim = -1);
   keep the result in the input tensor a and return nothing */
 void _SqueezeMe(XTensor * source, int leadingDim = -1);
+/* squeeze the tensor along the specified dimension (do it on site)
+   keep the result in the input tensor a and return nothing */
+void SqueezeMe(XTensor & source, int leadingDim = -1);
 /* squeeze the tensor along the specified dimension  (return an XTensor structure)
   make a new tensor to keep the result and return it */
 XTensor Squeeze(XTensor & source, int leadingDim = -1);
-void Squeeze(XTensor & source, XTensor & target, int leadingDim = -1, bool requireLink = false);
+void Squeeze(XTensor & source, XTensor & target, int leadingDim = -1);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/shape/Unsqueeze.cpp
+++ b/source/tensor/core/shape/Unsqueeze.cpp
@@ -166,7 +166,7 @@ XTensor Unsqueeze(const XTensor &a, int dim, int dSize)
    return b;
 }
-void Unsqueeze(const XTensor &a, XTensor &b, int dim, int dSize, bool requireLink)
+void Unsqueeze(const XTensor &a, XTensor &b, int dim, int dSize)
 {
    if (!b.isInit || !CheckUnsqueezeSize(&a, &b, dim, dSize)) {
        int order = a.order + 1;
@@ -191,7 +191,7 @@ void Unsqueeze(const XTensor &a, XTensor &b, int dim, int dSize, bool requireLin
    /* call _Unsqueeze function */
    _Unsqueeze(&a, &b, dim, dSize);
-    if (requireLink) {
+    if (b.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&a, NULL, &b, SHAPE_UNSQUEEZE);
        XLink::AddParamToHeadInt(&b, dim);

--- a/source/tensor/core/shape/Unsqueeze.h
+++ b/source/tensor/core/shape/Unsqueeze.h
@@ -35,7 +35,7 @@ void _Unsqueeze(const XTensor * a, XTensor * b, int dim, int dSize);
   make a new tensor to keep the result and return it */
 XTensor Unsqueeze(const XTensor &a, int dim, int dSize);
-void Unsqueeze(const XTensor &a, XTensor &b, int dim, int dSize, bool requireLink = false);
+void Unsqueeze(const XTensor &a, XTensor &b, int dim, int dSize);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/sort/Sort.cpp
+++ b/source/tensor/core/sort/Sort.cpp
@@ -98,6 +98,21 @@ void _SortMe(XTensor * a, XTensor * index, int dim)
 }
 /*
+sort the tensor along a given dimension (do it on site)
+keep the result in the input tensor a and return nothing
+>> a - input tensor
+>> index - index of the items in the resulting tensor
+>> dim - the dimension along which the sorting is performed
+*/
+void SortMe(XTensor& a, XTensor& index, int dim)
+{
+    _Sort(&a, &a, &index, dim);
+}
+/*
 sort the tensor along a given dimension (return an XTensor structure)
 make a new tensor to keep the result and return it

--- a/source/tensor/core/sort/Sort.cu
+++ b/source/tensor/core/sort/Sort.cu
@@ -217,7 +217,7 @@ void _CudaSortBig(const XTensor * a, XTensor * b, XTensor * indexA, XTensor * in
    CheckNTErrors((a->order > dim && dim >= 0), "Incorrect dimension specified!");
    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
-	int dimRDI = a->order - dim - 1;
+    int dimRDI = a->order - dim - 1;
    if (k < 0 || k > b->dimSizeRDI[dimRDI])
        k = b->dimSizeRDI[dimRDI];

--- a/source/tensor/core/utilities/SetAscendingOrder.cu
+++ b/source/tensor/core/utilities/SetAscendingOrder.cu
@@ -67,8 +67,8 @@ void CudaSetAscendingOrder(XTensor * a, int dim)
 {
    CheckNTErrors((a->dataType == X_INT), "TODO!");
-	int dimRDI = a->order - dim - 1;
+    int dimRDI = a->order - dim - 1;
-	int stride = 1;
+    int stride = 1;
    int strideNum = a->dimSizeRDI[dimRDI];
    for(int i = 0; i < dimRDI; i++)
        stride *= a->dimSizeRDI[i];

--- a/source/tensor/function/Dropout.cpp
+++ b/source/tensor/function/Dropout.cpp
@@ -149,6 +149,7 @@ XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim, int leadingDim
    CheckNTErrors(dropProb >= 0.0 && dropProb <= 1.0, "The probability must be 0-1!");
    XTensor mask;
+    // int * maskArrayInt = NULL;
    DTYPE * maskArray = NULL;
    DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - dropProb);

--- a/source/tensor/function/HardTanH.cpp
+++ b/source/tensor/function/HardTanH.cpp
@@ -83,7 +83,7 @@ XTensor HardTanH(const XTensor &x)
    return y;
 }
-void HardTanH(const XTensor &x, XTensor &y, bool requireLink)
+void HardTanH(const XTensor &x, XTensor &y)
 {
    if (!y.isInit || !XTensor::IsSameShaped(&y, &x)) {
        InitTensor(&y, &x);
@@ -92,7 +92,7 @@ void HardTanH(const XTensor &x, XTensor &y, bool requireLink)
    /* call _HardTanH function */
    _HardTanH(&x, &y);
-    if (requireLink) {
+    if (y.enableGrad) {
        /* tensor connection */
        XLink::MakeLink(&x, NULL, &y, FUNC_HARDTANH);
    }

--- a/source/tensor/function/HardTanH.h
+++ b/source/tensor/function/HardTanH.h
@@ -39,7 +39,7 @@ void _HardTanH(const XTensor * x, XTensor * y);
 /* hard tanh function (return an XTensor structure) */
 XTensor HardTanH(const XTensor &x);
-void HardTanH(const XTensor &x, XTensor &y, bool requireLink = false);
+void HardTanH(const XTensor &x, XTensor &y);
 /* de/dx */
 void _HardTanHBackward(XTensor * y, XTensor * x, 

--- a/source/tensor/function/Identity.cpp
+++ b/source/tensor/function/Identity.cpp
@@ -59,7 +59,7 @@ XTensor Identity(const XTensor &x)
    return y;
 }
-void Identity(const XTensor &x, XTensor &y, bool requireLink)
+void Identity(const XTensor &x, XTensor &y)
 {
    if (!y.isInit || !y.IsSameShaped(&y, &x)) {
        InitTensor(&y, &x);
@@ -68,7 +68,7 @@ void Identity(const XTensor &x, XTensor &y, bool requireLink)
    /* call _Identity function */
    _Identity(&x, &y);
-    if (requireLink) {
+    if (y.enableGrad) {
        /* tensor connection */
        XLink::MakeLink(&x, NULL, &y, FUNC_IDENTITY);
    }

--- a/source/tensor/function/Identity.h
+++ b/source/tensor/function/Identity.h
@@ -32,7 +32,7 @@ void _Identity(const XTensor * x, XTensor * y);
 /* identity function y = x (return an XTensor structure) */
 XTensor Identity(const XTensor &x);
-void Identity(const XTensor &x, XTensor &y, bool requireLink = false);
+void Identity(const XTensor &x, XTensor &y);
 /* de/dx */
 void _IdentityBackward(const XTensor * y, const XTensor * x, 

--- a/source/tensor/function/LogSoftmax.cpp
+++ b/source/tensor/function/LogSoftmax.cpp
@@ -194,7 +194,15 @@ XTensor LogSoftmax(const XTensor &x, int leadDim)
    return y;
 }
-void LogSoftmax(const XTensor &x, XTensor &y, int leadDim, bool requireLink)
+/*
+log scale softmax y = log(e^x / \sum_{i} e^{x_i})
+make a new tensor to keep the result and return it
+>> x - input vector
+>> y - output vector
+>> leadDim - leading dimension (along which we perform reduction)
+*/
+void LogSoftmax(const XTensor &x, XTensor &y, int leadDim)
 {
    int ld = leadDim;
    if (ld < 0)
@@ -207,32 +215,12 @@ void LogSoftmax(const XTensor &x, XTensor &y, int leadDim, bool requireLink)
    /* call _LogSoftmax function */
    _LogSoftmax(&x, &y, ld);
-    if (requireLink) {
+    if (y.enableGrad) {
        /* tensor connection */
        XLink::MakeLink(&x, NULL, &y, FUNC_LOGSOFTMAX);
        XLink::AddParamToHeadInt(&y, ld);
    }
 }
-/* 
-log scale softmax y = log(e^x / \sum_{i} e^{x_i})
-make a new tensor to keep the result and return it
->> x - input vector
->> y - output vector
->> leadDim - leading dimension (along which we perform reduction)
-*/
-void LogSoftmax(const XTensor &x, XTensor &y, int leadDim)
-{
-    if(!XTensor::IsSameShaped(&x, &y))
-        InitTensor(&y, &x);
-    /* call _LogSoftmax function */
-    _LogSoftmax(&x, &y, leadDim);
-    /* tensor connection */
-    XLink::MakeLink(&x, NULL, &y, FUNC_LOGSOFTMAX);
-    XLink::AddParamToHeadInt(&y, leadDim);
-}
 /*
 backward computation for dense matrices with default data type

--- a/source/tensor/function/LogSoftmax.h
+++ b/source/tensor/function/LogSoftmax.h
@@ -33,7 +33,7 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim);
 /* log scale softmax y = log(e^x / \sum_{i} e^{x_i}) (return an XTensor structure) */
 XTensor LogSoftmax(const XTensor &x, int leadDim);
-void LogSoftmax(const XTensor &x, XTensor &y, int leadDim, bool requireLink = false);
+void LogSoftmax(const XTensor &x, XTensor &y, int leadDim);
 /* log scale softmax y = log(e^x / \sum_{i} e^{x_i}) (with both argument of x and y) */
 void LogSoftmax(const XTensor &x, XTensor &y, int leadDim);

--- a/source/tensor/function/Rectify.cpp
+++ b/source/tensor/function/Rectify.cpp
@@ -75,7 +75,7 @@ XTensor Rectify(const XTensor &x)
    return y;
 }
-void Rectify(const XTensor &x, XTensor &y, bool requireLink)
+void Rectify(const XTensor &x, XTensor &y)
 {
    if (!y.isInit || !XTensor::IsSameShaped(&y, &x)) {
        InitTensor(&y, &x);
@@ -84,7 +84,7 @@ void Rectify(const XTensor &x, XTensor &y, bool requireLink)
    /* call _Rectify function */
    _Rectify(&x, &y);
-    if (requireLink) {
+    if (y.enableGrad) {
        /* tensor connection */
        XLink::MakeLink(&x, NULL, &y, FUNC_RECTIFY);
    }

--- a/source/tensor/function/Rectify.h
+++ b/source/tensor/function/Rectify.h
@@ -32,7 +32,7 @@ void _Rectify(const XTensor * x, XTensor * y);
 /* rectify function y = max(0, x) (return an XTensor structure) */
 XTensor Rectify(const XTensor &x);
-void Rectify(const XTensor &x, XTensor &y, bool requireLink = false);
+void Rectify(const XTensor &x, XTensor &y);
 /* de/dx */
 void _RectifyBackward(XTensor * y, XTensor * x, 

--- a/source/tensor/function/Sigmoid.cpp
+++ b/source/tensor/function/Sigmoid.cpp
@@ -78,7 +78,7 @@ XTensor Sigmoid(const XTensor &x)
    return y;
 }
-void Sigmoid(const XTensor &x, XTensor &y, bool requireLink)
+void Sigmoid(const XTensor &x, XTensor &y)
 {
    if (!y.isInit || !XTensor::IsSameShaped(&y, &x)) {
        InitTensor(&y, &x);
@@ -87,7 +87,7 @@ void Sigmoid(const XTensor &x, XTensor &y, bool requireLink)
    /* call _Sigmoid function */
    _Sigmoid(&x, &y);
-    if (requireLink) {
+    if (y.enableGrad) {
        /* tensor connection */
        XLink::MakeLink(&x, NULL, &y, FUNC_SIGMOID);
    }

--- a/source/tensor/function/Sigmoid.h
+++ b/source/tensor/function/Sigmoid.h
@@ -32,7 +32,7 @@ void _Sigmoid(const XTensor * x, XTensor * y);
 /* sigmoid function y = 1/(1+exp(-x)) (return an XTensor structure) */
 XTensor Sigmoid(const XTensor &x);
-void Sigmoid(const XTensor &x, XTensor &y, bool requireLink = false);
+void Sigmoid(const XTensor &x, XTensor &y);
 /* de/dx */
 void _SigmoidBackward(XTensor * y, XTensor * x, 

--- a/source/tensor/function/Softmax.cpp
+++ b/source/tensor/function/Softmax.cpp
@@ -148,7 +148,7 @@ XTensor Softmax(const XTensor &x, int leadDim)
    return y;
 }
-void Softmax(const XTensor &x, XTensor &y, int leadDim, bool requireLink)
+void Softmax(const XTensor &x, XTensor &y, int leadDim)
 {
    int ld = leadDim;
    if (ld < 0)
@@ -161,7 +161,7 @@ void Softmax(const XTensor &x, XTensor &y, int leadDim, bool requireLink)
    /* call _Softmax function */
    _Softmax(&x, &y, ld);
-    if (requireLink) {
+    if (y.enableGrad) {
        /* tensor connection */
        XLink::MakeLink(&x, NULL, &y, FUNC_SOFTMAX);
        XLink::AddParamToHeadInt(&y, ld);

--- a/source/tensor/function/Softmax.h
+++ b/source/tensor/function/Softmax.h
@@ -33,7 +33,7 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim);
 /* softmax y = e^x / \sum_{i} e^{x_i} (return an XTensor structure) */
 XTensor Softmax(const XTensor &x, int leadDim);
-void Softmax(const XTensor &x, XTensor &y, int leadDim, bool requireLink = false);
+void Softmax(const XTensor &x, XTensor &y, int leadDim);
 /* de/dx */
 void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, 

--- a/source/tensor/loss/CrossEntropy.cu
+++ b/source/tensor/loss/CrossEntropy.cu
@@ -195,17 +195,17 @@ void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output,
        delete[] dims;
    }
-    //if(padding != NULL) {
+    if(padding != NULL) {
-    //    XTensor * tmp = NewTensor(padding);
+        XTensor * tmp = NewTensor(padding);
-    //    _IsNonZero(padding, tmp);
+        _IsNonZero(padding, tmp);
-    //    int nonZeroNum = (int)_ReduceSumAll(tmp);
+        int nonZeroNum = (int)_ReduceSumAll(tmp);
-    //    _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum);
+        _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum);
-    //    delete tmp;
+        delete tmp;
-    //}
+    }
-    //else {
+    else {
-    //    int num = dedy->unitNum / dedy->GetDim(n);
+        int num = dedy->unitNum / dedy->GetDim(n);
-    //    _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)num);
+        _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)num);
-    //}
+    }
 }