Bug fixed

1. Merge with Huchi branch (replace all requireLink with enableGrad that allows gradient computation for a tensor); 2. Update the global memory size (This may make the memory size a little bit larger than the old version).

Bug fixed
1. Merge with Huchi branch (replace all requireLink with enableGrad that allows gradient computation for a tensor); 2. Update the global memory size (This may make the memory size a little bit larger than the old version).
93f51095 · liyinqiao · 4040dde0 · 93f51095 · 93f51095 · 93f51095
Commit 93f51095 authored Jul 25, 2019 by liyinqiao
--- a/source/network/Main.cpp
+++ b/source/network/Main.cpp
@@ -68,6 +68,9 @@ void BackwardTest()
    XTensor a;
    XTensor b;
    XTensor c;
+    a.enableGrad = true;
+    b.enableGrad = false;
+    c.enableGrad = false;
    XTensor mean;
    XTensor origin;
    InitTensor2D(&a, 2, 3);
@@ -85,14 +88,15 @@ void BackwardTest()
    b.Set1D(2.0F, 0);
    b.Set1D(1.0F, 1);

-    c = DivDim(a, b, 0);
+    DivDim(a, b, c, 0);
    c.Dump(stderr, "c:");
+    auto loss = CrossEntropy(c, a);

    //XLink::ShowNetwork(stderr, &c);

-    net.Backward(c);
+    net.Backward(loss);

-    net.Dump(stderr);
+    a.grad->Dump(stderr);

 }


--- a/source/network/XBackwardMath.cpp
+++ b/source/network/XBackwardMath.cpp
--- a/source/network/XBackwardMath.h
+++ b/source/network/XBackwardMath.h
--- a/source/network/XBackwardShape.cpp
+++ b/source/network/XBackwardShape.cpp
--- a/source/network/XNet.cpp
+++ b/source/network/XNet.cpp
--- a/source/sample/fnnlm/FNNLM.cpp
+++ b/source/sample/fnnlm/FNNLM.cpp
--- a/source/sample/transformer/T2TAttention.h
+++ b/source/sample/transformer/T2TAttention.h
--- a/source/sample/transformer/T2TBatchLoader.cpp
+++ b/source/sample/transformer/T2TBatchLoader.cpp
--- a/source/sample/transformer/T2TSearch.cpp
+++ b/source/sample/transformer/T2TSearch.cpp
@@ -297,12 +297,12 @@ void T2TSearch::Generate(T2TStateBundle * beam)
       row means a previous state. The column number is size-of-beam \times vocab-size. We,
       therefore, divide entries of the top-k index by vocab-size to compute the id of the
       previous state for each hypothesis in the top-k list. */
-    _DescaleMe(preID, sizeVocab);
+    Descale(preID, sizeVocab);
    
    /* Then, we do something similar to "preID". For the top-k predictions, we need 
       to know their indices in the vocabulary. We compute the offset of each prediction
       in the vocabulary by dividing it with vocab-size and computing the remainder. */
-    _ModMe(index, sizeVocab);
+    ModMe(index, sizeVocab);

    score.Reshape(order, dims);


--- a/source/tensor/XList.cpp
+++ b/source/tensor/XList.cpp
@@ -101,7 +101,6 @@ void TensorListBase<T>::Add(T&& item)
        maxNum = maxNum * 2 + 1;
    }
    items[count++] = item;
-	
 }

 /*

--- a/source/tensor/XList.h
+++ b/source/tensor/XList.h
--- a/source/tensor/XMem.cpp
+++ b/source/tensor/XMem.cpp
@@ -1562,9 +1562,9 @@ void XMemManager::GetBufferSize(MTYPE freeMem, MTYPE * myBufSize)
            if (freeMem >= MILLION * 512){
                *myBufSize = MILLION * 128;
                if (freeMem >= MILLION * 1024) {
-                    *myBufSize = MILLION * 128;
+                    *myBufSize = MILLION * 256;
                    if (freeMem >= MILLION * 2048)
-                        *myBufSize = MILLION * 128;
+                        *myBufSize = MILLION * 512;
                }
            }
        }

--- a/source/tensor/XQueue.cpp
+++ b/source/tensor/XQueue.cpp
--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -101,7 +101,7 @@ XTensor::XTensor(const XTensor * reference)
    SetDataPointer();
    id = MakeTensorID();

-    InitTensorV2(this, reference);
+    InitTensor(this, reference);
 }

 /* 
@@ -173,7 +173,7 @@ XTensor::XTensor(const XTensor &reference)
    else{
        devID = reference.devID;
        mem = reference.mem;
-        InitTensorV2(this, &reference);
+        InitTensor(this, &reference);
        _CopyValues(&reference, this);
    }

@@ -279,6 +279,7 @@ void XTensor::Init()
    isTmp =  false;
    isGrad = false;
    isVar  = false;
+    enableGrad = false;
    visitMark = 0;
    grad = NULL;
 }
@@ -309,6 +310,7 @@ void XTensor::ShallowCopy(const XTensor &tensor)
 {
    strcpy(name, tensor.name);
    order = tensor.order;
+    enableGrad = tensor.enableGrad;
    memcpy(dimSize, tensor.dimSize, sizeof(int) * MAX_TENSOR_DIM_NUM);
    memcpy(dimSizeRDI, tensor.dimSizeRDI, sizeof(int) * MAX_TENSOR_DIM_NUM);
    dataType = tensor.dataType;
@@ -2445,6 +2447,7 @@ void InitTensor(XTensor * tensor, const XTensor * reference)
    if(reference->order < 0)
        return;

+    tensor->enableGrad = reference->enableGrad;
    InitTensor(tensor, reference->order, reference->dimSize, 
               reference->dataType, reference->denseRatio, 
               reference->devID, reference->mem);
@@ -2460,6 +2463,7 @@ void InitTensorV2(XTensor * tensor, const XTensor * reference)
    if(reference->order < 0)
        return;

+    tensor->enableGrad = reference->enableGrad;
    InitTensorV2(tensor, reference->order, reference->dimSize, 
               reference->dataType, reference->devID);
 }
@@ -2474,6 +2478,7 @@ void InitTensorOnCPU(XTensor * tensor, const XTensor * reference)
    if(reference->order < 0)
        return;
    
+    tensor->enableGrad = reference->enableGrad;
    InitTensorV2(tensor, reference->order, reference->dimSize,
               reference->dataType, -1);
 }

--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
@@ -151,6 +151,9 @@ public:
    /* indicates whether the tensor keeps the gradient when used as model parameters */
    bool isGrad;

+    /* indicates whether the gradient of the tensor should be computed */
+    bool enableGrad;
+
    /* indicates whether the tensor is used as paramters (or variables) */
    bool isVar;


--- a/source/tensor/core/arithmetic/Div.cpp
+++ b/source/tensor/core/arithmetic/Div.cpp
@@ -143,6 +143,23 @@ void _DivMe(XTensor * a, const XTensor * b, DTYPE alpha, int leadingDim)
 }

 /*
+element-wise division of two tensors (do it on site)
+keep the result in the input tensor a and return nothing
+
+a(i) = a(i)*b(i) + \alpha * a(i)
+where i is the index of the item
+
+>> a - tensor a (where keep the result)
+>> b - tensor b
+>> alpha - the coefficient
+>> leadingDim - the dimension along which we perform broadcasting
+*/
+void DivMe(XTensor& a, const XTensor& b, DTYPE alpha, int leadingDim)
+{
+    _Div(&a, &b, &a, alpha, leadingDim);
+}
+
+/* 
 return a dimension if the division is performed as DivDim (in more details in DivDim.h)
 >> a - a tensor
 >> b - another tensor for division
@@ -229,9 +246,8 @@ where i is the index of the item
 >> c - result tensor
 >> alpha - the coefficient
 >> leadingDim - the dimension along which we perform broadcasting
->> requireLink - if add operation to network
 */
-void Div(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int leadingDim, bool requireLink)
+void Div(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int leadingDim)
 {
    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
        InitTensor(&c, &a);
@@ -245,7 +261,7 @@ void Div(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int leadin
        /* call _Div function */
        _Div(&a, &b, &c, 0, leadingDim);

-        if (requireLink) {
+        if (c.enableGrad) {
            /* tensor connections */
            XLink::MakeLink(&a, &b, &c, MATH_DIV);
            XLink::AddParamToHead(&c, alpha);
@@ -256,7 +272,7 @@ void Div(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int leadin
        /* call _DivDim function */
        _DivDim(&a, &b, &c, n, alpha);

-        if (requireLink) {
+        if (c.enableGrad) {
            /* tensor connections */
            XLink::MakeLink(&a, &b, &c, MATH_DIVDIM);
            XLink::AddParamToHeadInt(&c, n);

--- a/source/tensor/core/arithmetic/Div.cu
+++ b/source/tensor/core/arithmetic/Div.cu
--- a/source/tensor/core/arithmetic/Div.h
+++ b/source/tensor/core/arithmetic/Div.h
@@ -40,6 +40,7 @@ a(i) = a(i)/b(i) + \alpha * a(i)
 where i is the index of the element 
 */
 void _DivMe(XTensor * a, const XTensor * b, DTYPE alpha = 0.0, int leadingDim = 0);
+void DivMe(XTensor & a, const XTensor & b, DTYPE alpha = 0.0, int leadingDim = 0);

 /* 
 element-wise division of two tensors (return an XTensor structure)
@@ -54,7 +55,7 @@ element-wise division of two tensors:
 c(i) = a(i)/b(i) + \alpha * c(i)
 where i is the index of the element
 */
-void Div(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha = 0.0, int leadingDim = 0, bool requireLink = false);
+void Div(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha = 0.0, int leadingDim = 0);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/arithmetic/DivDim.cpp
+++ b/source/tensor/core/arithmetic/DivDim.cpp
@@ -183,9 +183,8 @@ i.e., a is divided with b by broadcasting
 >> c - where we put result. we save it in a if c is NULL
 >> n - the dimension index
 >> alpha - the scaling factor
->> requireLink - if add operation to network
 */
-void DivDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE alpha, bool requireLink)
+void DivDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE alpha)
 {
    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
        InitTensor(&c, &a);
@@ -194,7 +193,7 @@ void DivDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE alpha, 
    /* call _Div function */
    _DivDim(&a, &b, &c, n, alpha);

-    if (requireLink) {
+    if (c.enableGrad == true) {
        /* tensor connections */
        XLink::MakeLink(&a, &b, &c, MATH_DIVDIM);
        XLink::AddParamToHeadInt(&c, n);

--- a/source/tensor/core/arithmetic/DivDim.h
+++ b/source/tensor/core/arithmetic/DivDim.h
@@ -59,7 +59,7 @@ c(i) = a/b + \alpha * c
 where the size of b is equal to the n-th dimension of a, 
 i.e., a is divided with b by broadcasting 
 */
-void DivDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE alpha = (DTYPE)0.0, bool requireLink = false);
+void DivDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE alpha = (DTYPE)0.0);
    
 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/arithmetic/Mask.cpp
+++ b/source/tensor/core/arithmetic/Mask.cpp
@@ -130,6 +130,17 @@ void _MaskMe(XTensor * a, const XTensor * mask, DTYPE alpha)
 }

 /*
+mask entries of a given tensor (on site):
+a(i) = a(i) if mask(i) is non-zero
+a(i) = alpha if mask(i) = 0
+where i is the index of the element
+*/
+void MaskMe(XTensor& a, const XTensor& mask, DTYPE alpha)
+{
+    _Mask(&a, &mask, &a, alpha);
+}
+
+/*
 mask entries of a given tensor (return an XTensor structure):
 a(i) = a(i) if mask(i) is non-zero
 a(i) = alpha if mask(i) = 0

--- a/source/tensor/core/arithmetic/Mask.h
+++ b/source/tensor/core/arithmetic/Mask.h
@@ -43,6 +43,7 @@ a(i) = alpha if mask(i) = 0
 where i is the index of the element
 */
 void _MaskMe(XTensor * a, const XTensor * mask, DTYPE alpha);
+void MaskMe(XTensor & a, const XTensor & mask, DTYPE alpha);

 /* 
 mask entries of a given tensor (return an XTensor structure):

--- a/source/tensor/core/arithmetic/MatrixMul.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul.cpp
@@ -304,7 +304,7 @@ XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,

 void MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
    const XTensor &b, MATRIX_TRANS_TYPE transposedB, XTensor &c, 
-    DTYPE alpha, XPRunner * parallelRunner, bool requireLink)
+    DTYPE alpha, XPRunner * parallelRunner)
 {
    CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
    CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
@@ -339,7 +339,7 @@ void MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
    /* call _MatrixMul function */
    _MatrixMul(&a, transposedA, &b, transposedB, &c, alpha, 0, parallelRunner);

-    if (requireLink) {
+    if (c.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&a, &b, &c, MATH_MATRIXMUL);
        XLink::AddParamToHeadTrans(&c, transposedA);
@@ -400,7 +400,7 @@ XTensor MatrixMul(const XTensor &a, const XTensor &b,
 }

 void MatrixMul(const XTensor &a, const XTensor &b, XTensor &c,
-    DTYPE alpha, XPRunner * parallelRunner, bool requireLink)
+    DTYPE alpha, XPRunner * parallelRunner)
 {
    CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
    CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
@@ -435,7 +435,7 @@ void MatrixMul(const XTensor &a, const XTensor &b, XTensor &c,
    /* call _MatrixMul function */
    _MatrixMul(&a, X_NOTRANS, &b, X_NOTRANS, &c, alpha, 0, parallelRunner);

-    if (requireLink) {
+    if (c.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&a, &b, &c, MATH_MATRIXMUL);
        XLink::AddParamToHeadTrans(&c, X_NOTRANS);

--- a/source/tensor/core/arithmetic/MatrixMul.h
+++ b/source/tensor/core/arithmetic/MatrixMul.h
@@ -60,14 +60,14 @@ XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor
                  DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);

 void MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
-    XTensor &c, DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL, bool requireLink = false);
+               XTensor &c, DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);

 /* matrix multiplication with no transposition c = a * b * alpha*/
 XTensor MatrixMul(const XTensor &a, const XTensor &b, 
                  DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);

 void MatrixMul(const XTensor &a, const XTensor &b, XTensor &c, 
-    DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL, bool requireLink = false);
+               DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);


 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/MatrixMul2D.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul2D.cpp
--- a/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.cpp
--- a/source/tensor/core/arithmetic/Multiply.cpp
+++ b/source/tensor/core/arithmetic/Multiply.cpp
@@ -144,6 +144,23 @@ void _MultiplyMe(XTensor * a, const XTensor * b, DTYPE alpha, int leadingDim)
 }

 /*
+element-wise product of two tensors (do it on site)
+keep the result in the input tensor a and return nothing
+
+a(i) = a(i)*b(i) + \alpha * a(i)
+where i is the index of the item
+
+>> a - tensor a (where keep the result)
+>> b - tensor b
+>> alpha - the coefficient
+>> leadingDim - the dimension along which we perform broadcasting
+*/
+void MultiplyMe(XTensor& a, const XTensor& b, DTYPE alpha, int leadingDim)
+{
+    _Multiply(&a, &b, &a, alpha, leadingDim);
+}
+
+/* 
 return a dimension if the multiplication is performed as MultiplyDim (in more details in MultiplyDim.h)
 >> a - a tensor
 >> b - another tensor for multiplication
@@ -230,9 +247,8 @@ where i is the index of the item
 >> c - result tensor
 >> alpha - the coefficient
 >> leadingDim - the dimension along which we perform broadcasting
->> requireLink - if add operation to network
 */
-void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int leadingDim, bool requireLink)
+void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int leadingDim)
 {
    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
        InitTensor(&c, &a);
@@ -246,7 +262,7 @@ void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int l
        /* call _Multiply function */
        _Multiply(&a, &b, &c, 0, leadingDim);

-        if (requireLink) {
+        if (c.enableGrad) {
            /* tensor connections */
            XLink::MakeLink(&a, &b, &c, MATH_MULTIPLY);
            XLink::AddParamToHead(&c, alpha);
@@ -257,7 +273,7 @@ void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int l
        /* call _MultiplyDim function */
        _MultiplyDim(&a, &b, &c, n, alpha);

-        if (requireLink) {
+        if (c.enableGrad) {
            /* tensor connections */
            XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM);
            XLink::AddParamToHeadInt(&c, n);

--- a/source/tensor/core/arithmetic/Multiply.cu
+++ b/source/tensor/core/arithmetic/Multiply.cu
--- a/source/tensor/core/arithmetic/Multiply.h
+++ b/source/tensor/core/arithmetic/Multiply.h
@@ -40,6 +40,7 @@ a(i) = a(i)*b(i) + \alpha * a(i)
 where i is the index of the element 
 */
 void _MultiplyMe(XTensor * a, const XTensor * b, DTYPE alpha = 0.0, int leadingDim = 0);
+void MultiplyMe(XTensor & a, const XTensor & b, DTYPE alpha = 0.0, int leadingDim = 0);

 /* 
 element-wise product of two tensors (return an XTensor structure)
@@ -54,7 +55,7 @@ element-wise product of two tensors:
 c(i) = a(i)*b(i) + \alpha * c(i) 
 where i is the index of the element
 */
-void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha = 0.0, int leadingDim = 0, bool requireLink = false);
+void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha = 0.0, int leadingDim = 0);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/arithmetic/MultiplyDim.cpp
+++ b/source/tensor/core/arithmetic/MultiplyDim.cpp
@@ -139,6 +139,24 @@ void _MultiplyDimMe(XTensor * a, const XTensor * b, int n, DTYPE alpha)
 }

 /*
+tensor multiplication(do it on site)
+make a new tensor to keep the result and return it
+
+c = a * b + \alpha * c
+where the size of b is equal to the n-th dimension of a,
+i.e., a is multiplied with b by broadcasting
+
+>> a - a tensor
+>> b - another tensor whose size is equal to that of dimension n of a
+>> n - the dimension index
+>> alpha - the scaling factor
+*/
+void MultiplyDimMe(XTensor& a, const XTensor& b, int n, DTYPE alpha)
+{
+    _MultiplyDim(&a, &b, &a, n, alpha);
+}
+
+/*
 tensor multiplication (return an XTensor structure and make tensor connections)
 make a new tensor to keep the result and return it

@@ -180,9 +198,8 @@ i.e., a is multiplied with b by broadcasting
 >> b - another tensor whose size is equal to that of dimension n of a
 >> c - where we put a * b + \alpha * c. we save it in a if c is NULL
 >> n - the dimension index
->> requireLink - if add operation to network
 */
-void MultiplyDim(const XTensor &a, const XTensor &b, XTensor &c, int n, bool requireLink)
+void MultiplyDim(const XTensor &a, const XTensor &b, XTensor &c, int n)
 {
    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
        InitTensor(&c, &a);
@@ -191,7 +208,7 @@ void MultiplyDim(const XTensor &a, const XTensor &b, XTensor &c, int n, bool req
    /* call _Multiply function */
    _MultiplyDim(&a, &b, &c, n, 0);

-    if (requireLink) {
+    if (c.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM);
        XLink::AddParamToHeadInt(&c, n);
@@ -347,9 +364,8 @@ where some of dimensions of b can be of size 1
 >> a - a tensor
 >> b - another tensor that would be broadcasted
 >> c - the resulting tensor
->> requireLink - if add operation to network
 */
-void MultiplyBroadcast(const XTensor &a, const XTensor &b, XTensor &c, bool requireLink)
+void MultiplyBroadcast(const XTensor &a, const XTensor &b, XTensor &c)
 {
    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
        InitTensor(&c, &a);
@@ -358,7 +374,7 @@ void MultiplyBroadcast(const XTensor &a, const XTensor &b, XTensor &c, bool requ
    /* call _SumBroadcast function */
    _MultiplyBroadcast(&a, &b, &c, 0);

-    if (requireLink) {
+    if (c.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYBROADCAST);
        XLink::AddParamToHead(&c, 0);

--- a/source/tensor/core/arithmetic/MultiplyDim.h
+++ b/source/tensor/core/arithmetic/MultiplyDim.h
@@ -33,6 +33,7 @@ void _MultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYP
 /* tensor multiplication a = a * b + \alpha * c where the size of b is equal to the n-th dimension of a,
   i.e., a is multiplied with b by broadcasting. we keep the result in the input tensor a and return nothing */
 void _MultiplyDimMe(XTensor * a, const XTensor * b, int n, DTYPE alpha = 0.0);
+void MultiplyDimMe(XTensor & a, const XTensor & b, int n, DTYPE alpha = 0.0);

 /* tensor multiplication c = a * b where the size of b is equal to the n-th dimension of a,
   i.e., a is multiplied with b by broadcasting. We make a new tensor c to keep the result and return it */
@@ -40,7 +41,7 @@ XTensor MultiplyDim(const XTensor &a, const XTensor &b, int n);

 /* tensor multiplication c = a * b + \alpha * c  where the size of b is equal to the n-th dimension of a,
   i.e., a is multiplied with b by broadcasting */
-void MultiplyDim(const XTensor &a, const XTensor &b, XTensor &c, int n, bool requireLink = false);
+void MultiplyDim(const XTensor &a, const XTensor &b, XTensor &c, int n);

 /* tensor multiplication summation c = a * b + c * \beta where some of dimensions of b can be of size 1 */
 void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
@@ -50,7 +51,7 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
 XTensor MultiplyBroadcast(const XTensor &a, const XTensor &b);

 /* tensor multiplication summation c = a * b + c * \beta where some of dimensions of b can be of size 1 */
-void MultiplyBroadcast(const XTensor &a, const XTensor &b, XTensor &c, bool requireLink = false);
+void MultiplyBroadcast(const XTensor &a, const XTensor &b, XTensor &c);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/arithmetic/Negate.cpp
+++ b/source/tensor/core/arithmetic/Negate.cpp
@@ -60,6 +60,16 @@ void _NegateMe(XTensor * a)
 }

 /*
+set every entry to its minus value (do it on site)
+keep the result in the input tensor a and return nothing
+>> a - the tensor we are processing
+*/
+void NegateMe(XTensor& a)
+{
+    _Negate(&a, &a);
+}
+
+/*
 set every entry to its minus value (return an XTensor structure)
 make a new tensor to keep the result and return it
 >> a - input tensor we are processing
@@ -83,9 +93,8 @@ XTensor Negate(const XTensor & a)
 set every entry to its minus value
 >> a - input tensor we are processing
 >> b - output tensor we are processing
->> requireLink - if add operation to network
 */
-void Negate(const XTensor & a, XTensor & b, bool requireLink)
+void Negate(const XTensor & a, XTensor & b)
 {
    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {
        InitTensor(&b, &a);
@@ -94,7 +103,7 @@ void Negate(const XTensor & a, XTensor & b, bool requireLink)
    /* call _Negate function */
    _Negate(&a, &b);

-    if (requireLink) {
+    if (b.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&a, NULL, &b, MATH_NEGATE);
    }

--- a/source/tensor/core/arithmetic/Negate.h
+++ b/source/tensor/core/arithmetic/Negate.h
@@ -34,6 +34,7 @@ set every entry to its minus value (do it on site)
 keep the result in the input tensor a and return nothing
 */
 void _NegateMe(XTensor * a);
+void NegateMe(XTensor & a);

 /* 
 set every entry to its minus value (return an XTensor structure)
@@ -42,7 +43,7 @@ make a new tensor to keep the result and return it
 XTensor Negate(const XTensor & a);

 /* set every entry to its minus value */
-void Negate(const XTensor & a, XTensor & b, bool requireLink = false);
+void Negate(const XTensor & a, XTensor & b);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/arithmetic/Sign.cpp
+++ b/source/tensor/core/arithmetic/Sign.cpp
@@ -66,6 +66,16 @@ void _SignMe(XTensor * a)
 }

 /*
+set every entry to its sign value (do it on site)
+keep the result in the input tensor a and return nothing
+>> a - the tensor we are processing
+*/
+void SignMe(XTensor& a)
+{
+    _Sign(&a, &a);
+}
+
+/*
 set every entry to its sign value (return an XTensor structure)
 make a new tensor to keep the result and return it
 >> a - input tensor we are processing
@@ -89,9 +99,8 @@ XTensor Sign(const XTensor & a)
 set every entry to its sign value
 >> a - input tensor we are processing
 >> b - output tensor we are processing
->> requireLink - if add operation to network
 */
-void Sign(const XTensor & a, XTensor & b, bool requireLink)
+void Sign(const XTensor & a, XTensor & b)
 {
    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {
        InitTensor(&b, &a);
@@ -100,7 +109,7 @@ void Sign(const XTensor & a, XTensor & b, bool requireLink)
    /* call _Sign function */
    _Sign(&a, &b);

-    if (requireLink) {
+    if (b.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&a, NULL, &b, MATH_SIGN);
    }

--- a/source/tensor/core/arithmetic/Sign.h
+++ b/source/tensor/core/arithmetic/Sign.h
@@ -36,13 +36,19 @@ keep the result in the input tensor a and return nothing
 void _SignMe(XTensor * a);

 /* 
+set every entry to its sign value (do it on site)
+keep the result in the input tensor a and return nothing
+*/
+void SignMe(XTensor & a);
+
+/* 
 set every entry to its sign value  (return an XTensor structure)
 make a new tensor to keep the result and return it
 */
 XTensor Sign(const XTensor & a);

 /* set every entry to its sign value */
-void Sign(const XTensor & a, XTensor & b, bool requireLink = false);
+void Sign(const XTensor & a, XTensor & b);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/arithmetic/Sub.cpp
+++ b/source/tensor/core/arithmetic/Sub.cpp
@@ -128,6 +128,19 @@ void _SubMe(XTensor * a, const XTensor * b, DTYPE beta)
 }

 /*
+tensor subtraction a = a - b * \beta (do it on site)
+keep the result in the tensor a and return nothing
+
+>> a - a tensor
+>> b - another tensor
+>> beta - the scaling factor
+*/
+void SubMe(XTensor& a, const XTensor& b, DTYPE beta)
+{
+    _Sub(&a, &b, &a, beta);
+}
+  
+/* 
 return a dimension if the subtraction is performed as SubDim (in more details in SubDim.h)
 >> a - a tensor
 >> b - another tensor for subtraction
@@ -203,9 +216,8 @@ tensor subtraction c = a - b * \beta
 >> b - another tensor
 >> c - where we put a-b*\beta. we save it in a if c is NULL
 >> beta - the scaling factor
->> requireLink - if add operation to network
 */
-void Sub(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta, bool requireLink)
+void Sub(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta)
 {
    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
        InitTensor(&c, &a);
@@ -217,7 +229,7 @@ void Sub(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta, bool requir
        /* call _Sub function */
        _Sub(&a, &b, &c, beta);

-        if (requireLink) {
+        if (c.enableGrad) {
            /* tensor connections */
            XLink::MakeLink(&a, &b, &c, MATH_SUB);
            XLink::AddParamToHead(&c, beta);
@@ -227,7 +239,7 @@ void Sub(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta, bool requir
        /* call _SubDim function */
        _SubDim(&a, &b, &c, n, beta);

-        if (requireLink) {
+        if (c.enableGrad) {
            /* tensor connections */
            XLink::MakeLink(&a, &b, &c, MATH_SUBDIM);
            XLink::AddParamToHeadInt(&c, n);

--- a/source/tensor/core/arithmetic/Sub.h
+++ b/source/tensor/core/arithmetic/Sub.h
@@ -35,6 +35,7 @@ tensor subtraction a = a - b * \beta
 keep the result in the input tensor a and return nothing
 */
 void _SubMe(XTensor * a, const XTensor * b, DTYPE beta = (DTYPE)1.0);
+void SubMe(XTensor & a, const XTensor & b, DTYPE beta = (DTYPE)1.0);
    
 /*
 tensor subtraction c = a - b * \beta
@@ -43,7 +44,7 @@ make a new tensor c to keep the result and return it
 XTensor Sub(const XTensor &a, const XTensor &b, DTYPE beta = (DTYPE)1.0);

 /* tensor subtraction c = a - b * \beta */
-void Sub(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta = (DTYPE)1.0, bool requireLink = false);
+void Sub(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta = (DTYPE)1.0);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/arithmetic/SubDim.cpp
+++ b/source/tensor/core/arithmetic/SubDim.cpp
@@ -183,9 +183,8 @@ i.e., a is subtracted with b by broadcasting
 >> c - where we put a-b*\beta. we save it in a if c is NULL
 >> n - the dimension index
 >> beta - the scaling factor
->> requireLink - if add operation to network
 */
-void SubDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta, bool requireLink)
+void SubDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta)
 {
    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
        InitTensor(&c, &a);
@@ -194,7 +193,7 @@ void SubDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta, b
    /* call _Sub function */
    _SubDim(&a, &b, &c, n, beta);

-    if (requireLink) {
+    if (c.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&a, &b, &c, MATH_SUBDIM);
        XLink::AddParamToHeadInt(&c, n);

--- a/source/tensor/core/arithmetic/SubDim.cu
+++ b/source/tensor/core/arithmetic/SubDim.cu
--- a/source/tensor/core/arithmetic/SubDim.h
+++ b/source/tensor/core/arithmetic/SubDim.h
@@ -40,7 +40,7 @@ XTensor SubDim(const XTensor &a, const XTensor &b, int n, DTYPE beta = (DTYPE)1.

 /* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a, 
   i.e., a is subtracted with b by broadcasting*/
-void SubDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta = (DTYPE)1.0, bool requireLink = false);
+void SubDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta = (DTYPE)1.0);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/arithmetic/Sum.cpp
+++ b/source/tensor/core/arithmetic/Sum.cpp
@@ -133,6 +133,19 @@ void _SumMe(XTensor * a, const XTensor * b, DTYPE beta)
 }

 /*
+tensor summation a = a + b * \beta (do it on site)
+keep the result in the tensor a and return nothing
+
+>> a - a tensor
+>> b - another tensor
+>> beta - the scaling factor
+*/
+void SumMe(XTensor& a, const XTensor& b, DTYPE beta)
+{
+    _Sum(&a, &b, &a, beta);
+}
+
+/* 
 return a dimension if the sum is performed as SumDim (in more details in SumDim.h)
 >> a - a tensor
 >> b - another tensor for sum
@@ -207,9 +220,8 @@ tensor summation c = a + b * \beta
 >> a - a tensor
 >> b - another tensor
 >> beta - the scaling factor
->> requireLink - if add operation to network
 */
-void Sum(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta, bool requireLink)
+void Sum(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta)
 {
    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
        InitTensor(&c, &a);
@@ -221,7 +233,7 @@ void Sum(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta, bool requir
        /* call _Sum function */
        _Sum(&a, &b, &c, beta);

-        if (requireLink) {
+        if (c.enableGrad) {
            /* tensor connections */
            XLink::MakeLink(&a, &b, &c, MATH_SUM);
            XLink::AddParamToHead(&c, beta);
@@ -231,7 +243,7 @@ void Sum(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta, bool requir
        /* call _SumDim function */
        _SumDim(&a, &b, &c, n, beta);

-        if (requireLink) {
+        if (c.enableGrad) {
            /* tensor connections */
            XLink::MakeLink(&a, &b, &c, MATH_SUMDIM);
            XLink::AddParamToHeadInt(&c, n);

--- a/source/tensor/core/arithmetic/Sum.h
+++ b/source/tensor/core/arithmetic/Sum.h
@@ -34,6 +34,7 @@ tensor summation a = a + b * \beta
 keep the result in the input tensor a and return nothing
 */
 void _SumMe(XTensor * a, const XTensor * b, DTYPE beta = (DTYPE)1.0);
+void SumMe(XTensor & a, const XTensor & b, DTYPE beta = (DTYPE)1.0);
    
 /*
 tensor summation c = a + b * \beta
@@ -42,7 +43,7 @@ make a new tensor c to keep the result and return it
 XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta = (DTYPE)1.0);

 /* tensor summation c = a + b * \beta */
-void Sum(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta = (DTYPE)1.0, bool requireLink = false);
+void Sum(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta = (DTYPE)1.0);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/arithmetic/SumDim.cpp
+++ b/source/tensor/core/arithmetic/SumDim.cpp
@@ -200,9 +200,8 @@ i.e., a is summed with b by broadcasting
 >> c - where we put a+b*\beta. we save it in a if c is NULL
 >> n - the dimension index
 >> beta - the scaling factor
->> requireLink - if add operation to network
 */
-void SumDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta, bool requireLink)
+void SumDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta)
 {
    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
        InitTensor(&c, &a);
@@ -211,7 +210,7 @@ void SumDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta, b
    /* call _SumDim function */
    _SumDim(&a, &b, &c, n, beta);

-    if (requireLink) {
+    if (c.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&a, &b, &c, MATH_SUMDIM);
        XLink::AddParamToHeadInt(&c, n);
@@ -368,9 +367,8 @@ c = a + b * \beta
 >> b - another tensor that would be broadcasted
 >> c - the resulting tensor
 >> beta - the scaling factor
->> requireLink - if add operation to network
 */
-void SumBroadcast(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta, bool requireLink)
+void SumBroadcast(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta)
 {
    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
        InitTensor(&c, &a);
@@ -379,7 +377,7 @@ void SumBroadcast(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta, bo
    /* call _SumBroadcast function */
    _SumBroadcast(&a, &b, &c, beta);

-    if (requireLink) {
+    if (c.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&a, &b, &c, MATH_SUMBROADCAST);
        XLink::AddParamToHead(&c, beta);

--- a/source/tensor/core/arithmetic/SumDim.h
+++ b/source/tensor/core/arithmetic/SumDim.h
@@ -44,7 +44,7 @@ XTensor SumDim(const XTensor &a, const XTensor &b, int n, DTYPE beta = (DTYPE)1.

 /* tensor summation c = a + b * \beta where the size of b is equal to the n-th dimension of a, 
   i.e., a is summed with b by broadcasting */
-void SumDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta = (DTYPE)1.0, bool requireLink = false);
+void SumDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta = (DTYPE)1.0);

 /* tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1 */
 void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
@@ -54,7 +54,7 @@ void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta
 XTensor SumBroadcast(const XTensor &a, const XTensor &b, DTYPE beta = (DTYPE)1.0);

 /* tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1 */
-void SumBroadcast(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta = (DTYPE)1.0, bool requireLink = false);
+void SumBroadcast(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta = (DTYPE)1.0);
    
 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/arithmetic/XTensorBLAS.cpp
+++ b/source/tensor/core/arithmetic/XTensorBLAS.cpp
--- a/source/tensor/core/math/Binary.cpp
+++ b/source/tensor/core/math/Binary.cpp
@@ -126,13 +126,13 @@ XTensor funcName(const XTensor &a, float num)                               \
 }                                                                           \

 #define SIMPLE_BINARY_FUNCTION_VOID(funcName, _funcName, operationId)       \
-void funcName(const XTensor &a, XTensor &b, float num, bool requireLink)    \
+void funcName(const XTensor &a, XTensor &b, float num)    \
 {                                                                           \
    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {                      \
        InitTensor(&b, &a);                                                 \
    }                                                                       \
    _funcName(&a, &b, num);                                                 \
-    if (requireLink) {                                                      \
+    if (b.enableGrad) {                                                      \
        XLink::MakeLink(&a, NULL, &b, operationId);                         \
    }                                                                       \
 }                                                                           \
@@ -165,7 +165,7 @@ SIMPLE_BINARY_FUNCTION(Shift, _Shift, MATH_SHIFT)
 SIMPLE_BINARY_FUNCTION_VOID(Shift, _Shift, MATH_SHIFT)

 _SIMPLE_BINARY_FUNCTION_INT(_Mod, _CudaMod, mod)
-SIMPLE_BINARY_FUNCTION_ME_INT(_ModMe, _Mod)
+SIMPLE_BINARY_FUNCTION_ME_INT(ModMe, _Mod)
 SIMPLE_BINARY_FUNCTION_INT(Mod, _Mod)

 #else

--- a/source/tensor/core/math/Binary.h
+++ b/source/tensor/core/math/Binary.h
@@ -37,15 +37,22 @@ void _Scale(const XTensor * a, XTensor * b, float scale);
 scale up tensor entires (on site)
 b = a * scale
 */
-void _ScaleMe(XTensor & a, int scale);
-void _ScaleMe(XTensor & a, float scale);
+void _ScaleMe(XTensor * a, int scale);
+void _ScaleMe(XTensor * a, float scale);
+
+/*
+scale up tensor entires (on site)
+b = a * scale
+*/
+void ScaleMe(XTensor & a, int scale);
+void ScaleMe(XTensor & a, float scale);
   
 /*
 scale up tensor entires
 b = a * scale
 */
 void Scale(const XTensor & a, XTensor &b, int scale);
-void Scale(const XTensor & a, XTensor &b, float scale, bool requireLink = false);
+void Scale(const XTensor & a, XTensor &b, float scale);

 /*
 scale up tensor entires (return an XTensor structure)
@@ -64,15 +71,22 @@ void _Descale(const XTensor * a, XTensor * b, float scale);
 descale tensor entires (on site)
 b = a / scale
 */
-void _DescaleMe(XTensor & a, int scale);
-void _DescaleMe(XTensor & a, float scale);
+void _DescaleMe(XTensor * a, int scale);
+void _DescaleMe(XTensor * a, float scale);
+
+/*
+descale tensor entires (on site)
+b = a / scale
+*/
+void DescaleMe(XTensor & a, int scale);
+void DescaleMe(XTensor & a, float scale);
    
 /*
 descale tensor entires
 b = a / scale
 */
 void Descale(const XTensor & a, XTensor & b, int scale);
-void Descale(const XTensor & a, XTensor & b, float scale, bool requireLink = false);
+void Descale(const XTensor & a, XTensor & b, float scale);

 /*
 descale tensor entires (return an XTensor structure)
@@ -91,15 +105,22 @@ void _Shift(const XTensor * a, XTensor * b, float shift);
 shift tensor entires (on site)
 b = a + shift
 */
-void _ShiftMe(XTensor & a, int shift);
-void _ShiftMe(XTensor & a, float shift);
+void _ShiftMe(XTensor * a, int shift);
+void _ShiftMe(XTensor * a, float shift);
+
+/*
+shift tensor entires (on site)
+b = a + shift
+*/
+void ShiftMe(XTensor & a, int shift);
+void ShiftMe(XTensor & a, float shift);
    
 /*
 shift tensor entires
 b = a + shift
 */
 void Shift(const XTensor & a, XTensor & b, int shift);
-void Shift(const XTensor & a, XTensor & b, float shift, bool requireLink = false);
+void Shift(const XTensor & a, XTensor & b, float shift);

 /*
 shift tensor entires (return an XTensor structure)
@@ -118,7 +139,13 @@ void _Mod(const XTensor * a, XTensor * b, int base);
 mod tensor entires (on site)
 b = a % mod
 */
-void _ModMe(XTensor & a, int base);
+void _ModMe(XTensor * a, int base);
+
+/*
+mod tensor entires (on site)
+b = a % mod
+*/
+void ModMe(XTensor & a, int base);
    
 /*
 mod tensor entires

--- a/source/tensor/core/math/Clip.cpp
+++ b/source/tensor/core/math/Clip.cpp
@@ -71,6 +71,18 @@ void _ClipMe(XTensor * a, DTYPE lower, DTYPE upper)
 }

 /*
+set every entry to its clip value (do it on site)
+keep the result in the input tensor a and return nothing
+>> a - the tensor we are processing
+>> lower - the lower border
+>> upper - the upper border
+*/
+void ClipMe(XTensor& a, DTYPE lower, DTYPE upper)
+{
+    _Clip(&a, &a, lower, upper);
+}
+
+/*
 set every entry to its clip value (return an XTensor structure)
 make a new tensor to keep the result and return it
 >> a - input tensor we are processing
@@ -94,7 +106,7 @@ XTensor Clip(const XTensor & a, DTYPE lower, DTYPE upper)
    return b;
 }

-void Clip(const XTensor & a, XTensor & b, DTYPE lower, DTYPE upper, bool requireLink)
+void Clip(const XTensor & a, XTensor & b, DTYPE lower, DTYPE upper)
 {
    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {
        InitTensor(&b, &a);
@@ -103,7 +115,7 @@ void Clip(const XTensor & a, XTensor & b, DTYPE lower, DTYPE upper, bool require
    /* call _Clip function */
    _Clip(&a, &b, lower, upper);

-    if (requireLink) {
+    if (b.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&a, NULL, &b, MATH_CLIP);
        XLink::AddParamToHead(&b, lower);

--- a/source/tensor/core/math/Clip.cu
+++ b/source/tensor/core/math/Clip.cu
--- a/source/tensor/core/math/Clip.h
+++ b/source/tensor/core/math/Clip.h
@@ -33,11 +33,15 @@ void _Clip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper);
   keep the result in the input tensor a and return nothing */
 void _ClipMe(XTensor * a, DTYPE lower, DTYPE upper);

+/* set every entry to its clip value (do it on site)
+keep the result in the input tensor a and return nothing */
+void ClipMe(XTensor & a, DTYPE lower, DTYPE upper);
+
 /* set every entry to its clip value  (return an XTensor structure)
   make a new tensor to keep the result and return it */
 XTensor Clip(const XTensor & a, DTYPE lower, DTYPE upper);

-void Clip(const XTensor & a, XTensor & b, DTYPE lower, DTYPE upper, bool requireLink = false);
+void Clip(const XTensor & a, XTensor & b, DTYPE lower, DTYPE upper);

 /*
 backward of Clip function

--- a/source/tensor/core/math/Compare.h
+++ b/source/tensor/core/math/Compare.h
@@ -32,6 +32,9 @@ void _Equal(const XTensor * a, XTensor * b, DTYPE value);
 /* check whether every entry is equal to the given value (do it on site) */
 void _EqualMe(XTensor * a, DTYPE value);

+/* check whether every entry is equal to the given value (do it on site) */
+void EqualMe(XTensor & a, DTYPE value);
+
 /* check whether every entry is equal to the given value (return an XTensor structure) */
 XTensor Equal(const XTensor & a, DTYPE value);

@@ -41,6 +44,9 @@ void _NotEqual(const XTensor * a, XTensor * b, DTYPE value);
 /* check whether every entry is not equal to the given value (do it on site) */
 void _NotEqualMe(XTensor * a, DTYPE value);

+/* check whether every entry is not equal to the given value (do it on site) */
+void NotEqualMe(XTensor & a, DTYPE value);
+
 /* check whether every entry is not equal to the given value (return an XTensor structure) */
 XTensor NotEqual(const XTensor & a, DTYPE value);


--- a/source/tensor/core/math/Normalize.cpp
+++ b/source/tensor/core/math/Normalize.cpp
@@ -113,6 +113,27 @@ void _NormalizeMe(XTensor * input, int dim, const XTensor * mean, const XTensor 
 {
    _Normalize(input, input, dim, mean, var, a, b, epsilon);
 }
+
+/*
+normalized the data with normal distribution (do it on site)
+keep the result in the input tensor and return nothing
+
+For an input x, x = a * (x-mean)/sqrt(variance+\epsilon) + b
+where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
+
+>> input - the input tensor
+>> dim - dimension alone which we generate the mean and variance
+>> mean - the mean of the input
+>> var - the variance of the input
+>> a - the scalar
+>> b - the bias
+>> epsilon - a parameter
+*/
+void NormalizeMe(XTensor& input, int dim, const XTensor& mean, const XTensor& var, const XTensor& a, const XTensor& b, DTYPE epsilon)
+{
+    _Normalize(&input, &input, dim, &mean, &var, &a, &b, epsilon);
+}
+
 /*
 normalized the data with normal distribution (return an XTensor structure)
 make a new tensor to keep the result and return it 

--- a/source/tensor/core/math/Normalize.cu
+++ b/source/tensor/core/math/Normalize.cu
--- a/source/tensor/core/math/Normalize.h
+++ b/source/tensor/core/math/Normalize.h
@@ -42,6 +42,14 @@ where a and b are the scalar and bias respectively, and \epsilon is the adjustme
 void _NormalizeMe(XTensor * input, int dim, const XTensor * mean, const XTensor * var, const XTensor * a, const XTensor * b, DTYPE epsilon);

 /*
+normalized the data with normal distribution (do it on site)
+keep the result in the input tenosr and return nothing
+For an input x, x = a * (x-mean)/sqrt(variance+\epsilon) + b
+where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
+*/
+void NormalizeMe(XTensor & input, int dim, const XTensor & mean, const XTensor & var, const XTensor & a, const XTensor & b, DTYPE epsilon);
+
+/*
 normalized the data with normal distribution (return an XTensor structure)
 make a new tensor to keep the result and return it 
 For an input x, y = a * (x-mean)/sqrt(variance+\epsilon) + b

--- a/source/tensor/core/math/Power.cpp
+++ b/source/tensor/core/math/Power.cpp
@@ -81,6 +81,17 @@ void _PowerMe(XTensor * a, DTYPE p)
 }

 /*
+get the power(a, p) (do it on site)
+keep the result in the input tensor a and return nothing
+>> a - the tensor
+>> p - parameter
+*/
+void PowerMe(XTensor& a, DTYPE p)
+{
+    _Power(&a, &a, p);
+}
+
+/*
 get the power(a, p) (return an XTensor structure)
 make a new tensor to keep the result and return it
 >> a - input tensor
@@ -107,9 +118,8 @@ get the power(a, p)
 >> a - input tensor
 >> b - output tensor
 >> p - parameter
->> requireLink - if add operation to network
 */
-void Power(const XTensor & a, XTensor & b, DTYPE p, bool requireLink)
+void Power(const XTensor & a, XTensor & b, DTYPE p)
 {
    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {
        InitTensor(&b, &a);
@@ -118,7 +128,7 @@ void Power(const XTensor & a, XTensor & b, DTYPE p, bool requireLink)
    /* call _Power function */
    _Power(&a, &b, p);

-    if (requireLink) {
+    if (b.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&a, NULL, &b, MATH_POWER);
        XLink::AddParamToHead(&b, p);

--- a/source/tensor/core/math/Power.h
+++ b/source/tensor/core/math/Power.h
@@ -36,13 +36,19 @@ keep the result in the input tensor a and return nothing
 void _PowerMe(XTensor * a, DTYPE p);

 /* 
+get the power(x, y) (do it on site)
+keep the result in the input tensor a and return nothing
+*/
+void PowerMe(XTensor & a, DTYPE p);
+
+/* 
 get the power(x, y) (return an XTensor structure)
 make a new tensor to keep the result and return it
 */
 XTensor Power(const XTensor & a, DTYPE p);

 /* get the power(x, y) */
-void Power(const XTensor & a, XTensor & b, DTYPE p, bool requireLink = false);
+void Power(const XTensor & a, XTensor & b, DTYPE p);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/math/ScaleAndShift.cpp
+++ b/source/tensor/core/math/ScaleAndShift.cpp
@@ -92,6 +92,21 @@ void _ScaleAndShiftMe(XTensor * a, DTYPE scale, DTYPE shift)
 }

 /* 
+scale and shift all tensor entires (do it on site)
+keep the result in the input tensor a and return nothing
+
+a = a * scale + shift
+
+>> a - the input/output tensor
+>> scale - the scaler factor
+>> shift - the shift factor
+*/
+void ScaleAndShiftMe(XTensor& a, DTYPE scale, DTYPE shift)
+{
+    _ScaleAndShift(&a, &a, scale, shift);
+}
+
+/* 
 scale and shift all tensor entires (return an XTensor structure)
 make a new tensor to keep the result and return it

@@ -127,9 +142,8 @@ b = a * scale + shift
 >> b - the output tensor
 >> scale - the scaler factor
 >> shift - the shift factor
->> requireLink - if add operation to network
 */
-void ScaleAndShift(const XTensor & a, XTensor & b, DTYPE scale, DTYPE shift, bool requireLink)
+void ScaleAndShift(const XTensor & a, XTensor & b, DTYPE scale, DTYPE shift)
 {
    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {
        InitTensor(&b, &a);
@@ -138,7 +152,7 @@ void ScaleAndShift(const XTensor & a, XTensor & b, DTYPE scale, DTYPE shift, boo
    /* call _ScaleAndShift function */
    _ScaleAndShift(&a, &b, scale, shift);

-    if (requireLink) {
+    if (b.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&a, NULL, &b, MATH_SCALEANDSHIFT);
        XLink::AddParamToHead(&b, scale);

--- a/source/tensor/core/math/ScaleAndShift.h
+++ b/source/tensor/core/math/ScaleAndShift.h
@@ -45,6 +45,13 @@ void _ScaleAndShiftMe(XTensor * a, DTYPE scale, DTYPE shift = 0);

 /*
 scale and shift all tensor entires
+keep the result in the input tensor a and return nothing
+a = a * scale + shift 
+*/
+void ScaleAndShiftMe(XTensor & a, DTYPE scale, DTYPE shift = 0);
+
+/*
+scale and shift all tensor entires
 make a new tensor to keep the result and return it
 b = a * scale + shift 
 */
@@ -54,7 +61,7 @@ XTensor ScaleAndShift(const XTensor &a, DTYPE scale, DTYPE shift = 0);
 scale and shift all tensor entires 
 b = a * scale + shift 
 */
-void ScaleAndShift(const XTensor &a, XTensor &b, DTYPE scale, DTYPE shift = 0, bool requireLink = false);
+void ScaleAndShift(const XTensor &a, XTensor &b, DTYPE scale, DTYPE shift = 0);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/math/Unary.cpp
+++ b/source/tensor/core/math/Unary.cpp
@@ -83,13 +83,13 @@ XTensor funcName(const XTensor &a)                                          \
 }

 #define SIMPLE_UNARY_FUNCTION_VOID(funcName, _funcName, operationId)        \
-void funcName(const XTensor &a, XTensor &b, bool requireLink)               \
+void funcName(const XTensor &a, XTensor &b)                                 \
 {                                                                           \
    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {                      \
        InitTensor(&b, &a);                                                 \
    }                                                                       \
    _funcName(&a, &b);                                                      \
-    if (requireLink) {                                                      \
+    if (b.enableGrad) {                                                     \
        XLink::MakeLink(&a, NULL, &b, operationId);                         \
    }                                                                       \
 }
@@ -189,13 +189,13 @@ XTensor funcName(const XTensor &a)                                          \
    return b;                                                               \
 }
 #define SIMPLE_UNARY_FUNCTION_VOID(funcName, _funcName, operationId)        \
-void funcName(const XTensor &a, XTensor &b, bool requireLink)               \
+void funcName(const XTensor &a, XTensor &b)                                 \
 {                                                                           \
    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {                      \
        InitTensor(&b, &a);                                                 \
    }                                                                       \
    _funcName(&a, &b);                                                      \
-    if (requireLink) {                                                      \
+    if (b.enableGrad) {                                                     \
        XLink::MakeLink(&a, NULL, &b, operationId);                         \
    }                                                                       \
 }

--- a/source/tensor/core/math/Unary.cu
+++ b/source/tensor/core/math/Unary.cu
--- a/source/tensor/core/math/Unary.h
+++ b/source/tensor/core/math/Unary.h
@@ -31,110 +31,140 @@ void _Absolute(const XTensor * a, XTensor * b);
 /* set every entry to its absolute value (do it on site)
 keep the result in the input tensor a and return nothing */
 void _AbsoluteMe(XTensor * a);
+/* set every entry to its absolute value (do it on site)
+keep the result in the input tensor a and return nothing */
+void AbsoluteMe(XTensor & a);
 /* set every entry to its absolute value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Absolute(const XTensor & a);
 /* set every entry to its absolute value */
-void Absolute(const XTensor & a, XTensor & b, bool requireLink = false);
+void Absolute(const XTensor & a, XTensor & b);

 /* set every entry to its ceil value */
 void _Ceil(const XTensor * a, XTensor * b);
 /* set every entry to its ceil value (do it on site)
 keep the result in the input tensor a and return nothing */
 void _CeilMe(XTensor * a);
+/* set every entry to its ceil value (do it on site)
+keep the result in the input tensor a and return nothing */
+void CeilMe(XTensor & a);
 /* set every entry to its ceil value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Ceil(const XTensor & a);
 /* set every entry to its ceil value */
-void Ceil(const XTensor & a, XTensor & b, bool requireLink = false);
+void Ceil(const XTensor & a, XTensor & b);

 /* set every entry to its exponent value */
 void _Exp(const XTensor * a, XTensor * b);
 /* set every entry to its exponent value (do it on site)
 keep the result in the input tensor a and return nothing */
 void _ExpMe(XTensor * a);
+/* set every entry to its exponent value (do it on site)
+keep the result in the input tensor a and return nothing */
+void ExpMe(XTensor & a);
 /* set every entry to its exponent value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Exp(const XTensor & a);
 /* set every entry to its exponent value */
-void Exp(const XTensor & a, XTensor & b, bool requireLink = false);
+void Exp(const XTensor & a, XTensor & b);

 /* set every entry to its floor value */
 void _Floor(const XTensor * a, XTensor * b);
 /* set every entry to its floor value (do it on site)
 keep the result in the input tensor a and return nothing */
 void _FloorMe(XTensor * a);
+/* set every entry to its floor value (do it on site)
+keep the result in the input tensor a and return nothing */
+void FloorMe(XTensor & a);
 /* set every entry to its floor value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Floor(const XTensor & a);
 /* set every entry to its floor value */
-void Floor(const XTensor & a, XTensor & b, bool requireLink = false);
+void Floor(const XTensor & a, XTensor & b);

 /* if source entry is non-zero, set target entry to be one, otherwise zero */
 void _IsNonZero(const XTensor *a, XTensor *b);
 /* if source entry is non-zero, set target entry to be one, otherwise zero (do it on site)
 keep the result in the input tensor a and return nothing */
 void _IsNonZeroMe(XTensor *a);
+/* if source entry is non-zero, set target entry to be one, otherwise zero (do it on site)
+keep the result in the input tensor a and return nothing */
+void IsNonZeroMe(XTensor &a);
 /* if source entry is non-zero, set target entry to be one, otherwise zero (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor IsNonZero(const XTensor &a);
 /* if source entry is non-zero, set target entry to be one, otherwise zero */
-void IsNonZero(const XTensor &a, XTensor & b, bool requireLink = false);
+void IsNonZero(const XTensor &a, XTensor & b);

 /* if source entry is zero, set target entry to be one, otherwise zero */
 void _IsZero(const XTensor *a, XTensor *b);
 /* if source entry is zero, set target entry to be one, otherwise zero (do it on site)
 keep the result in the input tensor a and return nothing */
 void _IsZeroMe(XTensor *a);
+/* if source entry is zero, set target entry to be one, otherwise zero (do it on site)
+keep the result in the input tensor a and return nothing */
+void IsZeroMe(XTensor &a);
 /* if source entry is zero, set target entry to be one, otherwise zero (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor IsZero(const XTensor &a);
 /* if source entry is zero, set target entry to be one, otherwise zero */
-void IsZero(const XTensor &a, XTensor & b, bool requireLink = false);
+void IsZero(const XTensor &a, XTensor & b);

 /* set every entry to its logarithm value */
 void _Log(const XTensor * a, XTensor * b);
 /* set every entry to its logarithm value (do it on site)
 keep the result in the input tensor a and return nothing */
 void _LogMe(XTensor * a);
+/* set every entry to its logarithm value (do it on site)
+keep the result in the input tensor a and return nothing */
+void LogMe(XTensor & a);
 /* set every entry to its logarithm value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Log(const XTensor & a);
 /* set every entry to its logarithm value */
-void Log(const XTensor & a, XTensor & b, bool requireLink = false);
+void Log(const XTensor & a, XTensor & b);

 /* set every entry to its round value */
 void _Round(const XTensor * a, XTensor * b);
 /* set every entry to its round value (do it on site)
 keep the result in the input tensor a and return nothing */
 void _RoundMe(XTensor * a);
+/* set every entry to its round value (do it on site)
+keep the result in the input tensor a and return nothing */
+void RoundMe(XTensor & a);
 /* set every entry to its round value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Round(const XTensor & a);
 /* set every entry to its round value */
-void Round(const XTensor & a, XTensor & b, bool requireLink = false);
+void Round(const XTensor & a, XTensor & b);

 /* set every entry to its sqrt value */
 void _Sqrt(const XTensor * a, XTensor * b);
 /* set every entry to its sqrt value (do it on site)
 keep the result in the input tensor a and return nothing */
 void _SqrtMe(XTensor * a);
+/* set every entry to its sqrt value (do it on site)
+keep the result in the input tensor a and return nothing */
+void SqrtMe(XTensor & a);
 /* set every entry to its sqrt value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Sqrt(const XTensor & a);
 /* set every entry to its sqrt value */
-void Sqrt(const XTensor & a, XTensor & b, bool requireLink = false);
+void Sqrt(const XTensor & a, XTensor & b);

 /* set every entry to its square value */
 void _Square(const XTensor * a, XTensor * b);
 /* set every entry to its square value (do it on site)
 keep the result in the input tensor a and return nothing */
 void _SquareMe(XTensor * a);
+/* set every entry to its square value (do it on site)
+keep the result in the input tensor a and return nothing */
+void SquareMe(XTensor & a);
 /* set every entry to its square value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Square(const XTensor & a);
 /* set every entry to its square value */
-void Square(const XTensor & a, XTensor & b, bool requireLink = false);
+void Square(const XTensor & a, XTensor & b);


 /* set every entry to its sine value */
@@ -142,33 +172,42 @@ void _Sin(const XTensor * a, XTensor * b);
 /* set every entry to its sine value (do it on site)
 keep the result in the input tensor a and return nothing */
 void _SinMe(XTensor * a);
+/* set every entry to its sine value (do it on site)
+keep the result in the input tensor a and return nothing */
+void SinMe(XTensor & a);
 /* set every entry to its sine value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Sin(const XTensor & a);
 /* set every entry to its sine value */
-void Sin(const XTensor & a, XTensor & b, bool requireLink = false);
+void Sin(const XTensor & a, XTensor & b);

 /* set every entry to its cosine value */
 void _Cos(const XTensor * a, XTensor * b);
 /* set every entry to its cosine value (do it on site)
 keep the result in the input tensor a and return nothing */
 void _CosMe(XTensor * a);
+/* set every entry to its cosine value (do it on site)
+keep the result in the input tensor a and return nothing */
+void CosMe(XTensor & a);
 /* set every entry to its cosine value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Cos(const XTensor & a);
 /* set every entry to its cosine value */
-void Cos(const XTensor & a, XTensor & b, bool requireLink = false);
+void Cos(const XTensor & a, XTensor & b);

 /* set every entry to its tangent value */
 void _Tan(const XTensor * a, XTensor * b);
 /* set every entry to its tangent value (do it on site)
 keep the result in the input tensor a and return nothing */
 void _TanMe(XTensor * a);
+/* set every entry to its tangent value (do it on site)
+keep the result in the input tensor a and return nothing */
+void TanMe(XTensor & a);
 /* set every entry to its tangent value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Tan(const XTensor & a);
 /* set every entry to its tangent value */
-void Tan(const XTensor & a, XTensor & b, bool requireLink = false);
+void Tan(const XTensor & a, XTensor & b);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/reduce/ReduceMax.cpp
+++ b/source/tensor/core/reduce/ReduceMax.cpp
@@ -137,9 +137,8 @@ get the max value of the items along a dimension of the tensor
 >> input - the input tensor
 >> output - the output tensor
 >> dim - the dimension where the reduction is performed on
->> requireLink - if add operation to network
 */
-void ReduceMax(const XTensor &input, XTensor &output, int dim, bool requireLink)
+void ReduceMax(const XTensor &input, XTensor &output, int dim)
 {
    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");

@@ -163,7 +162,7 @@ void ReduceMax(const XTensor &input, XTensor &output, int dim, bool requireLink)
    /* call _ReduceMax function */
    _ReduceMax(&input, &output, dim);

-    if (requireLink) {
+    if (output.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCEMAX);
        XLink::AddParamToHeadInt(&output, dim);

--- a/source/tensor/core/reduce/ReduceMax.cu
+++ b/source/tensor/core/reduce/ReduceMax.cu
--- a/source/tensor/core/reduce/ReduceMax.h
+++ b/source/tensor/core/reduce/ReduceMax.h
@@ -36,7 +36,7 @@ make a new tensor to keep the result and return it
 XTensor ReduceMax(const XTensor &input, int dim);

 /* get the max value of the items along a dimension of the tensor. */
-void ReduceMax(const XTensor &input, XTensor &output, int dim, bool requireLink = false);
+void ReduceMax(const XTensor &input, XTensor &output, int dim);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/reduce/ReduceMean.cpp
+++ b/source/tensor/core/reduce/ReduceMean.cpp
@@ -94,9 +94,8 @@ For a 1-dimensional data array a, mean = (1/n) * sum_i input_i
 >> input - the input tensor
 >> output - the output tensor
 >> dim - the dimension where the reduction is performed on
->> requireLink - if add operation to network
 */
-void ReduceMean(const XTensor &input, XTensor &output, int dim, bool requireLink)
+void ReduceMean(const XTensor &input, XTensor &output, int dim)
 {
    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");

@@ -120,7 +119,7 @@ void ReduceMean(const XTensor &input, XTensor &output, int dim, bool requireLink
    /* call _ReduceMean function */
    _ReduceMean(&input, &output, dim);

-    if (requireLink) {
+    if (output.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCEMEAN);
        XLink::AddParamToHeadInt(&output, dim);

--- a/source/tensor/core/reduce/ReduceMean.h
+++ b/source/tensor/core/reduce/ReduceMean.h
@@ -43,7 +43,7 @@ XTensor ReduceMean(const XTensor &input, int dim);
 get the mean value along a dimension of the tensor
 For a 1-dimensional data array a, mean = (1/n) * sum_i input_i
 */
-void ReduceMean(const XTensor &input, XTensor &output, int dim, bool requireLink = false);
+void ReduceMean(const XTensor &input, XTensor &output, int dim);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/reduce/ReduceSum.cpp
+++ b/source/tensor/core/reduce/ReduceSum.cpp
@@ -244,7 +244,7 @@ XTensor ReduceSum(const XTensor &input, int dim, const XTensor &shift, DTYPE pow
    return output;
 }

-void ReduceSum(const XTensor &input, XTensor &output, int dim, const XTensor &shift, DTYPE power, bool isExp, bool requireLink)
+void ReduceSum(const XTensor &input, XTensor &output, int dim, const XTensor &shift, DTYPE power, bool isExp)
 {
    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");

@@ -268,7 +268,7 @@ void ReduceSum(const XTensor &input, XTensor &output, int dim, const XTensor &sh
    /* call _ReduceSum function */
    _ReduceSum(&input, &output, dim, &shift, power, isExp);

-    if (requireLink) {
+    if (output.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&input, &shift, &output, REDUCE_REDUCESUM);
        XLink::AddParamToHeadInt(&output, dim);
@@ -336,9 +336,8 @@ sum = \sum_i exp((a_i - shift)^power) if isExp == true
 >> shift - shift the input
 >> ieExp - specify if the exp() is performed
 >> power - we perform pow(item_i, power) on each item in the array
->> requireLink - if add operation to network
 */
-void ReduceSum(const XTensor &input, XTensor &output, int dim, DTYPE power, bool isExp, bool requireLink)
+void ReduceSum(const XTensor &input, XTensor &output, int dim, DTYPE power, bool isExp)
 {
    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");

@@ -362,7 +361,7 @@ void ReduceSum(const XTensor &input, XTensor &output, int dim, DTYPE power, bool
    /* call _ReduceSum function */
    _ReduceSum(&input, &output, dim, NULL, power, isExp);

-    if (requireLink) {
+    if (output.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCESUM);
        XLink::AddParamToHeadInt(&output, dim);

--- a/source/tensor/core/reduce/ReduceSum.cu
+++ b/source/tensor/core/reduce/ReduceSum.cu
--- a/source/tensor/core/reduce/ReduceSum.h
+++ b/source/tensor/core/reduce/ReduceSum.h
@@ -44,7 +44,7 @@ sum = \sum_i exp(a_i - shift) if isExp == true
 */
 XTensor ReduceSum(const XTensor &input, int dim, const XTensor &shift, DTYPE power = (DTYPE)1.0F, bool isExp = false);

-void ReduceSum(const XTensor &input, XTensor &output, int dim, const XTensor &shift, DTYPE power = (DTYPE)1.0F, bool isExp = false, bool requireLink = false);
+void ReduceSum(const XTensor &input, XTensor &output, int dim, const XTensor &shift, DTYPE power = (DTYPE)1.0F, bool isExp = false);

 /* 
 sum the items along a dimension of the tensor (return an XTensor structure)
@@ -61,7 +61,7 @@ For a 1-dimensional data array a,
 sum = \sum_i (a_i - shift) if isExp == false
 sum = \sum_i exp(a_i - shift) if isExp == true
 */
-void ReduceSum(const XTensor &input, XTensor &output, int dim, DTYPE power = (DTYPE)1.0F, bool isExp = false, bool requireLink = false);
+void ReduceSum(const XTensor &input, XTensor &output, int dim, DTYPE power = (DTYPE)1.0F, bool isExp = false);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/reduce/ReduceSumSquared.cpp
+++ b/source/tensor/core/reduce/ReduceSumSquared.cpp
@@ -91,9 +91,8 @@ For a 1-dimensional data array a, sum = \sum_i (a_i - shift)^2
 >> output - the output tensor
 >> dim - the dimension where the reduction is performed on
 >> shift - bias on the input
->> requireLink - if add operation to network
 */
-void ReduceSumSquared(const XTensor &input, XTensor &output, int dim, const XTensor &shift, bool requireLink)
+void ReduceSumSquared(const XTensor &input, XTensor &output, int dim, const XTensor &shift)
 {
    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");

@@ -117,7 +116,7 @@ void ReduceSumSquared(const XTensor &input, XTensor &output, int dim, const XTen
    /* call _ReduceSumSquared function */
    _ReduceSumSquared(&input, &output, dim, &shift);

-    if (requireLink) {
+    if (output.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&input, &shift, &output, REDUCE_REDUCESUMSQUARED);
        XLink::AddParamToHeadInt(&output, dim);

--- a/source/tensor/core/reduce/ReduceSumSquared.h
+++ b/source/tensor/core/reduce/ReduceSumSquared.h
@@ -45,7 +45,7 @@ squared sum of the items along a dimension of the tensor
 For a 1-dimensional data array a,
 sum = \sum_i (a_i - shift)^2
 */
-void ReduceSumSquared(const XTensor &input, XTensor &output, int dim, const XTensor &shift, bool requireLink = false);
+void ReduceSumSquared(const XTensor &input, XTensor &output, int dim, const XTensor &shift);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/reduce/ReduceVariance.cpp
+++ b/source/tensor/core/reduce/ReduceVariance.cpp
@@ -94,9 +94,8 @@ For a 1-dimensional data array a, variance = 1/n * \sum_i (a_i - mean)^2
 >> output - the output tensor
 >> dim - the dimension where the reduction is performed on
 >> mean - the mean value
->> requireLink - if add operation to network
 */
-void ReduceVariance(const XTensor &input, XTensor &output, int dim, const XTensor &mean, bool requireLink)
+void ReduceVariance(const XTensor &input, XTensor &output, int dim, const XTensor &mean)
 {
    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");

@@ -120,7 +119,7 @@ void ReduceVariance(const XTensor &input, XTensor &output, int dim, const XTenso
    /* call _ReduceVariance function */
    _ReduceVariance(&input, &output, dim, &mean);

-    if (requireLink) {
+    if (output.enableGrad) {
        /* tensor connection */
        XLink::MakeLink(&input, &mean, &output, REDUCE_REDUCEVARIANCE);
        XLink::AddParamToHeadInt(&output, dim);

--- a/source/tensor/core/reduce/ReduceVariance.h
+++ b/source/tensor/core/reduce/ReduceVariance.h
@@ -43,7 +43,7 @@ XTensor ReduceVariance(const XTensor &input, int dim, const XTensor &mean);
 variance of the items along a dimension of the tensor
 For a 1-dimensional data array a, variance = 1/n * \sum_i (a_i - mean)^2
 */
-void ReduceVariance(const XTensor &input, XTensor &output, int dim, const XTensor &mean, bool requireLink = false);
+void ReduceVariance(const XTensor &input, XTensor &output, int dim, const XTensor &mean);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/shape/ConcatenateSolely.cpp
+++ b/source/tensor/core/shape/ConcatenateSolely.cpp
--- a/source/tensor/core/shape/Merge.cpp
+++ b/source/tensor/core/shape/Merge.cpp
@@ -232,7 +232,7 @@ XTensor Merge(const XTensor &s, int whereToMerge, int leadingDim)
    return t;
 }

-void Merge(const XTensor &s, XTensor &t, int whereToMerge, int leadingDim, bool requireLink)
+void Merge(const XTensor &s, XTensor &t, int whereToMerge, int leadingDim)
 {
    if (!t.isInit || !CheckMergeSize(&s, &t, whereToMerge, leadingDim)) {
        if (leadingDim < 0)
@@ -261,7 +261,7 @@ void Merge(const XTensor &s, XTensor &t, int whereToMerge, int leadingDim, bool 
    /* call _Merge function */
    _Merge(&s, &t, whereToMerge, leadingDim);

-    if (requireLink) {
+    if (t.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&s, NULL, &t, SHAPE_MERGE);
        XLink::AddParamToHeadInt(&t, whereToMerge);

--- a/source/tensor/core/shape/Merge.h
+++ b/source/tensor/core/shape/Merge.h
@@ -33,7 +33,7 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim = -
   e.g., (M, N/3, 3) -> (M, N) */
 XTensor Merge(const XTensor &s, int whereToMerge, int leadingDim = -1);

-void Merge(const XTensor &s, XTensor &t, int whereToMerge, int leadingDim = -1, bool requireLink = false);
+void Merge(const XTensor &s, XTensor &t, int whereToMerge, int leadingDim = -1);

 /* merge small tensors into a big tensor */
 void _Merge(const TensorList * smalls, XTensor * big, int whereToMerge);

--- a/source/tensor/core/shape/Permute.h
+++ b/source/tensor/core/shape/Permute.h
@@ -42,6 +42,13 @@ a = permuted(a)
 void _PermuteMe(XTensor * a, int * dimPermute);

 /*
+permute the tensor dimensions (do it on site).
+keep the result in the input tensor and return nothing.
+a = permuted(a)
+*/
+void PermuteMe(XTensor  &a, int * dimPermute);
+
+/* 
 make a tensor with permuted dimensions (return an XTensor structure).
 make a new tensor to keep the result and return it.
 b = permuted(a)

--- a/source/tensor/core/shape/Reshape.cpp
+++ b/source/tensor/core/shape/Reshape.cpp
@@ -48,7 +48,7 @@ XTensor Reshape(XTensor &s, int order, int * dimSize)
    return t;
 }

-void Reshape(XTensor &s, XTensor &t, int order, int * dimSize, bool requireLink)
+void Reshape(XTensor &s, XTensor &t, int order, int * dimSize)
 {
    if (!t.isInit || !XTensor::IsSameShaped(&t, &s)) {
        InitTensor(&t, &s);
@@ -57,7 +57,7 @@ void Reshape(XTensor &s, XTensor &t, int order, int * dimSize, bool requireLink)
    /* call Reshape function */
    t.Reshape(order, dimSize);

-    if (requireLink) {
+    if (t.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&s, NULL, &t, SHAPE_RESHAPE);
    }

--- a/source/tensor/core/shape/Reshape.h
+++ b/source/tensor/core/shape/Reshape.h
@@ -29,7 +29,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* reshape the tensor */
 XTensor Reshape(XTensor &s, int order, int * dimSize);

-void Reshape(XTensor &s, XTensor &t, int order, int * dimSize, bool requireLink = false);
+void Reshape(XTensor &s, XTensor &t, int order, int * dimSize);

 } // namespace nts(NiuTrans.Tensor)
 #endif // __RESHAPE_H__
--- a/source/tensor/core/shape/Split.cpp
+++ b/source/tensor/core/shape/Split.cpp
@@ -227,7 +227,7 @@ XTensor Split(const XTensor &s, int whereToSplit, int splitNum)
    return t;
 }

-void Split(const XTensor &s, XTensor &t, int whereToSplit, int splitNum, bool requireLink)
+void Split(const XTensor &s, XTensor &t, int whereToSplit, int splitNum)
 {
    if (!t.isInit || !CheckSplitSize(&s, &t, whereToSplit, splitNum)) {
        int order = s.order + 1;
@@ -251,7 +251,7 @@ void Split(const XTensor &s, XTensor &t, int whereToSplit, int splitNum, bool re
    /* call _Split function */
    _Split(&s, &t, whereToSplit, splitNum);

-    if (requireLink) {
+    if (t.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&s, NULL, &t, SHAPE_SPLIT);
        XLink::AddParamToHeadInt(&t, whereToSplit);

--- a/source/tensor/core/shape/Split.h
+++ b/source/tensor/core/shape/Split.h
@@ -41,7 +41,7 @@ e.g., (M, N) -> (M, N/3, 3)
 */
 XTensor Split(const XTensor &s, int whereToSplit, int splitNum);

-void Split(const XTensor &s, XTensor &t, int whereToSplit, int splitNum, bool requireLink = false);
+void Split(const XTensor &s, XTensor &t, int whereToSplit, int splitNum);

 /* split a big tensor into small tensors */
 void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int splitNum);

--- a/source/tensor/core/shape/Squeeze.cpp
+++ b/source/tensor/core/shape/Squeeze.cpp
@@ -89,6 +89,20 @@ void _SqueezeMe(XTensor * source, int leadingDim)
 }

 /*
+squeeze the tensor along the specified dimension  (do it on site)
+keep the result in the input tensor a and return nothing
+
+>> source - the input tensor
+>> leadingDim - the dimension that we would squeeze
+                if leadingDim = -1, squeeze all dimensions that are 1
+                else, squeeze the specified dimension
+*/
+void SqueezeMe(XTensor& source, int leadingDim)
+{
+    _Squeeze(&source, &source, leadingDim);
+}
+
+/*
 squeeze the tensor along the specified dimension (return an XTensor structure)
 make a new tensor to keep the result and return it

@@ -112,7 +126,7 @@ XTensor Squeeze(XTensor & source, int leadingDim)
    return target;
 }

-void Squeeze(XTensor & source, XTensor & target, int leadingDim, bool requireLink)
+void Squeeze(XTensor & source, XTensor & target, int leadingDim)
 {
    if (!target.isInit || !XTensor::IsSameShaped(&source, &target)) {
        InitTensor(&target, &source);
@@ -121,7 +135,7 @@ void Squeeze(XTensor & source, XTensor & target, int leadingDim, bool requireLin
    /* call _Squeeze function */
    _Squeeze(&source, &target, leadingDim);

-    if (requireLink) {
+    if (target.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&source, NULL, &target, SHAPE_SQUEEZE);
    }

--- a/source/tensor/core/shape/Squeeze.h
+++ b/source/tensor/core/shape/Squeeze.h
@@ -33,11 +33,15 @@ void _Squeeze(XTensor * source, XTensor * target, int leadingDim = -1);
   keep the result in the input tensor a and return nothing */
 void _SqueezeMe(XTensor * source, int leadingDim = -1);

+/* squeeze the tensor along the specified dimension (do it on site)
+   keep the result in the input tensor a and return nothing */
+void SqueezeMe(XTensor & source, int leadingDim = -1);
+
 /* squeeze the tensor along the specified dimension  (return an XTensor structure)
   make a new tensor to keep the result and return it */
 XTensor Squeeze(XTensor & source, int leadingDim = -1);

-void Squeeze(XTensor & source, XTensor & target, int leadingDim = -1, bool requireLink = false);
+void Squeeze(XTensor & source, XTensor & target, int leadingDim = -1);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/shape/Unsqueeze.cpp
+++ b/source/tensor/core/shape/Unsqueeze.cpp
@@ -166,7 +166,7 @@ XTensor Unsqueeze(const XTensor &a, int dim, int dSize)
    return b;
 }

-void Unsqueeze(const XTensor &a, XTensor &b, int dim, int dSize, bool requireLink)
+void Unsqueeze(const XTensor &a, XTensor &b, int dim, int dSize)
 {
    if (!b.isInit || !CheckUnsqueezeSize(&a, &b, dim, dSize)) {
        int order = a.order + 1;
@@ -191,7 +191,7 @@ void Unsqueeze(const XTensor &a, XTensor &b, int dim, int dSize, bool requireLin
    /* call _Unsqueeze function */
    _Unsqueeze(&a, &b, dim, dSize);

-    if (requireLink) {
+    if (b.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&a, NULL, &b, SHAPE_UNSQUEEZE);
        XLink::AddParamToHeadInt(&b, dim);

--- a/source/tensor/core/shape/Unsqueeze.h
+++ b/source/tensor/core/shape/Unsqueeze.h
@@ -35,7 +35,7 @@ void _Unsqueeze(const XTensor * a, XTensor * b, int dim, int dSize);
   make a new tensor to keep the result and return it */
 XTensor Unsqueeze(const XTensor &a, int dim, int dSize);

-void Unsqueeze(const XTensor &a, XTensor &b, int dim, int dSize, bool requireLink = false);
+void Unsqueeze(const XTensor &a, XTensor &b, int dim, int dSize);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/sort/Sort.cpp
+++ b/source/tensor/core/sort/Sort.cpp
@@ -98,6 +98,21 @@ void _SortMe(XTensor * a, XTensor * index, int dim)
 }

 /*
+sort the tensor along a given dimension (do it on site)
+keep the result in the input tensor a and return nothing
+
+>> a - input tensor
+>> index - index of the items in the resulting tensor
+>> dim - the dimension along which the sorting is performed
+*/
+void SortMe(XTensor& a, XTensor& index, int dim)
+{
+    _Sort(&a, &a, &index, dim);
+}
+
+
+
+/*
 sort the tensor along a given dimension (return an XTensor structure)
 make a new tensor to keep the result and return it


--- a/source/tensor/core/sort/Sort.cu
+++ b/source/tensor/core/sort/Sort.cu
--- a/source/tensor/core/sort/Sort.h
+++ b/source/tensor/core/sort/Sort.h
@@ -36,6 +36,12 @@ keep the result in the input tensor a and return nothing
 void _SortMe(XTensor * a, XTensor * index, int dim);

 /*
+sort the data along a given dimension (do it on site)
+keep the result in the input tensor a and return nothing
+*/
+void SortMe(XTensor & a, XTensor & index, int dim);
+
+/* 
 sort the data along a given dimension (return an XTensor structure)
 make a new tensor to keep the result and return it
 */

--- a/source/tensor/core/sort/TopK.cu
+++ b/source/tensor/core/sort/TopK.cu
--- a/source/tensor/core/utilities/SetAscendingOrder.cu
+++ b/source/tensor/core/utilities/SetAscendingOrder.cu
--- a/source/tensor/core/utilities/XMatrixSegment.cpp
+++ b/source/tensor/core/utilities/XMatrixSegment.cpp
--- a/source/tensor/function/HardTanH.cpp
+++ b/source/tensor/function/HardTanH.cpp
@@ -84,7 +84,7 @@ XTensor HardTanH(const XTensor &x)
    return y;
 }

-void HardTanH(const XTensor &x, XTensor &y, bool requireLink)
+void HardTanH(const XTensor &x, XTensor &y)
 {
    if (!y.isInit || !XTensor::IsSameShaped(&y, &x)) {
        InitTensor(&y, &x);
@@ -93,7 +93,7 @@ void HardTanH(const XTensor &x, XTensor &y, bool requireLink)
    /* call _HardTanH function */
    _HardTanH(&x, &y);

-    if (requireLink) {
+    if (y.enableGrad) {
        /* tensor connection */
        XLink::MakeLink(&x, NULL, &y, FUNC_HARDTANH);
    }

--- a/source/tensor/function/HardTanH.h
+++ b/source/tensor/function/HardTanH.h
@@ -40,7 +40,7 @@ void _HardTanH(const XTensor * x, XTensor * y);
 /* hard tanh function (return an XTensor structure) */
 XTensor HardTanH(const XTensor &x);

-void HardTanH(const XTensor &x, XTensor &y, bool requireLink = false);
+void HardTanH(const XTensor &x, XTensor &y);

 /* de/dx */
 void _HardTanHBackward(XTensor * gold, XTensor * y, XTensor * x, 

--- a/source/tensor/function/Identity.cpp
+++ b/source/tensor/function/Identity.cpp
@@ -58,7 +58,7 @@ XTensor Identity(const XTensor &x)
    return y;
 }

-void Identity(const XTensor &x, XTensor &y, bool requireLink)
+void Identity(const XTensor &x, XTensor &y)
 {
    if (!y.isInit || !y.IsSameShaped(&y, &x)) {
        InitTensor(&y, &x);
@@ -67,7 +67,7 @@ void Identity(const XTensor &x, XTensor &y, bool requireLink)
    /* call _Identity function */
    _Identity(&x, &y);

-    if (requireLink) {
+    if (y.enableGrad) {
        /* tensor connection */
        XLink::MakeLink(&x, NULL, &y, FUNC_IDENTITY);
    }

--- a/source/tensor/function/Identity.h
+++ b/source/tensor/function/Identity.h
@@ -33,7 +33,7 @@ void _Identity(const XTensor * x, XTensor * y);
 /* identity function y = x (return an XTensor structure) */
 XTensor Identity(const XTensor &x);

-void Identity(const XTensor &x, XTensor &y, bool requireLink = false);
+void Identity(const XTensor &x, XTensor &y);

 /* de/dx */
 void _IdentityBackward(XTensor * gold, XTensor * y, XTensor * x, 

--- a/source/tensor/function/LogSoftmax.cpp
+++ b/source/tensor/function/LogSoftmax.cpp
@@ -194,7 +194,15 @@ XTensor LogSoftmax(const XTensor &x, int leadDim)
    return y;
 }

-void LogSoftmax(const XTensor &x, XTensor &y, int leadDim, bool requireLink)
+/*
+log scale softmax y = log(e^x / \sum_{i} e^{x_i})
+make a new tensor to keep the result and return it
+
+>> x - input vector
+>> y - output vector
+>> leadDim - leading dimension (along which we perform reduction)
+*/
+void LogSoftmax(const XTensor &x, XTensor &y, int leadDim)
 {
    int ld = leadDim;
    if (ld < 0)
@@ -207,32 +215,13 @@ void LogSoftmax(const XTensor &x, XTensor &y, int leadDim, bool requireLink)
    /* call _LogSoftmax function */
    _LogSoftmax(&x, &y, ld);

-    if (requireLink) {
+    if (y.enableGrad) {
        /* tensor connection */
        XLink::MakeLink(&x, NULL, &y, FUNC_LOGSOFTMAX);
        XLink::AddParamToHeadInt(&y, ld);
    }
 }
-/* 
-log scale softmax y = log(e^x / \sum_{i} e^{x_i})
-make a new tensor to keep the result and return it

->> x - input vector
->> y - output vector
->> leadDim - leading dimension (along which we perform reduction)
-*/
-void LogSoftmax(const XTensor &x, XTensor &y, int leadDim)
-{
-    if(!XTensor::IsSameShaped(&x, &y))
-        InitTensor(&y, &x);
-
-    /* call _LogSoftmax function */
-    _LogSoftmax(&x, &y, leadDim);
-
-    /* tensor connection */
-    XLink::MakeLink(&x, NULL, &y, FUNC_LOGSOFTMAX);
-    XLink::AddParamToHeadInt(&y, leadDim);
-}

 /*
 backward computation for dense matrices with default data type

--- a/source/tensor/function/LogSoftmax.h
+++ b/source/tensor/function/LogSoftmax.h
@@ -33,8 +33,6 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim);
 /* log scale softmax y = log(e^x / \sum_{i} e^{x_i}) (return an XTensor structure) */
 XTensor LogSoftmax(const XTensor &x, int leadDim);

-void LogSoftmax(const XTensor &x, XTensor &y, int leadDim, bool requireLink = false);
-
 /* log scale softmax y = log(e^x / \sum_{i} e^{x_i}) (with both argument of x and y) */
 void LogSoftmax(const XTensor &x, XTensor &y, int leadDim);


--- a/source/tensor/function/Rectify.cpp
+++ b/source/tensor/function/Rectify.cpp
@@ -77,7 +77,7 @@ XTensor Rectify(const XTensor &x)
    return y;
 }

-void Rectify(const XTensor &x, XTensor &y, bool requireLink)
+void Rectify(const XTensor &x, XTensor &y)
 {
    if (!y.isInit || !XTensor::IsSameShaped(&y, &x)) {
        InitTensor(&y, &x);
@@ -86,7 +86,7 @@ void Rectify(const XTensor &x, XTensor &y, bool requireLink)
    /* call _Rectify function */
    _Rectify(&x, &y);

-    if (requireLink) {
+    if (y.enableGrad) {
        /* tensor connection */
        XLink::MakeLink(&x, NULL, &y, FUNC_RECTIFY);
    }

--- a/source/tensor/function/Rectify.h
+++ b/source/tensor/function/Rectify.h
@@ -33,7 +33,7 @@ void _Rectify(const XTensor * x, XTensor * y);
 /* rectify function y = max(0, x) (return an XTensor structure) */
 XTensor Rectify(const XTensor &x);

-void Rectify(const XTensor &x, XTensor &y, bool requireLink = false);
+void Rectify(const XTensor &x, XTensor &y);

 /* de/dx */
 void _RectifyBackward(XTensor * gold, XTensor * y, XTensor * x, 

--- a/source/tensor/function/Sigmoid.cpp
+++ b/source/tensor/function/Sigmoid.cpp
@@ -75,7 +75,7 @@ XTensor Sigmoid(const XTensor &x)
    return y;
 }

-void Sigmoid(const XTensor &x, XTensor &y, bool requireLink)
+void Sigmoid(const XTensor &x, XTensor &y)
 {
    if (!y.isInit || !XTensor::IsSameShaped(&y, &x)) {
        InitTensor(&y, &x);
@@ -84,7 +84,7 @@ void Sigmoid(const XTensor &x, XTensor &y, bool requireLink)
    /* call _Sigmoid function */
    _Sigmoid(&x, &y);

-    if (requireLink) {
+    if (y.enableGrad) {
        /* tensor connection */
        XLink::MakeLink(&x, NULL, &y, FUNC_SIGMOID);
    }

--- a/source/tensor/function/Sigmoid.h
+++ b/source/tensor/function/Sigmoid.h
@@ -33,7 +33,7 @@ void _Sigmoid(const XTensor * x, XTensor * y);
 /* sigmoid function y = 1/(1+exp(-x)) (return an XTensor structure) */
 XTensor Sigmoid(const XTensor &x);

-void Sigmoid(const XTensor &x, XTensor &y, bool requireLink = false);
+void Sigmoid(const XTensor &x, XTensor &y);

 /* de/dx */
 void _SigmoidBackward(XTensor * gold, XTensor * y, XTensor * x, 

--- a/source/tensor/function/Softmax.cpp
+++ b/source/tensor/function/Softmax.cpp
@@ -148,7 +148,7 @@ XTensor Softmax(const XTensor &x, int leadDim)
    return y;
 }

-void Softmax(const XTensor &x, XTensor &y, int leadDim, bool requireLink)
+void Softmax(const XTensor &x, XTensor &y, int leadDim)
 {
    int ld = leadDim;
    if (ld < 0)
@@ -161,7 +161,7 @@ void Softmax(const XTensor &x, XTensor &y, int leadDim, bool requireLink)
    /* call _Softmax function */
    _Softmax(&x, &y, ld);

-    if (requireLink) {
+    if (y.enableGrad) {
        /* tensor connection */
        XLink::MakeLink(&x, NULL, &y, FUNC_SOFTMAX);
        XLink::AddParamToHeadInt(&y, ld);

--- a/source/tensor/function/Softmax.h
+++ b/source/tensor/function/Softmax.h
@@ -33,7 +33,7 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim);
 /* softmax y = e^x / \sum_{i} e^{x_i} (return an XTensor structure) */
 XTensor Softmax(const XTensor &x, int leadDim);

-void Softmax(const XTensor &x, XTensor &y, int leadDim, bool requireLink = false);
+void Softmax(const XTensor &x, XTensor &y, int leadDim);

 /* de/dx */
 void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, 

--- a/source/tensor/test/TAbsolute.cpp
+++ b/source/tensor/test/TAbsolute.cpp
--- a/source/tensor/test/TClip.cpp
+++ b/source/tensor/test/TClip.cpp
--- a/source/tensor/test/TCompare.cpp
+++ b/source/tensor/test/TCompare.cpp
--- a/source/tensor/test/TConcatenate.cpp
+++ b/source/tensor/test/TConcatenate.cpp
--- a/source/tensor/test/TConcatenateSolely.cpp
+++ b/source/tensor/test/TConcatenateSolely.cpp
--- a/source/tensor/test/TConvertDataType.cpp
+++ b/source/tensor/test/TConvertDataType.cpp
--- a/source/tensor/test/TCos.cpp
+++ b/source/tensor/test/TCos.cpp
--- a/source/tensor/test/TDiv.cpp
+++ b/source/tensor/test/TDiv.cpp
--- a/source/tensor/test/TDivDim.cpp
+++ b/source/tensor/test/TDivDim.cpp
--- a/source/tensor/test/TExp.cpp
+++ b/source/tensor/test/TExp.cpp
--- a/source/tensor/test/THardTanH.cpp
+++ b/source/tensor/test/THardTanH.cpp
--- a/source/tensor/test/TIdentity.cpp
+++ b/source/tensor/test/TIdentity.cpp
--- a/source/tensor/test/TLog.cpp
+++ b/source/tensor/test/TLog.cpp
--- a/source/tensor/test/TLogSoftmax.cpp
+++ b/source/tensor/test/TLogSoftmax.cpp
--- a/source/tensor/test/TMerge.cpp
+++ b/source/tensor/test/TMerge.cpp
--- a/source/tensor/test/TMultiply.cpp
+++ b/source/tensor/test/TMultiply.cpp
--- a/source/tensor/test/TNegate.cpp
+++ b/source/tensor/test/TNegate.cpp
--- a/source/tensor/test/TNormalize.cpp
+++ b/source/tensor/test/TNormalize.cpp
--- a/source/tensor/test/TPower.cpp
+++ b/source/tensor/test/TPower.cpp
--- a/source/tensor/test/TRectify.cpp
+++ b/source/tensor/test/TRectify.cpp
--- a/source/tensor/test/TRound.cpp
+++ b/source/tensor/test/TRound.cpp
--- a/source/tensor/test/TSigmoid.cpp
+++ b/source/tensor/test/TSigmoid.cpp
--- a/source/tensor/test/TSign.cpp
+++ b/source/tensor/test/TSign.cpp
--- a/source/tensor/test/TSin.cpp
+++ b/source/tensor/test/TSin.cpp
--- a/source/tensor/test/TSoftmax.cpp
+++ b/source/tensor/test/TSoftmax.cpp
--- a/source/tensor/test/TSplit.cpp
+++ b/source/tensor/test/TSplit.cpp
--- a/source/tensor/test/TSub.cpp
+++ b/source/tensor/test/TSub.cpp
--- a/source/tensor/test/TSubDim.cpp
+++ b/source/tensor/test/TSubDim.cpp
--- a/source/tensor/test/TSum.cpp
+++ b/source/tensor/test/TSum.cpp
--- a/source/tensor/test/TSumDim.cpp
+++ b/source/tensor/test/TSumDim.cpp
--- a/source/tensor/test/TTan.cpp
+++ b/source/tensor/test/TTan.cpp
--- a/source/tensor/test/TTranspose.cpp
+++ b/source/tensor/test/TTranspose.cpp
--- a/source/tensor/test/Test.cpp
+++ b/source/tensor/test/Test.cpp