improve the space management

baad6629 · xiaotong · 6ea64b51 · baad6629 · baad6629 · baad6629
Commit baad6629 authored Sep 18, 2018 by xiaotong
--- a/source/network/XBackwardFunc.cpp
+++ b/source/network/XBackwardFunc.cpp
@@ -29,7 +29,7 @@
 namespace nts{

 /* compute dE/dx of a node */
-void XFuncGrad::MakeGrad(XTensor * node)
+void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient)
 {
    


--- a/source/network/XBackwardFunc.h
+++ b/source/network/XBackwardFunc.h
@@ -35,7 +35,7 @@ class XFuncGrad
 public:
    /* compute dE/dx of a node */
    static
-    void MakeGrad(XTensor * node);
+    void MakeGrad(XTensor * node, bool isEfficient);

    /* indicates whether the node is for an activation function */
    static

--- a/source/network/XBackwardMath.cpp
+++ b/source/network/XBackwardMath.cpp
@@ -28,69 +28,73 @@
 namespace nts{

 /* compute dE/dx of a node */
-void XMathGrad::MakeGrad(XTensor * node)
+void XMathGrad::MakeGrad(XTensor * node, bool isEfficient)
 {
+    if(!isEfficient){
        CheckNTErrors(node->grad != NULL, "No gradient found!");
+    }
+    else{
+        CheckNTErrors(!node->isGrad || node->grad != NULL, "No gradient found!");
+    }

    XLink &income = node->income;
    int operID = income.typeID;

-
    if(operID == MATH_ABSOLUTE)
-        GradAbsolute(node);
+        GradAbsolute(node, isEfficient);
    else if(operID == MATH_COS)
-        GradCos(node);
+        GradCos(node, isEfficient);
    else if(operID == MATH_EXP)
-        GradExp(node);
+        GradExp(node, isEfficient);
    else if(operID == MATH_LOG)
-        GradLog(node);
+        GradLog(node, isEfficient);
    else if(operID == MATH_ROUND)
-        GradRound(node);
+        GradRound(node, isEfficient);
    else if(operID == MATH_SIGN)
-        GradSign(node);
+        GradSign(node, isEfficient);
    else if(operID == MATH_SIN)
-        GradSin(node);
+        GradSin(node, isEfficient);
    else if(operID == MATH_TAN)
-        GradTan(node);
+        GradTan(node, isEfficient);

    else if(operID == MATH_CLIP)
-        GradClip(node);
+        GradClip(node, isEfficient);
    else if(operID == MATH_DIV)
-        GradDiv(node);
+        GradDiv(node, isEfficient);
    else if(operID == MATH_DIVDIM)
-        GradDivDim(node);
+        GradDivDim(node, isEfficient);
    else if(operID == MATH_MATRIXMUL)
-        GradMatrixMul(node);
+        GradMatrixMul(node, isEfficient);
    else if(operID == MATH_MATRIXMULBATCHED)
-        GradMatrixMulBatched(node);
+        GradMatrixMulBatched(node, isEfficient);
    else if(operID == MATH_MULTIPLY)
-        GradMultiply(node);
+        GradMultiply(node, isEfficient);
    else if(operID == MATH_MULTIPLYDIM)
-        GradMultiplyDim(node);
+        GradMultiplyDim(node, isEfficient);
    else if(operID == MATH_NEGATE)
-        GradNegate(node);
+        GradNegate(node, isEfficient);
    else if(operID == MATH_NORMALIZE)
-        GradNormalize(node);
+        GradNormalize(node, isEfficient);
    else if(operID == MATH_POWER)
-        GradPower(node);
+        GradPower(node, isEfficient);
    else if(operID == MATH_SCALEANDSHIFT)
-        GradScaleAndShift(node);
+        GradScaleAndShift(node, isEfficient);
    else if(operID == MATH_SUB)
-        GradSub(node);
+        GradSub(node, isEfficient);
    else if(operID == MATH_SUBDIM)
-        GradSubDim(node);
+        GradSubDim(node, isEfficient);
    else if(operID == MATH_SUM)
-        GradSum(node);
+        GradSum(node, isEfficient);
    else if(operID == MATH_SUMDIM)
-        GradSumDim(node);
+        GradSumDim(node, isEfficient);
    else if(operID == REDUCE_REDUCEMEAN)
-        GradReduceMean(node);
+        GradReduceMean(node, isEfficient);
    else if(operID == REDUCE_REDUCESUM)
-        GradReduceSum(node);
+        GradReduceSum(node, isEfficient);
    else if(operID == REDUCE_REDUCESUMSQUARED)
-        GradReduceSumSquared(node);
+        GradReduceSumSquared(node, isEfficient);
    else if(operID == REDUCE_REDUCEVARIANCE)
-        GradReduceVariance(node);
+        GradReduceVariance(node, isEfficient);
    else{
        ShowNTErrors("TODO!");
    }
@@ -111,8 +115,10 @@ we have
 dE/da = dE/dc   a >= 0
        -dE/dc  a < 0
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradAbsolute(XTensor * node)
+void XMathGrad::GradAbsolute(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for ABSOLUTE!");
@@ -137,8 +143,10 @@ c = cos(a)
 we have
 dE/da = dE/dc * -sin(a)
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradCos(XTensor * node)
+void XMathGrad::GradCos(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for COS!");
@@ -164,8 +172,10 @@ c = exp(a)
 we have
 dE/da = dE/dc * exp(a)
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradExp(XTensor * node)
+void XMathGrad::GradExp(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for EXP!");
@@ -190,8 +200,10 @@ c = log(a)
 we have
 dE/da = dE/dc * 1/a
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradLog(XTensor * node)
+void XMathGrad::GradLog(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for LOG!");
@@ -212,8 +224,10 @@ c = round(a)
 we have
 dE/da = 0
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradRound(XTensor * node)
+void XMathGrad::GradRound(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for ROUND!");
@@ -231,8 +245,10 @@ c = sign(a)
 we have
 dE/da = 0
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradSign(XTensor * node)
+void XMathGrad::GradSign(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SIGN!");
@@ -250,8 +266,10 @@ c = sin(a)
 we have
 dE/da = dE/dc * cos(a)
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradSin(XTensor * node)
+void XMathGrad::GradSin(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SIN!");
@@ -276,8 +294,10 @@ c = tan(a)
 we have
 dE/da = dE/dc * 1/(cos(a))^2
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradTan(XTensor * node)
+void XMathGrad::GradTan(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for TAN!");
@@ -302,8 +322,10 @@ we have
 dE/da = 1  lower < a < upper
 dE/da = 0  otherwise 
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradClip(XTensor * node)
+void XMathGrad::GradClip(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for CLIP!");
@@ -332,8 +354,10 @@ we have
 dE/da = dE/dc / b
 dE/db = dE/dc * a / -b^2
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradDiv(XTensor * node)
+void XMathGrad::GradDiv(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for DIVIDE!");
@@ -365,8 +389,12 @@ c = a / b
 where the size of b is equal to dimension n of a, i.e., |b| = a.dimSize[n]
 dE/da = dE/dc * (1/b)
 dE/db = (dE/dc * (-a/b^2)).reduce(0,...,n-1,n+1,...)
+
+>> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradDivDim(XTensor * node)
+void XMathGrad::GradDivDim(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for DIVDIM!");
@@ -466,8 +494,10 @@ we have
 dE/da = dE/dc * b^T * \alpha
 dE/db = a^T * dE/dc * \alpha
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradMatrixMul(XTensor * node)
+void XMathGrad::GradMatrixMul(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for MULTIPLY!");
@@ -479,7 +509,9 @@ void XMathGrad::GradMatrixMul(XTensor * node)
    MATRIX_TRANS_TYPE transB = income.GetParamTrans(1);
    DTYPE alpha = income.GetParam(2);

+    if(!isEfficient || a->isGrad)
        XNoder::MakeGrad(a);
+    if(!isEfficient || b->isGrad)
        XNoder::MakeGrad(b);

    XTensor * c = node;
@@ -487,9 +519,9 @@ void XMathGrad::GradMatrixMul(XTensor * node)
    XTensor * deda = a->grad;
    XTensor * dedb = b->grad;
    
-    if(deda->order == 2 && dedb->order == 2)
-        GradMatrixMul(a, deda, transA, b, dedb, transB, dedc, alpha);
-    else if(transA == X_NOTRANS && deda->order > 2 && dedb->order == 2){
+    if(a->order == 2 && b->order == 2)
+        GradMatrixMul(a, deda, transA, b, dedb, transB, dedc, alpha, isEfficient);
+    else if(transA == X_NOTRANS && a->order > 2 && b->order == 2){
        int orderBackupA = a->order;
        int orderBackupC = c->order;
        int dimsBackupA[MAX_TENSOR_DIM_NUM];
@@ -499,13 +531,15 @@ void XMathGrad::GradMatrixMul(XTensor * node)

        a->Reshape(a->unitNum/a->GetDim(-1), a->GetDim(-1));
        c->Reshape(c->unitNum/c->GetDim(-1), c->GetDim(-1));
+        if(!isEfficient || a->isGrad)
            deda->Reshape(deda->unitNum/deda->GetDim(-1), deda->GetDim(-1));
        dedc->Reshape(dedc->unitNum/dedc->GetDim(-1), dedc->GetDim(-1));

-        GradMatrixMul(a, deda, transA, b, dedb, transB, dedc, alpha);
+        GradMatrixMul(a, deda, transA, b, dedb, transB, dedc, alpha, isEfficient);

        a->Reshape(orderBackupA, dimsBackupA);
        c->Reshape(orderBackupC, dimsBackupC);
+        if(!isEfficient || a->isGrad)
            deda->Reshape(orderBackupA, dimsBackupA);
        dedc->Reshape(orderBackupC, dimsBackupC);
    }
@@ -524,18 +558,22 @@ gradient for matrix multiply: c = matmul(a, b) * \alpha
 >> dedb - dE/db
 >> dedc - dE/dc
 >> alpha - the scalar
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
 void XMathGrad::GradMatrixMul(XTensor * a, XTensor * deda, MATRIX_TRANS_TYPE transA,
                              XTensor * b, XTensor * dedb, MATRIX_TRANS_TYPE transB,
-                              XTensor * dedc, DTYPE alpha)
+                              XTensor * dedc, DTYPE alpha, bool isEfficient)
 {
    /* c = a * b * \alpha */
    if(transA == X_NOTRANS && transB == X_NOTRANS){
        
        /* dE/da = dE/dc * b^T * \alpha */
+        if(!isEfficient || a->isGrad)
            _MatrixMul(dedc, X_NOTRANS, b, X_TRANS, deda, alpha, 1.0F);
        
        /* dE/db = a^T * dE/dc * \alpha */
+        if(!isEfficient || b->isGrad)
            _MatrixMul(a, X_TRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
    }
    
@@ -544,9 +582,11 @@ void XMathGrad::GradMatrixMul(XTensor * a, XTensor * deda, MATRIX_TRANS_TYPE tra
        
        /* dE/da = (dE/dc * b^T)^T * \alpha 
                 = b * dE/dc^T * \alpha */
+        if(!isEfficient || a->isGrad)
            _MatrixMul(b, X_NOTRANS, dedc, X_TRANS, deda, alpha, 1.0F);
        
        /* dE/db = a * dE/dc * \alpha */
+        if(!isEfficient || b->isGrad)
            _MatrixMul(a, X_NOTRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
    }
    
@@ -554,10 +594,12 @@ void XMathGrad::GradMatrixMul(XTensor * a, XTensor * deda, MATRIX_TRANS_TYPE tra
    else if(transA == X_NOTRANS && transB == X_TRANS){
        
        /* dE/da = dE/dc * b * \alpha */
+        if(!isEfficient || a->isGrad)
            _MatrixMul(dedc, X_NOTRANS, b, X_NOTRANS, deda, alpha, 1.0F);
        
        /* dE/db = (a^T * dE/dc)^T * \alpha 
                 = dE/dc^T * a * \alpha */
+        if(!isEfficient || b->isGrad)
            _MatrixMul(dedc, X_TRANS, a, X_NOTRANS, dedb, alpha, 1.0F);
    }
    
@@ -566,10 +608,12 @@ void XMathGrad::GradMatrixMul(XTensor * a, XTensor * deda, MATRIX_TRANS_TYPE tra
        
        /* dE/da = (dE/dc * b)^T * \alpha 
                 = b^T * dE/dc^T * \alpha */
+        if(!isEfficient || a->isGrad)
            _MatrixMul(b, X_TRANS, dedc, X_TRANS, deda, alpha, 1.0F);
        
        /* dE/db = (a * dE/dc)^T * \alpha 
                 = dE/dc^T * a^T * \alpha */
+        if(!isEfficient || b->isGrad)
            _MatrixMul(dedc, X_TRANS, a, X_TRANS, dedb, alpha, 1.0F);
    }
 }
@@ -582,8 +626,10 @@ we have
 dE/da_i = dE/dc_i * b_i^T * \alpha
 dE/db_i = a_i^T * dE/dc_i * \alpha
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradMatrixMulBatched(XTensor * node)
+void XMathGrad::GradMatrixMulBatched(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for MULTIPLY!");
@@ -657,8 +703,10 @@ we have
 dE/da = dE/dc * b
 dE/db = dE/dc * a 
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradMultiply(XTensor * node)
+void XMathGrad::GradMultiply(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for MULTIPLY!");
@@ -681,8 +729,12 @@ c = a * b
 where the size of b is equal to dimension n of a, i.e., |b| = a.dimSize[n]
 dE/da = dE/dc * b
 dE/db = (dE/dc * a).reduce(0,...,n-1,n+1,...)
+
+>> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradMultiplyDim(XTensor * node)
+void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for MULTIPLYDIM!");
@@ -771,8 +823,10 @@ c = -a
 we have
 dE/da = dE/dc * (-1)
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradNegate(XTensor * node)
+void XMathGrad::GradNegate(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for NEGATE!");
@@ -793,8 +847,10 @@ void XMathGrad::GradNegate(XTensor * node)
 /*
 gradient for normalize
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradNormalize(XTensor * node)
+void XMathGrad::GradNormalize(XTensor * node, bool isEfficient)
 {
    ShowNTErrors("This is really a bad piece of code!!!");
    
@@ -887,8 +943,10 @@ c = pow(a,p)
 we have
 dE/da = (dE/dc) * p * a^(p-1)
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradPower(XTensor * node)
+void XMathGrad::GradPower(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for POWER!");
@@ -916,8 +974,10 @@ c = a * scale + shift
 we have
 dE/da = dE/dc * scale
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradScaleAndShift(XTensor * node)
+void XMathGrad::GradScaleAndShift(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SCALEANDSHIFT!");
@@ -941,8 +1001,10 @@ we have
 dE/da = dE/dc
 dE/db = -dE/dc * \beta
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradSub(XTensor * node)
+void XMathGrad::GradSub(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for SUBSTRACT!");
@@ -966,8 +1028,11 @@ c = a - b * \beta
 where the size of b is equal to dimension n of a, i.e., |b| = a.dimSize[n]
 dE/da = dE/dc
 dE/db = - dE/dc * b.reduce(0,...,n-1,n+1,...) * \beta
+>> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradSubDim(XTensor * node)
+void XMathGrad::GradSubDim(XTensor * node, bool isEfficient)
 {
 	XLink &income = node->income;
 	CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for SUBDIM!");
@@ -1063,9 +1128,12 @@ c =  a + b * \beta
 we have
 dE/da = dE/dc 
 dE/db = dE/dc * \beta
+
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradSum(XTensor * node)
+void XMathGrad::GradSum(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for SUM!");
@@ -1074,11 +1142,15 @@ void XMathGrad::GradSum(XTensor * node)
    XTensor * b = income.tails[1];
    DTYPE beta = income.GetParam(0);

+    if(!isEfficient || a->isGrad){
        XNoder::MakeGrad(a);
-    XNoder::MakeGrad(b);
-
        _Sum(a->grad, node->grad, a->grad);
+    }
+
+    if(!isEfficient || b->isGrad){
+        XNoder::MakeGrad(b);
        _Sum(b->grad, node->grad, b->grad, beta);
+    }

    node->visitMark = NODE_FINISHED;
 }
@@ -1089,8 +1161,12 @@ c = a + b * \beta
 where the size of b is equal to dimension n of a, i.e., |b| = a.dimSize[n]
 dE/da = dE/dc
 dE/db = dE/dc * b.reduce(0,...,n-1,n+1,...) * \beta
+
+>> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradSumDim(XTensor * node)
+void XMathGrad::GradSumDim(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for SUMDIM!");
@@ -1183,9 +1259,12 @@ for
 c = reduceMean(a, dim)
 we have
 dE/da = Unsqueeze(dE/dc) * 1/dimSizeA[dim]
+
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradReduceMean(XTensor * node)
+void XMathGrad::GradReduceMean(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for Reduce!");
@@ -1213,9 +1292,12 @@ for
 c = reduceSum(a, dim)
 we have
 dE/da = Unsqueeze(dE/dc) * 1
+
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradReduceSum(XTensor * node)
+void XMathGrad::GradReduceSum(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for Reduce!");
@@ -1243,9 +1325,12 @@ c = \sum_i (a_i - b)^2
 we have
 dE/da = Unsqueeze(dE/dc) * 2a
 dE/db = dE/dc * -2 * n * b
+
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradReduceSumSquared(XTensor * node)
+void XMathGrad::GradReduceSumSquared(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for Reduce!");
@@ -1292,9 +1377,12 @@ where b is the mean, and n is the size of a
 we have
 dE/da = Unsqueeze(dE/dc) * 2a/n
 dE/db = dE/dc * -2 * b
+
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradReduceVariance(XTensor * node)
+void XMathGrad::GradReduceVariance(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for Reduce!");

--- a/source/network/XBackwardMath.h
+++ b/source/network/XBackwardMath.h
@@ -33,7 +33,7 @@ class XMathGrad
 public:
    /* compute dE/dx of a node */
    static
-    void MakeGrad(XTensor * node);
+    void MakeGrad(XTensor * node, bool isEfficient);

    /* indicates whether the node is for a math operation */
    static
@@ -43,121 +43,121 @@ private:
    
    /* gradient for absolute */
    static
-    void GradAbsolute(XTensor * node);
+    void GradAbsolute(XTensor * node, bool isEfficient);
    
    /* gradient for cos */
    static
-    void GradCos(XTensor * node);
+    void GradCos(XTensor * node, bool isEfficient);
    
    /* gradient for exp */
    static
-    void GradExp(XTensor * node);
+    void GradExp(XTensor * node, bool isEfficient);

    /* gradient for log: c =  log(a) */
    static
-    void GradLog(XTensor * node);
+    void GradLog(XTensor * node, bool isEfficient);
    
    /* gradient for round */
    static
-    void GradRound(XTensor * node);
+    void GradRound(XTensor * node, bool isEfficient);
    
    /* gradient for sign */
    static
-    void GradSign(XTensor * node);
+    void GradSign(XTensor * node, bool isEfficient);

    /* gradient for sin */
    static
-    void GradSin(XTensor * node);
+    void GradSin(XTensor * node, bool isEfficient);

    /* gradient for tan */
    static
-    void GradTan(XTensor * node);
+    void GradTan(XTensor * node, bool isEfficient);

    /* gradient for clip */
    static
-    void GradClip(XTensor * node);
+    void GradClip(XTensor * node, bool isEfficient);

    /* gradient for Divide */
    static
-    void GradDiv(XTensor * node);
+    void GradDiv(XTensor * node, bool isEfficient);

    /* gradient for DivideDim */
    static
-    void GradDivDim(XTensor * node);
+    void GradDivDim(XTensor * node, bool isEfficient);

    /* gradient for matrix multiply: c = matmul(a, b) * \alpha */
    static
-    void GradMatrixMul(XTensor * node);
+    void GradMatrixMul(XTensor * node, bool isEfficient);
    
    /* gradient for matrix multiply: c = matmul(a, b) * \alpha */
    static
    void GradMatrixMul(XTensor * a, XTensor * deda, MATRIX_TRANS_TYPE transA,
                       XTensor * b, XTensor * dedb, MATRIX_TRANS_TYPE transB,
-                       XTensor * dedc, DTYPE alpha);
+                       XTensor * dedc, DTYPE alpha, bool isEfficient);

    /* gradient for matrix multiply in batch mode.
       for each batch: c_i = matmul(a_i, b_i) * \alpha */
    static
-    void GradMatrixMulBatched(XTensor * node);
+    void GradMatrixMulBatched(XTensor * node, bool isEfficient);

    /* gradient for multiply (dot production): c =  a * b * \alpha */
    static
-    void GradMultiply(XTensor * node);
+    void GradMultiply(XTensor * node, bool isEfficient);

    /* gradient for multiply one dimension: c =  a * b * \alpha 
       where the size of b is equal to that of one dimension of a */
    static
-    void GradMultiplyDim(XTensor * node);
+    void GradMultiplyDim(XTensor * node, bool isEfficient);

    /* gradient for negate */
    static
-    void GradNegate(XTensor * node);
+    void GradNegate(XTensor * node, bool isEfficient);
    
    /* gradient for normalize */
    static
-    void GradNormalize(XTensor * node);
+    void GradNormalize(XTensor * node, bool isEfficient);

    /* gradient for power */
    static
-    void GradPower(XTensor * node);
+    void GradPower(XTensor * node, bool isEfficient);

    /* gradient for ScaleAndShift */
    static
-    void GradScaleAndShift(XTensor * node);
+    void GradScaleAndShift(XTensor * node, bool isEfficient);

    /* gradient for Minus */
    static
-    void GradSub(XTensor * node);
+    void GradSub(XTensor * node, bool isEfficient);
    
 	/* gradient for sub with one dimension: c = a - b * \beta
 	where the size of b is equal to that of one dimension of a */
 	static
-	void GradSubDim(XTensor * node);
+	void GradSubDim(XTensor * node, bool isEfficient);

    /* gradient for sum: c =  a + b * \beta */
    static
-    void GradSum(XTensor * node);
+    void GradSum(XTensor * node, bool isEfficient);

    /* gradient for sum with one dimension: c = a + b * \beta
       where the size of b is equal to that of one dimension of a */
    static
-    void GradSumDim(XTensor * node);
+    void GradSumDim(XTensor * node, bool isEfficient);

    /* gradient for reduceMean */
    static
-    void GradReduceMean(XTensor * node);
+    void GradReduceMean(XTensor * node, bool isEfficient);

    /* gradient for reduceSum */
    static
-    void GradReduceSum(XTensor * node);
+    void GradReduceSum(XTensor * node, bool isEfficient);

    /* gradient for reduceSumSquared */
    static
-    void GradReduceSumSquared(XTensor * node);
+    void GradReduceSumSquared(XTensor * node, bool isEfficient);

    /* gradient for reduceVariance */
    static
-    void GradReduceVariance(XTensor * node);
+    void GradReduceVariance(XTensor * node, bool isEfficient);
 };

 }

--- a/source/network/XBackwardShape.cpp
+++ b/source/network/XBackwardShape.cpp
@@ -30,7 +30,7 @@
 namespace nts{

 /* compute dE/dx of a node */
-void XShapeGrad::MakeGrad(XTensor * node)
+void XShapeGrad::MakeGrad(XTensor * node, bool isEfficent)
 {
    CheckNTErrors(node->grad != NULL, "No gradient found!");

@@ -38,17 +38,17 @@ void XShapeGrad::MakeGrad(XTensor * node)
    int operID = income.typeID;

    if(operID == SHAPE_MERGE)
-        GradMerge(node);
+        GradMerge(node, isEfficent);
    else if(operID == SHAPE_MERGE_LIST)
-        GradMergeList(node);
+        GradMergeList(node, isEfficent);
    else if(operID == SHAPE_UNSQUEEZE)
-        GradUnsqueeze(node);
+        GradUnsqueeze(node, isEfficent);
    else if(operID == SHAPE_SPLIT)
-        GradSplit(node);
+        GradSplit(node, isEfficent);
    else if(operID == SHAPE_SPLIT_LIST)
-        GradSplitList(node);
+        GradSplitList(node, isEfficent);
    else if (operID == SHAPE_TRANSPOSE)
-        GradTranspose(node);
+        GradTranspose(node, isEfficent);
    else{
        ShowNTErrors("TODO!");
    }
@@ -62,10 +62,10 @@ bool XShapeGrad::IsShapeOP(XTensor * node)
 }

 /* post processing of a node */
-void XShapeGrad::PostProcessing(XTensor * node, int typeID)
+void XShapeGrad::PostProcessing(XTensor * node, int typeID, bool isEfficent)
 {
    if(typeID == SHAPE_SPLIT_LIST)
-        GradSplitListPost(node);
+        GradSplitListPost(node, isEfficent);
 }

 /* 
@@ -80,8 +80,10 @@ dE/db_1 = dE/dc_{split_1}
 i.e.,
 dE/da = split(dE/dc)
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XShapeGrad::GradMerge(XTensor * node)
+void XShapeGrad::GradMerge(XTensor * node, bool isEfficent)
 {
    XLink &income = node->income;
    XTensor * input = income.tails[0];
@@ -162,8 +164,10 @@ dE/db = dE/dc_{split_1}
 i.e.,
 list(dE/da, dE/db, ...) = split(dE/dc)
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XShapeGrad::GradMergeList(XTensor * node)
+void XShapeGrad::GradMergeList(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for MERGE!");
@@ -239,8 +243,10 @@ c = split(a)
 we have
 dE/da = merge(dE/dc)
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XShapeGrad::GradSplit(XTensor * node)
+void XShapeGrad::GradSplit(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    XTensor * input = income.tails[0];
@@ -279,8 +285,10 @@ list(c_1, ...) = split(a)
 we have
 dE/da = merge(dE/c_1, ...)
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XShapeGrad::GradSplitList(XTensor * node)
+void XShapeGrad::GradSplitList(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    XTensor * input = income.tails[0];
@@ -299,8 +307,10 @@ have been processed. We do this in a post-processing
 manner because we can fuze multiple memory copy jobs 
 one time. This is good for system speed up. 
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XShapeGrad::GradSplitListPost(XTensor * node)
+void XShapeGrad::GradSplitListPost(XTensor * node, bool isEfficient)
 {
    /* we compute the gradient for current node, rather than for
       child node, i.e., we use the outgoing edge here */
@@ -351,8 +361,10 @@ c = unsqueeze(a)
 we have
 dE/da = reduecesum(dE/dc)
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XShapeGrad::GradUnsqueeze(XTensor * node)
+void XShapeGrad::GradUnsqueeze(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for UNSQUEEZE!");
@@ -379,8 +391,10 @@ c = Transpose(a)
 we have
 dE/da = Transpose(dE/dc)
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XShapeGrad::GradTranspose(XTensor * node)
+void XShapeGrad::GradTranspose(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for TRANSPOSE!");

--- a/source/network/XBackwardShape.h
+++ b/source/network/XBackwardShape.h
@@ -34,7 +34,7 @@ class XShapeGrad
 public:
    /* compute dE/dx of a node */
    static
-    void MakeGrad(XTensor * node);
+    void MakeGrad(XTensor * node, bool isEfficent);

    /* indicates whether the node is for a shaping operation */
    static
@@ -42,38 +42,38 @@ public:

    /* post processing of a node */
    static
-    void PostProcessing(XTensor * node, int typeId);
+    void PostProcessing(XTensor * node, int typeId, bool isEfficent);

 private:
    /* gradient computation for merge: c = merge(a, b, ...) */
    static
-    void GradMerge(XTensor * node);
+    void GradMerge(XTensor * node, bool isEfficent);

    /* gradient computation for merging a list of tensors : c = merge(list(a, b, ...)) */
    static
-    void GradMergeList(XTensor * node);
+    void GradMergeList(XTensor * node, bool isEfficent);

    /* gradient computation for split: c = split(a) */
    static
-    void GradSplit(XTensor * node);
+    void GradSplit(XTensor * node, bool isEfficent);

    /* gradient computation for spliting. we return the list of the splits : list(c_1, ...) = split(a) */
    static
-    void GradSplitList(XTensor * node);
+    void GradSplitList(XTensor * node, bool isEfficent);

    /* gradient computation for spliting. we return the list of the splits : list(c_1, ...) = split(a).
       this method is called only when all nodes of spliting have been processed. We do this in a post-processing
       manner because we can fuze multiple memory copy jobs one time. This is good for system speed up. */
    static
-    void GradSplitListPost(XTensor * node);
+    void GradSplitListPost(XTensor * node, bool isEfficent);

    /* gradient computation for unsqueezing a tensor : c = unsqueeze(a) */
    static
-    void GradUnsqueeze(XTensor * node);
+    void GradUnsqueeze(XTensor * node, bool isEfficent);

    /* gradient computation for unsqueezing a tensor : c = unsqueeze(a) */
    static
-    void GradTranspose(XTensor * node);
+    void GradTranspose(XTensor * node, bool isEfficent);
    
 };


--- a/source/network/XNet.cpp
+++ b/source/network/XNet.cpp
@@ -55,6 +55,7 @@ void XNetClearAll()
 XNet::XNet()
 {
    nodes.Clear();
+    isGradEfficient = false;
 }

 /* de-constructor */
@@ -115,6 +116,10 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
 {
    Traverse(roots);

+    /* label tensors where the backward computation is neccessary */
+    if(isGradEfficient)
+        MakeEfficientNet();
+
    for(int i = 0; i < nodes.count; i++){
        XTensor * node = (XTensor*)nodes.Get(i);
        node->visitMark = NODE_UNFINISHED;
@@ -154,10 +159,20 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
            CheckNTErrors(node->mem->bufUsed < BUF_PITCH, "Illegal access of buffer!");
        }

-        if(node->visitMark == NODE_FINISHED)
-            continue;
+        if(node->visitMark != NODE_FINISHED)
+            BackwardNode(node, isGradEfficient); 

-        BackwardNode(node);  
+        if(isGradEfficient){
+            if(!XNoder::IsLeaf(node)){
+                XLink & outgo = node->outgo;
+                for(int i = 0; i < outgo.tailNum; i++){
+                    XTensor * parent = outgo.tails[i];
+                    ClearGrad(parent);
+                }
+            }
+            else
+                ClearGrad(node);
+        }
    }
 }

@@ -179,27 +194,32 @@ void XNet::Backward(XList &roots, LOSS_FUNCTION_NAME loss)
 /* 
 backward computation for a given node 
 >> node - the node keeps the result of an operation (e.g., activation function)
+>> isEfficient - indicates whether the back-propagation is compuated in an
+                 efficient manner
 */
-void XNet::BackwardNode(XTensor * node)
+void XNet::BackwardNode(XTensor * node, bool isEfficent)
 {
    if(node == NULL || node->visitMark == NODE_FINISHED)
        return;

    if(!XNoder::IsLeaf(node)){
        /* post processing for parent nodes */
-        BackwardNodePost(node);
+        BackwardNodePost(node, isEfficent);

        /* process the current node */
        if(XMathGrad::IsMathOP(node))
-            XMathGrad::MakeGrad(node);
+            XMathGrad::MakeGrad(node, isEfficent);
        else if(XFuncGrad::IsFunc(node))
-            XFuncGrad::MakeGrad(node);
+            XFuncGrad::MakeGrad(node, isEfficent);
        else if(XShapeGrad::IsShapeOP(node))
-            XShapeGrad::MakeGrad(node);
+            XShapeGrad::MakeGrad(node, isEfficent);
        else{
            ShowNTErrors("Wrong node type!");
        }
    }
+    else{
+        node->visitMark = NODE_FINISHED;
+    }
 }

 /* 
@@ -207,7 +227,7 @@ backward computation (in post processing) for a given node
 >> node - the node whose parent nodes are not processed yet. So
          we do the job at the child node.
 */
-void XNet::BackwardNodePost(XTensor * node)
+void XNet::BackwardNodePost(XTensor * node, bool isEfficent)
 {
    bool isSplitList = false;
    XLink &outgo = node->outgo;
@@ -217,7 +237,7 @@ void XNet::BackwardNodePost(XTensor * node)
    }

    if(isSplitList)
-        XShapeGrad::PostProcessing(node, SHAPE_SPLIT_LIST);
+        XShapeGrad::PostProcessing(node, SHAPE_SPLIT_LIST, isEfficent);
 }

 /* 
@@ -304,4 +324,62 @@ void XNet::Dump(FILE * file)
    }
 }

+/* 
+set the flag of gradient-efficient 
+>> flag - the flag
+*/
+void XNet::SetGradEfficientFlag(bool flag)
+{
+    isGradEfficient = flag;
+}
+
+/* generate the gradient-efficient flag for every node */
+void XNet::MakeEfficientNet()
+{
+    /* back-propagation from output to input */
+    for(int i = 0; i < nodes.count; i++){
+        XTensor * node = (XTensor*)nodes.Get(i);
+        XLink &income = node->income;
+        for(int j = 0; j < income.tailNum; j++){
+            XTensor * child = income.tails[j];
+            if(child->isGrad || child->isVar){
+                node->SetGradFlag(true);
+                break;
+            }
+
+        }
+    }
+}
+
+/* 
+clear the graident information if the node is no use 
+>> node - the node that we want to clear
+*/
+void XNet::ClearGrad(XTensor * node)
+{
+    if(node->isVar)
+        return;
+    if(node->grad == NULL)
+        return;
+    if(node->visitMark != NODE_FINISHED)
+        return;
+
+    XLink & income = node->income;
+
+    bool finished = true;
+    for(int i = 0; i < income.tailNum; i++){
+        XTensor * child = income.tails[i];
+        if(child->visitMark != NODE_FINISHED){
+            finished = false;
+            break;
+        }
+    }
+
+    if(finished){
+        //fprintf(stderr, "del %d %ld\n", node->id, node->grad->unitNum);
+        delete node->grad;
+        node->grad = NULL;
+    }
+}
+
 }
\ No newline at end of file
--- a/source/network/XNet.h
+++ b/source/network/XNet.h
@@ -47,6 +47,9 @@ struct XNet
    /* input nodes of the network */
    XList inputs;

+    /* indicates whether the network just keeps the gradient for parameter tensors */
+    bool isGradEfficient;
+
    /* constructor */
    XNet();

@@ -71,10 +74,10 @@ struct XNet
    void Backward(XList &roots, LOSS_FUNCTION_NAME loss = NOLOSS);

    /* backward computation for a given node */
-    void BackwardNode(XTensor * node);
+    void BackwardNode(XTensor * node, bool isEfficent = false);

    /* backward computation (in post processing) for a given node */
-    void BackwardNodePost(XTensor * node);
+    void BackwardNodePost(XTensor * node, bool isEfficent = false);

    /* traverse the net and find the topological order by 
       depth-first search (Tarjan's algorithm) */
@@ -89,6 +92,15 @@ struct XNet

    /* dump network information */
    void Dump(FILE * file);
+
+    /* set the flag of gradient-efficient */
+    void SetGradEfficientFlag(bool flag = true);
+
+    /* generate the gradient-efficient flag for every node */
+    void MakeEfficientNet();
+
+    /* clear the graident information if the node is no use */
+    void ClearGrad(XTensor * node);
 };

 /* we make a unique id for every tensor */