1. add gather function 2. add cross entropy forward computation and backward…

1. add gather function 2. add cross entropy forward computation and backward computation 3. code optimization 4. merge with xiaotong-working branch

1. add gather function 2. add cross entropy forward computation and backward…
1. add gather function 2. add cross entropy forward computation and backward computation 3. code optimization 4. merge with xiaotong-working branch
ceb5b101 · xuchen · 102db468 · ceb5b101 · ceb5b101 · ceb5b101
Commit ceb5b101 authored Oct 02, 2018 by xuchen
--- a/source/network/XBackwardFunc.cpp
+++ b/source/network/XBackwardFunc.cpp
@@ -29,10 +29,8 @@
 namespace nts{

 /* compute dE/dx of a node */
-void XFuncGrad::MakeGrad(XTensor * node)
+void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient)
 {
-    
-
    XLink &income = node->income;
    int operID = income.typeID;


--- a/source/network/XBackwardFunc.h
+++ b/source/network/XBackwardFunc.h
@@ -35,7 +35,7 @@ class XFuncGrad
 public:
    /* compute dE/dx of a node */
    static
-    void MakeGrad(XTensor * node);
+    void MakeGrad(XTensor * node, bool isEfficient);

    /* indicates whether the node is for an activation function */
    static

--- a/source/network/XBackwardMath.cpp
+++ b/source/network/XBackwardMath.cpp
@@ -28,69 +28,73 @@
 namespace nts{

 /* compute dE/dx of a node */
-void XMathGrad::MakeGrad(XTensor * node)
+void XMathGrad::MakeGrad(XTensor * node, bool isEfficient)
 {
-    CheckNTErrors(node->grad != NULL, "No gradient found!");
+    if(!isEfficient){
+        CheckNTErrors(node->grad != NULL, "No gradient found!");
+    }
+    else{
+        CheckNTErrors(!node->isGrad || node->grad != NULL, "No gradient found!");
+    }

    XLink &income = node->income;
    int operID = income.typeID;

-
    if(operID == MATH_ABSOLUTE)
-        GradAbsolute(node);
+        GradAbsolute(node, isEfficient);
    else if(operID == MATH_COS)
-        GradCos(node);
+        GradCos(node, isEfficient);
    else if(operID == MATH_EXP)
-        GradExp(node);
+        GradExp(node, isEfficient);
    else if(operID == MATH_LOG)
-        GradLog(node);
+        GradLog(node, isEfficient);
    else if(operID == MATH_ROUND)
-        GradRound(node);
+        GradRound(node, isEfficient);
    else if(operID == MATH_SIGN)
-        GradSign(node);
+        GradSign(node, isEfficient);
    else if(operID == MATH_SIN)
-        GradSin(node);
+        GradSin(node, isEfficient);
    else if(operID == MATH_TAN)
-        GradTan(node);
+        GradTan(node, isEfficient);

    else if(operID == MATH_CLIP)
-        GradClip(node);
+        GradClip(node, isEfficient);
    else if(operID == MATH_DIV)
-        GradDiv(node);
+        GradDiv(node, isEfficient);
    else if(operID == MATH_DIVDIM)
-        GradDivDim(node);
+        GradDivDim(node, isEfficient);
    else if(operID == MATH_MATRIXMUL)
-        GradMatrixMul(node);
+        GradMatrixMul(node, isEfficient);
    else if(operID == MATH_MATRIXMULBATCHED)
-        GradMatrixMulBatched(node);
+        GradMatrixMulBatched(node, isEfficient);
    else if(operID == MATH_MULTIPLY)
-        GradMultiply(node);
+        GradMultiply(node, isEfficient);
    else if(operID == MATH_MULTIPLYDIM)
-        GradMultiplyDim(node);
+        GradMultiplyDim(node, isEfficient);
    else if(operID == MATH_NEGATE)
-        GradNegate(node);
+        GradNegate(node, isEfficient);
    else if(operID == MATH_NORMALIZE)
-        GradNormalize(node);
+        GradNormalize(node, isEfficient);
    else if(operID == MATH_POWER)
-        GradPower(node);
+        GradPower(node, isEfficient);
    else if(operID == MATH_SCALEANDSHIFT)
-        GradScaleAndShift(node);
+        GradScaleAndShift(node, isEfficient);
    else if(operID == MATH_SUB)
-        GradSub(node);
+        GradSub(node, isEfficient);
    else if(operID == MATH_SUBDIM)
-        GradSubDim(node);
+        GradSubDim(node, isEfficient);
    else if(operID == MATH_SUM)
-        GradSum(node);
+        GradSum(node, isEfficient);
    else if(operID == MATH_SUMDIM)
-        GradSumDim(node);
+        GradSumDim(node, isEfficient);
    else if(operID == REDUCE_REDUCEMEAN)
-        GradReduceMean(node);
+        GradReduceMean(node, isEfficient);
    else if(operID == REDUCE_REDUCESUM)
-        GradReduceSum(node);
+        GradReduceSum(node, isEfficient);
    else if(operID == REDUCE_REDUCESUMSQUARED)
-        GradReduceSumSquared(node);
+        GradReduceSumSquared(node, isEfficient);
    else if(operID == REDUCE_REDUCEVARIANCE)
-        GradReduceVariance(node);
+        GradReduceVariance(node, isEfficient);
    else{
        ShowNTErrors("TODO!");
    }
@@ -111,8 +115,10 @@ we have
 dE/da = dE/dc   a >= 0
        -dE/dc  a < 0
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradAbsolute(XTensor * node)
+void XMathGrad::GradAbsolute(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for ABSOLUTE!");
@@ -137,8 +143,10 @@ c = cos(a)
 we have
 dE/da = dE/dc * -sin(a)
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradCos(XTensor * node)
+void XMathGrad::GradCos(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for COS!");
@@ -164,8 +172,10 @@ c = exp(a)
 we have
 dE/da = dE/dc * exp(a)
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradExp(XTensor * node)
+void XMathGrad::GradExp(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for EXP!");
@@ -190,8 +200,10 @@ c = log(a)
 we have
 dE/da = dE/dc * 1/a
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradLog(XTensor * node)
+void XMathGrad::GradLog(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for LOG!");
@@ -212,8 +224,10 @@ c = round(a)
 we have
 dE/da = 0
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradRound(XTensor * node)
+void XMathGrad::GradRound(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for ROUND!");
@@ -231,8 +245,10 @@ c = sign(a)
 we have
 dE/da = 0
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradSign(XTensor * node)
+void XMathGrad::GradSign(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SIGN!");
@@ -250,8 +266,10 @@ c = sin(a)
 we have
 dE/da = dE/dc * cos(a)
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradSin(XTensor * node)
+void XMathGrad::GradSin(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SIN!");
@@ -276,8 +294,10 @@ c = tan(a)
 we have
 dE/da = dE/dc * 1/(cos(a))^2
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradTan(XTensor * node)
+void XMathGrad::GradTan(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for TAN!");
@@ -302,8 +322,10 @@ we have
 dE/da = 1  lower < a < upper
 dE/da = 0  otherwise 
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradClip(XTensor * node)
+void XMathGrad::GradClip(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for CLIP!");
@@ -332,8 +354,10 @@ we have
 dE/da = dE/dc / b
 dE/db = dE/dc * a / -b^2
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradDiv(XTensor * node)
+void XMathGrad::GradDiv(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for DIVIDE!");
@@ -365,8 +389,12 @@ c = a / b
 where the size of b is equal to dimension n of a, i.e., |b| = a.dimSize[n]
 dE/da = dE/dc * (1/b)
 dE/db = (dE/dc * (-a/b^2)).reduce(0,...,n-1,n+1,...)
+
+>> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradDivDim(XTensor * node)
+void XMathGrad::GradDivDim(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for DIVDIM!");
@@ -391,7 +419,7 @@ void XMathGrad::GradDivDim(XTensor * node)
    XTensor * interGradTMP = NewTensorBuf(node->grad, node->devID, node->mem);

    _Negate(a, aTMP1);
-    _Power(b, bTMP, -2);
+    _Power(b, bTMP, -2.0F);
    _MultiplyDim(aTMP1, bTMP, aTMP2, n);

    _Multiply(node->grad, aTMP2, interGradTMP);
@@ -405,17 +433,17 @@ void XMathGrad::GradDivDim(XTensor * node)
           size of b. Then we can reduce the matrix into a row vector. */
        interGradTMP->Reshape(2, reshapedSize);

-        if(b->outgo.tailNum > 1){
+        //if(b->outgo.tailNum > 1){
            XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);

            _ReduceSum(interGradTMP, bGradTMP, 0);
            _Sum(b->grad, bGradTMP, b->grad);

            DelTensorBuf(bGradTMP);
-        }
+        /*}
        else{
            _ReduceSum(interGradTMP, b->grad, 0);
-        }
+        }*/
    }
    else{
        int reshapedSize[MAX_TENSOR_DIM_NUM];
@@ -437,17 +465,17 @@ void XMathGrad::GradDivDim(XTensor * node)
        XTensor * interGrad = NewTensorBuf(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
        _ReduceSum(interGradTMP, interGrad, 2);

-        if(b->outgo.tailNum > 1){
+        //if(b->outgo.tailNum > 1){
            XTensor * bGradTMP2 = NewTensorBuf(b->grad, b->devID, b->mem);

            _ReduceSum(interGrad, bGradTMP2, 0);
            _Sum(b->grad, bGradTMP2, b->grad);

            DelTensorBuf(bGradTMP2);
-        }
+        /*}
        else{
            _ReduceSum(interGrad, b->grad, 0);
-        }
+        }*/
        DelTensorBuf(interGrad);
    }

@@ -466,8 +494,10 @@ we have
 dE/da = dE/dc * b^T * \alpha
 dE/db = a^T * dE/dc * \alpha
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradMatrixMul(XTensor * node)
+void XMathGrad::GradMatrixMul(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for MULTIPLY!");
@@ -479,17 +509,19 @@ void XMathGrad::GradMatrixMul(XTensor * node)
    MATRIX_TRANS_TYPE transB = income.GetParamTrans(1);
    DTYPE alpha = income.GetParam(2);

-    XNoder::MakeGrad(a);
-    XNoder::MakeGrad(b);
+    if(!isEfficient || a->isGrad)
+        XNoder::MakeGrad(a);
+    if(!isEfficient || b->isGrad)
+        XNoder::MakeGrad(b);

    XTensor * c = node;
    XTensor * dedc = node->grad;
    XTensor * deda = a->grad;
    XTensor * dedb = b->grad;
    
-    if(deda->order == 2 && dedb->order == 2)
-        GradMatrixMul(a, deda, transA, b, dedb, transB, dedc, alpha);
-    else if(transA == X_NOTRANS && deda->order > 2 && dedb->order == 2){
+    if(a->order == 2 && b->order == 2)
+        GradMatrixMul(a, deda, transA, b, dedb, transB, dedc, alpha, isEfficient);
+    else if(transA == X_NOTRANS && a->order > 2 && b->order == 2){
        int orderBackupA = a->order;
        int orderBackupC = c->order;
        int dimsBackupA[MAX_TENSOR_DIM_NUM];
@@ -499,14 +531,16 @@ void XMathGrad::GradMatrixMul(XTensor * node)

        a->Reshape(a->unitNum/a->GetDim(-1), a->GetDim(-1));
        c->Reshape(c->unitNum/c->GetDim(-1), c->GetDim(-1));
-        deda->Reshape(deda->unitNum/deda->GetDim(-1), deda->GetDim(-1));
+        if(!isEfficient || a->isGrad)
+            deda->Reshape(deda->unitNum/deda->GetDim(-1), deda->GetDim(-1));
        dedc->Reshape(dedc->unitNum/dedc->GetDim(-1), dedc->GetDim(-1));

-        GradMatrixMul(a, deda, transA, b, dedb, transB, dedc, alpha);
+        GradMatrixMul(a, deda, transA, b, dedb, transB, dedc, alpha, isEfficient);

        a->Reshape(orderBackupA, dimsBackupA);
        c->Reshape(orderBackupC, dimsBackupC);
-        deda->Reshape(orderBackupA, dimsBackupA);
+        if(!isEfficient || a->isGrad)
+            deda->Reshape(orderBackupA, dimsBackupA);
        dedc->Reshape(orderBackupC, dimsBackupC);
    }
    else{
@@ -524,19 +558,23 @@ gradient for matrix multiply: c = matmul(a, b) * \alpha
 >> dedb - dE/db
 >> dedc - dE/dc
 >> alpha - the scalar
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
 void XMathGrad::GradMatrixMul(XTensor * a, XTensor * deda, MATRIX_TRANS_TYPE transA,
                              XTensor * b, XTensor * dedb, MATRIX_TRANS_TYPE transB,
-                              XTensor * dedc, DTYPE alpha)
+                              XTensor * dedc, DTYPE alpha, bool isEfficient)
 {
    /* c = a * b * \alpha */
    if(transA == X_NOTRANS && transB == X_NOTRANS){
        
        /* dE/da = dE/dc * b^T * \alpha */
-        _MatrixMul(dedc, X_NOTRANS, b, X_TRANS, deda, alpha, 1.0F);
+        if(!isEfficient || a->isGrad)
+            _MatrixMul(dedc, X_NOTRANS, b, X_TRANS, deda, alpha, 1.0F);
        
        /* dE/db = a^T * dE/dc * \alpha */
-        _MatrixMul(a, X_TRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
+        if(!isEfficient || b->isGrad)
+            _MatrixMul(a, X_TRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
    }
    
    /* c = a^T * b * \alpha */
@@ -544,21 +582,25 @@ void XMathGrad::GradMatrixMul(XTensor * a, XTensor * deda, MATRIX_TRANS_TYPE tra
        
        /* dE/da = (dE/dc * b^T)^T * \alpha 
                 = b * dE/dc^T * \alpha */
-        _MatrixMul(b, X_NOTRANS, dedc, X_TRANS, deda, alpha, 1.0F);
+        if(!isEfficient || a->isGrad)
+            _MatrixMul(b, X_NOTRANS, dedc, X_TRANS, deda, alpha, 1.0F);
        
        /* dE/db = a * dE/dc * \alpha */
-        _MatrixMul(a, X_NOTRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
+        if(!isEfficient || b->isGrad)
+            _MatrixMul(a, X_NOTRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
    }
    
    /* c = a * b^T * \alpha */
    else if(transA == X_NOTRANS && transB == X_TRANS){
        
        /* dE/da = dE/dc * b * \alpha */
-        _MatrixMul(dedc, X_NOTRANS, b, X_NOTRANS, deda, alpha, 1.0F);
+        if(!isEfficient || a->isGrad)
+            _MatrixMul(dedc, X_NOTRANS, b, X_NOTRANS, deda, alpha, 1.0F);
        
        /* dE/db = (a^T * dE/dc)^T * \alpha 
                 = dE/dc^T * a * \alpha */
-        _MatrixMul(dedc, X_TRANS, a, X_NOTRANS, dedb, alpha, 1.0F);
+        if(!isEfficient || b->isGrad)
+            _MatrixMul(dedc, X_TRANS, a, X_NOTRANS, dedb, alpha, 1.0F);
    }
    
    /* c = a^T * b^T * \alpha */
@@ -566,11 +608,13 @@ void XMathGrad::GradMatrixMul(XTensor * a, XTensor * deda, MATRIX_TRANS_TYPE tra
        
        /* dE/da = (dE/dc * b)^T * \alpha 
                 = b^T * dE/dc^T * \alpha */
-        _MatrixMul(b, X_TRANS, dedc, X_TRANS, deda, alpha, 1.0F);
+        if(!isEfficient || a->isGrad)
+            _MatrixMul(b, X_TRANS, dedc, X_TRANS, deda, alpha, 1.0F);
        
        /* dE/db = (a * dE/dc)^T * \alpha 
                 = dE/dc^T * a^T * \alpha */
-        _MatrixMul(dedc, X_TRANS, a, X_TRANS, dedb, alpha, 1.0F);
+        if(!isEfficient || b->isGrad)
+            _MatrixMul(dedc, X_TRANS, a, X_TRANS, dedb, alpha, 1.0F);
    }
 }

@@ -582,8 +626,10 @@ we have
 dE/da_i = dE/dc_i * b_i^T * \alpha
 dE/db_i = a_i^T * dE/dc_i * \alpha
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradMatrixMulBatched(XTensor * node)
+void XMathGrad::GradMatrixMulBatched(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for MULTIPLY!");
@@ -657,8 +703,10 @@ we have
 dE/da = dE/dc * b
 dE/db = dE/dc * a 
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradMultiply(XTensor * node)
+void XMathGrad::GradMultiply(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for MULTIPLY!");
@@ -681,8 +729,12 @@ c = a * b
 where the size of b is equal to dimension n of a, i.e., |b| = a.dimSize[n]
 dE/da = dE/dc * b
 dE/db = (dE/dc * a).reduce(0,...,n-1,n+1,...)
+
+>> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradMultiplyDim(XTensor * node)
+void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for MULTIPLYDIM!");
@@ -713,17 +765,17 @@ void XMathGrad::GradMultiplyDim(XTensor * node)
           size of b. Then we can reduce the matrix into a row vector. */
        bGradTMP->Reshape(2, reshapedSize);

-        if(b->outgo.tailNum > 1){
+        //if(b->outgo.tailNum > 1){
            XTensor * bGradTMP2 = NewTensorBuf(b->grad, b->devID, b->mem);

            _ReduceSum(bGradTMP, bGradTMP2, 0);
            _Sum(b->grad, bGradTMP2, b->grad);

            DelTensorBuf(bGradTMP2);
-        }
+        /*}
        else{
            _ReduceSum(bGradTMP, b->grad, 0);
-        }
+        }*/
    }
    else{
        int reshapedSize[MAX_TENSOR_DIM_NUM];
@@ -745,17 +797,17 @@ void XMathGrad::GradMultiplyDim(XTensor * node)
        XTensor * interGrad = NewTensorBuf(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
        _ReduceSum(bGradTMP, interGrad, 2);

-        if(b->outgo.tailNum > 1){
+        //if(b->outgo.tailNum > 1){
            XTensor * bGradTMP2 = NewTensorBuf(b->grad, b->devID, b->mem);

            _ReduceSum(interGrad, bGradTMP2, 0);
            _Sum(b->grad, bGradTMP2, b->grad);

            DelTensorBuf(bGradTMP2);
-        }
+        /*}
        else{
            _ReduceSum(interGrad, b->grad, 0);
-        }
+        }*/

        DelTensorBuf(interGrad);
    }
@@ -771,8 +823,10 @@ c = -a
 we have
 dE/da = dE/dc * (-1)
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradNegate(XTensor * node)
+void XMathGrad::GradNegate(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for NEGATE!");
@@ -793,8 +847,10 @@ void XMathGrad::GradNegate(XTensor * node)
 /*
 gradient for normalize
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradNormalize(XTensor * node)
+void XMathGrad::GradNormalize(XTensor * node, bool isEfficient)
 {
    ShowNTErrors("This is really a bad piece of code!!!");
    
@@ -887,8 +943,10 @@ c = pow(a,p)
 we have
 dE/da = (dE/dc) * p * a^(p-1)
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradPower(XTensor * node)
+void XMathGrad::GradPower(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for POWER!");
@@ -916,8 +974,10 @@ c = a * scale + shift
 we have
 dE/da = dE/dc * scale
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradScaleAndShift(XTensor * node)
+void XMathGrad::GradScaleAndShift(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SCALEANDSHIFT!");
@@ -941,8 +1001,10 @@ we have
 dE/da = dE/dc
 dE/db = -dE/dc * \beta
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradSub(XTensor * node)
+void XMathGrad::GradSub(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for SUBSTRACT!");
@@ -966,8 +1028,11 @@ c = a - b * \beta
 where the size of b is equal to dimension n of a, i.e., |b| = a.dimSize[n]
 dE/da = dE/dc
 dE/db = - dE/dc * b.reduce(0,...,n-1,n+1,...) * \beta
+>> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradSubDim(XTensor * node)
+void XMathGrad::GradSubDim(XTensor * node, bool isEfficient)
 {
 	XLink &income = node->income;
 	CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for SUBDIM!");
@@ -994,20 +1059,20 @@ void XMathGrad::GradSubDim(XTensor * node)
 		   size of b. Then we can reduce the matrix into a row vector. */
 		node->grad->Reshape(2, reshapedSize);

-		if(b->outgo.tailNum > 1){
+		//if(b->outgo.tailNum > 1){
 			XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
 			_ReduceSum(node->grad, bGradTMP, 0);
 			if(beta != 1.0F)
 				_ScaleAndShiftMe(bGradTMP, beta);
 			_Sub(b->grad, bGradTMP, b->grad);
 			DelTensorBuf(bGradTMP);
-		}
+		/*}
 		else{
 			_ReduceSum(node->grad, b->grad, 0);
 			if(beta != 1.0F)
 				_ScaleAndShiftMe(b->grad, beta);
 			_ScaleAndShiftMe(b->grad, -1.0F);
-		}
+		}*/

 		node->grad->Reshape(order, dimSize);
 	}
@@ -1032,20 +1097,20 @@ void XMathGrad::GradSubDim(XTensor * node)

 		_ReduceSum(node->grad, interGrad, 2);

-		if(b->outgo.tailNum > 1){
+		//if(b->outgo.tailNum > 1){
 			XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
 			_ReduceSum(interGrad, bGradTMP, 0);
 			if(beta != 1.0F)
 				_ScaleAndShiftMe(bGradTMP, beta);
 			_Sub(b->grad, bGradTMP, b->grad);
 			DelTensorBuf(bGradTMP);
-		}
+		/*}
 		else{
 			_ReduceSum(interGrad, b->grad, 0);
 			if(beta != 1.0F)
 				_ScaleAndShiftMe(b->grad, beta);
 			_ScaleAndShiftMe(b->grad, -1.0F);
-		}
+		}*/

 		node->grad->Reshape(order, dimSize);

@@ -1063,9 +1128,12 @@ c =  a + b * \beta
 we have
 dE/da = dE/dc 
 dE/db = dE/dc * \beta
+
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradSum(XTensor * node)
+void XMathGrad::GradSum(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for SUM!");
@@ -1074,11 +1142,15 @@ void XMathGrad::GradSum(XTensor * node)
    XTensor * b = income.tails[1];
    DTYPE beta = income.GetParam(0);

-    XNoder::MakeGrad(a);
-    XNoder::MakeGrad(b);
+    if(!isEfficient || a->isGrad){
+        XNoder::MakeGrad(a);
+        _Sum(a->grad, node->grad, a->grad);
+    }

-    _Sum(a->grad, node->grad, a->grad);
-    _Sum(b->grad, node->grad, b->grad, beta);
+    if(!isEfficient || b->isGrad){
+        XNoder::MakeGrad(b);
+        _Sum(b->grad, node->grad, b->grad, beta);
+    }

    node->visitMark = NODE_FINISHED;
 }
@@ -1088,9 +1160,13 @@ gradient for sum with one dimension
 c = a + b * \beta
 where the size of b is equal to dimension n of a, i.e., |b| = a.dimSize[n]
 dE/da = dE/dc
-dE/db = dE/dc * b.reduce(0,...,n-1,n+1,...) * \beta
+dE/db = dE/dc * a.reduce(0,...,n-1,n+1,...) * \beta
+
+>> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradSumDim(XTensor * node)
+void XMathGrad::GradSumDim(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for SUMDIM!");
@@ -1117,19 +1193,19 @@ void XMathGrad::GradSumDim(XTensor * node)
           size of b. Then we can reduce the matrix into a row vector. */
        node->grad->Reshape(2, reshapedSize);

-        if(b->outgo.tailNum > 1){
+        //if(b->outgo.tailNum > 1){
            XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
            _ReduceSum(node->grad, bGradTMP, 0);
            if(beta != 1.0F)
                _ScaleAndShiftMe(bGradTMP, beta);
            _Sum(bGradTMP, b->grad, b->grad);
            DelTensorBuf(bGradTMP);
-        }
+        /*}
        else{
            _ReduceSum(node->grad, b->grad, 0);
            if(beta != 1.0F)
                _ScaleAndShiftMe(b->grad, beta);
-        }
+        }*/

        node->grad->Reshape(order, dimSize);
    }
@@ -1154,19 +1230,19 @@ void XMathGrad::GradSumDim(XTensor * node)

        _ReduceSum(node->grad, interGrad, 2);

-        if(b->outgo.tailNum > 1){
+        //if(b->outgo.tailNum > 1){
            XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
            _ReduceSum(interGrad, bGradTMP, 0);
            if(beta != 1.0F)
                _ScaleAndShiftMe(bGradTMP, beta);
            _Sum(bGradTMP, b->grad, b->grad);
            DelTensorBuf(bGradTMP);
-        }
+        /*}
        else{
            _ReduceSum(interGrad, b->grad, 0);
            if(beta != 1.0F)
                _ScaleAndShiftMe(b->grad, beta);
-        }
+        }*/

        node->grad->Reshape(order, dimSize);

@@ -1183,9 +1259,12 @@ for
 c = reduceMean(a, dim)
 we have
 dE/da = Unsqueeze(dE/dc) * 1/dimSizeA[dim]
+
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradReduceMean(XTensor * node)
+void XMathGrad::GradReduceMean(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for Reduce!");
@@ -1213,9 +1292,12 @@ for
 c = reduceSum(a, dim)
 we have
 dE/da = Unsqueeze(dE/dc) * 1
+
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradReduceSum(XTensor * node)
+void XMathGrad::GradReduceSum(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for Reduce!");
@@ -1243,9 +1325,12 @@ c = \sum_i (a_i - b)^2
 we have
 dE/da = Unsqueeze(dE/dc) * 2a
 dE/db = dE/dc * -2 * n * b
+
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradReduceSumSquared(XTensor * node)
+void XMathGrad::GradReduceSumSquared(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for Reduce!");
@@ -1292,9 +1377,12 @@ where b is the mean, and n is the size of a
 we have
 dE/da = Unsqueeze(dE/dc) * 2a/n
 dE/db = dE/dc * -2 * b
+
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XMathGrad::GradReduceVariance(XTensor * node)
+void XMathGrad::GradReduceVariance(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for Reduce!");

--- a/source/network/XBackwardMath.h
+++ b/source/network/XBackwardMath.h
@@ -33,7 +33,7 @@ class XMathGrad
 public:
    /* compute dE/dx of a node */
    static
-    void MakeGrad(XTensor * node);
+    void MakeGrad(XTensor * node, bool isEfficient);

    /* indicates whether the node is for a math operation */
    static
@@ -43,121 +43,121 @@ private:
    
    /* gradient for absolute */
    static
-    void GradAbsolute(XTensor * node);
+    void GradAbsolute(XTensor * node, bool isEfficient);
    
    /* gradient for cos */
    static
-    void GradCos(XTensor * node);
+    void GradCos(XTensor * node, bool isEfficient);
    
    /* gradient for exp */
    static
-    void GradExp(XTensor * node);
+    void GradExp(XTensor * node, bool isEfficient);

    /* gradient for log: c =  log(a) */
    static
-    void GradLog(XTensor * node);
+    void GradLog(XTensor * node, bool isEfficient);
    
    /* gradient for round */
    static
-    void GradRound(XTensor * node);
+    void GradRound(XTensor * node, bool isEfficient);
    
    /* gradient for sign */
    static
-    void GradSign(XTensor * node);
+    void GradSign(XTensor * node, bool isEfficient);

    /* gradient for sin */
    static
-    void GradSin(XTensor * node);
+    void GradSin(XTensor * node, bool isEfficient);

    /* gradient for tan */
    static
-    void GradTan(XTensor * node);
+    void GradTan(XTensor * node, bool isEfficient);

    /* gradient for clip */
    static
-    void GradClip(XTensor * node);
+    void GradClip(XTensor * node, bool isEfficient);

    /* gradient for Divide */
    static
-    void GradDiv(XTensor * node);
+    void GradDiv(XTensor * node, bool isEfficient);

    /* gradient for DivideDim */
    static
-    void GradDivDim(XTensor * node);
+    void GradDivDim(XTensor * node, bool isEfficient);

    /* gradient for matrix multiply: c = matmul(a, b) * \alpha */
    static
-    void GradMatrixMul(XTensor * node);
+    void GradMatrixMul(XTensor * node, bool isEfficient);
    
    /* gradient for matrix multiply: c = matmul(a, b) * \alpha */
    static
    void GradMatrixMul(XTensor * a, XTensor * deda, MATRIX_TRANS_TYPE transA,
                       XTensor * b, XTensor * dedb, MATRIX_TRANS_TYPE transB,
-                       XTensor * dedc, DTYPE alpha);
+                       XTensor * dedc, DTYPE alpha, bool isEfficient);

    /* gradient for matrix multiply in batch mode.
       for each batch: c_i = matmul(a_i, b_i) * \alpha */
    static
-    void GradMatrixMulBatched(XTensor * node);
+    void GradMatrixMulBatched(XTensor * node, bool isEfficient);

    /* gradient for multiply (dot production): c =  a * b * \alpha */
    static
-    void GradMultiply(XTensor * node);
+    void GradMultiply(XTensor * node, bool isEfficient);

    /* gradient for multiply one dimension: c =  a * b * \alpha 
       where the size of b is equal to that of one dimension of a */
    static
-    void GradMultiplyDim(XTensor * node);
+    void GradMultiplyDim(XTensor * node, bool isEfficient);

    /* gradient for negate */
    static
-    void GradNegate(XTensor * node);
+    void GradNegate(XTensor * node, bool isEfficient);
    
    /* gradient for normalize */
    static
-    void GradNormalize(XTensor * node);
+    void GradNormalize(XTensor * node, bool isEfficient);

    /* gradient for power */
    static
-    void GradPower(XTensor * node);
+    void GradPower(XTensor * node, bool isEfficient);

    /* gradient for ScaleAndShift */
    static
-    void GradScaleAndShift(XTensor * node);
+    void GradScaleAndShift(XTensor * node, bool isEfficient);

    /* gradient for Minus */
    static
-    void GradSub(XTensor * node);
+    void GradSub(XTensor * node, bool isEfficient);
    
 	/* gradient for sub with one dimension: c = a - b * \beta
 	where the size of b is equal to that of one dimension of a */
 	static
-	void GradSubDim(XTensor * node);
+	void GradSubDim(XTensor * node, bool isEfficient);

    /* gradient for sum: c =  a + b * \beta */
    static
-    void GradSum(XTensor * node);
+    void GradSum(XTensor * node, bool isEfficient);

    /* gradient for sum with one dimension: c = a + b * \beta
       where the size of b is equal to that of one dimension of a */
    static
-    void GradSumDim(XTensor * node);
+    void GradSumDim(XTensor * node, bool isEfficient);

    /* gradient for reduceMean */
    static
-    void GradReduceMean(XTensor * node);
+    void GradReduceMean(XTensor * node, bool isEfficient);

    /* gradient for reduceSum */
    static
-    void GradReduceSum(XTensor * node);
+    void GradReduceSum(XTensor * node, bool isEfficient);

    /* gradient for reduceSumSquared */
    static
-    void GradReduceSumSquared(XTensor * node);
+    void GradReduceSumSquared(XTensor * node, bool isEfficient);

    /* gradient for reduceVariance */
    static
-    void GradReduceVariance(XTensor * node);
+    void GradReduceVariance(XTensor * node, bool isEfficient);
 };

 }

--- a/source/network/XBackwardShape.cpp
+++ b/source/network/XBackwardShape.cpp
@@ -26,29 +26,34 @@
 #include "XBackwardShape.h"
 #include "../tensor/XName.h"
 #include "../tensor/core/CHeader.h"
+#include "../tensor/core/getandset/SetData.h"

 namespace nts{

 /* compute dE/dx of a node */
-void XShapeGrad::MakeGrad(XTensor * node)
+void XShapeGrad::MakeGrad(XTensor * node, bool isEfficent)
 {
    CheckNTErrors(node->grad != NULL, "No gradient found!");

    XLink &income = node->income;
    int operID = income.typeID;

-    if(operID == SHAPE_MERGE)
-        GradMerge(node);
+    if(operID == MOVEMENT_COPYINDEXED)
+        GradCopyIndexed(node, isEfficent);
+    else if(operID == SHAPE_MERGE)
+        GradMerge(node, isEfficent);
    else if(operID == SHAPE_MERGE_LIST)
-        GradMergeList(node);
-    else if(operID == SHAPE_UNSQUEEZE)
-        GradUnsqueeze(node);
+        GradMergeList(node, isEfficent);
+    else if(operID == SHAPE_RESHAPE)
+        GradReshape(node, isEfficent);
    else if(operID == SHAPE_SPLIT)
-        GradSplit(node);
+        GradSplit(node, isEfficent);
    else if(operID == SHAPE_SPLIT_LIST)
-        GradSplitList(node);
+        GradSplitList(node, isEfficent);
    else if (operID == SHAPE_TRANSPOSE)
-        GradTranspose(node);
+        GradTranspose(node, isEfficent);
+    else if(operID == SHAPE_UNSQUEEZE)
+        GradUnsqueeze(node, isEfficent);
    else{
        ShowNTErrors("TODO!");
    }
@@ -62,10 +67,54 @@ bool XShapeGrad::IsShapeOP(XTensor * node)
 }

 /* post processing of a node */
-void XShapeGrad::PostProcessing(XTensor * node, int typeID)
+void XShapeGrad::PostProcessing(XTensor * node, int typeID, bool isEfficent)
 {
    if(typeID == SHAPE_SPLIT_LIST)
-        GradSplitListPost(node);
+        GradSplitListPost(node, isEfficent);
+}
+
+/* 
+gradient computation for copying indexed sub-tensors
+for
+b = copyindexed(a) 
+we have
+dE/da = spread(b)
+>> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
+*/
+void XShapeGrad::GradCopyIndexed(XTensor * node, bool isEfficent)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for CopyIndexed!");
+
+    int dim = income.GetParamInt(0);
+    int * srcIndex = (int *)income.GetParamPointer(1);
+    int indexSize = income.GetParamInt(2);
+    int * tgtIndex = (int *)income.GetParamPointer(3);
+    int copyNum = income.GetParamInt(4);
+
+    int realIndexSize = indexSize * copyNum;
+    int * realSrcIndex = new int[realIndexSize];
+    int * realTgtIndex = new int[realIndexSize];
+    for(int i = 0; i < indexSize; i++) {
+        for(int j = 0; j < copyNum; j++) {
+            realSrcIndex[i * copyNum + j] = srcIndex[i] + j;
+            realTgtIndex[i * copyNum + j] = tgtIndex[i] + j;
+        }
+    }
+
+    XTensor * input = income.tails[0];
+    XNoder::MakeGrad(input);
+
+    _SpreadForGather(input->grad, node->grad, dim, realSrcIndex, realIndexSize, realTgtIndex);
+
+    delete[] realSrcIndex;
+    delete[] realTgtIndex;
+    delete[] srcIndex;
+    delete[] tgtIndex;
+
+    node->visitMark = NODE_FINISHED;
 }

 /* 
@@ -80,14 +129,16 @@ dE/db_1 = dE/dc_{split_1}
 i.e.,
 dE/da = split(dE/dc)
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XShapeGrad::GradMerge(XTensor * node)
+void XShapeGrad::GradMerge(XTensor * node, bool isEfficent)
 {
    XLink &income = node->income;
    XTensor * input = income.tails[0];

    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for MERGE!");
-    CheckNTErrors(node->order == input->order - 1, "wrong tensor orders!");
+    CheckNTErrors(node->order == input->order - 1, "Wrong tensor orders!");

    int whereToMerge = income.GetParamInt(0);
    int leadDim = income.GetParamInt(1);
@@ -162,8 +213,10 @@ dE/db = dE/dc_{split_1}
 i.e.,
 list(dE/da, dE/db, ...) = split(dE/dc)
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XShapeGrad::GradMergeList(XTensor * node)
+void XShapeGrad::GradMergeList(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for MERGE!");
@@ -233,14 +286,46 @@ void XShapeGrad::GradMergeList(XTensor * node)
 }

 /* 
+gradient computation for reshaping a tensor
+for
+b = reshape(a)
+we have
+dE/da = reshape(dE/db)
+>> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
+*/
+void XShapeGrad::GradReshape(XTensor * node, bool isEfficent)
+{
+    XLink &income = node->income;
+    XTensor * input = income.tails[0];
+    XNoder::MakeGrad(input);
+
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for MERGE!");
+
+    int order = income.GetParamInt(0);
+    int * dimSize = (int *)income.GetParamPointer(1);
+
+    node->grad->Reshape(order, dimSize);
+
+    _CopyValues(node->grad, input->grad);
+
+    delete[] dimSize;
+
+    node->visitMark = NODE_FINISHED;
+}
+
+/* 
 gradient computation for split: 
 for
 c = split(a)
 we have
 dE/da = merge(dE/dc)
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XShapeGrad::GradSplit(XTensor * node)
+void XShapeGrad::GradSplit(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    XTensor * input = income.tails[0];
@@ -262,10 +347,12 @@ void XShapeGrad::GradSplit(XTensor * node)
    /* if the tensor is used somewhere else, we need another SUM
       for gradient accumulation */
    else{
-        XTensor inputGradTMP(input);
+        XTensor * inputGradTMP = NewTensorBuf(input, input->devID, input->mem);

-        _Merge(node->grad, &inputGradTMP, whereToSplit + 1, 0);
-        _Sum(input->grad, &inputGradTMP, input->grad);
+        _Merge(node->grad, inputGradTMP, whereToSplit + 1, 0);
+        _Sum(input->grad, inputGradTMP, input->grad);
+        
+        DelTensorBuf(inputGradTMP);
    }

    node->visitMark = NODE_FINISHED;
@@ -279,8 +366,10 @@ list(c_1, ...) = split(a)
 we have
 dE/da = merge(dE/c_1, ...)
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XShapeGrad::GradSplitList(XTensor * node)
+void XShapeGrad::GradSplitList(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
    XTensor * input = income.tails[0];
@@ -299,8 +388,10 @@ have been processed. We do this in a post-processing
 manner because we can fuze multiple memory copy jobs 
 one time. This is good for system speed up. 
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XShapeGrad::GradSplitListPost(XTensor * node)
+void XShapeGrad::GradSplitListPost(XTensor * node, bool isEfficient)
 {
    /* we compute the gradient for current node, rather than for
       child node, i.e., we use the outgoing edge here */
@@ -337,71 +428,84 @@ void XShapeGrad::GradSplitListPost(XTensor * node)
       somewhere else, we need another SUM for gradient 
       accumulation */
    else{
-        XTensor nodeGradTMP(node);
+        XTensor * nodeGradTMP = NewTensorBuf(node, node->devID, node->mem);

-        _Merge(&splits, &nodeGradTMP, whereToSplit + 1);
-        _Sum(node->grad, &nodeGradTMP, node->grad);
+        _Merge(&splits, nodeGradTMP, whereToSplit + 1);
+        _Sum(node->grad, nodeGradTMP, node->grad);
+        
+        DelTensorBuf(nodeGradTMP);
    }
 }

-/* 
-gradient for unsqueezing a tensor
+/*
+gradient for transposing a tensor
 for
-c = unsqueeze(a) 
+c = Transpose(a)
 we have
-dE/da = reduecesum(dE/dc)
+dE/da = Transpose(dE/dc)
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XShapeGrad::GradUnsqueeze(XTensor * node)
+void XShapeGrad::GradTranspose(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
-    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for UNSQUEEZE!");
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for TRANSPOSE!");

    XTensor * output = node;
    XTensor * input = income.tails[0];
+    XTensor * b = NewTensorBuf(input, input->devID, input->mem);
    XNoder::MakeGrad(input);

-    int dim = income.GetParamInt(0);
-    int dSize = income.GetParamInt(1);
+    int i = income.GetParamInt(0);
+    int j = income.GetParamInt(1);

-    CheckNTErrors(dSize == output->GetDim(dim), "Wrong dim size for UNSQUEEZE!");
-    CheckNTErrors(output->unitNum = input->unitNum * dSize, "Wrong tensor size!");
+    CheckNTErrors(input->order > i && i >= 0, "index of dimension is out of scope!");
+    CheckNTErrors(input->order > j && j >= 0, "index of dimension is out of scope!");

-    _ReduceSum(output->grad, input->grad, dim);
+    _Transpose(output->grad, b, i, j);
+    _Sum(input->grad, b, input->grad);
+    
+    DelTensorBuf(b);

    node->visitMark = NODE_FINISHED;
+
+    delete b;
 }

-/*
-gradient for transposing a tensor
+/* 
+gradient for unsqueezing a tensor
 for
-c = Transpose(a)
+c = unsqueeze(a) 
 we have
-dE/da = Transpose(dE/dc)
+dE/da = reduecesum(dE/dc)
 >> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
 */
-void XShapeGrad::GradTranspose(XTensor * node)
+void XShapeGrad::GradUnsqueeze(XTensor * node, bool isEfficient)
 {
    XLink &income = node->income;
-    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for TRANSPOSE!");
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for UNSQUEEZE!");

    XTensor * output = node;
    XTensor * input = income.tails[0];
-    XTensor * b = NewTensor(input);
    XNoder::MakeGrad(input);

-    int i = income.GetParamInt(0);
-    int j = income.GetParamInt(1);
-
-    CheckNTErrors(input->order > i && i >= 0, "index of dimension is out of scope!");
-    CheckNTErrors(input->order > j && j >= 0, "index of dimension is out of scope!");
+    int dim = income.GetParamInt(0);
+    int dSize = income.GetParamInt(1);

-    _Transpose(output->grad, b, i, j);
-    _Sum(input->grad, b, input->grad);
+    CheckNTErrors(dSize == output->GetDim(dim), "Wrong dim size for UNSQUEEZE!");
+    CheckNTErrors(output->unitNum = input->unitNum * dSize, "Wrong tensor size!");
+    
+    XTensor * g = NewTensorBuf(input->grad, input->devID, input->mem);
+    
+    _ReduceSum(output->grad, g, dim);
+    _Sum(input->grad, g, input->grad);
+    
+    DelTensorBuf(g);

    node->visitMark = NODE_FINISHED;
-
-    delete b;
 }

 }
\ No newline at end of file
--- a/source/network/XBackwardShape.h
+++ b/source/network/XBackwardShape.h
@@ -34,7 +34,7 @@ class XShapeGrad
 public:
    /* compute dE/dx of a node */
    static
-    void MakeGrad(XTensor * node);
+    void MakeGrad(XTensor * node, bool isEfficent);

    /* indicates whether the node is for a shaping operation */
    static
@@ -42,39 +42,48 @@ public:

    /* post processing of a node */
    static
-    void PostProcessing(XTensor * node, int typeId);
+    void PostProcessing(XTensor * node, int typeId, bool isEfficent);

 private:
+    
+    /* gradient computation for copying indexed sub-tensors: b = copyindexed(a, srcIndex, indexSize, tgtIndex, copyNum) */
+    static
+    void GradCopyIndexed(XTensor * node, bool isEfficent);
+
    /* gradient computation for merge: c = merge(a, b, ...) */
    static
-    void GradMerge(XTensor * node);
+    void GradMerge(XTensor * node, bool isEfficent);

    /* gradient computation for merging a list of tensors : c = merge(list(a, b, ...)) */
    static
-    void GradMergeList(XTensor * node);
+    void GradMergeList(XTensor * node, bool isEfficent);
+    
+    /* gradient computation for transposing a tensor : b = transpose(a) */
+    static
+    void GradTranspose(XTensor * node, bool isEfficent);
+
+    /* gradient computation for reshaping a tensor: c = reshape(a) */
+    static
+    void GradReshape(XTensor * node, bool isEfficent);

    /* gradient computation for split: c = split(a) */
    static
-    void GradSplit(XTensor * node);
+    void GradSplit(XTensor * node, bool isEfficent);

    /* gradient computation for spliting. we return the list of the splits : list(c_1, ...) = split(a) */
    static
-    void GradSplitList(XTensor * node);
+    void GradSplitList(XTensor * node, bool isEfficent);

    /* gradient computation for spliting. we return the list of the splits : list(c_1, ...) = split(a).
       this method is called only when all nodes of spliting have been processed. We do this in a post-processing
       manner because we can fuze multiple memory copy jobs one time. This is good for system speed up. */
    static
-    void GradSplitListPost(XTensor * node);
+    void GradSplitListPost(XTensor * node, bool isEfficent);

    /* gradient computation for unsqueezing a tensor : c = unsqueeze(a) */
    static
-    void GradUnsqueeze(XTensor * node);
+    void GradUnsqueeze(XTensor * node, bool isEfficent);

-    /* gradient computation for unsqueezing a tensor : c = unsqueeze(a) */
-    static
-    void GradTranspose(XTensor * node);
-    
 };

 }

--- a/source/network/XNet.cpp
+++ b/source/network/XNet.cpp
@@ -55,6 +55,7 @@ void XNetClearAll()
 XNet::XNet()
 {
    nodes.Clear();
+    isGradEfficient = true;
 }

 /* de-constructor */
@@ -115,6 +116,10 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
 {
    Traverse(roots);

+    /* label tensors where the backward computation is neccessary */
+    if(isGradEfficient)
+        MakeEfficientNet();
+
    for(int i = 0; i < nodes.count; i++){
        XTensor * node = (XTensor*)nodes.Get(i);
        node->visitMark = NODE_UNFINISHED;
@@ -154,10 +159,19 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
            CheckNTErrors(node->mem->bufUsed < BUF_PITCH, "Illegal access of buffer!");
        }

-        if(node->visitMark == NODE_FINISHED)
-            continue;
+        if(node->visitMark != NODE_FINISHED)
+            BackwardNode(node, isGradEfficient); 
+
+        if(isGradEfficient){
+            XLink & outgo = node->outgo;
+            for(int i = 0; i < outgo.tailNum; i++){
+                XTensor * parent = outgo.tails[i];
+                ClearGrad(parent);
+            }

-        BackwardNode(node);  
+            if(XNoder::IsLeaf(node))
+                ClearGrad(node);
+        }
    }
 }

@@ -179,27 +193,32 @@ void XNet::Backward(XList &roots, LOSS_FUNCTION_NAME loss)
 /* 
 backward computation for a given node 
 >> node - the node keeps the result of an operation (e.g., activation function)
+>> isEfficient - indicates whether the back-propagation is compuated in an
+                 efficient manner
 */
-void XNet::BackwardNode(XTensor * node)
+void XNet::BackwardNode(XTensor * node, bool isEfficent)
 {
    if(node == NULL || node->visitMark == NODE_FINISHED)
        return;

    if(!XNoder::IsLeaf(node)){
        /* post processing for parent nodes */
-        BackwardNodePost(node);
+        BackwardNodePost(node, isEfficent);

        /* process the current node */
        if(XMathGrad::IsMathOP(node))
-            XMathGrad::MakeGrad(node);
+            XMathGrad::MakeGrad(node, isEfficent);
        else if(XFuncGrad::IsFunc(node))
-            XFuncGrad::MakeGrad(node);
+            XFuncGrad::MakeGrad(node, isEfficent);
        else if(XShapeGrad::IsShapeOP(node))
-            XShapeGrad::MakeGrad(node);
+            XShapeGrad::MakeGrad(node, isEfficent);
        else{
            ShowNTErrors("Wrong node type!");
        }
    }
+    else{
+        node->visitMark = NODE_FINISHED;
+    }
 }

 /* 
@@ -207,7 +226,7 @@ backward computation (in post processing) for a given node
 >> node - the node whose parent nodes are not processed yet. So
          we do the job at the child node.
 */
-void XNet::BackwardNodePost(XTensor * node)
+void XNet::BackwardNodePost(XTensor * node, bool isEfficent)
 {
    bool isSplitList = false;
    XLink &outgo = node->outgo;
@@ -217,7 +236,7 @@ void XNet::BackwardNodePost(XTensor * node)
    }

    if(isSplitList)
-        XShapeGrad::PostProcessing(node, SHAPE_SPLIT_LIST);
+        XShapeGrad::PostProcessing(node, SHAPE_SPLIT_LIST, isEfficent);
 }

 /* 
@@ -284,6 +303,8 @@ void XNet::TarjanVisit(XTensor * node, XList &orders, const unsigned int code)
        node->visitMark = code + 2;
        orders.Add(node);
    }
+    else if(node->visitMark == code + 2){
+    }
 }

 /* 
@@ -304,4 +325,62 @@ void XNet::Dump(FILE * file)
    }
 }

+/* 
+set the flag of gradient-efficient 
+>> flag - the flag
+*/
+void XNet::SetGradEfficientFlag(bool flag)
+{
+    isGradEfficient = flag;
+}
+
+/* generate the gradient-efficient flag for every node */
+void XNet::MakeEfficientNet()
+{
+    /* back-propagation from output to input */
+    for(int i = 0; i < nodes.count; i++){
+        XTensor * node = (XTensor*)nodes.Get(i);
+        XLink &income = node->income;
+        for(int j = 0; j < income.tailNum; j++){
+            XTensor * child = income.tails[j];
+            if(child->isGrad || child->isVar){
+                node->SetGradFlag(true);
+                break;
+            }
+
+        }
+    }
+}
+
+/* 
+clear the graident information if the node is no use 
+>> node - the node that we want to clear
+*/
+void XNet::ClearGrad(XTensor * node)
+{
+    if(node->isVar)
+        return;
+    if(node->grad == NULL)
+        return;
+    if(node->visitMark != NODE_FINISHED)
+        return;
+
+    XLink & income = node->income;
+
+    bool finished = true;
+    for(int i = 0; i < income.tailNum; i++){
+        XTensor * child = income.tails[i];
+        if(child->visitMark != NODE_FINISHED){
+            finished = false;
+            break;
+        }
+    }
+
+    if(finished){
+        //fprintf(stderr, "del %d %ld\n", node->id, node->grad->unitNum);
+        delete node->grad;
+        node->grad = NULL;
+    }
+}
+
 }
\ No newline at end of file
--- a/source/network/XNet.h
+++ b/source/network/XNet.h
@@ -47,6 +47,9 @@ struct XNet
    /* input nodes of the network */
    XList inputs;

+    /* indicates whether the network just keeps the gradient for parameter tensors */
+    bool isGradEfficient;
+
    /* constructor */
    XNet();

@@ -71,10 +74,10 @@ struct XNet
    void Backward(XList &roots, LOSS_FUNCTION_NAME loss = NOLOSS);

    /* backward computation for a given node */
-    void BackwardNode(XTensor * node);
+    void BackwardNode(XTensor * node, bool isEfficent = false);

    /* backward computation (in post processing) for a given node */
-    void BackwardNodePost(XTensor * node);
+    void BackwardNodePost(XTensor * node, bool isEfficent = false);

    /* traverse the net and find the topological order by 
       depth-first search (Tarjan's algorithm) */
@@ -89,6 +92,15 @@ struct XNet

    /* dump network information */
    void Dump(FILE * file);
+
+    /* set the flag of gradient-efficient */
+    void SetGradEfficientFlag(bool flag = true);
+
+    /* generate the gradient-efficient flag for every node */
+    void MakeEfficientNet();
+
+    /* clear the graident information if the node is no use */
+    void ClearGrad(XTensor * node);
 };

 /* we make a unique id for every tensor */

--- a/source/sample/fnnlm/FNNLM.cpp
+++ b/source/sample/fnnlm/FNNLM.cpp
@@ -74,6 +74,7 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net);
 void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NAME loss, 
              FNNModel &model, FNNModel &grad, FNNNet &net);
 void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model);
+void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model);

 /* 
 entry of the program 
@@ -476,7 +477,12 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
                Clear(model, true);

                /* forward + backward process */
-                ForwardAutoDiff(inputs, output, model);
+				
+				/* this is implemented by gather function */
+                ForwardAutoDiff(ngrams, ngramNum, output, model);
+				
+				/* this is implemented by multiply function */
+				//ForwardAutoDiff(inputs, output, model);

                /* automatic differentiation */
                autoDiffer.Backward(output, gold, CROSSENTROPY);
@@ -975,7 +981,55 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
 }

 /*
-forward process (with tensor connections)
+forward process (with tensor connections) (this is implemented by gather function)
+>> ngrams - the loaded ngrams
+>> batch - the tensor encoding a batch of words
+>> output - output probability
+>> model - the fnn model
+*/
+void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model)
+{
+    int n = model.n;
+    int depth = model.hDepth;
+
+    XTensor words;
+    XTensor embeddingBig;
+    XTensor hidden;
+    XTensor b;
+
+    int size = batch * (n-1);
+    int * index = new int[size];
+
+    for(int i = 0; i < batch; i++){
+        for (int j = 0; j < n-1; j++){
+            int a = i * (n - 1) + j;
+            index[a] = ngrams[i].words[j];
+        }
+    }
+
+    XTensor embedding;
+    embedding = Gather(model.embeddingW, 0, index, size);
+    
+    delete[] index;
+
+    int dimSize[2];
+    dimSize[0] = embedding.GetDim(0) / (n - 1);
+    dimSize[1] = embedding.GetDim(1) * (n - 1);
+
+    hidden = Reshape(embedding, embedding.order, dimSize);
+
+    /* hidden layers */
+    for(int i = 0; i < depth; i++)
+        hidden = MMul(hidden, model.hiddenW[i]) + model.hiddenB[i];
+
+    /* output layer */
+    output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1);
+
+    //XLink::ShowNetwork(stderr, &output);
+}
+
+/*
+forward process (with tensor connections) (this is implemented by multiply function)
 >> inputs - input word representations
 >> output - output probability
 >> model - the fnn model
@@ -1122,8 +1176,12 @@ void Test(const char * test, const char * result, FNNModel &model)
            /* forward computation */
            Forward(inputs, output, model, net);
        }
-        else {
-            ForwardAutoDiff(inputs, output, model);
+        else {			
+			/* this is implemented by gather function */
+            ForwardAutoDiff(ngrams, ngramNum, output, model);
+				
+			/* this is implemented by multiply function */
+			//ForwardAutoDiff(inputs, output, model);
        }

        /* prediction probabilities */

--- a/source/sample/transformer/T2TAttention.cpp
+++ b/source/sample/transformer/T2TAttention.cpp
@@ -53,7 +53,7 @@ initialize the model
 >> myDevID - device id
 >> myMem - the memory pool
 */
-void T2TAttention::InitModel(int argc, const char ** argv, 
+void T2TAttention::InitModel(int argc, char ** argv, 
                             bool myIsMasked, int myIgnored, 
                             int myDevID, XMem * myMem)
 {
@@ -69,18 +69,22 @@ void T2TAttention::InitModel(int argc, const char ** argv,
    LoadParamInt(argc, argv, "d", &dv, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
    LoadParamFloat(argc, argv, "attminmax", &minmax, 0.1F);
+    LoadParamFloat(argc, argv, "dropoutatt", &dropoutP, 0);

    InitTensor2D(&wk, d, dk, X_FLOAT, devID, mem);
    InitTensor2D(&wq, d, dk, X_FLOAT, devID, mem);
    InitTensor2D(&wv, d, dv, X_FLOAT, devID, mem);
+    InitTensor2D(&wa, d, d, X_FLOAT, devID, mem);
    
    float scale = 1.0F;
    float finfoutk = (float)sqrt(6.0F * scale/(d + dk));
    float finfoutv = (float)sqrt(6.0F * scale/(d + dv));
+    float finfouta = (float)sqrt(6.0F * scale / (d + d));

    wk.SetDataRand(-finfoutk, finfoutk);
    wq.SetDataRand(-finfoutk, finfoutk);
    wv.SetDataRand(-finfoutv, finfoutv);
+    wa.SetDataRand(-finfouta, finfouta);
 }

 /* 
@@ -90,10 +94,11 @@ make the network
       and H = vector size of each position
 >> q - queries
 >> v - values
->> maske - as it is
+>> mask - as it is
+>> isTraining - indicates whether the model is used for training
 << return - multi-attention result
 */
-XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask)
+XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining)
 {
    XTensor k2;
    XTensor q2;
@@ -123,14 +128,17 @@ XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask)
    if(isMasked)
        dot = dot + mask;

-    dot = Linear(dot, 1.0F/(float)sqrt((float)dk));
+    dot = Linear(dot, 1.0F/(float)sqrt((float)dk/nhead));

    scalar = Softmax(dot, -1);
+    
+    if(isTraining && dropoutP > 0)
+        scalar = Dropout(scalar, dropoutP);

    att = BMMul(scalar, vheads);

    /* concatenate the heads */
-    return Merge(att, att.order - 1);
+    return MMul(Merge(att, att.order - 1), wa);
 }

 }
--- a/source/sample/transformer/T2TAttention.h
+++ b/source/sample/transformer/T2TAttention.h
@@ -57,6 +57,9 @@ public:
    /* transformation matrix for V */
    XTensor wv;

+    /* transformation after dot-product attention */
+    XTensor wa;
+
    /* size of transformed Q and K */
    int dk;

@@ -75,6 +78,9 @@ public:

    /* indicates whether the model is used for training */
    bool isTraining;
+    
+    /* dropout probability */
+    DTYPE dropoutP;

 public:
    /* constructor */
@@ -84,12 +90,12 @@ public:
    ~T2TAttention();

    /* initialize the model */
-    void InitModel(int argc, const char ** argv, 
+    void InitModel(int argc, char ** argv, 
                   bool myIsMasked, int myIgnored, 
                   int myDevID = -1, XMem * myMem = NULL);

    /* make the network */
-    XTensor Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask);
+    XTensor Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining);
 };

 }

--- a/source/sample/transformer/T2TDecoder.h
+++ b/source/sample/transformer/T2TDecoder.h
@@ -34,7 +34,7 @@ class AttDecoder : T2TDecoder
 {
 public:
    /* initialize the model */
-    void InitModel(int argc, const char ** argv);
+    void InitModel(int argc, char ** argv);
 };

 }

--- a/source/sample/transformer/T2TEmbedding.cpp
+++ b/source/sample/transformer/T2TEmbedding.cpp
@@ -48,7 +48,7 @@ initialize the model
 >> myDevID - device id
 >> myMem - the memory pool
 */
-void T2TEmbedder::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
+void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
 {
    devID = myDevID;
    mem = myMem;
@@ -60,7 +60,8 @@ void T2TEmbedder::InitModel(int argc, const char ** argv, int myDevID, XMem * my

    InitTensor2D(&w, vSize, eSize, X_FLOAT, devID, mem);

-    w.SetDataRandn(0, 1.0F/(float)sqrt((float)eSize));
+    DTYPE v = 1.0F/(float)sqrt((float)eSize);
+    w.SetDataRand(-v, v);

    /* create the positional embedding matrix */
    MakePosEmbedding(eSize, d, maxLength);
@@ -79,6 +80,17 @@ void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length)

    for(int pos = 0; pos < length; pos++){
        float * dp = data + pos * eSize;
+        
+        int channelSize = eSize / 2;
+        int offset = 0;
+        for(int i = 0; i < channelSize; i++){
+            dp[offset++] = (float)sin(pos/pow(10000.0F, 2.0F*i/(d - 2)));
+        }
+        for(int i = 0; i < channelSize; i++){
+            dp[offset++] = (float)cos(pos/pow(10000.0F, 2.0F*i/(d - 2)));
+        }
+
+        /*
        for(int k = 0; k < eSize; k++){
            if(k % 2 == 0){
                int i = k/2;
@@ -89,6 +101,7 @@ void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length)
                dp[k] = (float)cos(pos/pow(10000.0F, 2.0F*i/d));
            }
        }
+        */
    }

    posEmbeddingBase.SetData(data, posEmbeddingBase.unitNum);
@@ -135,7 +148,7 @@ XTensor T2TEmbedder::Make(XTensor &input)
    }

    /* then we make word embeddings */
-    wordEmbedding = Linear(MMul(input, w), (float)sqrt((float)d));
+    wordEmbedding = Linear(MMul(input, w), (float)sqrt((float)eSize));

    /* we sum over the two embeddings */
    return wordEmbedding + posEmbedding;

--- a/source/sample/transformer/T2TEmbedding.h
+++ b/source/sample/transformer/T2TEmbedding.h
@@ -71,7 +71,7 @@ public:
    ~T2TEmbedder();

    /* initialize the model */
-    void InitModel(int argc, const char ** argv, int myDevID = -1, XMem * myMem = NULL);
+    void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL);

    /* make positional embeddings */
    void MakePosEmbedding(int eSize, int d, int length);

--- a/source/sample/transformer/T2TEncoder.cpp
+++ b/source/sample/transformer/T2TEncoder.cpp
@@ -51,7 +51,7 @@ initialize the model
 >> myDevID - device id
 >> myMem - the memory pool
 */
-void AttEncoder::InitModel(int argc, const char ** argv, 
+void AttEncoder::InitModel(int argc, char ** argv, 
                           bool myIsMasked, int myIgnored, 
                           int myDevID, XMem * myMem)
 {
@@ -89,16 +89,17 @@ void AttEncoder::InitModel(int argc, const char ** argv,
 make the encoding network
 >> input - the input tensor of the encoder
 >> mask - the mask that indicate each position is valid
->> skipInputRes - indicates whether we skip the residual connection of the first layer
->> isTraining - indicates whether the model is for training
+>> isTraining - indicates whether the model is used for training
 << return - the output tensor of the encoder
 */
-XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool skipInputRes, bool isTraining)
+XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool isTraining)
 {
    XTensor x;

    x = embedder.Make(input);

+    //x.Dump(tmpFILE, "embedding: ");
+
    /* dropout */
    if(isTraining && dropoutP > 0)
        x = Dropout(x, dropoutP);
@@ -109,37 +110,21 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool skipInputRes, bool 
        XTensor fnn;
        XTensor res;

-        /* we skip the residual connection for the first layer if
-           the encoder is used in language modeling. */
-        if(skipInputRes && i == 0){
-            /* self attention */
-            att = attentions[i].Make(x, x, x, mask);
-
-            /* dropout */
-            if(isTraining && dropoutP > 0)
-                att = Dropout(att, dropoutP);
-
-            /* layer normalization */
-            x = attLayerNorms[i].Make(att); 
-        }
-        else{
+        /* self attention */
+        att = attentions[i].Make(x, x, x, mask, isTraining);

-            /* self attention */
-            att = attentions[i].Make(x, x, x, mask);
-
-            /* dropout */
-            if(isTraining && dropoutP > 0)
-                att = Dropout(att, dropoutP);
+        /* dropout */
+        if(isTraining && dropoutP > 0)
+            att = Dropout(att, dropoutP);

-            /* residual connection */
-            res = Sum(att, x);
+        /* residual connection */
+        res = Sum(att, x);

-            /* layer normalization */
-            x = attLayerNorms[i].Make(res);
-        }
+        /* layer normalization */
+        x = attLayerNorms[i].Make(res);

        /* fnn */
-        fnn = fnns[i].Make(x);
+        fnn = fnns[i].Make(x, isTraining);

        /* dropout */
        if(isTraining && dropoutP > 0)
@@ -150,9 +135,6 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool skipInputRes, bool 

        /* layer normalization */
        x = fnnLayerNorms[i].Make(res);
-
-        if(isTraining && dropoutP > 0)
-            x = Dropout(x, dropoutP);
    }

    return x;

--- a/source/sample/transformer/T2TEncoder.h
+++ b/source/sample/transformer/T2TEncoder.h
@@ -40,7 +40,7 @@ class T2TEncoder
 {
 public:
    virtual
-    XTensor Make(XTensor &input, XTensor &mask, bool skipInputRes, bool isTraining) = 0;
+    XTensor Make(XTensor &input, XTensor &mask, bool isTraining) = 0;
 };

 /* 
@@ -49,7 +49,7 @@ the encoder based on RNN
 class RNNEncoder : T2TEncoder
 {
 public:
-    XTensor Make(XTensor &input, XTensor &mask, bool skipInputRes, bool isTraining);
+    XTensor Make(XTensor &input, XTensor &mask, bool isTraining);
 };


@@ -113,12 +113,12 @@ public:
    ~AttEncoder();

    /* initialize the model */
-    void InitModel(int argc, const char ** argv, 
+    void InitModel(int argc, char ** argv, 
                   bool myIsMasked, int myIgnored, 
                   int myDevID = -1, XMem * myMem = NULL);

    /* make the encoding network */
-    XTensor Make(XTensor &input, XTensor &mask, bool skipInputRes, bool isTraining);
+    XTensor Make(XTensor &input, XTensor &mask, bool isTraining);
 };



--- a/source/sample/transformer/T2TFNN.cpp
+++ b/source/sample/transformer/T2TFNN.cpp
@@ -49,7 +49,7 @@ initialize the model
 >> myDevID - device id
 >> myMem - the memory pool
 */
-void T2TFNN::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
+void T2TFNN::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
 {
    devID = myDevID;
    mem = myMem;
@@ -58,8 +58,9 @@ void T2TFNN::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)

    LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "d", &outSize, DEFAULT_EMBEDDING_SIZE);
-    LoadParamInt(argc, argv, "fnnh", &hSize, DEFAULT_EMBEDDING_SIZE * 4);
+    LoadParamInt(argc, argv, "fnnh", &hSize, outSize * 4);
    LoadParamFloat(argc, argv, "fnnminmax", &minmax, 0.1F);
+    LoadParamFloat(argc, argv, "dropoutfnn", &dropoutP, 0);

    InitTensor2D(&w1, inSize, hSize, X_FLOAT, devID, mem);
    InitTensor1D(&b1, hSize, X_FLOAT, devID, mem);
@@ -83,12 +84,15 @@ y = max(0, x * w1 + b1) * w2 + b2
 >> input - the input tensor
 >> return - the output tensor 
 */
-XTensor T2TFNN::Make(XTensor &input)
+XTensor T2TFNN::Make(XTensor &input, bool isTraining)
 {
    XTensor t1;

    /* t1 = max(0, x * w1 + b1) */
    t1 = Rectify(MMul(input, w1) + b1);
+    
+    if(isTraining && dropoutP > 0)
+        t1 = Dropout(t1, dropoutP);

    /* result = t1 * w2 + b2 */
    return MMul(t1, w2) + b2;

--- a/source/sample/transformer/T2TFNN.h
+++ b/source/sample/transformer/T2TFNN.h
@@ -59,6 +59,9 @@ public:

    /* bias of transformation 2 */
    XTensor b2;
+    
+    /* dropout probability */
+    DTYPE dropoutP;

 public:

@@ -69,10 +72,10 @@ public:
    ~T2TFNN();

    /* initialize the model */
-    void InitModel(int argc, const char ** argv, int myDevID = -1, XMem * myMem = NULL);
+    void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL);

    /* make the network */
-    XTensor Make(XTensor &input);
+    XTensor Make(XTensor &input, bool isTraining);

 };


--- a/source/sample/transformer/T2TLayerNormal.cpp
+++ b/source/sample/transformer/T2TLayerNormal.cpp
@@ -32,7 +32,8 @@ namespace transformer
 T2TLN::T2TLN()
 {
    devID = -1;
-    mem   = NULL;
+    mem = NULL;
+    d = 0;
 }

 /* de-constructor */
@@ -47,28 +48,28 @@ initialize the model
 >> myDevID - device id
 >> myMem - the memory pool
 */
-void T2TLN::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
+void T2TLN::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
 {
    devID = myDevID;
    mem = myMem;

-    int d = 0;
+    d = 0;
    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);

-    InitTensor2D(&w, d, d, X_FLOAT, devID, mem);
+    InitTensor1D(&w, d, X_FLOAT, devID, mem);
    InitTensor1D(&b, d, X_FLOAT, devID, mem);

    float scale = 1.0F;
-    float finfout = (float)sqrt(6.0F * scale / (d + d));
+    float finfout = (float)sqrt(6.0F * scale / d);

    w.SetDataRand(-finfout, finfout);
    b.SetZeroAll();
 }
-    
+
 /*
-make the network 
+make the network
 for each layer representation x, we have
-y = 
+y =
 >> input - the input tensor
 >> return - layer normalization output
 */
@@ -90,16 +91,17 @@ XTensor T2TLN::Make(XTensor &input)

    /* standard = sqrt(variance) */
    standard = Power(variance, 0.5F);
-    /* unsqueeze mean and standard deviation to fit them into 
+
+    /* unsqueeze mean and standard deviation to fit them into
       the same shape of x */
    meanFilled = Unsqueeze(mean, x.order - 1, x.GetDim(-1));
    standardFilled = Unsqueeze(standard, x.order - 1, x.GetDim(-1));

    /* x' = (x - \mu)/standard */
-    xn = (x - meanFilled)/standardFilled;
+    xn = (x - meanFilled) / standardFilled;

    /* result = x' * w + b   */
-    return MMul(xn, w) + b;
+    return xn * w + b;
 }

 }
--- a/source/sample/transformer/T2TLayerNormal.h
+++ b/source/sample/transformer/T2TLayerNormal.h
@@ -45,6 +45,9 @@ public:

    /* the bias term b */
    XTensor b;
+
+    /* dimension size of the model */
+    int d;
    
 public:
    /* constructor */
@@ -54,7 +57,7 @@ public:
    ~T2TLN();
    
    /* initialize the model */
-    void InitModel(int argc, const char ** argv, int myDevID = -1, XMem * myMem = NULL);
+    void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL);
    
    /* make the network */
    XTensor Make(XTensor &input);

--- a/source/sample/transformer/T2TModel.cpp
+++ b/source/sample/transformer/T2TModel.cpp
@@ -48,7 +48,7 @@ initialize the model
 >> argc - number of arguments
 >> argv - list of pointers to the arguments
 */
-void T2TModel::InitModel(int argc, const char ** argv)
+void T2TModel::InitModel(int argc, char ** argv)
 {
    bool useMem = false;
    int memSize = 0;
@@ -64,25 +64,32 @@ void T2TModel::InitModel(int argc, const char ** argv)

    if(useMem){
        delete mem;
-        mem = new XMem(devID, isMemFreeOTF ? FREE_ON_THE_FLY : UNI_FREE, (MTYPE)MILLION * 256, 1024, MILLION * 128);
+        mem = new XMem(devID, FREE_ON_THE_FLY, (MTYPE)MILLION * 256, 1024, MILLION * 128);
        mem->SetDesiredSize(devID, 0, (MTYPE)memSize * MILLION);
    }

    encoder.InitModel(argc, argv, isLM, 0, devID, mem);
    outputLayer.InitModel(argc, argv, devID, mem);
+
+    XList params(10);
+    GetParams(params);
+
+    for(int i = 0; i < params.count; i++){
+        XTensor * param = (XTensor*)params.Get(i);
+        param->SetVarFlag();
+    }
 }

 /* 
 make the encoding network
 >> input - input tensor
 >> mask - the mask for positions that are/not involved in computation
->> skipInputRes - indicates whether we skip the residual connection of the first layer
 >> isTraining - indicates whether we are training the model
 << return - encoding result
 */
-XTensor T2TModel::MakeEncoding(XTensor &input, XTensor &mask, bool skipInputRes, bool isTraining)
+XTensor T2TModel::MakeEncoding(XTensor &input, XTensor &mask, bool isTraining)
 {
-    return encoder.Make(input, mask, skipInputRes, isTraining);
+    return encoder.Make(input, mask, isTraining);
 }

 /* 
@@ -134,9 +141,9 @@ void T2TModel::Make(XTensor &input, XTensor &output, XTensor &padding, bool isTr
        
        _ScaleAndShiftMe(padding3, 1e9F, -1e9F);
        
-        //_Sum(&mask, padding3, &mask);
+        _Sum(&mask, padding3, &mask);

-        encoding = MakeEncoding(input, mask, true, isTraining);
+        encoding = MakeEncoding(input, mask, isTraining);
        outputLayer.Make(encoding, output);

        delete[] dims;
@@ -167,6 +174,7 @@ void T2TModel::GetParams(XList &list)
        list.Add(&encoder.attentions[i].wk);
        list.Add(&encoder.attentions[i].wq);
        list.Add(&encoder.attentions[i].wv);
+        list.Add(&encoder.attentions[i].wa);
        list.Add(&encoder.fnnLayerNorms[i].w);
        list.Add(&encoder.fnnLayerNorms[i].b);
        list.Add(&encoder.attLayerNorms[i].w);

--- a/source/sample/transformer/T2TModel.h
+++ b/source/sample/transformer/T2TModel.h
@@ -66,10 +66,10 @@ public:
    ~T2TModel();

    /* initialize the model */
-    void InitModel(int argc, const char ** argv);
+    void InitModel(int argc, char ** argv);

    /* make the encoding network */
-    XTensor MakeEncoding(XTensor &input, XTensor &mask, bool skipInputRes, bool isTraining);
+    XTensor MakeEncoding(XTensor &input, XTensor &mask, bool isTraining);

    /* make the entire network (with the output softmax layer) */
    void Make(XTensor &input, XTensor &output, XTensor &padding, bool isTraining);

--- a/source/sample/transformer/T2TOutput.cpp
+++ b/source/sample/transformer/T2TOutput.cpp
@@ -49,7 +49,7 @@ initialize the model
 >> myDevID - device id
 >> myMem - the memory pool
 */
-void T2TOutput::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
+void T2TOutput::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
 {
    devID = myDevID;
    mem = myMem;

--- a/source/sample/transformer/T2TOutput.h
+++ b/source/sample/transformer/T2TOutput.h
@@ -59,7 +59,7 @@ public:
    ~T2TOutput();

    /* initialize the model */
-    void InitModel(int argc, const char ** argv, int myDevID = -1, XMem * myMem = NULL);
+    void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL);

    /* make the network */
    XTensor Make(XTensor &input);

--- a/source/sample/transformer/T2TTrainer.cpp
+++ b/source/sample/transformer/T2TTrainer.cpp
@@ -26,6 +26,11 @@
 #include "../../tensor/core/CHeader.h"
 #include "../../network/XNoder.h"

+#ifndef WIN32
+#include <sys/time.h>
+#include <unistd.h>
+#endif
+
 namespace transformer
 {

@@ -33,15 +38,25 @@ namespace transformer
 T2TTrainer::T2TTrainer()
 {
    seqLen = NULL;
+    seqLen2 = NULL;
    nseqBuf = 0;
    nextSeq = -1;
+
+    argNum = 0;
+    argArray = NULL;
+    buf = NULL;
+    buf2 = NULL;
+    bufSize = 0;
+    seqOffset = NULL;
 }

 /* de-constructor */
 T2TTrainer::~T2TTrainer()
 {
    delete[] buf;
+    delete[] buf2;
    delete[] seqLen;
+    delete[] seqLen2;
    delete[] seqOffset;

    for(int i = 0; i < moments.count; i++){
@@ -53,6 +68,11 @@ T2TTrainer::~T2TTrainer()
        XTensor * m = (XTensor*)moments2nd.Get(i);
        delete m;
    }
+
+    for(int i = 0; i < argNum; i++)
+        delete[] argArray[i];
+    delete[] argArray;
+
 }

 /* 
@@ -60,8 +80,15 @@ initialization
 >> argc - number of arguments
 >> argv - list of pointers to the arguments
 */
-void T2TTrainer::Init(int argc, const char ** argv)
+void T2TTrainer::Init(int argc, char ** argv)
 {
+    argNum = argc;
+    argArray = new char*[argc];
+    for(int i = 0; i < argNum; i++){
+        argArray[i] = new char[strlen(argv[i]) + 1];
+        strcpy(argArray[i], argv[i]);
+    }
+
    bool useMem = false;

    LoadParamBool(argc, argv, "mem", &useMem, useMem);
@@ -78,16 +105,23 @@ void T2TTrainer::Init(int argc, const char ** argv)
    LoadParamInt(argc, argv, "bufsize", &bufSize, 50000);
    LoadParamBool(argc, argv, "adam", &useAdam, false);
    LoadParamFloat(argc, argv, "adambeta1", &adamBeta1, 0.9F);
-    LoadParamFloat(argc, argv, "adambeta2", &adamBeta2, 0.999F);
-    LoadParamFloat(argc, argv, "adamdelta", &adamDelta, 1e-8F);
-
-    buf = new int[bufSize];
-    seqLen = new int[bufSize];
+    LoadParamFloat(argc, argv, "adambeta2", &adamBeta2, 0.98F);
+    LoadParamFloat(argc, argv, "adamdelta", &adamDelta, 1e-9F);
+    LoadParamBool(argc, argv, "shuffled", &isShuffled, false);
+    LoadParamFloat(argc, argv, "labelsmoothing", &labelSmoothingP, 0);
+    LoadParamInt(argc, argv, "nstepcheckpoint", &nStepCheckpoint, -1);
+    LoadParamBool(argc, argv, "epochcheckpoint", &useEpochCheckpoint, false);
+    LoadParamInt(argc, argv, "updatestep", &updateStep, 1);
+    LoadParamBool(argc, argv, "doubledend", &isDoubledEnd, false);
+
+    buf  = new int[bufSize];
+    buf2 = new int[bufSize];
+    seqLen  = new int[bufSize];
+    seqLen2 = new int[bufSize];
    seqOffset = new int[bufSize];

    adamBeta1T = 1.0F;
    adamBeta2T = 1.0F;
-
 }

 int tc = 0;
@@ -95,9 +129,11 @@ int tc = 0;
 /* 
 train the model
 >> fn - training data file
+>> validFN - validation data file
+>> modelFN - where we keep the model
 >> model - model to train
 */
-void T2TTrainer::Train(const char * fn, T2TModel * model)
+void T2TTrainer::Train(const char * fn, const char * validFN, const char * modelFN, T2TModel * model)
 {
    int epoch = 0;
    int step = 0;
@@ -107,32 +143,39 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
    bool isEnd = false;
    float loss = 0;
    float lr = 0;
+    int nStepCheck = 0;
+    int nCheckpoint = 0;
+    int nSkipped = 0;
+    int gradStep = 0;
+    int validStep = 0;
+
+    char * trainFN = new char[(int)strlen(fn) + 10];
+    strcpy(trainFN, fn);
+
+#ifndef WIN32
+    if(isShuffled)
+        sprintf(trainFN, "%s.random", fn);
+#endif

    PrepareModel(model);

    int devID = model->devID;
    XMem * mem = model->mem;
-
-    if(mem != NULL && mem->mode == UNI_FREE)
-        mem->SetPin();
-
    XNet net;
-
-    tf = fopen("tmp.xx.txt", "wb");
-    tc = 0;
    
    double startT = GetClockSec();
    
    for(epoch = 1; epoch <= nepoch; epoch++){
+#ifndef WIN32
+        if(isShuffled)
+            Shuffle(fn, trainFN);
+#endif
        
-        FILE * file = fopen(fn, "rb");
+        FILE * file = fopen(trainFN, "rb");
        CheckNTErrors(file, "cannot open training file!");
        
        wordCount = 0;
        loss = 0;
-
-        if(mem != NULL)
-            mem->BackToPin();
        
        /* batch of input sequences */
        XTensor batch;
@@ -143,33 +186,69 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
        /* gold standard */
        XTensor gold;
        
-        while(LoadBatch(file, true, &batch, &padding, &gold, NULL, 1, vSize, sBatchSize, wBatchSize, isLenSorted, wc, devID, mem)){
-            
+        /* label smoothed gold standard (if needed) */
+        XTensor goldSmoothed;
+        
+        while (LoadBatch(file, true, &batch, &padding, &gold, NULL, 1, vSize, sBatchSize, wBatchSize, isLenSorted, wc, devID, mem)) {
+
+            CheckNTErrors(batch.order == 3, "wrong tensor order of the sequence batch");
+
            /* output probabilities */
            XTensor output;
-            
+
            /* make the network */
            model->Make(batch, output, padding, true);

+            /* back-propagation for obtaining gradients */
+            if (labelSmoothingP > 0)
+                LabelSmooth(&gold, &goldSmoothed, labelSmoothingP);
+
            /* make paddings for the output */
-            if(output.GetDim(0) > 1)
-                PadOutput(&output, &padding);
+            if (output.GetDim(0) > 1)
+                PadOutput(&output, &gold, &padding);

-            /* back-propagation for obtaining gradients */
-            net.Backward(output, gold, CROSSENTROPY);
-            
-            /* learning rate */
-            lr = lrate * (1.0F / (float)sqrt((float)d)) * (float)MIN(pow((float)step + 1, -0.5F - lrbias), ((float)step + 1) * pow((float)nwarmup, -1.5F - lrbias));
-            
-            /* update the parameters */
-            Update(model, lr);
-            
            /* get probabilities */
            float prob = GetProb(&output, &gold, NULL);
-            
-            loss += -prob;
-            wordCount += wc;
-            wordCountTotal += wc;
+            DTYPE lossLocal = -prob / wc;
+            bool doUpdate = (!IsNAN(lossLocal) && !IsINF(lossLocal) && lossLocal < 1e3F);
+
+            XTensor &g = labelSmoothingP > 0 ? goldSmoothed : gold;
+
+            if (doUpdate) {
+                
+                /* recale the output for normalized loss */
+                RescaleOutput(&output, &g, &padding);
+                
+                /* back-propagation */
+                net.Backward(output, g, CROSSENTROPY);
+
+                /*for(int i = 0; i < net.nodes.count; i++){
+                    XTensor * node = (XTensor*)net.nodes.Get(i);
+                    XLink::ShowNode(stderr, node);
+                }
+
+                exit(0);*/
+
+                gradStep += 1;
+                loss += -prob;
+                wordCount += wc;
+                wordCountTotal += wc;
+                
+                /* update the parameters */
+                if(gradStep == updateStep){
+                    
+                    /* learning rate */
+                    lr = lrate * (1.0F / (float)sqrt((float)d)) * (float)MIN(pow((float)validStep + 1, -0.5F - lrbias), ((float)validStep + 1) * pow((float)nwarmup, -1.5F - lrbias));
+                    
+                    /* model update */
+                    Update(model, lr);
+                    
+                    gradStep = 0;
+                    validStep++;
+                }
+            }
+            else
+                nSkipped++;
            
            if(++step >= nstep){
                isEnd = true;
@@ -178,33 +257,39 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
            
            if (step % 1 == 0) {
                double elapsed = GetClockSec() - startT;
-                XPRINT7(0, stderr, "[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, ppl=%.3f, sppl=%.3f\n",
-                        lr, elapsed, step, epoch, wordCountTotal, exp(loss / wordCount), exp(-prob/wc));
+                XPRINT8(0, stderr, "[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, loss=%.3f, ppl=%.3f, sppl=%.3f",
+                        lr, elapsed, step, epoch, wordCountTotal, loss/wordCount, exp(loss/wordCount), exp(-prob/wc));
+                if (!doUpdate)
+                    XPRINT(0, stderr, " (no update)");
+                XPRINT(0, stderr, "\n");
            }

-            if(mem != NULL && mem->mode == UNI_FREE)
-                mem->BackToPin();
+            if(nStepCheckpoint > 0 && ++nStepCheck >= nStepCheckpoint){
+                MakeCheckpoint(model, validFN, modelFN, "step", step);
+                nStepCheck = 0;
+                nCheckpoint++;
+            }
        }
        
        fclose(file);

        if (isEnd)
            break;
+
+        if(useEpochCheckpoint)
+            MakeCheckpoint(model, validFN, modelFN, "epoch", epoch);
    }

-    if(mem != NULL && mem->mode == UNI_FREE)
-        mem->BackToPin();
-    
    double elapsed = GetClockSec() - startT;
-
-    fclose(tf);
    
    epoch = MIN(epoch, nepoch);
    
-    XPRINT6(0, stderr, "[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, ppl=%.3f\n",
-            lr, elapsed, step, epoch, wordCountTotal, exp(loss / wordCount));
-    XPRINT3(0, stderr, "[INFO] training finished (took %.1fs, step=%d and epoch=%d)\n",
-            elapsed, step, epoch);
+    XPRINT7(0, stderr, "[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, loss=%.3f, ppl=%.3f\n",
+            lr, elapsed, step, epoch, wordCountTotal, loss/wordCount, exp(loss/wordCount));
+    XPRINT4(0, stderr, "[INFO] training finished (took %.1fs, step=%d, skipped=%d and epoch=%d)\n",
+            elapsed, step, nSkipped, epoch);
+
+    delete[] trainFN;
 }

 /* 
@@ -218,6 +303,7 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
    int wc = 0;
    int wordCount = 0;
    int wordCountTotal = 0;
+    int sentCount = 0;
    float loss = 0;

    /* data files */
@@ -230,16 +316,10 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
    XMem * mem = model->mem;

    XNet net;
-
-    tf = fopen("tmp.xx.txt", "wb");
-    tc = 0;
    
    double startT = GetClockSec();
        
    wordCount = 0;
-
-    if(mem != NULL && mem->mode == UNI_FREE)
-        mem->BackToPin();
        
    /* batch of input sequences */
    XTensor batch;
@@ -255,7 +335,7 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
    
    ClearBuf();

-    while(LoadBatch(file, true, &batch, &padding, &gold, seqs, 1, vSize, 1, 1, isLenSorted, wc, devID, mem)){
+    while(LoadBatch(file, true, &batch, &padding, &gold, seqs, 1, vSize, 1, 1, false, wc, devID, mem)){

        CheckNTErrors(batch.order == 3, "wrong tensor order of the sequence batch");
            
@@ -302,13 +382,8 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
        loss += -prob;
        wordCount += wc;
        wordCountTotal += wc;
-            
-        if(mem != NULL && mem->mode == UNI_FREE)
-            mem->BackToPin();
+        sentCount += 1;
    }
-
-    if(mem != NULL && mem->mode == UNI_FREE)
-        mem->BackToPin();
        
    fclose(file);
    fclose(ofile);
@@ -316,21 +391,52 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
    delete[] seqs;
    
    double elapsed = GetClockSec() - startT;
-
-    fclose(tf);
    
    XPRINT3(0, stderr, "[INFO] test finished (took %.1fs, word=%d, and ppl=%.3f)\n",
            elapsed,wordCountTotal, exp(loss / wordCount));
 }

+/* 
+make a checkpoint 
+>> model - the model
+>> validFN - validation data file
+>> modelFN - model data file
+>> label - label of the model
+>> id - id of the checkpoint
+*/
+void T2TTrainer::MakeCheckpoint(T2TModel * model, const char * validFN, const char * modelFN, const char * label, int id)
+{
+    char * fn = new char[MAX_LINE_LENGTH];
+    char * fn2 = new char[MAX_LINE_LENGTH];
+    sprintf(fn, "%s.%s.%03d", modelFN, label, id);
+    sprintf(fn2, "%s.%s.%03d.output", modelFN, label, id);
+
+    model->Dump(fn);
+    if(validFN != NULL){
+        T2TTrainer trainer;
+        trainer.Init(argNum, argArray);
+        trainer.Test(validFN, fn2, model);
+    }
+
+    delete[] fn;
+    delete[] fn2;
+}
+
 char line[MAX_SEQUENCE_LENGTH];

 struct SampleNode
 {
    int id;
+    int * p;
    int size;
+    int value;
 };

+int CompareSampleNode(const void * a, const void * b)
+{
+   return ((SampleNode*)b)->value - ((SampleNode*)a)->value;
+}
+
 /* 
 load data to buffer 
 >> file - where to load data
@@ -403,14 +509,46 @@ int T2TTrainer::LoadBuf(FILE * file, bool isSorted, int step)
    nseqBuf = seqCount;
    nextSeq = 0;

+    /* sort the sequences by length */
    if (isSorted) {
        SampleNode * nodes = new SampleNode[seqCount];
        int count = 0;
+        int offset = 0;
        for (int i = 0; i < seqCount; i += step) {
-            nodes[count].id = count;
-            nodes[count].size = seqLen[i];
+            SampleNode &node = nodes[count];
+            node.id = count;
+            node.p = buf + offset;
+            node.size = 0;
+            for(int j = 0; j < step; j++)
+                node.size += seqLen[i + j];
+            node.value = seqLen[i];
            count++;
+            offset += node.size;
        }
+
+        qsort(nodes, seqCount, sizeof(SampleNode), CompareSampleNode);
+
+        count = 0;
+        offset = 0;
+        for(int i = 0; i < seqCount; i++){
+            SampleNode &node = nodes[count];
+            //fprintf(stderr, "%d %d %d\n", node.size, node.id, node.value);
+            memcpy(buf2 + offset, node.p, sizeof(int) * node.size);
+            for(int j = 0; j < step; j++){
+                seqLen2[count + j] = seqLen[node.id + j];
+                seqOffset[count + j] = offset + (j > 0 ? seqLen[node.id + j - 1] : 0);
+            }
+            count += step;
+            offset += node.size;
+        }
+
+        int * tmp = buf;
+        buf = buf2;
+        buf2 = tmp;
+        tmp = seqLen;
+        seqLen = seqLen2;
+        seqLen2 = tmp;
+
        delete[] nodes;
    }

@@ -449,7 +587,7 @@ int T2TTrainer::LoadBatch(FILE * file, bool isLM,
                          int devID, XMem * mem)
 {
    if(nextSeq < 0 || nextSeq >= nseqBuf)
-        LoadBuf(file, isSorted);
+        LoadBuf(file, isSorted, step);

    int seq = MAX(nextSeq, 0);
    int wc = 0;
@@ -457,7 +595,9 @@ int T2TTrainer::LoadBatch(FILE * file, bool isLM,
    int sc = 0;
    int max = 0;
    while(seq + sc < nseqBuf){
-        wn = seqLen[seq + sc];
+        int len = isDoubledEnd ? seqLen[seq + sc] : seqLen[seq + sc] - 1;
+        CheckNTErrors(len > 0, "Empty sequence!");
+        wn = len;
        wc += wn;
        sc += 1;

@@ -512,13 +652,19 @@ int T2TTrainer::LoadBatch(FILE * file, bool isLM,

        /* this might be slow on GPUs :( */
        for(int s = seq; s < seq + sc; s++){
-            for(int w = 0; w < seqLen[s]; w++){
+            int len = isDoubledEnd ? seqLen[s] : seqLen[s] - 1;
+            CheckNTErrors(len <= max, "Something is wrong!");
+            for(int w = 0; w < len; w++){
                batch->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w]);
                padding->Set2D(1.0F, s - seq, w);
                if(w > 0)
                    output->Set3D(1.0F, s - seq, w - 1, buf[seqOffset[s] + w]);
-                if(w == seqLen[s] - 1)
-                    output->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w]);
+                if(w == len - 1){
+                    if(isDoubledEnd)
+                        output->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w]);
+                    else
+                        output->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w + 1]);
+                }
                wCount++;
                /*fprintf(tf, "%d", buf[seqOffset[s] + w]);
                if(w < seqLen[s] - 1)
@@ -530,7 +676,7 @@ int T2TTrainer::LoadBatch(FILE * file, bool isLM,
            }

            if(seqs != NULL){
-                for(int w = seqLen[s]; w < max; w++)
+                for(int w = len; w < max; w++)
                    seqs[seqSize++] = -1;
            }
        }
@@ -540,6 +686,23 @@ int T2TTrainer::LoadBatch(FILE * file, bool isLM,

    return sc;
 }
+
+/* 
+shuffle lines of the file 
+>> srcFile - the source file to shuffle
+>> tgtFile - the resulting file
+*/
+void T2TTrainer::Shuffle(const char * srcFile, const char * tgtFile)
+{
+    char * line = new char[MAX_LINE_LENGTH];
+#ifndef WIN32
+    sprintf(line, "shuf %s > %s", srcFile, tgtFile);
+    system(line);
+#else
+    ShowNTErrors("Cannot shuffle the file on WINDOWS systems!");
+#endif
+    delete[] line;
+}
    
 /*
 get word probabilities for a batch of sequences
@@ -610,12 +773,12 @@ void T2TTrainer::Update(T2TModel * model, const float lr)
            DTYPE e = lr * (DTYPE)sqrt(1 - adamBeta2T) / (1 - adamBeta1T);
            DTYPE d = adamDelta * (DTYPE)sqrt(1 - adamBeta2T);

-            /* m = beat_1 * m + (1-beta_1) * grad */
+            /* m = beta_1 * m + (1-beta_1) * grad */
            XTensor * m = (XTensor*)moments.Get(i);
            _ScaleAndShiftMe(m, adamBeta1, 0);
            _Sum(m, paraGrad, m, (1.0F - adamBeta1));
            
-            /* v = beat_2 * v + (1-beta_2) * grad * grad*/
+            /* v = beta_2 * v + (1-beta_2) * grad * grad*/
            XTensor * v = (XTensor*)moments2nd.Get(i);
            _Multiply(paraGrad, paraGrad, v, adamBeta2/(1.0F - adamBeta2));
            _ScaleAndShiftMe(v, (1.0F - adamBeta2), 0);
@@ -676,9 +839,11 @@ void T2TTrainer::PrepareModel(T2TModel * model)
 /* 
 do padding on the output 
 >> output - output tensor of the network
+>> gold - gold standard
 >> padding - padding of a batch of sentences
+>> lsP - smoothing factor
 */
-void T2TTrainer::PadOutput(XTensor * output, XTensor * padding)
+void T2TTrainer::PadOutput(XTensor * output, XTensor * gold, XTensor * padding)
 {
    if(output == NULL || padding == NULL)
        return;
@@ -693,14 +858,68 @@ void T2TTrainer::PadOutput(XTensor * output, XTensor * padding)
    XTensor * padding2 = NewTensorBuf(1, &padding->unitNum, X_FLOAT, 1.0F, padding->devID, padding->mem);

    _CopyValues(padding, padding2);
+    _MultiplyDim(output, padding2, output, 0);
    _ScaleAndShiftMe(padding2, 1e9F, -1e9F);
-
    _SumDim(output, padding2, output, 0);
-
+    
    output->Reshape(on, dimso);
+    
+    if(gold != NULL){
+        gold->Reshape(gold->unitNum/dimso[gold->order - 1], dimso[gold->order - 1]);
+        _CopyValues(padding, padding2);
+        _MultiplyDim(gold, padding2, gold, 0);
+        gold->Reshape(on, dimso);
+    }

    delete[] dimso;
    DelTensorBuf(padding2);
 }

+/*
+recale the output and gold tensors for normalized loss
+>> output - output tensor of the network
+>> gold - gold standard
+>> padding - padding of a batch of sentences
+*/
+void T2TTrainer::RescaleOutput(XTensor * output, XTensor * gold, XTensor * padding)
+{
+    CheckNTErrors(output->order == 3, "Wrong dimension number!");
+    CheckNTErrors(gold->order == 3, "Wrong dimension number!");
+    
+    int num = padding->GetDim(0);
+    XTensor * factor = NewTensorBuf(1, &num, padding->dataType, 1.0F, padding->devID, padding->mem);
+    
+    _ReduceSum(padding, factor, padding->order - 1);
+    
+    _ExpMe(output);
+    _DivDim(output, factor, output, 0);
+    _LogMe(output);
+    _DivDim(gold, factor, gold, 0);
+    
+    DelTensorBuf(factor);
+}
+    
+/*
+perform label smoothing
+>> gold - gold standard
+>> smoothed - result of label smoothing
+>> p - smoothing factor
+*/
+void T2TTrainer::LabelSmooth(XTensor * gold, XTensor * smoothed, DTYPE p)
+{
+    CheckNTErrors(p >= 0 && p <= 1.0F, "Smoothing factor must be in range [0,1]");
+    
+    int n = gold->GetDim(-1);
+    DTYPE q = 1.0F - p;
+    DTYPE gift = p / n;
+    
+    InitTensor(smoothed, gold);
+    _CopyValues(gold, smoothed);
+    
+    if(p == 0)
+        return;
+
+    _ScaleAndShiftMe(smoothed, q, gift);
+}
+
 }
--- a/source/sample/transformer/T2TTrainer.h
+++ b/source/sample/transformer/T2TTrainer.h
@@ -37,15 +37,27 @@ namespace transformer
 class T2TTrainer
 {
 public:
+    /* paramter number */
+    int argNum;
+
+    /* parameter array */
+    char ** argArray;
+
    /* buffer for loading words */
    int * buf;

+    /* another buffer */
+    int * buf2;
+
    /* buffer size */
    int bufSize;

    /* length of each sequence */
    int * seqLen;

+    /* another array */
+    int * seqLen2;
+
    /* offset of the first word for each sequence */
    int * seqOffset;

@@ -101,6 +113,24 @@ public:
    /* list of the 2nd order moment of the parameter matrics */
    XList moments2nd;

+    /* indicates whether the data file is shuffled for training */
+    bool isShuffled;
+    
+    /* the factor of label smoothing */
+    DTYPE labelSmoothingP;
+
+    /* number of steps after which we make a checkpoint */
+    int nStepCheckpoint;
+
+    /* indicates whether we make a checkpoint after each traing epoch */
+    bool useEpochCheckpoint;
+    
+    /* number of batches on which we do model update */
+    int updateStep;
+    
+    /* indicates whether we double the </s> symble for the output of lms */
+    bool isDoubledEnd;
+
 public:
    /* constructor */
    T2TTrainer();
@@ -109,14 +139,17 @@ public:
    ~T2TTrainer();

    /* initialize the trainer */
-    void Init(int argc, const char ** argv);
+    void Init(int argc, char ** argv);

    /* train the model */
-    void Train(const char * fn, T2TModel * model);
+    void Train(const char * fn, const char * validFN, const char * modelFN, T2TModel * model);

    /* test the model */
    void Test(const char * fn, const char * ofn, T2TModel * model);

+    /* make a checkpoint */
+    void MakeCheckpoint(T2TModel * model, const char * validFN, const char * modelFN, const char * label, int id);
+
    /* load data to buffer */
    int LoadBuf(FILE * file, bool isSorted, int step);

@@ -130,6 +163,9 @@ public:
                  int step, int vs, int sBatch, int wBatch, 
                  bool isSorted, int &wCount,
                  int devID, XMem * mem);
+
+    /* shuffle the data file */
+    void Shuffle(const char * srcFile, const char * tgtFile);
    
    /* get word probabilities for a batch of sequences */
    float GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs);
@@ -141,7 +177,13 @@ public:
    void PrepareModel(T2TModel * model);

    /* do padding on the output */
-    void PadOutput(XTensor * output, XTensor * padding);
+    void PadOutput(XTensor * output, XTensor * gold, XTensor * padding);
+    
+    /* recale the output and gold tensors for normalized loss */
+    void RescaleOutput(XTensor * output, XTensor * gold, XTensor * padding);
+    
+    /* perform label smoothing */
+    void LabelSmooth(XTensor * gold, XTensor * smoothed, DTYPE p);
 };



--- a/source/sample/transformer/T2TUtility.cpp
+++ b/source/sample/transformer/T2TUtility.cpp
@@ -30,7 +30,7 @@ FILE * tmpFILE;
 int llnum = 0;
 FILE * tf = NULL;

-void LoadParamString(int argc, const char ** argv, const char * name, char * p, const char * defaultP)
+void LoadParamString(int argc, char ** argv, const char * name, char * p, const char * defaultP)
 {
    char vname[128];
    vname[0] = '-';
@@ -47,7 +47,7 @@ void LoadParamString(int argc, const char ** argv, const char * name, char * p, 
        strcpy(p, defaultP);
 }

-void LoadParamInt(int argc, const char ** argv, const char * name, int * p, int defaultP)
+void LoadParamInt(int argc, char ** argv, const char * name, int * p, int defaultP)
 {
    char vname[128];
    vname[0] = '-';
@@ -64,7 +64,7 @@ void LoadParamInt(int argc, const char ** argv, const char * name, int * p, int 
        *p = defaultP;
 }

-void LoadParamBool(int argc, const char ** argv, const char * name, bool * p, bool defaultP)
+void LoadParamBool(int argc, char ** argv, const char * name, bool * p, bool defaultP)
 {
    char vname[128];
    vname[0] = '-';
@@ -81,7 +81,7 @@ void LoadParamBool(int argc, const char ** argv, const char * name, bool * p, bo
        *p = defaultP;
 }

-void LoadParamFloat(int argc, const char ** argv, const char * name, float * p, float defaultP)
+void LoadParamFloat(int argc, char ** argv, const char * name, float * p, float defaultP)
 {
    char vname[128];
    vname[0] = '-';
@@ -98,7 +98,7 @@ void LoadParamFloat(int argc, const char ** argv, const char * name, float * p, 
        *p = defaultP;
 }

-void ShowParams(int argc, const char ** argv)
+void ShowParams(int argc, char ** argv)
 {
    fprintf(stderr, "args:\n");
    for(int i = 0; i < argc; i++){

--- a/source/sample/transformer/T2TUtility.h
+++ b/source/sample/transformer/T2TUtility.h
@@ -30,13 +30,13 @@ namespace transformer
 extern FILE * tmpFILE;

 /* load arguments */
-void LoadParamString(int argc, const char ** argv, const char * name, char * p, const char * defaultP);
-void LoadParamInt(int argc, const char ** argv, const char * name, int * p, int defaultP);
-void LoadParamBool(int argc, const char ** argv, const char * name, bool * p, bool defaultP);
-void LoadParamFloat(int argc, const char ** argv, const char * name, float * p, float defaultP);
+void LoadParamString(int argc, char ** argv, const char * name, char * p, const char * defaultP);
+void LoadParamInt(int argc, char ** argv, const char * name, int * p, int defaultP);
+void LoadParamBool(int argc, char ** argv, const char * name, bool * p, bool defaultP);
+void LoadParamFloat(int argc, char ** argv, const char * name, float * p, float defaultP);

 /* show arguments */
-void ShowParams(int argc, const char ** argv);
+void ShowParams(int argc, char ** argv);

 extern int llnum;
 extern FILE * tf;

--- a/source/sample/transformer/Transformer.cpp
+++ b/source/sample/transformer/Transformer.cpp
@@ -19,6 +19,7 @@
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
 */

+#include <math.h>
 #include "Transformer.h"
 #include "T2TModel.h"
 #include "T2TUtility.h"
@@ -32,31 +33,39 @@ int TransformerMain(int argc, const char ** argv)
 {
    if(argc == 0)
        return 1;
+    
+    fprintf(stderr, "%e\n", log(1e-8F));
+
+    char ** args = new char*[argc];
+    for(int i = 0; i < argc; i++){
+        args[i] = new char[strlen(argv[i]) + 1];
+        strcpy(args[i], argv[i]);
+    }

    tmpFILE = fopen("tmp.txt", "wb");

-    ShowParams(argc, argv);
+    ShowParams(argc, args);

    char * trainFN = new char[MAX_LINE_LENGTH];
    char * modelFN = new char[MAX_LINE_LENGTH];
    char * testFN = new char[MAX_LINE_LENGTH];
    char * outputFN = new char[MAX_LINE_LENGTH];

-    LoadParamString(argc, argv, "train", trainFN, "");
-    LoadParamString(argc, argv, "model", modelFN, "");
-    LoadParamString(argc, argv, "test", testFN, "");
-    LoadParamString(argc, argv, "output", outputFN, "");
+    LoadParamString(argc, args, "train", trainFN, "");
+    LoadParamString(argc, args, "model", modelFN, "");
+    LoadParamString(argc, args, "test", testFN, "");
+    LoadParamString(argc, args, "output", outputFN, "");

    T2TTrainer trainer;
-    trainer.Init(argc, argv);
+    trainer.Init(argc, args);

    T2TModel model;

-    model.InitModel(argc, argv);
+    model.InitModel(argc, args);

    /* learn model parameters */
    if(strcmp(trainFN, ""))
-        trainer.Train(trainFN, &model);
+        trainer.Train(trainFN, testFN, strcmp(modelFN, "") ? modelFN : "checkpoint.model", &model);

    /* save the final model */
    if(strcmp(modelFN, "") && strcmp(trainFN, ""))
@@ -66,18 +75,25 @@ int TransformerMain(int argc, const char ** argv)
    if(strcmp(modelFN, ""))
        model.Read(modelFN);

+    T2TTrainer tester;
+    tester.Init(argc, args);
+
    /* test the model on the new data */
    if(strcmp(testFN, "") && strcmp(outputFN, ""))
-        trainer.Test(testFN, outputFN, &model);
+        tester.Test(testFN, outputFN, &model);

    delete[] trainFN;
    delete[] modelFN;
    delete[] testFN;
    delete[] outputFN;

+    for(int i = 0; i < argc; i++)
+        delete[] args[i];
+    delete[] args;
+
    fclose(tmpFILE);

    return 0;
 }

-}
\ No newline at end of file
+}
--- a/source/tensor/XGlobal.h
+++ b/source/tensor/XGlobal.h
@@ -55,6 +55,9 @@ namespace nts {
 #define DTYPE_MIN (DTYPE)-3.40E+38
 #endif

+#define LOGPROB_MIN (DTYPE)-2E+1
+#define GRAD_MAX (DTYPE)1E+5
+
 #if WIN32
 #define DELIMITER '\\'
 #else
@@ -148,6 +151,7 @@ extern bool useCUDA;
 #define XPRINT5(VERBOSE,FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5);FFLUSH(FILEH);}}
 #define XPRINT6(VERBOSE,FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6);FFLUSH(FILEH);}}
 #define XPRINT7(VERBOSE,FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6,ARG7) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6,ARG7);FFLUSH(FILEH);}}
+#define XPRINT8(VERBOSE,FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6,ARG7,ARG8) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6,ARG7,ARG8);FFLUSH(FILEH);}}

 #define B2I(V) V==0?false:true


--- a/source/tensor/XLink.cpp
+++ b/source/tensor/XLink.cpp
@@ -263,6 +263,18 @@ int XLink::GetParamInt(int i)
    char * p = (char*)params + i * paramSize;
    return *(int*)p;
 }
+
+/* 
+get a paramter in integer 
+>> i - id of the parameter
+<< return - the parameter in integer
+*/
+void * XLink::GetParamPointer(int i)
+{
+    CheckNTErrors(params != NULL, "parameter array cannot be empty!");
+    char * p = (char*)params + i * paramSize;
+    return *(int **)p;
+}
    
 /*
 get a parameter in MATRIX_TRANS_TYPE
@@ -401,8 +413,7 @@ add a boolean parameter
 */
 void XLink::AddParamToHeadBool(XTensor * h, bool param)
 {
-    if(h != NULL)
-        return;
+    CheckNTErrors(h != NULL, "head tensor cannot be empty!");
    h->income.AddParam(&param, sizeof(bool));
 }

@@ -413,8 +424,7 @@ add a pointer parameter
 */
 void XLink::AddParamToHeadPointer(XTensor * h, void * param)
 {
-    if(h != NULL)
-        return;
+    CheckNTErrors(h != NULL, "head tensor cannot be empty!");
    h->income.AddParam(&param, sizeof(param));
 }

@@ -589,9 +599,24 @@ show the network encoded in a root node (tensor)
 */
 void XLink::ShowNetwork(FILE * file, XTensor * root)
 {
-    fprintf(file, "node %d - ", root->id);
-
    XLink &income = root->income;
+
+    for(int i = 0; i < income.tailNum; i++){
+        XTensor * child = income.tails[i];
+        ShowNetwork(file, child);
+    }
+}
+
+/* 
+show a node 
+>> file - file to dump information
+>> root - pointer to the node
+*/
+void XLink::ShowNode(FILE * file, XTensor * node)
+{
+    fprintf(file, "node %d - ", node->id);
+
+    XLink &income = node->income;
    if(income.head == NULL){
        fprintf(file, "income[%d]: null ", income.tailNum);
    }
@@ -607,7 +632,7 @@ void XLink::ShowNetwork(FILE * file, XTensor * root)
    }
    fprintf(stderr, ", ");

-    XLink &outgo = root->outgo;
+    XLink &outgo = node->outgo;
    if(outgo.head == NULL || outgo.tailNum == 0){
        fprintf(file, "outgo[%d]: null ", outgo.tailNum);
    }
@@ -623,11 +648,6 @@ void XLink::ShowNetwork(FILE * file, XTensor * root)
    }

    fprintf(stderr, "\n");
-
-    for(int i = 0; i < income.tailNum; i++){
-        XTensor * child = income.tails[i];
-        ShowNetwork(file, child);
-    }
 }
    
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/XLink.h
+++ b/source/tensor/XLink.h
@@ -127,6 +127,9 @@ struct XLink

    /* get a paramter in integer */
    int GetParamInt(int i);
+
+    /* get a paramter in pointer */
+    void * GetParamPointer(int i);
    
    /* get a parameter in MATRIX_TRANS_TYPE */
    MATRIX_TRANS_TYPE GetParamTrans(int i);
@@ -178,6 +181,10 @@ struct XLink
    /* show the network encoded in a root node (tensor) */
    static
    void ShowNetwork(FILE * file, XTensor * root);
+
+    /* show a node */
+    static
+    void ShowNode(FILE * file, XTensor * node);
 };
    
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/XMem.cpp
+++ b/source/tensor/XMem.cpp
@@ -600,7 +600,7 @@ void * XMem::AllocStandard(int myDevID, MTYPE mySize, bool myIsRebuiltIndex)
    void * result = NULL;

    /* search for the memory piece avialable for the allocation */
-    for(int i = indexEntryNum; i > index; i--){
+    for(int i = index; i <= indexEntryNum; i++){
        if(i == indexEntryNum){
            entry = memIndex + index;
            CheckNTErrors(mySize >= minSizeIndex[index], "Wrong index!");
@@ -667,7 +667,7 @@ void * XMem::AllocStandard(int myDevID, MTYPE mySize, bool myIsRebuiltIndex)
        hit->size = mySize;
        hit->head.state = 2;
        hit->pReal = beg;
-        blocks[hit->head.blockID].used += mySize;
+        blocks[hit->head.blockID].used += head->size;
        
        RemoveFreeIndexNode(hit);
        AddAllocIndexNode(hit);
@@ -690,7 +690,7 @@ void * XMem::AllocStandard(int myDevID, MTYPE mySize, bool myIsRebuiltIndex)
                    continue;
                
                if (block->mem == NULL) {
-                    block->size = MAX(maxBlockSize, mySize + 2 * MY_PITCH);
+                    block->size = MAX(block->sizeDesired, mySize + 2 * MY_PITCH);
                    if (myDevID < 0) {
                        block->mem = new char[block->size];
                        memset(block->mem, 0, block->size);
@@ -719,8 +719,9 @@ void * XMem::AllocStandard(int myDevID, MTYPE mySize, bool myIsRebuiltIndex)
                newNode->head.indexNode = newNode;
                newNode->p = block->mem;
                newNode->pReal = NULL;
-                newNode->size = (char*)block->mem + mySize -
-                                (char*)GetPitchedAddress(block->mem, MY_PITCH);
+                //newNode->size = (char*)block->mem + block->size -
+                //                (char*)GetPitchedAddress(block->mem, MY_PITCH);
+                newNode->size = mySize;
                    
                AddFreeIndexNode(newNode);
                    
@@ -1041,9 +1042,14 @@ void XMem::RebuildIndex()
                /* make a new index node */
                MPieceNode * newNode = memIndex2 + nodeNumUsed2++;
                newNode->p = p;
-                newNode->size = node->size;
-                //newNode->size = (char*)p + head->size - 
-                //                ( head->state == 1 ? (char*)GetPitchedAddress((char*)p, MY_PITCH) : (char*)head->indexNode->pReal);
+                
+                if(head->state == 1){
+                    newNode->size = (char*)p + head->size -
+                                    ( head->state == 1 ? (char*)GetPitchedAddress((char*)p, MY_PITCH) : (char*)head->indexNode->pReal);
+                }
+                else
+                    newNode->size = node->size;
+                
                newNode->pre = NULL;
                newNode->next = NULL;
                

--- a/source/tensor/XName.cpp
+++ b/source/tensor/XName.cpp
@@ -35,6 +35,8 @@ const char * GetOPName(int type)
            return "M_EXP";
        else if (type == MATH_FLOOR)
            return "M_FLOOR";
+        else if (type == MATH_ISZERO)
+            return "M_ISZERO";
        else if (type == MATH_LOG)
            return "M_LOG";
        else if (type == MATH_SQRT)
@@ -107,10 +109,14 @@ const char * GetOPName(int type)
            return "S_MERGE_LIST";
        else if (type == SHAPE_PERMUTE)
            return "S_PERMUTE";
+        else if (type == SHAPE_RESHAPE)
+            return "S_RESHAPE";
        else if (type == SHAPE_SPLIT)
            return "S_SPLIT";
        else if (type == SHAPE_SPLIT_LIST)
            return "S_SPLIT_LIST";
+        else if (type == SHAPE_SQUEEZE)
+            return "S_SQUEEZE";
        else if (type == SHAPE_TRANSPOSE)
            return "S_TRANSPOSE";
        else if (type == SHAPE_UNSQUEEZE)

--- a/source/tensor/XName.h
+++ b/source/tensor/XName.h
@@ -35,7 +35,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define MATH_CEIL               MATH_ABSOLUTE + 1
 #define MATH_EXP                MATH_CEIL + 1
 #define MATH_FLOOR              MATH_EXP + 1
-#define MATH_LOG                MATH_FLOOR + 1
+#define MATH_ISZERO             MATH_FLOOR + 1
+#define MATH_LOG                MATH_ISZERO + 1
 #define MATH_SQRT               MATH_LOG + 1
 #define MATH_SQUARE             MATH_SQRT + 1
 #define MATH_SIN                MATH_SQUARE + 1
@@ -81,9 +82,11 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define SHAPE_MERGE             SHAPE_CONCATENATE + 1
 #define SHAPE_MERGE_LIST        SHAPE_MERGE + 1
 #define SHAPE_PERMUTE           SHAPE_MERGE_LIST + 1
-#define SHAPE_SPLIT             SHAPE_PERMUTE + 1
+#define SHAPE_RESHAPE           SHAPE_PERMUTE + 1
+#define SHAPE_SPLIT             SHAPE_RESHAPE + 1
 #define SHAPE_SPLIT_LIST        SHAPE_SPLIT + 1
-#define SHAPE_TRANSPOSE         SHAPE_SPLIT_LIST + 1
+#define SHAPE_SQUEEZE           SHAPE_SPLIT_LIST + 1
+#define SHAPE_TRANSPOSE         SHAPE_SQUEEZE + 1
 #define SHAPE_UNSQUEEZE         SHAPE_TRANSPOSE + 1

 #define SORT                    SHAPE_UNSQUEEZE + 1

--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -38,6 +38,7 @@
 #include "XMem.h"
 #include "XHeap.h"
 #include "XBLAS.h"
+#include "XName.h"
 #include "core/shape/MergeBlockLists.h"
 #include "core/movement/CopyValues.h"
 #include "core/arithmetic/Sum.h"
@@ -45,6 +46,7 @@
 #include "core/arithmetic/Sub.h"
 #include "core/arithmetic/Div.h"
 #include "core/math/ScaleAndShift.h"
+#include "function/Identity.h"

 #ifdef USE_CUDA

@@ -202,7 +204,7 @@ XTensor::~XTensor()
        dims[0] = -dims[0];
        
        XTensor * newTensor = new XTensor(order, dims, dataType, denseRatio, devID, mem);
-        newTensor->SetTMP();
+        newTensor->SetTMPFlag();
        newTensor->data = data;
        data = NULL;
        
@@ -244,6 +246,7 @@ void XTensor::Init()
    isInit = false;
    isTmp =  false;
    isGrad = false;
+    isVar  = false;
    visitMark = 0;
    grad = NULL;
 }
@@ -289,7 +292,8 @@ void XTensor::ShallowCopy(const XTensor &tensor)
 /* overloading of the equal-sign */
 XTensor& XTensor::operator= (const XTensor& tensor)
 {
-    /* we must make a hard copy of the tensor if it is the input 
+    
+    /* we must make a hard copy of the tensor if it is the input
       of another node. */
    if(outgo.tailNum > 0){
        int dims[MAX_TENSOR_DIM_NUM];
@@ -297,7 +301,7 @@ XTensor& XTensor::operator= (const XTensor& tensor)
        dims[0] = -dims[0];
        
        XTensor * newTensor = new XTensor(order, dims, dataType, denseRatio, devID, mem);
-        newTensor->SetTMP();
+        newTensor->SetTMPFlag();
        newTensor->data = data;
        newTensor->dataHost = dataHost;
        newTensor->signature = tensor.signature;
@@ -311,38 +315,54 @@ XTensor& XTensor::operator= (const XTensor& tensor)
        dataHost = NULL;
    }

-    /* hard copy of the data array */
-    int size = unitNum * unitSize;
-    if( isInit && !isSparse && !tensor.isSparse &&
-        size == tensor.unitNum * tensor.unitSize &&
-      ((devID < 0 && tensor.devID < 0) && devID == tensor.devID) &&
-        data != NULL)
-    {
-        XMemCopy(data, devID, tensor.data, tensor.devID, size);
-        if(dataHost != NULL && tensor.dataHost != NULL)
-            XMemCopy(dataHost, -1, tensor.dataHost, tensor.devID, size);
+    if(false && !tensor.isTmp){
+        /* NOTE: this might lead to additional data copy on Mac machines */
+        /* we make an identity transformation here */
+        
+        if(outgo.tailNum > 0)
+            XLink::ClearOutgoing(this);
+        XLink::ClearIncoming(this);
+        
+        if(!IsSameShaped(this, &tensor))
+            Resize(tensor.order, tensor.dimSize, tensor.dataType, tensor.denseRatio);
+        
+        _Identity(&tensor, this);
+        XLink::MakeLink(&tensor, NULL, this, FUNC_IDENTITY);
    }
    else{
-        DestroyData();
-        if(!isInit){
-            devID = tensor.devID;
-            mem = tensor.mem;
+        /* hard copy of the data array */
+        int size = unitNum * unitSize;
+        if( isInit && !isSparse && !tensor.isSparse &&
+            size == tensor.unitNum * tensor.unitSize &&
+          ((devID < 0 && tensor.devID < 0) && devID == tensor.devID) &&
+            data != NULL)
+        {
+            XMemCopy(data, devID, tensor.data, tensor.devID, size);
+            if(dataHost != NULL && tensor.dataHost != NULL)
+                XMemCopy(dataHost, -1, tensor.dataHost, tensor.devID, size);
        }
+        else{
+            DestroyData();
+            if(!isInit){
+                devID = tensor.devID;
+                mem = tensor.mem;
+            }

-        Resize(tensor.order, tensor.dimSize, tensor.dataType, tensor.denseRatio);
-        _CopyValues(&tensor, this);
-    }
+            Resize(tensor.order, tensor.dimSize, tensor.dataType, tensor.denseRatio);
+            _CopyValues(&tensor, this);
+        }

-    /* copy member variables */
-    ShallowCopy(tensor);
+        /* copy member variables */
+        ShallowCopy(tensor);

-    isInit = true;
-    isTmp  = false;
+        isInit = true;
+        isTmp  = false;

-    CheckNTErrors(outgo.tailNum == 0, "The node has outgoing edge to other nodes!");
+        CheckNTErrors(outgo.tailNum == 0, "The node has outgoing edge to other nodes!");

-    /* create tensor links for the new tensor */
-    XLink::Replace(&tensor, this);
+        /* create tensor links for the new tensor */
+        XLink::Replace(&tensor, this);
+    }

    return *this;
 }
@@ -353,24 +373,48 @@ XTensor XTensor::operator+ (const XTensor& tensor)
    return Sum(*this, tensor);
 }

+/* overloading of the plus-sign */
+XTensor XTensor::operator+ (const DTYPE shift)
+{
+    return ScaleAndShift(*this, 1, shift);
+}
+
 /* overloading of the multiply-sign */
 XTensor XTensor::operator* (const XTensor& tensor)
 {
    return Multiply(*this, tensor);
 }

+/* overloading of the multiply-sign */
+XTensor XTensor::operator* (const DTYPE scale)
+{
+    return ScaleAndShift(*this, scale, 0);
+}
+
 /* overloading of the minus-sign */
 XTensor XTensor::operator- (const XTensor& tensor)
 {
    return Sub(*this, tensor);
 }

+/* overloading of the minus-sign */
+XTensor XTensor::operator- (const DTYPE shift)
+{
+    return ScaleAndShift(*this, 1, -shift);
+}
+
 /* overloading of the division-sign */
 XTensor XTensor::operator/ (const XTensor& tensor)
 {
    return Div(*this, tensor);
 }

+/* overloading of the division-sign */
+XTensor XTensor::operator/ (const DTYPE scale)
+{
+    return ScaleAndShift(*this, (DTYPE)1/scale, 0);
+}
+
 /* 
 linear transformation b = a * \scale + \shift
 >> scale - the slope
@@ -419,7 +463,7 @@ judge whether the three matrices are in the same type and size
 >> c - a tensor again
 << return - whether the two input tensors are identical
 */
-bool XTensor::IsSameShaped(XTensor * a, XTensor * b, XTensor * c)
+bool XTensor::IsSameShaped(const XTensor * a, const XTensor * b, const XTensor * c)
 {
    return IsSameShaped(a, b) && IsSameShaped(a, c);
 }
@@ -440,7 +484,7 @@ void XTensor::SetDim(int * myDimSize)
 get the size of a given dimension 
 >> dim - the given dim we are looking at
 */
-int XTensor::GetDim(const int dim)
+int XTensor::GetDim(const int dim) const
 {
    CheckNTErrors(dim < order, "dimenision is out of range!");
    
@@ -746,6 +790,20 @@ void XTensor::SetDataPointer()
    dataP = &data;
 }

+/* compare two number */
+bool IsFloatEqual(DTYPE a, DTYPE b, float absError, float relError)
+{
+    if(a == b)
+        return true;
+    if(fabs(a - b) < absError)
+        return true;
+    if(fabs(a) < fabs(b))
+        return (fabs(a - b) / b < relError) ? true : false;
+    else
+        return (fabs(a - b) / a < relError) ? true : false;
+}
+
+/* check whether the data array is the same as the answer */
 bool XTensor::CheckData(const void * d, int num, float tolerance, int beg)
 {
    if (data == NULL || d == NULL)
@@ -759,7 +817,7 @@ bool XTensor::CheckData(const void * d, int num, float tolerance, int beg)
    DTYPE * answerPrt = (DTYPE*)d;
    for (int i = beg; i < num; i++) {
        value = ToCPU(devID, valuePrt);
-        if (fabs(value - *answerPrt) > tolerance)
+        if(IsFloatEqual(value, *answerPrt, tolerance, 1e-4F) == false)
            return false;
        valuePrt++;
        answerPrt++;
@@ -1125,7 +1183,7 @@ int XTensor::GetNonzeroSize()
 set the tensor as "temporary" 
 >> myIsTMP - the flag
 */
-void XTensor::SetTMP(bool myIsTmp)
+void XTensor::SetTMPFlag(bool myIsTmp)
 {
    isTmp = myIsTmp;
 }
@@ -1134,12 +1192,23 @@ void XTensor::SetTMP(bool myIsTmp)
 set the tensor as "keep-gradient" 
 >> myIsGrad - the flag
 */
-void XTensor::SetGrad(bool myIsGrad)
+void XTensor::SetGradFlag(bool myIsGrad)
 {
    isGrad = myIsGrad;
 }

 /* 
+set the tensor as "variable" 
+>> myIsVar - the flag
+*/
+void XTensor::SetVarFlag(bool myIsVar)
+{
+    isVar = myIsVar;
+    if(isVar)
+        SetGradFlag(true);
+}
+
+/* 
 resize a tensor with a specified tensor size
 >> myOrder - order of the tensor
 >> myDimSize - the size of each dimension
@@ -1415,9 +1484,18 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg, 

            }
        }
-        else {
-            ShowNTErrors("TODO!");
+        else if(dataType == X_INT) {
+            int end = MIN(n > 0 ? beg + n : beg + unitNum, unitNum);
+            for(int i = beg; i < end; i++){
+                int f = ((int*)d)[i];
+                if(i == beg)
+                    fprintf(file, "%d", f);
+                else
+                    fprintf(file, " %d", f);
+            }
        }
+        else
+            ShowNTErrors("TODO!");
    }
    else {
        int num = this->unitNumNonZero > 0 ? *(int*)d : 0;

--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
@@ -145,6 +145,9 @@ public:
    /* indicates whether the tensor keeps the gradient when used as model parameters */
    bool isGrad;

+    /* indicates whether the tensor is used as paramters (or variables) */
+    bool isVar;
+
    /* mark for traversing the gragh */
    unsigned int visitMark;

@@ -201,15 +204,27 @@ public:

    /* overloading of the plus-sign */
    XTensor  operator+ (const XTensor &tensor);
+    
+    /* overloading of the plus-sign */
+    XTensor  operator+ (const DTYPE shift);

    /* overloading of the multiply-sign */
    XTensor  operator* (const XTensor &tensor);
+    
+    /* overloading of the multiply-sign */
+    XTensor  operator* (const DTYPE scale);

    /* overloading of the minus-sign */
    XTensor  operator- (const XTensor &tensor);
+    
+    /* overloading of the minus-sign */
+    XTensor  operator- (const DTYPE shift);

    /* overloading of the division-sign */
    XTensor  operator/ (const XTensor &tensor);
+    
+    /* overloading of the division-sign */
+    XTensor  operator/ (const DTYPE scale);

    /* linear transformation */
    XTensor Lin(DTYPE scale, DTYPE shift = 0);
@@ -220,13 +235,13 @@ public:

    /* judge whether the three matrices are in the same type and size */
    static
-    bool IsSameShaped(XTensor * a, XTensor * b, XTensor * c);
+    bool IsSameShaped(const XTensor * a, const XTensor * b, const XTensor * c);

    /* set the size of each dimension */
    void SetDim(int * myDimSize);

    /* get the size of a given dimension */
-    int GetDim(const int dim);
+    int GetDim(const int dim) const;

    /* reshape the tensor */
    void Reshape(const int order, const int * myDimSize);
@@ -319,10 +334,13 @@ public:
    int GetNonzeroSize();

    /* set the tensor as "temporary" */
-    void SetTMP(bool myIsTmp = true);
+    void SetTMPFlag(bool myIsTmp = true);

    /* set the tensor as "keep-gradient" */
-    void SetGrad(bool myIsGrad = true);
+    void SetGradFlag(bool myIsGrad = true);
+
+    /* set the tensor as "variable" */
+    void SetVarFlag(bool myIsVar = true);

    /* resize a matrix with a specified matrix size */
    bool Resize(const int myOrder, const int * myDimSize,

--- a/source/tensor/core/CHeader.h
+++ b/source/tensor/core/CHeader.h
@@ -63,11 +63,14 @@
 #include "movement/CopyIndexed.h"
 #include "movement/CopyInGrid.h"
 #include "movement/CopyValues.h"
+#include "movement/Gather.h"
+#include "movement/Spread.h"

 #include "reduce/ReduceMax.h"
 #include "reduce/ReduceMean.h"
 #include "reduce/ReduceStandardVariance.h"
 #include "reduce/ReduceSum.h"
+#include "reduce/ReduceSumAll.h"
 #include "reduce/ReduceSumSquared.h"
 #include "reduce/ReduceVariance.h"

@@ -77,8 +80,10 @@
 #include "shape/MakeSplitBlockIndex.h"
 #include "shape/Merge.h"
 #include "shape/MergeBlockLists.h"
+#include "shape/Reshape.h"
 #include "shape/Permute.h"
 #include "shape/Split.h"
+#include "shape/Squeeze.h"
 #include "shape/Transpose.h"
 #include "shape/Unsqueeze.h"


--- a/source/tensor/core/arithmetic/Div.cpp
+++ b/source/tensor/core/arithmetic/Div.cpp
@@ -147,6 +147,8 @@ int GetDivDimIndex(const XTensor &a, const XTensor &b)
 {
    if(a.order < b.order)
        return -1;
+    if(XTensor::IsSameShaped(&a, &b))
+        return -1;

    int hitCount = 0;
    int hitDim = -1;
@@ -181,7 +183,7 @@ where i is the index of the item
 XTensor Div(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim)
 {
    XTensor c(&a);
-    c.SetTMP();
+    c.SetTMPFlag();

    int n = GetDivDimIndex(a, b);


--- a/source/tensor/core/arithmetic/DivDim.cpp
+++ b/source/tensor/core/arithmetic/DivDim.cpp
@@ -150,7 +150,7 @@ i.e., a is divided with b by broadcasting
 XTensor DivDim(const XTensor &a, const XTensor &b, int n, DTYPE alpha)
 {
    XTensor c(&a);
-    c.SetTMP();
+    c.SetTMPFlag();
    
    /* call _Div function */
    _DivDim(&a, &b, &c, n, alpha);

--- a/source/tensor/core/arithmetic/MatrixMul.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul.cpp
@@ -249,7 +249,7 @@ XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,

    float dr = (!a.isSparse || !b.isSparse) ? 1.0F : MAX(a.denseRatio, b.denseRatio);
    XTensor c(order, dimSize, a.dataType, dr, a.devID, a.mem);
-    c.SetTMP();
+    c.SetTMPFlag();

    /* call _MatrixMul function */
    _MatrixMul(&a, transposedA, &b, transposedB, &c, alpha, 0, parallelRunner);
@@ -299,7 +299,7 @@ XTensor MatrixMul(const XTensor &a, const XTensor &b,

    float dr = (!a.isSparse || !b.isSparse) ? 1.0F : MAX(a.denseRatio, b.denseRatio);
    XTensor c(order, dimSize, a.dataType, dr, a.devID, a.mem);
-    c.SetTMP();
+    c.SetTMPFlag();

    /* call _MatrixMul function */
    _MatrixMul(&a, X_NOTRANS, &b, X_NOTRANS, &c, alpha, 0, parallelRunner);

--- a/source/tensor/core/arithmetic/MatrixMulBatched.cpp
+++ b/source/tensor/core/arithmetic/MatrixMulBatched.cpp
@@ -314,7 +314,7 @@ XTensor MatrixMulBatched(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const 

    float dr = (!a.isSparse || !b.isSparse) ? 1.0F : MAX(a.denseRatio, b.denseRatio);
    XTensor c(order, dimSize, a.dataType, dr, a.devID, a.mem);
-    c.SetTMP();
+    c.SetTMPFlag();

    /*call _MatrixMulBatched function */
    _MatrixMulBatched(&a, transposedA, &b, transposedB, &c, alpha, 0, parallelRunner);
@@ -370,7 +370,7 @@ XTensor MatrixMulBatched(const XTensor &a, const XTensor &b,

    float dr = (!a.isSparse || !b.isSparse) ? 1.0F : MAX(a.denseRatio, b.denseRatio);
    XTensor c(order, dimSize, a.dataType, dr, a.devID, a.mem);
-    c.SetTMP();
+    c.SetTMPFlag();

    /*call _MatrixMulBatched function */
    _MatrixMulBatched(&a, X_NOTRANS, &b, X_NOTRANS, &c, alpha, 0, parallelRunner);

--- a/source/tensor/core/arithmetic/Multiply.cpp
+++ b/source/tensor/core/arithmetic/Multiply.cpp
@@ -148,6 +148,8 @@ int GetMultiplyDimIndex(const XTensor &a, const XTensor &b)
 {
    if(a.order < b.order)
        return -1;
+    if(XTensor::IsSameShaped(&a, &b))
+        return -1;

    int hitCount = 0;
    int hitDim = -1;
@@ -182,7 +184,7 @@ XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim
 {

    XTensor c(&a);
-    c.SetTMP();
+    c.SetTMPFlag();
    
    int n = GetMultiplyDimIndex(a, b);


--- a/source/tensor/core/arithmetic/MultiplyDim.cpp
+++ b/source/tensor/core/arithmetic/MultiplyDim.cpp
@@ -148,7 +148,7 @@ i.e., a is multiplied with b by broadcasting
 XTensor MultiplyDim(const XTensor &a, const XTensor &b, int n, DTYPE alpha)
 {
    XTensor c(&a);
-    c.SetTMP();
+    c.SetTMPFlag();

    /* call _Multiply function */
    _MultiplyDim(&a, &b, &c, n, alpha);

--- a/source/tensor/core/arithmetic/Negate.cpp
+++ b/source/tensor/core/arithmetic/Negate.cpp
@@ -68,7 +68,7 @@ make a new tensor to keep the result and return it
 XTensor Negate(const XTensor & a)
 {
    XTensor b(&a);
-    b.SetTMP();
+    b.SetTMPFlag();
    
    /* call _Negate function */
    _Negate(&a, &b);

--- a/source/tensor/core/arithmetic/Sign.cpp
+++ b/source/tensor/core/arithmetic/Sign.cpp
@@ -74,7 +74,7 @@ make a new tensor to keep the result and return it
 XTensor Sign(const XTensor & a)
 {
    XTensor b(&a);
-    b.SetTMP();
+    b.SetTMPFlag();

    /* call _Sign function */
    _Sign(&a, &b);

--- a/source/tensor/core/arithmetic/Sub.cpp
+++ b/source/tensor/core/arithmetic/Sub.cpp
@@ -134,6 +134,8 @@ int GetSubDimIndex(const XTensor &a, const XTensor &b)
 {
    if(a.order < b.order)
        return -1;
+    if(XTensor::IsSameShaped(&a, &b))
+        return -1;

    int hitCount = 0;
    int hitDim = -1;
@@ -164,7 +166,7 @@ make a new tensor c to keep the result and return it
 XTensor Sub(const XTensor &a, const XTensor &b, DTYPE beta)
 {
    XTensor c(&a);
-    c.SetTMP();
+    c.SetTMPFlag();

    int n = GetSubDimIndex(a, b);


--- a/source/tensor/core/arithmetic/SubDim.cpp
+++ b/source/tensor/core/arithmetic/SubDim.cpp
@@ -150,7 +150,7 @@ i.e., a is subtracted with b by broadcasting
 XTensor SubDim(const XTensor &a, const XTensor &b, int n, DTYPE beta)
 {
 	XTensor c(&a);
-	c.SetTMP();
+	c.SetTMPFlag();

 	/* call _Sub function */
 	_SubDim(&a, &b, &c, n, beta);

--- a/source/tensor/core/arithmetic/Sum.cpp
+++ b/source/tensor/core/arithmetic/Sum.cpp
@@ -139,6 +139,8 @@ int GetSumDimIndex(const XTensor &a, const XTensor &b)
 {
    if(a.order < b.order)
        return -1;
+    if(XTensor::IsSameShaped(&a, &b))
+        return -1;

    int hitCount = 0;
    int hitDim = -1;
@@ -169,7 +171,7 @@ make a new tensor c to keep the result and return it
 XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta)
 {
    XTensor c(&a);
-    c.SetTMP();
+    c.SetTMPFlag();

    int n = GetSumDimIndex(a, b);


--- a/source/tensor/core/arithmetic/SumDim.cpp
+++ b/source/tensor/core/arithmetic/SumDim.cpp
@@ -150,7 +150,7 @@ i.e., a is summed with b by broadcasting
 XTensor SumDim(const XTensor &a, const XTensor &b, int n, DTYPE beta)
 {
    XTensor c(&a);
-    c.SetTMP();
+    c.SetTMPFlag();
    
    /* call _Sum function */
    _SumDim(&a, &b, &c, n, beta);

--- a/source/tensor/core/getandset/Select.cpp
+++ b/source/tensor/core/getandset/Select.cpp
@@ -111,7 +111,7 @@ XTensor SelectRange(const XTensor &a, int dim, int low, int high)

    float dr = (!a.isSparse) ? 1.0F : a.denseRatio;
    XTensor c(order, dimSize, a.dataType, dr, a.devID, a.mem);
-    c.SetTMP();
+    c.SetTMPFlag();

    /* call _SelectRange function */
    _SelectRange(&a, &c, dim, low, high);

--- a/source/tensor/core/getandset/SetData.cpp
+++ b/source/tensor/core/getandset/SetData.cpp
@@ -234,7 +234,7 @@ void _SetDataDim(XTensor * tensor, int beg, int len, int dim, DTYPE p)
    int n = tensor->order;

    CheckNTErrors(tensor->dataType == DEFAULT_DTYPE, "TODO!");
-    CheckNTErrors(dim < n && dim > 0, "Illegal dimension!");
+    CheckNTErrors(dim < n && dim >= 0, "Illegal dimension!");
    CheckNTErrors(beg >= 0 && beg < tensor->GetDim(dim), "Illegal beginning position!");
    CheckNTErrors(beg + len >= 0 && beg + len < tensor->GetDim(dim), "Illegal length!");
    
@@ -264,11 +264,78 @@ void _SetDataDim(XTensor * tensor, int beg, int len, int dim, DTYPE p)
 }

 /* 
+modify data items along with a given index and dimension (and keep the remaining items unchanged) 
+>> source - the tensor whose data array would be modified
+>> modify - the tensor whose data array would be used to modify the source tensor
+>> dim - the dimension along which we modify the tensor
+>> index - index of the given dimension
+e.g., given a source tensor (3, 3)
+      1 2 3
+      4 5 6
+      7 8 9
+      given a modified tensor (3)
+      1 2 3
+      when dim = 0, index = 1, we have
+      1 2 3
+      1 2 3
+      7 8 9
+      i.e., we set entries of row 1 to {1, 2, 3}
+*/
+void _SetDataIndexed(XTensor * source, XTensor * modify, int dim, int index)
+{
+    int order = source->order;
+    int size = source->GetDim(dim);
+
+    CheckNTErrors(source->dataType == DEFAULT_DTYPE, "TODO!");
+    CheckNTErrors(dim >= 0 && dim < order, "Illegal dimension!");
+    CheckNTErrors(index >= 0 && index < size, "Illegal index!");
+    
+    for(int i = 0; i < order - 1; i++){
+        if(i < dim){
+            CheckNTErrors(modify->GetDim(i) == source->GetDim(i), "Illegal dimension!");
+        }
+        else if(i >= dim){
+            CheckNTErrors(modify->GetDim(i) == source->GetDim(i+1), "Illegal dimension!");
+        }
+    }
+
+    if(source->devID < 0 && modify->devID < 0){
+        int stride = 1;
+        int blockSize = 1;
+        int blockNum  = 1;
+
+        for(int i = order - 1; i > dim; i--){
+            stride *= source->GetDim(i);
+        }
+
+        blockSize = stride * source->GetDim(dim);
+        blockNum = source->unitNum / blockSize;
+
+        for(int i = 0; i < blockNum; i++){
+            DTYPE * d = (DTYPE*)source->data + blockSize * i + index * stride;
+            DTYPE * p = (DTYPE*)modify->data + stride * i;
+            for(int j = 0; j < stride; j++)
+                d[j] = p[j];
+        }
+    }
+    else if(source->devID >= 0 && modify->devID >= 0) {
+#ifdef USE_CUDA
+        _CudaSetDataIndexed(source, modify, dim, index);
+#else
+        ShowNTErrors("Please specify USE_CUDA and recompile the code!");
+#endif
+    }
+    else{
+        ShowNTErrors("TODO!");
+    }
+}
+
+/* 
 generate data as lower triangular matrics for last two dimensions 
 >> tensor - the tensor whose data to be set
 >> p - the value for each entry of the lower triangular matrics
 >> shift - the offset from diagonal
-e.g., for a 3* 3 tensor, 
+e.g., for a 3 * 3 tensor, 
      when p = 1 ans shift = 0, we have
      1 0 0
      1 1 0
@@ -363,7 +430,6 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
    }
 }
    
-
 /*
 generate data items with a normal distribution with specified mean and standard deviation 
 >> mean - mean or expectation of the distribution

--- a/source/tensor/core/getandset/SetData.cu
+++ b/source/tensor/core/getandset/SetData.cu
@@ -231,7 +231,7 @@ void _CudaSetDataDim(XTensor * tensor, int beg, int len, int dim, DTYPE p)
    int n = tensor->order;

    CheckNTErrors(tensor->dataType == DEFAULT_DTYPE, "TODO!");
-    CheckNTErrors(dim < n && dim > 0, "Illegal dimension!");
+    CheckNTErrors(dim < n && dim >= 0, "Illegal dimension!");
    CheckNTErrors(beg >= 0 && beg < tensor->GetDim(dim), "Illegal beginning position!");
    CheckNTErrors(beg + len >= 0 && beg + len < tensor->GetDim(dim), "Illegal length!");

@@ -255,12 +255,95 @@ void _CudaSetDataDim(XTensor * tensor, int beg, int len, int dim, DTYPE p)
    int devIDBackup;
    ProtectCudaDev(tensor->devID, devIDBackup);

-    KernelSetDataDim<<<blocks, threads >>>((DTYPE*)tensor->data, beg * stride, len * stride, blockSize, blockNum, p);
+    KernelSetDataDim<<<blocks, threads >>>((DTYPE*)tensor->data, beg * stride, 
+                                            len * stride, blockSize, blockNum, p);

    BacktoCudaDev(tensor->devID, devIDBackup);
 }

 /* 
+modify data items along with a given index and dimension 
+(and keep the remaining items unchanged) - kernel version
+
+>> s - the pointer whose data would be modified
+>> m - the pointer whose data would be used to modify the data pointed by s
+>> blockNum - number of data blocks
+>> blockSize - size of a data block
+>> stride - stride of a data block
+*/
+__global__
+void KernelSetDataIndexed(DTYPE * s, DTYPE * m, int blockNum, int blockSize, int stride)
+{
+    /* offset in each block */
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    /* block id */
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+    
+    if(i >= stride || j >= blockNum)
+        return;
+
+    int x = blockSize * j + i;
+    int y = stride * j + i;
+    s[x] = m[y];
+}
+
+/*
+modify data items along with a given index and dimension (and keep the remaining items unchanged) 
+>> source - the tensor whose data array would be modified
+>> modify - the tensor whose data array would be used to modify the source tensor
+>> dim - the dimension along which we modify the tensor
+>> index - index of the given dimension
+e.g., given a source tensor (3, 3)
+      1 2 3
+      4 5 6
+      7 8 9
+      given a modified tensor (3)
+      1 2 3
+      when dim = 0, index = 1, we have
+      1 2 3
+      1 2 3
+      7 8 9
+      i.e., we set entries of row 1 to {1, 2, 3}
+*/
+void _CudaSetDataIndexed(XTensor * source, XTensor * modify, int dim, int index)
+{
+    int order = source->order;
+    int size = source->GetDim(dim);
+
+    CheckNTErrors(source->dataType == DEFAULT_DTYPE, "TODO!");
+    CheckNTErrors(dim >= 0 && dim < order, "Illegal dimension!");
+    CheckNTErrors(index >= 0 && index < size, "Illegal index!");
+    
+    int stride = 1;
+    int blockSize = 1;
+    int blockNum  = 1;
+
+    for(int i = order - 1; i > dim; i--){
+        stride *= source->GetDim(i);
+    }
+
+    blockSize = stride * source->GetDim(dim);
+    blockNum = source->unitNum / blockSize;
+
+    int cudaGrids[3];
+    int cudaBlocks[3];
+
+    GDevs.GetCudaThread2D(source->devID, stride, blockNum, MAX_INT, cudaGrids, cudaBlocks);
+
+    dim3 blocks(cudaGrids[0], cudaGrids[1]);
+    dim3 threads(cudaBlocks[0], cudaBlocks[1]);
+
+    int devIDBackup;
+    ProtectCudaDev(source->devID, devIDBackup);
+    
+    KernelSetDataIndexed<<<blocks, threads >>>((DTYPE*)source->data + index * stride, (DTYPE*)modify->data, 
+                                                blockNum, blockSize, stride);
+
+    BacktoCudaDev(source->devID, devIDBackup);
+}
+
+/* 
 set lower triangular matrics for each block
 >> d - pointer to the data array
 >> l - row number (or column number) of each block, i.e, 

--- a/source/tensor/core/getandset/SetData.cuh
+++ b/source/tensor/core/getandset/SetData.cuh
@@ -40,6 +40,9 @@ void _CudaSetDataFixedDouble(XTensor * tensor, double p);
 /* set data items along with a given dimension (and keep the remaining items unchanged) */
 void _CudaSetDataDim(XTensor * tensor, int beg, int len, int dim, DTYPE p);

+/* modify data items along with a given index and dimension (and keep the remaining items unchanged) */
+void _CudaSetDataIndexed(XTensor * source, XTensor * modify, int dim, int index);
+
 /* generate data as lower triangular matrics for last two dimensions (cuda version) */
 void _CudaSetDataLowTri(XTensor * tensor, DTYPE p, int shift);


--- a/source/tensor/core/getandset/SetData.h
+++ b/source/tensor/core/getandset/SetData.h
@@ -48,6 +48,9 @@ void _SetDataFixedDouble(XTensor * tensor, double p);
 /* set data items along with a given dimension (and keep the remaining items unchanged) */
 void _SetDataDim(XTensor * tensor, int beg, int len, int dim, DTYPE p);

+/* modify data items along with a given index and dimension (and keep the remaining items unchanged) */
+void _SetDataIndexed(XTensor * source, XTensor * modify, int dim, int index);
+
 /* generate data as lower triangular matrics for last two dimensions */
 void _SetDataLowTri(XTensor * tensor, DTYPE p, int shift);


--- a/source/tensor/core/math/Clip.cpp
+++ b/source/tensor/core/math/Clip.cpp
@@ -81,7 +81,7 @@ make a new tensor to keep the result and return it
 XTensor Clip(const XTensor & a, DTYPE lower, DTYPE upper)
 {
 	XTensor b(&a);
-	b.SetTMP();
+	b.SetTMPFlag();

 	/* call _Clip function */
 	_Clip(&a, &b, lower, upper);

--- a/source/tensor/core/math/Clip.h
+++ b/source/tensor/core/math/Clip.h
 /* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+

 /*
-* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-03
-*/
+ * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-03
+ */

 #ifndef __CLIP_H__
 #define __CLIP_H__
@@ -29,16 +30,12 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* set every entry to its clip value */
 void _Clip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper);

-/*
-set every entry to its clip value (do it on site)
-keep the result in the input tensor a and return nothing
-*/
+/* set every entry to its clip value (do it on site)
+   keep the result in the input tensor a and return nothing */
 void _ClipMe(XTensor * a, DTYPE lower, DTYPE upper);

-/*
-set every entry to its clip value  (return a XTensor structure)
-make a new tensor to keep the result and return it
-*/
+/* set every entry to its clip value  (return a XTensor structure)
+   make a new tensor to keep the result and return it */
 XTensor Clip(const XTensor & a, DTYPE lower, DTYPE upper);

 /*

--- a/source/tensor/core/math/Normalize.cpp
+++ b/source/tensor/core/math/Normalize.cpp
@@ -132,7 +132,7 @@ where a and b are the scalar and bias respectively, and \epsilon is the adjustme
 XTensor Normalize(const XTensor &input, int dim, const XTensor &mean, const XTensor &var, const XTensor &a, const XTensor &b, DTYPE epsilon)
 {
    XTensor output(&input);
-    output.SetTMP();
+    output.SetTMPFlag();

    /* call _Normalize function */
    _Normalize(&input, &output, dim, &mean, &var, &a, &b, epsilon);

--- a/source/tensor/core/math/Power.cpp
+++ b/source/tensor/core/math/Power.cpp
@@ -90,7 +90,7 @@ make a new tensor to keep the result and return it
 XTensor Power(const XTensor & a, DTYPE p)
 {
    XTensor b(&a);
-    b.SetTMP();
+    b.SetTMPFlag();
    
    /* call _Power function */
    _Power(&a, &b, p);

--- a/source/tensor/core/math/ScaleAndShift.cpp
+++ b/source/tensor/core/math/ScaleAndShift.cpp
@@ -105,7 +105,7 @@ b = a * scale + shift
 XTensor ScaleAndShift(const XTensor &a, DTYPE scale, DTYPE shift)
 {
    XTensor b(&a);
-    b.SetTMP();
+    b.SetTMPFlag();
    
    /* call _ScaleAndShift function */
    _ScaleAndShift(&a, &b, scale, shift);

--- a/source/tensor/core/math/Unary.cpp
+++ b/source/tensor/core/math/Unary.cpp
 /* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+

 /*
-* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
-*/
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
+ */

 #include <math.h>
 #include "../../XName.h"
@@ -36,6 +37,11 @@ DTYPE round(DTYPE r)
 	return (r > 0.0) ? (DTYPE)floor(r + 0.5) : (DTYPE)ceil(r - 0.5);
 }

+DTYPE iszero(DTYPE r)
+{
+    return (r == 0.0) ? (DTYPE)1.0 : (DTYPE)0.0;
+}
+
 #ifdef USE_CUDA
 /* define three marco separately, specify the respective function names  (GPU mode) */
 #define _SIMPLE_UNARY_FUNCTION(_funcName, _cudaFuncName, origFunc)          \
@@ -65,7 +71,7 @@ void _funcNameMe(XTensor * a)                                               \
 XTensor funcName(const XTensor &a)                                          \
 {                                                                           \
    XTensor b(&a);                                                          \
-    b.SetTMP();                                                             \
+    b.SetTMPFlag();                                                         \
    _funcName(&a, &b);                                                      \
    XLink::MakeLink(&a, NULL, &b, operationId);                             \
    return b;                                                               \
@@ -87,6 +93,10 @@ _SIMPLE_UNARY_FUNCTION(_Floor, _CudaFloor, floor)
 _SIMPLE_UNARY_FUNCTION_ME(_FloorMe, _Floor)
 SIMPLE_UNARY_FUNCTION(Floor, _Floor, MATH_FLOOR)

+_SIMPLE_UNARY_FUNCTION(_IsZero, _CudaIsZero, iszero)
+_SIMPLE_UNARY_FUNCTION_ME(_IsZeroMe, _IsZero)
+SIMPLE_UNARY_FUNCTION(IsZero, _IsZero, MATH_ISZERO)
+
 _SIMPLE_UNARY_FUNCTION(_Log, _CudaLog, log)
 _SIMPLE_UNARY_FUNCTION_ME(_LogMe, _Log)
 SIMPLE_UNARY_FUNCTION(Log, _Log, MATH_LOG)
@@ -140,7 +150,7 @@ void _funcNameMe(XTensor * a)                                               \
 XTensor funcName(const XTensor &a)                                          \
 {                                                                           \
    XTensor b(&a);                                                          \
-    b.SetTMP();                                                             \
+    b.SetTMPFlag();                                                         \
    _funcName(&a, &b);                                                      \
    XLink::MakeLink(&a, NULL, &b, operationId);                             \
    return b;                                                               \
@@ -163,6 +173,10 @@ _SIMPLE_UNARY_FUNCTION(_Floor, floor)
 _SIMPLE_UNARY_FUNCTION_ME(_FloorMe, _Floor)
 SIMPLE_UNARY_FUNCTION(Floor, _Floor, MATH_FLOOR)

+_SIMPLE_UNARY_FUNCTION(_IsZero, iszero)
+_SIMPLE_UNARY_FUNCTION_ME(_IsZeroMe, _IsZero)
+SIMPLE_UNARY_FUNCTION(IsZero, _IsZero, MATH_ISZERO)
+
 _SIMPLE_UNARY_FUNCTION(_Log, log)
 _SIMPLE_UNARY_FUNCTION_ME(_LogMe, _Log)
 SIMPLE_UNARY_FUNCTION(Log, _Log, MATH_LOG)

--- a/source/tensor/core/math/Unary.cu
+++ b/source/tensor/core/math/Unary.cu
 /* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+

 /*
-* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
-*/
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
+ */

 #include <math.h>
 #include "../../XDevice.h"
@@ -28,17 +29,23 @@
 namespace nts {

 __device__
-DTYPE CudaSquare(DTYPE x)
+DTYPE cudasquare(DTYPE x)
 {
    return x * x;
 }

 __device__
-DTYPE CudaRound(DTYPE r)
+DTYPE cudaround(DTYPE r)
 {
 	return (r > 0.0) ? (DTYPE)floor(r + 0.5) : (DTYPE)ceil(r - 0.5);
 }

+__device__
+DTYPE cudaiszero(DTYPE r)
+{
+    return (r == 0.0) ? (DTYPE)1.0 : (DTYPE)0.0;
+}
+
 #define SIMPLE_UNARY_FUNCTION_GPU(funcName, origFunc)                       \
 __global__                                                                  \
 void Kernel##funcName(DTYPE * a, DTYPE * b, int size)                       \
@@ -89,10 +96,11 @@ SIMPLE_UNARY_FUNCTION_GPU(Absolute, fabs)
 SIMPLE_UNARY_FUNCTION_GPU(Ceil, ceil)
 SIMPLE_UNARY_FUNCTION_GPU(Exp, exp)
 SIMPLE_UNARY_FUNCTION_GPU(Floor, floor)
+SIMPLE_UNARY_FUNCTION_GPU(IsZero, cudaiszero)
 SIMPLE_UNARY_FUNCTION_GPU(Log, log)
-SIMPLE_UNARY_FUNCTION_GPU(Round, CudaRound)
+SIMPLE_UNARY_FUNCTION_GPU(Round, cudaround)
 SIMPLE_UNARY_FUNCTION_GPU(Sqrt, sqrt)
-SIMPLE_UNARY_FUNCTION_GPU(Square, CudaSquare)
+SIMPLE_UNARY_FUNCTION_GPU(Square, cudasquare)

 SIMPLE_UNARY_FUNCTION_GPU(Sin, sin)
 SIMPLE_UNARY_FUNCTION_GPU(Cos, cos)

--- a/source/tensor/core/math/Unary.cuh
+++ b/source/tensor/core/math/Unary.cuh
 /* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+

 /*
-* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
-*/
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
+ */

 #ifndef __UNARY_CUH__
 #define __UNARY_CUH__
@@ -65,6 +66,15 @@ void KernelFloor(__half * a, __half * b, int size);
 /* set each entry to its floor value */
 void _CudaFloor(const XTensor * a, XTensor * b);

+/* if source entry is zero, set target entry to be one, otherwise zero (CUDA Kernel) */
+__global__
+void KernelIsZero(DTYPE * a, DTYPE * b, int size);
+/* if source entry is zero, set target entry to be one, otherwise zero (CUDA Kernel) with float16 data type*/
+__global__
+void KernelIsZero(__half * a, __half * b, int size);
+/* if source entry is zero, set target entry to be one, otherwise zero */
+void _CudaIsZero(const XTensor * a, XTensor * b);
+
 /* set each entry to its logarithm value (CUDA Kernel) */
 __global__
 void KernelLog(DTYPE * a, DTYPE * b, int size);

--- a/source/tensor/core/math/Unary.h
+++ b/source/tensor/core/math/Unary.h
 /* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+

 /*
-* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
-*/
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
+ */

 #ifndef __UNARY_H__
 #define __UNARY_H__
@@ -62,6 +63,15 @@ void _FloorMe(XTensor * a);
 make a new tensor to keep the result and return it */
 XTensor Floor(const XTensor & a);

+/* if source entry is zero, set target entry to be one, otherwise zero */
+void _IsZero(const XTensor *a, XTensor *b);
+/* if source entry is zero, set target entry to be one, otherwise zero (do it on site)
+keep the result in the input tensor a and return nothing */
+void _IsZeroMe(XTensor *a);
+/* if source entry is zero, set target entry to be one, otherwise zero (return a XTensor structure)
+make a new tensor to keep the result and return it */
+XTensor IsZero(const XTensor &a);
+
 /* set every entry to its logarithm value */
 void _Log(const XTensor * a, XTensor * b);
 /* set every entry to its logarithm value (do it on site)

--- a/source/tensor/core/movement/CopyIndexed.cpp
+++ b/source/tensor/core/movement/CopyIndexed.cpp
@@ -32,7 +32,7 @@ copy indexed sub-tensors
 >> t - the target tensor
 >> dim - the leading dimension to define "sub-tensors"
         e.g., for a tensor of size (3, 2, 4) and dim = 2, 
-         we have 4 sub-tensors of size (3,2)
+         we have 4 sub-tensors of size (3, 2)
 >> srcIndex - index of the source sub-tensors
 >> indexSize - length of srcIndex (and tgtIndex)
 >> tgtIndex - index of the target sub-tensors
@@ -130,22 +130,30 @@ XTensor CopyIndexed(const XTensor &s, int dim, int * srcIndex, int indexSize, in
    
    float dr = (!s.isSparse) ? 1.0F : s.denseRatio;
    XTensor t(order, dimSize, s.dataType, dr, s.devID, s.mem);
-    t.SetTMP();
+    t.SetTMPFlag();

    /* call _CopyIndexed function */
    _CopyIndexed(&s, &t, dim, srcIndex, indexSize, tgtIndex, copyNum);

+    /* care: we must malloc a new array for save index,
+             because the source indexs may be freed. */
+    int * saveSrcIndex = new int[indexSize];
+    memcpy(saveSrcIndex, srcIndex, indexSize * sizeof(int));
+
+    int * saveTgtIndex = new int[indexSize];
+    memcpy(saveTgtIndex, tgtIndex, indexSize * sizeof(int));
+
    /* tensor connection */
    XLink::MakeLink(&s, NULL, &t, MOVEMENT_COPYINDEXED);
    XLink::AddParamToHeadInt(&t, dim);
-    XLink::AddParamToHeadPointer(&t, srcIndex);
+    XLink::AddParamToHeadPointer(&t, saveSrcIndex);
    XLink::AddParamToHeadInt(&t, indexSize);
-    XLink::AddParamToHeadPointer(&t, tgtIndex);
+    XLink::AddParamToHeadPointer(&t, saveTgtIndex);
    XLink::AddParamToHeadInt(&t, copyNum);
    
    /* destroy variables */
    delete[] dimSize;
-    
+
    return t;
 }


--- a/source/tensor/core/movement/CopyIndexed.h
+++ b/source/tensor/core/movement/CopyIndexed.h
 /* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */

 /*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
+ * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
+ */

 #ifndef __COPYINDEXED_H__
 #define __COPYINDEXED_H__

--- a/source/tensor/core/movement/CopyValues.cpp
+++ b/source/tensor/core/movement/CopyValues.cpp
@@ -108,7 +108,7 @@ make a new tensor to keep the result and return it
 XTensor CopyValues(const XTensor &s, XStream * stream)
 {
    XTensor t(&s);
-    t.SetTMP();
+    t.SetTMPFlag();

    /* call _CopyValues function */
    _CopyValues(&s, &t, stream);

--- a/source/tensor/core/movement/Gather.cpp
+++ b/source/tensor/core/movement/Gather.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-18
+ */
+
+#include "Gather.h"
+#include "CopyIndexed.h"
+
+namespace nts{ // namespace nts(NiuTrans.Tensor)
+
+/*
+gather indexed sub-tensors
+
+>> s - the source tensor
+>> t - the target tensor
+>> dim - the leading dimension to define "sub-tensors"
+         e.g., for a tensor of size (3, 2, 4) and dim = 2, 
+         we have 4 sub-tensors of size (3, 2)
+>> srcIndex - index of the source sub-tensors
+>> indexSize - length of srcIndex (and tgtIndex)
+*/
+void _Gather(const XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize)
+{
+    int * tgtIndex = new int[indexSize];
+    for(int i = 0; i < indexSize; i++)
+        tgtIndex[i] = i;
+
+    _CopyIndexed(s, t, dim, srcIndex, indexSize, tgtIndex, 1);
+
+    delete[] tgtIndex;
+}
+
+/*
+gather indexed sub-tensors (return a XTensor structure)
+make a new tensor to keep the result and return it
+
+>> s - the source tensor
+>> dim - the leading dimension to define "sub-tensors"
+         e.g., for a tensor of size (3, 2, 4) and dim = 2, 
+         we have 4 sub-tensors of size (3, 2)
+>> srcIndex - index of the source sub-tensors
+>> indexSize - length of srcIndex (and tgtIndex)
+<< return - the result of copying indexed sub-tensors
+
+Notice: the index must be on the CPU!!!
+*/
+XTensor Gather(const XTensor &s, int dim, int * srcIndex, int indexSize)
+{
+    int * tgtIndex = new int[indexSize];
+    for(int i = 0; i < indexSize; i++)
+        tgtIndex[i] = i;
+	
+    /* call CopyIndexed function */
+    XTensor result;
+    result = CopyIndexed(s, dim, srcIndex, indexSize, tgtIndex, 1);
+
+    delete[] tgtIndex;
+
+    return result;
+}
+
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/movement/Gather.h
+++ b/source/tensor/core/movement/Gather.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-18
+ */
+
+#ifndef __GATHER_H__
+#define __GATHER_H__
+
+#include "../../XTensor.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* gather selected sub-tensors */
+void _Gather(const XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize);
+
+/* gather selected sub-tensors (return a XTensor structure)
+make a new tensor to keep the result and return it */
+XTensor Gather(const XTensor &s, int dim, int * srcIndex, int indexSize);
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __GATHER_H__
\ No newline at end of file
--- a/source/tensor/core/movement/Spread.cpp
+++ b/source/tensor/core/movement/Spread.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-25
+ */
+
+#include "Spread.h"
+#include "Spread.cuh"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/*
+This is core assignment for spread function.
+
+>> sData - the data pointer of the source tensor
+>> cData - the data pointer of collection tensor
+>> blockNum - number of data blocks
+>> blockSizeSrc - size of source data block
+>> blockSizeColl - size of source data block
+>> stride - stride of a data block
+*/
+void _Assignment(DTYPE * sData, DTYPE * cData, int blockNum, 
+                 int blockSizeSrc, int blockSizeColl, int stride) 
+{
+    for (int i = 0; i < blockNum; i++) {
+        DTYPE * s = sData + blockSizeSrc * i;
+        DTYPE * c = cData + blockSizeColl * i;
+        for(int j = 0; j < stride; j++)
+            s[j] = c[j];
+    }
+}
+
+/*
+spread a collection tensor to source tensor.
+This is a inverse operation compared to gather.
+
+>> source - the source tensor whose data would be modified
+>> collection - the collection whose data would be spread to source tensor
+>> dim - the leading dimension to define "sub-tensors"
+         e.g., for a tensor of size (3, 2, 4) and dim = 2, 
+         we have 4 sub-tensors of size (3, 2)
+>> srcIndex - index of the source sub-tensors
+>> indexSize - length of srcIndex (and collIndex)
+>> collIndex - index of the gathered sub-tensors
+*/
+void _Spread(XTensor * source, XTensor * collection, int dim, 
+             int * srcIndex, int indexSize, int * collIndex)
+{
+    int order = source->order;
+    int size = source->GetDim(dim);
+
+    CheckNTErrors(source->dataType == DEFAULT_DTYPE, "TODO!");
+    CheckNTErrors(dim >= 0 && dim < order, "Illegal dimension!");
+    
+    for(int i = 0; i < order; i++){
+        if(i < dim){
+            CheckNTErrors(collection->GetDim(i) == source->GetDim(i), "Illegal dimension!");
+        }
+        else if(i > dim){
+            CheckNTErrors(collection->GetDim(i) == source->GetDim(i), "Illegal dimension!");
+        }
+        else{
+            CheckNTErrors(collection->GetDim(i) == indexSize, "Illegal dimension!");
+        }
+    }
+
+#ifdef USE_CUDA
+    if(source->devID >= 0 && collection->devID >= 0) {
+        _CudaSpread(source, collection, dim, srcIndex, indexSize, collIndex);
+        return;
+    }
+#endif
+
+    int blockSizeSrc = 1;
+    int blockSizeColl = 1;
+    int blockNum = 1;
+    int stride = 1;
+
+    for (int i = dim + 1; i < order; i++) {
+        stride *= source->GetDim(i);
+    }
+    
+    blockSizeSrc = stride * source->GetDim(dim);
+    blockSizeColl = stride * collection->GetDim(dim);
+    blockNum = source->unitNum / blockSizeSrc;
+
+    DTYPE * sData = (DTYPE*)source->data;
+    DTYPE * cData = (DTYPE*)collection->data;
+
+    for(int i = 0; i < indexSize; i++){
+        int src = srcIndex[i];
+        int tgt = collIndex[i];
+        DTYPE * s = sData + src * stride;
+        DTYPE * c = cData + tgt * stride;
+        _Assignment(s, c, blockNum, blockSizeSrc, blockSizeColl, stride);
+    }
+}
+
+/*
+This is core assignment for backward computation of gather function.
+Care of the operator "+=" instead of "=".
+
+>> sData - the data pointer of the source tensor
+>> cData - the data pointer of collection tensor
+>> blockNum - number of data blocks
+>> blockSizeSrc - size of source data block
+>> blockSizeColl - size of source data block
+>> stride - stride of a data block
+*/
+void _AssignmentForGather(DTYPE * sData, DTYPE * cData, int blockNum, 
+                          int blockSizeSrc, int blockSizeColl, int stride) 
+{
+    for (int i = 0; i < blockNum; i++) {
+        DTYPE * s = sData + blockSizeSrc * i;
+        DTYPE * c = cData + blockSizeColl * i;
+        for(int j = 0; j < stride; j++)
+            s[j] += c[j];
+    }
+}
+
+/*
+spread a collection tensor to source tensor.
+And this is a special spread function for backward computation of gather function.
+
+>> source - the source tensor whose data would be modified
+>> collection - the collection whose data would be spread to source tensor
+>> dim - the leading dimension to define "sub-tensors"
+         e.g., for a tensor of size (3, 2, 4) and dim = 2, 
+         we have 4 sub-tensors of size (3, 2)
+>> srcIndex - index of the source sub-tensors
+>> indexSize - length of srcIndex (and collIndex)
+>> collIndex - index of the gathered sub-tensors
+*/
+void _SpreadForGather(XTensor * source, XTensor * collection, int dim, 
+                      int * srcIndex, int indexSize, int * collIndex)
+{
+    int order = source->order;
+    int size = source->GetDim(dim);
+
+    CheckNTErrors(source->dataType == DEFAULT_DTYPE, "TODO!");
+    CheckNTErrors(dim >= 0 && dim < order, "Illegal dimension!");
+    
+    for(int i = 0; i < order; i++){
+        if(i < dim){
+            CheckNTErrors(collection->GetDim(i) == source->GetDim(i), "Illegal dimension!");
+        }
+        else if(i > dim){
+            CheckNTErrors(collection->GetDim(i) == source->GetDim(i), "Illegal dimension!");
+        }
+        else{
+            CheckNTErrors(collection->GetDim(i) == indexSize, "Illegal dimension!");
+        }
+    }
+
+#ifdef USE_CUDA
+    if(source->devID >= 0 && collection->devID >= 0) {
+        _CudaSpreadForGather(source, collection, dim, srcIndex, indexSize, collIndex);
+        return;
+    }
+#endif
+
+    int blockSizeSrc = 1;
+    int blockSizeColl = 1;
+    int blockNum = 1;
+    int stride = 1;
+
+    for (int i = dim + 1; i < order; i++) {
+        stride *= source->GetDim(i);
+    }
+    
+    blockSizeSrc = stride * source->GetDim(dim);
+    blockSizeColl = stride * collection->GetDim(dim);
+    blockNum = source->unitNum / blockSizeSrc;
+
+    DTYPE * sData = (DTYPE*)source->data;
+    DTYPE * cData = (DTYPE*)collection->data;
+
+    for(int i = 0; i < indexSize; i++){
+        int src = srcIndex[i];
+        int tgt = collIndex[i];
+        DTYPE * s = sData + src * stride;
+        DTYPE * c = cData + tgt * stride;
+        _AssignmentForGather(s, c, blockNum, blockSizeSrc, blockSizeColl, stride);
+    }
+}
+
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/movement/Spread.cu
+++ b/source/tensor/core/movement/Spread.cu
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-25
+ */
+
+#ifndef __SPREAD_CUH__
+#define __SPREAD_CUH__
+
+#include "../../XTensor.h"
+#include "../../XDevice.h"
+#include "Spread.cuh"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* 
+This is core assignment for spread function.
+
+>> sData - the data pointer of the source tensor
+>> cData - the data pointer of collection tensor
+>> blockNum - the number of data blocks
+>> blockSizeSrc - the size of source data block
+>> blockSizeColl - the size of source data block
+>> stride - the stride of a data block
+*/
+__global__
+void KernelSpread(DTYPE * sData, DTYPE * cData,  int blockNum, 
+                  int blockSizeSrc, int blockSizeColl, int stride)
+{
+    /* block id */
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    /* offset in each block */
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+    
+    if(i >= blockNum || j >= stride)
+        return;
+
+    DTYPE * s = sData + blockSizeSrc * i;
+    DTYPE * c = cData + blockSizeColl * i;
+
+    s[j] = c[j];
+}
+
+/*
+spread a collection tensor to source tensor (cuda version).
+This is a inverse operation compared to gather.
+
+>> source - the source tensor whose data would be modified
+>> collection - the collection whose data would be spread to source tensor
+>> dim - the leading dimension to define "sub-tensors"
+         e.g., for a tensor of size (3, 2, 4) and dim = 2, 
+         we have 4 sub-tensors of size (3, 2)
+>> srcIndex - index of the source sub-tensors
+>> indexSize - length of srcIndex (and collIndex)
+>> collIndex - index of the gathered sub-tensors
+*/
+void _CudaSpread(XTensor * source, XTensor * collection, int dim, 
+                          int * srcIndex, int indexSize, int * collIndex)
+{
+    int order = source->order;
+
+    CheckNTErrors(source->dataType == DEFAULT_DTYPE, "TODO!");
+    CheckNTErrors(dim >= 0 && dim < order, "Illegal dimension!");
+    
+    int blockSizeSrc = 1;
+    int blockSizeColl = 1;
+    int blockNum = 1;
+    int stride = 1;
+
+    for (int i = dim + 1; i < order; i++) {
+        stride *= source->GetDim(i);
+    }
+    
+    blockSizeSrc = stride * source->GetDim(dim);
+    blockSizeColl = stride * collection->GetDim(dim);
+    blockNum = source->unitNum / blockSizeSrc;
+
+    int cudaGrids[3];
+    int cudaBlocks[3];
+
+    GDevs.GetCudaThread2D(source->devID, blockNum, stride, MAX_INT, cudaGrids, cudaBlocks);
+
+    dim3 blocks(cudaGrids[0], cudaGrids[1]);
+    dim3 threads(cudaBlocks[0], cudaBlocks[1]);
+
+    int devIDBackup;
+    ProtectCudaDev(source->devID, devIDBackup);
+    
+    DTYPE * sData = (DTYPE*)source->data;
+    DTYPE * cData = (DTYPE*)collection->data;
+    for(int i = 0; i < indexSize; i++) {
+        int src = srcIndex[i];
+        int tgt = collIndex[i];
+        DTYPE * s = sData + src * stride;
+        DTYPE * c = cData + tgt * stride;
+
+        KernelSpread<<<blocks, threads >>>(s, c, blockNum, blockSizeSrc, blockSizeColl, stride);
+    }
+    
+    BacktoCudaDev(source->devID, devIDBackup);
+}
+
+/* 
+This is core assignment for backward computation of gather function.
+Care of the operator "+=" instead of "=".
+
+>> sData - the data pointer of the source tensor
+>> cData - the data pointer of collection tensor
+>> blockNum - number of data blocks
+>> blockSizeSrc - size of source data block
+>> blockSizeColl - size of source data block
+>> stride - stride of a data block
+*/
+__global__
+void KernelSpreadForGather(DTYPE * sData, DTYPE * cData,  int blockNum, 
+                            int blockSizeSrc, int blockSizeColl, int stride)
+{
+    /* block id */
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    /* offset in each block */
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+    
+    if(i >= blockNum || j >= stride)
+        return;
+
+    DTYPE * s = sData + blockSizeSrc * i;
+    DTYPE * c = cData + blockSizeColl * i;
+
+    s[j] += c[j];
+}
+
+/*
+spread a collection tensor to source tensor (cuda version).
+And this is a special spread function for backward computation of gather function.
+
+>> source - the source tensor whose data would be modified
+>> collection - the collection whose data would be spread to source tensor
+>> dim - the leading dimension to define "sub-tensors"
+         e.g., for a tensor of size (3, 2, 4) and dim = 2, 
+         we have 4 sub-tensors of size (3, 2)
+>> srcIndex - index of the source sub-tensors
+>> indexSize - length of srcIndex (and collIndex)
+>> collIndex - index of the gathered sub-tensors
+*/
+void _CudaSpreadForGather(XTensor * source, XTensor * collection, int dim, 
+                          int * srcIndex, int indexSize, int * collIndex)
+{
+    int order = source->order;
+
+    CheckNTErrors(source->dataType == DEFAULT_DTYPE, "TODO!");
+    CheckNTErrors(dim >= 0 && dim < order, "Illegal dimension!");
+    
+    int blockSizeSrc = 1;
+    int blockSizeColl = 1;
+    int blockNum = 1;
+    int stride = 1;
+
+    for (int i = dim + 1; i < order; i++) {
+        stride *= source->GetDim(i);
+    }
+    
+    blockSizeSrc = stride * source->GetDim(dim);
+    blockSizeColl = stride * collection->GetDim(dim);
+    blockNum = source->unitNum / blockSizeSrc;
+
+    int cudaGrids[3];
+    int cudaBlocks[3];
+
+    GDevs.GetCudaThread2D(source->devID, blockNum, stride, MAX_INT, cudaGrids, cudaBlocks);
+
+    dim3 blocks(cudaGrids[0], cudaGrids[1]);
+    dim3 threads(cudaBlocks[0], cudaBlocks[1]);
+
+    int devIDBackup;
+    ProtectCudaDev(source->devID, devIDBackup);
+    
+    DTYPE * sData = (DTYPE*)source->data;
+    DTYPE * cData = (DTYPE*)collection->data;
+    for(int i = 0; i < indexSize; i++) {
+        int src = srcIndex[i];
+        int tgt = collIndex[i];
+        DTYPE * s = sData + src * stride;
+        DTYPE * c = cData + tgt * stride;
+
+        KernelSpreadForGather<<<blocks, threads >>>(s, c, blockNum, blockSizeSrc, blockSizeColl, stride);
+    }
+    
+    BacktoCudaDev(source->devID, devIDBackup);
+}
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __SPREAD_CUH__
\ No newline at end of file
--- a/source/tensor/core/movement/Spread.cuh
+++ b/source/tensor/core/movement/Spread.cuh
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-25
+ */
+
+#ifndef __SPREAD_CUH__
+#define __SPREAD_CUH__
+
+#include "../../XTensor.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* spread a collection tensor to source tensor (cuda version) */
+void _CudaSpread(XTensor * source, XTensor * collection, int dim, 
+                 int * srcIndex, int indexSize, int * collIndex);
+
+/* special spread function for backward computation of gather function (cuda version) */
+void _CudaSpreadForGather(XTensor * source, XTensor * collection, int dim, 
+                          int * srcIndex, int indexSize, int * collIndex);
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __SPREAD_CUH__
\ No newline at end of file
--- a/source/tensor/core/movement/Spread.h
+++ b/source/tensor/core/movement/Spread.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-25
+ */
+
+#ifndef __SPREAD_H__
+#define __SPREAD_H__
+
+#include "../../XTensor.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* spread a collection tensor to source tensor */
+void _Spread(XTensor * source, XTensor * collection, int dim, 
+             int * srcIndex, int indexSize, int * collIndex);
+
+/* spread a collection tensor to source tensor (return a XTensor structure)
+   make a new tensor to keep the result and return it */
+void Spread(XTensor * source, XTensor * collection, int dim, 
+            int * srcIndex, int indexSize, int * collIndex);
+
+/* special spread function for backward computation of gather function */
+void _SpreadForGather(XTensor * source, XTensor * collection, int dim, 
+                      int * srcIndex, int indexSize, int * collIndex);
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __SPREAD_H__
\ No newline at end of file
--- a/source/tensor/core/reduce/ReduceMax.cpp
+++ b/source/tensor/core/reduce/ReduceMax.cpp
@@ -114,7 +114,7 @@ XTensor ReduceMax(const XTensor &input, int dim)

    float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
    XTensor output(order, dimSize, input.dataType, dr, input.devID, input.mem);
-    output.SetTMP();
+    output.SetTMPFlag();

    /* call _ReduceMax function */
    _ReduceMax(&input, &output, dim);

--- a/source/tensor/core/reduce/ReduceMean.cpp
+++ b/source/tensor/core/reduce/ReduceMean.cpp
@@ -71,7 +71,7 @@ XTensor ReduceMean(const XTensor &input, int dim)

    float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
    XTensor output(order, dimSize, input.dataType, dr, input.devID, input.mem);
-    output.SetTMP();
+    output.SetTMPFlag();

    /* call _ReduceMean function */
    _ReduceMean(&input, &output, dim);

--- a/source/tensor/core/reduce/ReduceSum.cpp
+++ b/source/tensor/core/reduce/ReduceSum.cpp
@@ -225,7 +225,7 @@ XTensor ReduceSum(const XTensor &input, int dim, const XTensor &shift, DTYPE pow

    float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
    XTensor output(order, dimSize, input.dataType, dr, input.devID, input.mem);
-    output.SetTMP();
+    output.SetTMPFlag();

    /* call _ReduceSum function */
    _ReduceSum(&input, &output, dim, &shift, power, isExp);
@@ -271,7 +271,7 @@ XTensor ReduceSum(const XTensor &input, int dim, DTYPE power, bool isExp)

    float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
    XTensor output(order, dimSize, input.dataType, dr, input.devID, input.mem);
-    output.SetTMP();
+    output.SetTMPFlag();

    /* call _ReduceSum function */
    _ReduceSum(&input, &output, dim, NULL, power, isExp);

--- a/source/tensor/core/reduce/ReduceSum.h
+++ b/source/tensor/core/reduce/ReduceSum.h
@@ -33,7 +33,7 @@ sum = \sum_i (a_i - shift) if isExp == false
 sum = \sum_i exp(a_i - shift) if isExp == true
 */
 void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor * shift = NULL,
-    DTYPE power = (DTYPE)1.0F, bool isExp = false);
+                DTYPE power = (DTYPE)1.0F, bool isExp = false);

 /* 
 sum the items along a dimension of the tensor (return a XTensor structure)

--- a/source/tensor/core/reduce/ReduceSumAll.cpp
+++ b/source/tensor/core/reduce/ReduceSumAll.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-27
+ */
+
+#include "ReduceSumAll.h"
+#include "ReduceSum.h"
+#include "../movement/CopyValues.h"
+
+namespace nts{ // namespace nts(NiuTrans.Tensor)
+
+int * getDimSize(const XTensor * tensor, int n)
+{
+    int order = tensor->order;
+    int * dimSize = new int[order - 1];
+
+    for (int i = 0; i < order; i++) {
+        if(i < n)
+            dimSize[i] = tensor->dimSize[i];
+        else if(i > n)
+            dimSize[i - 1] = tensor->dimSize[i];
+    }
+    return dimSize;
+}
+
+/*
+sum all the items of the tensor (It should be optimized!)
+>> source - the inpute tensor
+<< return - the total summation
+*/
+DTYPE _ReduceSumAll(XTensor * source)
+{
+    int order = source->order;
+    DTYPE summation;
+
+    XTensor * big = NewTensor(source);
+    _CopyValues(source, big);
+    for(int i = 0; i < order; i++) {
+
+        if(i == order - 1)
+            big->Reshape(big->unitNum, 1);
+
+        int * dimSize;
+        dimSize = getDimSize(big, 0);
+        XTensor * little = NewTensor(big->order - 1, dimSize, source->dataType, source->denseRatio, source->devID, source->mem);
+           
+        _ReduceSum(big, little, 0);
+
+        delete big;
+        delete dimSize;
+
+        big = NewTensor(little);
+        _CopyValues(little, big);
+
+        delete little;
+    }
+    summation = big->Get1D(0);
+    delete big;
+
+    return summation;
+}
+
+/*
+sum all the items of the tensor
+>> source - the inpute tensor
+<< return - the total summation   
+*/
+DTYPE ReduceSumAll(XTensor & source)
+{
+    return _ReduceSumAll(&source);
+}
+
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/reduce/ReduceSumAll.h
+++ b/source/tensor/core/reduce/ReduceSumAll.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-27
+ */
+
+
+#ifndef __REDUCESUMALL_H__
+#define __REDUCESUMALL_H__
+
+#include "../../XTensor.h"
+
+namespace nts{ // namespace nts(NiuTrans.Tensor)
+
+/* sum all the items of the tensor */
+DTYPE _ReduceSumAll(XTensor * source);
+
+/* sum all the items of the tensor */
+DTYPE ReduceSumAll(XTensor & source);
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __REDUCESUMALL_H__
\ No newline at end of file
--- a/source/tensor/core/reduce/ReduceSumSquared.cpp
+++ b/source/tensor/core/reduce/ReduceSumSquared.cpp
@@ -67,7 +67,7 @@ XTensor ReduceSumSquared(const XTensor &input, int dim, const XTensor &shift)

    float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
    XTensor output(order, dimSize, input.dataType, dr, input.devID, input.mem);
-    output.SetTMP();
+    output.SetTMPFlag();

    /* call _ReduceSumSquared function */
    _ReduceSumSquared(&input, &output, dim, &shift);

--- a/source/tensor/core/reduce/ReduceVariance.cpp
+++ b/source/tensor/core/reduce/ReduceVariance.cpp
@@ -70,7 +70,7 @@ XTensor ReduceVariance(const XTensor &input, int dim, const XTensor &mean)

    float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
    XTensor output(order, dimSize, input.dataType, dr, input.devID, input.mem);
-    output.SetTMP();
+    output.SetTMPFlag();

    /* call _ReduceVariance function */
    _ReduceVariance(&input, &output, dim, &mean);

--- a/source/tensor/core/shape/Concatenate.cpp
+++ b/source/tensor/core/shape/Concatenate.cpp
@@ -93,7 +93,7 @@ XTensor Concatenate(const XList &smalls, int dim)

        float dr = (!tensor->isSparse) ? 1.0F : tensor->denseRatio;
        XTensor big(order, dimSize, tensor->dataType, dr, tensor->devID, tensor->mem);
-        big.SetTMP();
+        big.SetTMPFlag();

        /* call _Merge function */
        _Merge(&smalls, &big, dim);
@@ -121,7 +121,7 @@ XTensor Concatenate(const XList &smalls, int dim)

        float dr = (!tensor->isSparse) ? 1.0F : tensor->denseRatio;
        XTensor big(order, dimSize, tensor->dataType, dr, tensor->devID, tensor->mem);
-        big.SetTMP();
+        big.SetTMPFlag();

        /* call _ConcatenateSolely function */
        _ConcatenateSolely(&smalls, &big, dim);
@@ -194,7 +194,7 @@ XTensor Concatenate(const XTensor &smallA, const XTensor &smallB, int dim)

        float dr = (!tensor->isSparse) ? 1.0F : tensor->denseRatio;
        XTensor big(order, dimSize, tensor->dataType, dr, tensor->devID, tensor->mem);
-        big.SetTMP();
+        big.SetTMPFlag();

        /* call _Merge function */
        _Merge(&smalls, &big, dim);
@@ -222,7 +222,7 @@ XTensor Concatenate(const XTensor &smallA, const XTensor &smallB, int dim)

        float dr = (!tensor->isSparse) ? 1.0F : tensor->denseRatio;
        XTensor big(order, dimSize, tensor->dataType, dr, tensor->devID, tensor->mem);
-        big.SetTMP();
+        big.SetTMPFlag();

        /* call _ConcatenateSolely function */
        _ConcatenateSolely(&smalls, &big, dim);

--- a/source/tensor/core/shape/Merge.cpp
+++ b/source/tensor/core/shape/Merge.cpp
@@ -183,7 +183,7 @@ XTensor Merge(const XTensor &s, int whereToMerge, int leadingDim)

    float dr = (!s.isSparse) ? 1.0F : s.denseRatio;
    XTensor t(order, dimSize, s.dataType, dr, s.devID, s.mem);
-    t.SetTMP();
+    t.SetTMPFlag();

    /* call _Merge function */
    _Merge(&s, &t, whereToMerge, leadingDim);
@@ -334,7 +334,7 @@ XTensor Merge(const XList &smalls, int whereToMerge)

    float dr = (!tensor->isSparse) ? 1.0F : tensor->denseRatio;
    XTensor big(order, dimSize, tensor->dataType, dr, tensor->devID, tensor->mem);
-    big.SetTMP();
+    big.SetTMPFlag();

    /* call _Merge function */
    _Merge(&smalls, &big, whereToMerge);
@@ -371,7 +371,7 @@ XTensor Merge(const XTensor &smallA, const XTensor &smallB, int whereToMerge)

    float dr = (!smallA.isSparse) ? 1.0F : smallA.denseRatio;
    XTensor big(order, dimSize, smallA.dataType, dr, smallA.devID, smallA.mem);
-    big.SetTMP();
+    big.SetTMPFlag();

    XList smalls(2);
    smalls.Add(&smallA);

--- a/source/tensor/core/shape/Reshape.cpp
+++ b/source/tensor/core/shape/Reshape.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-25
+ */
+
+#include "../../XTensor.h"
+#include "../../XName.h"
+#include "../movement/CopyValues.h"
+#include "Reshape.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* 
+reshape the tensor 
+>> s - the input tensor
+>> order - order of the tensor
+>> dimSize - the size of each dimension
+<< return - the output tensor
+*/
+XTensor Reshape(XTensor &s, int order, int * dimSize)
+{
+    XTensor t(&s);
+    t.SetTMPFlag();
+    _CopyValues(&s, &t);
+
+    int oriOrder = s.order;
+    int * oriDimSize = new int[order];
+    memcpy(oriDimSize, s.dimSize, sizeof(int) * order);
+
+    /* call Reshape function */
+    t.Reshape(order, dimSize);
+
+    /* tensor connections */
+	XLink::MakeLink(&s, NULL, &t, SHAPE_RESHAPE);
+	XLink::AddParamToHeadInt(&t, oriOrder);
+	XLink::AddParamToHeadPointer(&t, oriDimSize);
+
+	return t;
+}
+
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/shape/Reshape.h
+++ b/source/tensor/core/shape/Reshape.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-25
+ */
+
+#ifndef __RESHAPE_H__
+#define __RESHAPE_H__
+
+#include "../../XTensor.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* reshape the tensor */
+XTensor Reshape(XTensor &s, int order, int * dimSize);
+
+} // namespace nts(NiuTrans.Tensor)
+#endif // __RESHAPE_H__
--- a/source/tensor/core/shape/Split.cpp
+++ b/source/tensor/core/shape/Split.cpp
@@ -184,7 +184,7 @@ XTensor Split(const XTensor &s, int whereToSplit, int splitNum)
    
    float dr = (!s.isSparse) ? 1.0F : s.denseRatio;
    XTensor t(order, dimSize, s.dataType, dr, s.devID, s.mem);
-    t.SetTMP();
+    t.SetTMPFlag();

    /* call _Split function */
    _Split(&s, &t, whereToSplit, splitNum);

--- a/source/tensor/core/shape/Squeeze.cpp
+++ b/source/tensor/core/shape/Squeeze.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-27
+ */
+
+#include "Squeeze.h"
+#include "../movement/CopyValues.h"
+#include "../../XName.h"
+
+namespace nts{ // namespace nts(NiuTrans.Tensor)
+
+/*
+squeeze the tensor along the specified dimension 
+>> source - the input tensor
+>> target - the output tensor
+>> leadingDim - the dimension that we would squeeze
+                if leadingDim = -1, squeeze all dimensions that are 1
+                else, squeeze the specified dimension
+*/
+void _Squeeze(XTensor * source, XTensor * target, int leadingDim)
+{
+    int order = target->order;
+
+    CheckNTErrors(XTensor::IsSameShaped(source, target), 
+                 "The source and target tensor must be of the same size!");
+    CheckNTErrors(leadingDim >= -1 && leadingDim < order,
+                  "Wrong leading dimension");
+
+    _CopyValues(source, target);
+
+    if(leadingDim < 0) {
+        int * newDimSize = new int[order];
+        int newOrder = 0;
+        for(int i = 0; i < order; i++) {
+            int dim = source->GetDim(i);
+            if(dim > 1) {
+                newDimSize[newOrder] = dim;
+                newOrder += 1;
+            }
+        }
+        target->Reshape(newOrder, newDimSize);
+        delete[] newDimSize;
+    }
+    else {
+        if(source->GetDim(leadingDim) > 1) 
+            return;
+
+        int newOrder = order - 1;
+        int * newDimSize = new int[newOrder];
+        for(int i = 0; i < order; i++)
+            if(i < leadingDim)
+                newDimSize[i] = source->GetDim(i);
+            else if(i > leadingDim)
+                newDimSize[i - 1] = source->GetDim(i);
+
+        target->Reshape(newOrder, newDimSize);
+        delete[] newDimSize;
+    }
+}
+
+/*
+squeeze the tensor along the specified dimension  (do it on site)
+keep the result in the input tensor a and return nothing
+
+>> source - the input tensor
+>> leadingDim - the dimension that we would squeeze
+                if leadingDim = -1, squeeze all dimensions that are 1
+                else, squeeze the specified dimension
+*/
+void _SqueezeMe(XTensor * source, int leadingDim)
+{
+    _Squeeze(source, source, leadingDim);
+}
+
+/*
+squeeze the tensor along the specified dimension (return a XTensor structure)
+make a new tensor to keep the result and return it
+
+>> source - the input tensor
+>> leadingDim - the dimension that we would squeeze
+                if leadingDim = -1, squeeze all dimensions that are 1
+                else, squeeze the specified dimension
+<< return - the output tensor after squeeze operation
+*/
+XTensor Squeeze(XTensor & source, int leadingDim)
+{
+    XTensor target(&source);
+    target.SetTMPFlag();
+
+    /* call _Squeeze function */
+    _Squeeze(&source, &target, leadingDim);
+
+    /* tensor connections */
+    XLink::MakeLink(&source, NULL, &target, SHAPE_SQUEEZE);
+
+    return target;
+}
+
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/shape/Squeeze.h
+++ b/source/tensor/core/shape/Squeeze.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-27
+ */
+
+#ifndef __SQUEEZE_H__
+#define __SQUEEZE_H__
+
+#include "../../XTensor.h"
+
+namespace nts{ // namespace nts(NiuTrans.Tensor)
+
+/* squeeze the tensor along the specified dimension */
+void _Squeeze(XTensor * source, XTensor * target, int leadingDim = -1);
+
+/* squeeze the tensor along the specified dimension (do it on site)
+   keep the result in the input tensor a and return nothing */
+void _SqueezeMe(XTensor * source, int leadingDim = -1);
+
+/* squeeze the tensor along the specified dimension  (return a XTensor structure)
+   make a new tensor to keep the result and return it */
+XTensor Squeeze(XTensor & source, int leadingDim = -1);
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __SQUEEZE_H__
\ No newline at end of file
--- a/source/tensor/core/shape/Transpose.cpp
+++ b/source/tensor/core/shape/Transpose.cpp
@@ -138,7 +138,7 @@ XTensor Transpose(const XTensor &a, const int i, const int j)

    float dr = (!a.isSparse) ? 1.0F : a.denseRatio;
    XTensor b(order, dimSize, a.dataType, dr, a.devID, a.mem);
-    b.SetTMP();
+    b.SetTMPFlag();

    /* call _Transpose function */
    _Transpose(&a, &b, i, j);

--- a/source/tensor/core/shape/Unsqueeze.cpp
+++ b/source/tensor/core/shape/Unsqueeze.cpp
@@ -122,7 +122,7 @@ XTensor Unsqueeze(const XTensor &a, int dim, int dSize)

    float dr = (!a.isSparse) ? 1.0F : a.denseRatio;
    XTensor b(order, dimSize, a.dataType, dr, a.devID, a.mem);
-    b.SetTMP();
+    b.SetTMPFlag();

    /* call _Unsqueeze function */
    _Unsqueeze(&a, &b, dim, dSize);

--- a/source/tensor/core/shape/Unsqueeze.h
+++ b/source/tensor/core/shape/Unsqueeze.h
 /* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */

 /*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
+ * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
+ */

 #ifndef __UNSQUEEZE_H__
 #define __UNSQUEEZE_H__
@@ -26,14 +26,13 @@

 namespace nts { // namespace nts(NiuTrans.Tensor)

-/* insert a dimension by copying the blocks for x times (where x is the size of the inerted dimension) */
+/* insert a dimension by copying the blocks for x times 
+  (where x is the size of the inerted dimension) */
 void _Unsqueeze(const XTensor * a, XTensor * b, int dim, int dSize);

-/*
-insert a dimension by copying the blocks for x times 
-(where x is the size of the inerted dimension) (return a XTensor structure)
-make a new tensor to keep the result and return it
-*/
+/* insert a dimension by copying the blocks for x times 
+  (where x is the size of the inerted dimension) (return a XTensor structure)
+   make a new tensor to keep the result and return it */
 XTensor Unsqueeze(const XTensor &a, int dim, int dSize);

 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/function/CrossEntropy.cpp
+++ b/source/tensor/function/CrossEntropy.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-17
+ */
+
+#include <math.h>
+#include "CrossEntropy.h"
+#include "CrossEntropy.cuh"
+#include "../core/arithmetic/MultiplyDim.h"
+#include "../core/arithmetic/Multiply.h"
+#include "../core/math/Unary.h"
+#include "../core/math/ScaleAndShift.h"
+#include "../core/arithmetic/Negate.h"
+#include "../core/reduce/ReduceSum.h"
+#include "../core/reduce/ReduceSumAll.h"
+
+namespace nts{ // namespace nts(NiuTrans.Tensor)
+
+/*
+compute the cross entropy loss
+
+loss = sum_{i} (-gold_i * log(output_i))
+where gold and output are distributions 
+        
+>> output - model prediction
+>> gold - gold standard
+>> loss - compute loss
+>> weight - a rescaling weight given to each class
+>> padding - specify a target value that is ignored and does not contribute to the loss computation
+>> leadingDim - the leading dimension for the output
+*/
+void _CrossEntropy(const XTensor * output, const XTensor * gold,
+                   XTensor * loss, const XTensor * weight, 
+                   const XTensor * padding, int leadingDim)
+{
+    int n = leadingDim < 0 ? output->order - 1 : leadingDim;
+    CheckNTErrors(n >= 0 && n < output->order, "Wrong leadingDim!");
+
+    int unitNum = output->dimSize[n];
+
+    CheckNTErrors(XTensor::IsSameShaped(output, gold), 
+                 "The output tensor and gold tensor must be of the same size!");
+    CheckNTErrors(weight == NULL || weight->unitNum == unitNum, "Wrong weight tensor!");
+    CheckNTErrors(padding == NULL || XTensor::IsSameShaped(padding, loss), "The loss tensor and padding tensor must be same shape!");
+    CheckNTErrors(loss->order == output->order - 1, "Wrong loss dimension!");
+    CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, "TODO!");
+
+    XTensor * logInter = NewTensorBuf(output, output->devID, output->mem);
+    XTensor * mulInter = NewTensorBuf(output, output->devID, output->mem);
+    XTensor * negInter = NewTensorBuf(output, output->devID, output->mem);
+
+    XTensor * logBuf = NewTensorBuf(output, output->devID, output->mem);
+    XTensor * mulBuf = NewTensorBuf(output, output->devID, output->mem);
+    XTensor * negBuf = NewTensorBuf(output, output->devID, output->mem);
+
+    /* l = log(output) */
+    _Log(output, logBuf);
+
+    if(weight != NULL){
+        XTensor * weightBuf = NewTensorBuf(output, output->devID, output->mem);
+        /* multiply gold and weight by broadcast wg = mulDim(g * w) */
+        _MultiplyDim(gold, weight, weightBuf, n, 0);
+        /* multiply weighted gold and log(output) wgl = mul(wg, l) */
+        _Multiply(weightBuf, logBuf, mulBuf, 0);
+        DelTensorBuf(weightBuf);
+    }
+    else{
+        /* multiply gold and log(output) gl = mul(g, l) */
+        _Multiply(gold, logBuf, mulBuf, 0);
+    }
+
+    /* negate multiply result n = negate(mul) */
+    _NegateMe(mulBuf);
+    
+    _ReduceSum(mulBuf, loss, n);
+
+    DelTensorBuf(negInter);
+    DelTensorBuf(mulInter);
+    DelTensorBuf(logInter);
+}
+
+/*
+compute the cross entropy loss (implementation manually)
+
+loss = sum_{i} (-gold_i * log(output_i))
+where gold and output are distributions 
+        
+>> output - model prediction
+>> gold - gold standard
+>> loss - compute loss
+>> weight - a rescaling weight given to each class
+>> padding - specify a target value that is ignored and does not contribute to the loss computation
+>> leadingDim - the leading dimension for the output
+*/
+void _CrossEntropyManual(const XTensor * output, const XTensor * gold,
+                         XTensor * loss, const XTensor * weight, 
+                         const XTensor * padding, int leadingDim)
+{
+#ifdef USE_CUDA
+    if(output->devID >= 0) {
+        _CudaCrossEntropyManual(output, gold, loss, weight, padding, leadingDim);
+        return;
+    }
+#endif
+
+    int order = output->order;
+    int n = leadingDim < 0 ? output->order - 1 : leadingDim;
+    int leadingDimSize = output->GetDim(n);
+
+    CheckNTErrors(n >= 0 && n < output->order, 
+                 "Wrong leadingDim!");
+    CheckNTErrors(XTensor::IsSameShaped(output, gold), 
+                 "The output tensor and gold tensor must be of the same size!");
+    CheckNTErrors(weight == NULL || weight->unitNum == leadingDimSize, 
+                 "Wrong weight tensor!");
+    CheckNTErrors(padding == NULL || XTensor::IsSameShaped(padding, loss), 
+                 "The loss tensor and padding tensor must be same shape!");
+    CheckNTErrors(loss->order == output->order - 1, 
+                 "Wrong loss dimension!");
+    CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, 
+                 "TODO!");
+    
+    int blockNum = 1;
+    int blockSize = 1;
+    int stride = 1;
+
+    for(int i = n + 1; i < order; i++)
+        stride *= output->GetDim(i);
+    
+    blockSize = stride * leadingDimSize;
+    blockNum = output->unitNum / blockSize;
+
+    DTYPE * outputData = (DTYPE*)output->data;
+    DTYPE * goldData = (DTYPE*)gold->data;
+    DTYPE * lossData = (DTYPE*)loss->data;
+
+    DTYPE tmpLoss;
+    if(weight == NULL) {
+        if(padding == NULL) {
+            for(int i = 0; i < blockNum; i++) {
+                int beg = i * blockSize;
+
+                tmpLoss = 0;
+                for(int j = 0; j < blockSize; j++) 
+                    tmpLoss += -(*(goldData + beg + j)) * 
+                                (DTYPE)log(*(outputData + beg + j));
+                *(lossData + i) = tmpLoss;
+            }
+        }
+        else {
+            DTYPE * paddingData = (DTYPE*)padding->data;
+            for(int i = 0; i < blockNum; i++) {
+                int beg = i * blockSize;
+
+                if(*(paddingData + i) == 0)
+                    *(lossData + i) = 0;
+                else{
+                    tmpLoss = 0;
+                    for(int j = 0; j < blockSize; j++)
+                        tmpLoss += -(*(goldData + beg + j)) * 
+                                    (DTYPE)log(*(outputData + beg + j));
+                    *(lossData + i) = tmpLoss;
+                }
+            }            
+        }
+    }
+    else {
+        DTYPE * weightData = (DTYPE*)weight->data;
+        if(padding == NULL) {
+            for(int i = 0; i < blockNum; i++) {
+                int beg = i * blockSize;
+
+                tmpLoss = 0;
+                for(int j = 0; j < blockSize; j++)
+                    tmpLoss += -(*(goldData + beg + j)) * 
+                                (DTYPE)log(*(outputData + beg + j)) * 
+                                (*(weightData + j));
+                *(lossData + i) = tmpLoss;
+            }
+        }
+        else {
+            DTYPE * paddingData = (DTYPE*)padding->data;
+            for(int i = 0; i < blockNum; i++) {
+                int beg = i * blockSize;
+
+                if(*(paddingData + i) == 0)
+                    *(lossData + i) = 0;
+                else{
+                    tmpLoss = 0;
+                    for(int j = 0; j < blockSize; j++)
+                        tmpLoss += -(*(goldData + beg + j)) * 
+                                    (DTYPE)log(*(outputData + beg + j)) * 
+                                    (*(weightData + j));
+                    *(lossData + i) = tmpLoss;
+                }
+            }              
+        }
+    }
+}
+
+/*
+get the dimSize after reduce operation
+>> tensor - a tensor to be reduced
+>> n - the reduce dimension 
+<< return - the pointer of dimSize
+*/
+int * reduceDimSize(const XTensor * tensor, int n)
+{
+    int order = tensor->order;
+    int * dimSize = new int[order - 1];
+
+    for (int i = 0; i < order; i++) {
+        if(i < n)
+            dimSize[i] = tensor->dimSize[i];
+        else if(i > n)
+            dimSize[i - 1] = tensor->dimSize[i];
+    }
+    return dimSize;
+}
+
+/*
+compute the cross entropy loss
+loss = sum_{i} (-gold_i * log(output_i))
+where gold and output are distributions 
+        
+>> output - model prediction
+>> gold - gold standard
+>> reduce - loss compute way, sum or mean
+>> weight - a rescaling weight given to each class
+>> padding - specify a target value that is ignored and does not contribute to the loss computation
+>> leadingDim - the leading dimension for the output
+*/
+DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
+                    LOSS_COMPUTE_WAY reduceWay, const XTensor * weight, 
+                    const XTensor * padding, int leadingDim)
+{
+    int n = leadingDim < 0 ? output->order - 1 : leadingDim;
+    CheckNTErrors(n >= 0 && n < output->order, "Wrong leadingDim!");
+
+    int unitNum = output->dimSize[n];
+
+    CheckNTErrors(XTensor::IsSameShaped(output, gold), 
+                 "The output tensor and gold tensor must be of the same size!");
+    CheckNTErrors(weight == NULL || weight->unitNum == unitNum, "Wrong weight tensor!");
+    CheckNTErrors(padding == NULL || padding->order == output->order - 1, "The loss tensor and padding tensor must be same shape!");
+    CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, "TODO!");
+
+    XTensor * logBuf = NewTensorBuf(output, output->devID, output->mem);
+    XTensor * mulBuf = NewTensorBuf(output, output->devID, output->mem);
+    XTensor * negBuf = NewTensorBuf(output, output->devID, output->mem);
+
+    /* l = log(output) */
+    _Log(output, logBuf);
+
+    if(weight != NULL){
+        XTensor * weightBuf = NewTensorBuf(output, output->devID, output->mem);
+        /* multiply gold and weight by broadcast wg = mulDim(g * w) */
+        _MultiplyDim(gold, weight, weightBuf, n, 0);
+        /* multiply weighted gold and log(output) wgl = mul(wg, l) */
+        _Multiply(weightBuf, logBuf, mulBuf, 0);
+        DelTensorBuf(weightBuf);
+    }
+    else{
+        /* multiply gold and log(output) gl = mul(g, l) */
+        _Multiply(gold, logBuf, mulBuf, 0);
+    }
+
+    /* negate multiply result n = negate(mul) */
+    _NegateMe(mulBuf);
+    
+    int * dimSize;
+    dimSize = reduceDimSize(output, n);
+    XTensor * lossInter = NewTensor(output->order - 1, dimSize, output->dataType, output->denseRatio, output->devID, output->mem);
+    
+    /* reduce sum all classes */
+    _ReduceSum(mulBuf, lossInter, n);
+
+    DelTensorBuf(negBuf);
+    DelTensorBuf(mulBuf);
+    DelTensorBuf(logBuf);
+    
+    DTYPE loss;
+
+    /* compute the total loss */
+    if(padding != NULL) {
+        XTensor * temp(lossInter);
+        _Multiply(lossInter, padding, temp);
+        loss = _ReduceSumAll(temp);
+        delete temp;
+    }
+    else 
+        loss = _ReduceSumAll(lossInter);
+
+    if(reduceWay == REDUCE_MEAN) {
+        if(padding != NULL) {
+            XTensor * zeroIndicator = NewTensorBuf(padding, padding->devID, padding->mem);
+            
+            _IsZero(padding, zeroIndicator);
+            int reduceSize = (int)_ReduceSumAll(zeroIndicator);
+            loss = loss / (DTYPE)(padding->unitNum - reduceSize);
+
+            DelTensorBuf(zeroIndicator);
+        }
+        else 
+            loss = loss / (DTYPE)lossInter->unitNum;
+    }
+    else if(reduceWay == REDUCE_SUM) {
+        /* don't need to do anything */
+    }
+    else {
+        ShowNTErrors("TODO");
+    }
+
+    delete[] dimSize;
+    delete lossInter;
+
+    return loss;
+}
+
+/*
+compute the cross entropy loss (implementation manually)
+
+loss = sum_{i} (-gold_i * log(output_i))
+where gold and output are distributions 
+        
+>> output - model prediction
+>> gold - gold standard
+>> reduceWay - loss compute way, sum or mean
+>> weight - a rescaling weight given to each class
+>> padding - specify a target value that is ignored and does not contribute to the loss computation
+>> leadingDim - the leading dimension for the output
+<< return - the cross entropy loss that is a scalar
+*/
+DTYPE _CrossEntropyManual(const XTensor * output, const XTensor * gold,
+                          LOSS_COMPUTE_WAY reduceWay, const XTensor * weight, 
+                          const XTensor * padding, int leadingDim)
+{
+#ifdef USE_CUDA
+    if(output->devID >= 0) {
+        return _CudaCrossEntropyManual(output, gold, reduceWay, weight, padding, leadingDim);
+    }
+#endif
+
+    int order = output->order;
+    int n = leadingDim < 0 ? output->order - 1 : leadingDim;
+    int leadingDimSize = output->GetDim(n);
+
+    CheckNTErrors(n >= 0 && n < output->order, 
+                 "Wrong leadingDim!");
+    CheckNTErrors(XTensor::IsSameShaped(output, gold), 
+                 "The output tensor and gold tensor must be of the same size!");
+    CheckNTErrors(weight == NULL || weight->unitNum == leadingDimSize, 
+                 "Wrong weight tensor!");
+    CheckNTErrors(padding == NULL || padding->order == output->order - 1, 
+                 "Wrong padding tensor!");
+    CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, 
+                 "TODO!");
+    
+    int blockNum = 1;
+    int blockSize = 1;
+    int stride = 1;
+
+    for(int i = n + 1; i < order; i++)
+        stride *= output->GetDim(i);
+    
+    blockSize = stride * leadingDimSize;
+    blockNum = output->unitNum / blockSize;
+
+    DTYPE * outputData = (DTYPE*)output->data;
+    DTYPE * goldData = (DTYPE*)gold->data;
+
+    DTYPE loss = 0;
+    int nonZeroNum = 0;
+
+    if(weight == NULL) {
+        if(padding == NULL) {
+            nonZeroNum = blockNum;
+            for(int i = 0; i < blockNum; i++) {
+                int beg = i * blockSize;
+
+                for(int j = 0; j < blockSize; j++) 
+                    loss += -(*(goldData + beg + j)) * 
+                             (DTYPE)log(*(outputData + beg + j));
+            }
+        }
+        else {
+            DTYPE * paddingData = (DTYPE*)padding->data;
+            for(int i = 0; i < blockNum; i++) {
+                if(*(paddingData + i) == 0)
+                    continue;
+                else{
+                    nonZeroNum += 1;
+
+                    int beg = i * blockSize;
+                    for(int j = 0; j < blockSize; j++)
+                        loss += -(*(goldData + beg + j)) * 
+                                 (DTYPE)log(*(outputData + beg + j));
+                }
+            }            
+        }
+    }
+    else {
+        DTYPE * weightData = (DTYPE*)weight->data;
+        if(padding == NULL) {
+            nonZeroNum = blockNum;
+            for(int i = 0; i < blockNum; i++) {
+                int beg = i * blockSize;
+                for(int j = 0; j < blockSize; j++)
+                    loss += -(*(goldData + beg + j)) * 
+                             (DTYPE)log(*(outputData + beg + j)) * 
+                             (*(weightData + j));
+            }
+        }
+        else {
+            DTYPE * paddingData = (DTYPE*)padding->data;
+            for(int i = 0; i < blockNum; i++) {
+                if(*(paddingData + i) == 0)
+                    continue;
+                else{
+                    nonZeroNum += 1;
+
+                    int beg = i * blockSize;
+                    for(int j = 0; j < blockSize; j++)
+                        loss += -(*(goldData + beg + j)) * 
+                                 (DTYPE)log(*(outputData + beg + j)) * 
+                                 (*(weightData + j));
+                }
+            }              
+        }
+    }
+
+    if(reduceWay == REDUCE_MEAN) {
+        loss = loss / (DTYPE)nonZeroNum;
+    }
+    else if(reduceWay == REDUCE_SUM) {
+        /* don't need to do anything */
+    }
+    else {
+        ShowNTErrors("TODO");
+    }
+
+    return loss;
+}
+
+/* 
+backward compuation for cross entropy function (tensor version)
+
+loss = sum_{i} (-t_i * log(y_i))
+dE/dy_i = -t_i / y_i
+where E is the error(loss) function that measure the errors in y
+with respect to gold standard, and y this the model output
+
+>> dedy - dE/dy (for return)
+>> output - model prediction
+>> gold - gold standard
+>> weight - a rescaling weight given to each class
+>> padding - specify a target value that is ignored and does not contribute to the loss computation
+>> leadingDim - the leading dimension for the output
+*/
+void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor * gold, 
+                           const XTensor * weight, XTensor * padding, 
+                           int leadingDim)
+{
+#ifdef USE_CUDA
+    if(output->devID >= 0) {
+        _CudaCrossEntropyBackward(dedy, output, gold, weight, padding, leadingDim);
+        return;
+    }
+#endif
+
+    int order = output->order;
+    int n = leadingDim < 0 ? output->order - 1 : leadingDim;
+    int leadingDimSize = output->GetDim(n);
+    int unitSize = dedy->unitSize;
+
+    CheckNTErrors(n >= 0 && n < output->order, 
+                 "Wrong leading dimension!");
+    CheckNTErrors(XTensor::IsSameShaped(dedy, output, gold), 
+                 "The output tensor and gold tensor must be of the same size!");
+    CheckNTErrors(weight == NULL || weight->unitNum == leadingDimSize, 
+                 "Wrong weight tensor!");
+    CheckNTErrors(padding == NULL || padding->order == output->order - 1, 
+                 "Wrong padding tensor!");
+    CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, 
+                 "TODO!");
+    
+    int blockNum = 1;
+    int blockSize = 1;
+    int stride = 1;
+
+    for(int i = n + 1; i < order; i++)
+        stride *= output->GetDim(i);
+    
+    blockSize = stride * leadingDimSize;
+    blockNum = output->unitNum / blockSize;
+
+    DTYPE * dedyData = (DTYPE*)dedy->data;
+    DTYPE * outputData = (DTYPE*)output->data;
+    DTYPE * goldData = (DTYPE*)gold->data;
+
+    if(weight == NULL) {
+        if(padding == NULL) {
+            for(int i = 0; i < blockNum; i++) {
+                int beg = i * blockSize;
+                for(int j = 0; j < blockSize; j++)
+                    *(dedyData + beg + j) = -(*(goldData + beg + j)) / 
+                                             (*(outputData + beg + j));
+            }
+        }
+        else {
+            DTYPE * paddingData = (DTYPE*)padding->data;
+            for(int i = 0; i < blockNum; i++) {
+                int beg = i * blockSize;
+                if(*(paddingData + i) == 0)
+                    memset(dedyData + beg, 0, blockSize * unitSize);
+                else
+                    for(int j = 0; j < blockSize; j++)
+                        *(dedyData + beg + j) = -(*(goldData + beg + j)) / 
+                                                 (*(outputData + beg + j));
+            }
+        }
+    }
+    else {
+        DTYPE * weightData = (DTYPE*)weight->data;
+        if(padding == NULL) {
+            for(int i = 0; i < blockNum; i++) {
+                int beg = i * blockSize;
+                for(int j = 0; j < blockSize; j++)
+                    *(dedyData + beg + j) = -(*(weightData + j)) * 
+                                             (*(goldData + beg + j)) / 
+                                             (*(outputData + beg + j));
+            }
+        }
+        else {
+            DTYPE * paddingData = (DTYPE*)padding->data;
+            for(int i = 0; i < blockNum; i++) {
+                int beg = i * blockSize;
+                if(*(paddingData + i) == 0)
+                    memset(dedyData + beg, 0, blockSize * unitSize);
+                else
+                    for(int j = 0; j < blockSize; j++) {
+                        *(dedyData + beg + j) = -(*(weightData + j)) * 
+                                                 (*(goldData + beg + j)) / 
+                                                 (*(outputData + beg + j));
+                }
+            }
+        }        
+    }
+
+    if(padding != NULL) {
+        XTensor * tmp(padding);
+        _IsZero(padding, tmp);
+        int nonZeroNum = _ReduceSumAll(tmp);
+        _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum);
+        delete tmp;
+    }
+    else {
+        _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)blockNum);
+    }
+}
+
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/function/CrossEntropy.cu
+++ b/source/tensor/function/CrossEntropy.cu
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-17
+ */
+
+#ifndef __CROSSENTROPY_CUH__
+#define __CROSSENTROPY_CUH__
+
+#include "../XTensor.h"
+#include "../XDevice.h"
+#include "CrossEntropy.cuh"
+#include "CrossEntropy.h"
+#include "../core/reduce/ReduceSumAll.h"
+#include "../core/math/Unary.h"
+#include "../core/math/ScaleAndShift.h"
+
+namespace nts{ // namespace nts(NiuTrans.Tensor)
+
+/* 
+compute the cross entropy loss (cuda kernel) 
+
+>> outputData - the data pointer of output tensor
+>> goldData - the data pointer of gold tensor
+>> lossData - the data pointer of loss tensor
+>> weightData - the data pointer of weight tensor
+>> paddingData - the data pointer of padding tensor
+>> blockNum - the number of data blocks
+>> stride - the size of a data block
+*/
+__global__
+void KernelCrossEntropy(DTYPE * outputData, DTYPE * goldData,
+                        DTYPE * lossData, DTYPE * weightData, 
+                        DTYPE * paddingData, int blockNum, int blockSize)
+{
+    /* block id */
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if(i >= blockNum)
+        return;
+
+    int beg = i * blockSize;
+    DTYPE tmpLoss = 0;
+
+    if(weightData == NULL) {
+        if(paddingData == NULL) {
+            tmpLoss = 0;
+            for(int j = 0; j < blockSize; j++) 
+                tmpLoss += -(*(goldData + beg + j)) * 
+                            (DTYPE)log(*(outputData + beg + j));
+            *(lossData + i) = tmpLoss;
+        }
+        else {
+            if(*(paddingData + i) == 0)
+                *(lossData + i) = tmpLoss;
+            else{
+                for(int j = 0; j < blockSize; j++)
+                    tmpLoss += -(*(goldData + beg + j)) * 
+                                (DTYPE)log(*(outputData + beg + j));
+                *(lossData + i) = tmpLoss;
+            }
+        }
+    }
+    else {
+        if(paddingData == NULL) {
+            for(int j = 0; j < blockSize; j++)
+                tmpLoss += -(*(goldData + beg + j)) * 
+                            (DTYPE)log(*(outputData + beg + j)) * 
+                            (*(weightData + j));
+            *(lossData + i) = tmpLoss;
+        }
+        else {
+            if(*(paddingData + i) == 0)
+                *(lossData + i) = tmpLoss;
+            else{
+                tmpLoss = 0;
+                for(int j = 0; j < blockSize; j++)
+                    tmpLoss += -(*(goldData + beg + j)) * 
+                                (DTYPE)log(*(outputData + beg + j)) * 
+                                (*(weightData + j));
+                *(lossData + i) = tmpLoss;
+            }
+        }
+    }
+}
+
+/* 
+compute the cross entropy loss (cuda version) 
+loss = sum_{i} (-gold_i * log(output_i))
+where gold and output are distributions 
+        
+>> output - model prediction
+>> gold - gold standard
+>> loss - compute loss
+>> weight - a rescaling weight given to each class
+>> padding - specify a target value that is ignored and does not contribute to the loss computation
+>> leadingDim - the leading dimension for the output
+*/
+void _CudaCrossEntropyManual(const XTensor * output, const XTensor * gold,
+                             XTensor * loss, const XTensor * weight, 
+                             const XTensor * padding, int leadingDim)
+{
+    int order = output->order;
+    int n = leadingDim < 0 ? output->order - 1 : leadingDim;
+    int leadingDimSize = output->GetDim(n);
+
+    CheckNTErrors(n >= 0 && n < output->order, 
+                 "Wrong leadingDim!");
+    CheckNTErrors(XTensor::IsSameShaped(output, gold), 
+                 "The output tensor and gold tensor must be of the same size!");
+    CheckNTErrors(weight == NULL || weight->unitNum == leadingDimSize, 
+                 "Wrong weight tensor!");
+    CheckNTErrors(padding == NULL || XTensor::IsSameShaped(padding, loss), 
+                 "The loss tensor and padding tensor must be same shape!");
+    CheckNTErrors(loss->order == output->order - 1, 
+                 "Wrong loss dimension!");
+    CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, 
+                 "TODO!");
+    
+    int blockNum = 1;
+    int blockSize = 1;
+    int stride = 1;
+
+    for(int i = n + 1; i < order; i++)
+        stride *= output->GetDim(i);
+    
+    blockSize = stride * leadingDimSize;
+    blockNum = output->unitNum / blockSize;
+
+    int cudaGrids[3];
+    int cudaBlocks[3];
+
+    //GDevs.GetCudaThread2D(output->devID, blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
+    GDevs.GetCudaThread(output->devID, blockNum, cudaGrids, cudaBlocks);
+    
+    dim3 blocks(cudaGrids[0], cudaGrids[1]);
+    dim3 threads(cudaBlocks[0], cudaBlocks[1]);
+
+    int devIDBackup;
+    ProtectCudaDev(output->devID, devIDBackup);
+    
+    DTYPE * outputData = (DTYPE*)output->data;
+    DTYPE * goldData = (DTYPE*)gold->data;
+    DTYPE * lossData = (DTYPE*)loss->data;
+
+    if(weight == NULL) {
+        if(padding == NULL)
+            KernelCrossEntropy<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
+                                (outputData, goldData, lossData, 
+                                 NULL, NULL,
+                                 blockNum, blockSize);
+        else
+            KernelCrossEntropy<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
+                                (outputData, goldData, lossData, 
+                                 NULL, (DTYPE*)padding->data,
+                                 blockNum, blockSize);
+    }
+    else {
+        if(padding == NULL)
+            KernelCrossEntropy<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
+                                (outputData, goldData, lossData, 
+                                 (DTYPE*)weight->data, NULL,
+                                 blockNum, blockSize);
+        else
+            KernelCrossEntropy<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
+                                (outputData, goldData, lossData, 
+                                 (DTYPE*)weight->data, (DTYPE*)padding->data,
+                                 blockNum, blockSize);
+    }
+    
+    BacktoCudaDev(output->devID, devIDBackup);
+
+}
+
+/*
+compute the cross entropy loss (scalar version) 
+
+loss = sum_{i} (-gold_i * log(output_i))
+where gold and output are distributions 
+        
+>> output - model prediction
+>> gold - gold standard
+>> reduceWay - loss compute way, sum or mean
+>> weight - a rescaling weight given to each class
+>> padding - specify a target value that is ignored and does not contribute to the loss computation
+>> leadingDim - the leading dimension for the output
+<< return - the cross entropy loss that is a scalar
+*/
+DTYPE _CudaCrossEntropyManual(const XTensor * output, const XTensor * gold,
+                              LOSS_COMPUTE_WAY reduceWay, const XTensor * weight, 
+                              const XTensor * padding, int leadingDim)
+{
+    DTYPE loss = 0;
+
+    int order = output->order;
+    int n = leadingDim < 0 ? output->order - 1 : leadingDim;
+    int leadingDimSize = output->GetDim(n);
+
+    CheckNTErrors(n >= 0 && n < output->order, 
+                 "Wrong leadingDim!");
+    CheckNTErrors(XTensor::IsSameShaped(output, gold), 
+                 "The output tensor and gold tensor must be of the same size!");
+    CheckNTErrors(weight == NULL || weight->unitNum == leadingDimSize, 
+                 "Wrong weight tensor!");
+    CheckNTErrors(padding == NULL || padding->order == output->order - 1, 
+                 "Wrong padding tensor!");
+    CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, 
+                 "TODO!");
+    
+    int * dimSize = new int[output->order - 1];
+    for (int i = 0; i < order; i++) {
+        if(i < n)
+            dimSize[i] = output->dimSize[i];
+        else if(i > n)
+            dimSize[i - 1] = output->dimSize[i];
+    }
+
+    XTensor * lossInter = NewTensor(output->order - 1, dimSize, output->dataType, output->denseRatio, output->devID, output->mem);
+
+    _CudaCrossEntropyManual(output, gold, lossInter, weight, padding, leadingDim);
+
+    loss = _ReduceSumAll(lossInter);
+
+    if(reduceWay == REDUCE_MEAN) {
+        int totalNum;
+        if(padding == NULL) {
+            totalNum = lossInter->unitNum;
+        }
+        else {
+            XTensor * zeroIndicator = NewTensorBuf(output, output->devID, output->mem);
+            _IsZero(padding, zeroIndicator);
+            totalNum = lossInter->unitNum - (int)_ReduceSumAll(zeroIndicator);
+            DelTensorBuf(zeroIndicator);
+        }
+
+        loss = loss / (DTYPE)totalNum;
+    }
+
+    return loss;
+}
+
+/* 
+backward computation of cross entropy function (kernel version)
+
+>> dedyData - the data pointer of dedy tensor
+>> outputData - the data pointer of output tensor
+>> goldData - the data pointer of gold tensor
+>> weightData - the data pointer of weight tensor
+>> paddingData - the data pointer of padding tensor
+>> blockNum - the number of data blocks
+>> blockSize - the size of a data block
+*/
+__global__
+void KernelCrossEntropyBackward(DTYPE * dedyData, DTYPE * outputData, DTYPE * goldData,
+                                DTYPE * weightData, DTYPE * paddingData,
+                                int blockNum, int blockSize)
+{
+    /* block id */
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if(i >= blockNum)
+        return;
+
+    int beg = i * blockSize;
+
+    if(weightData == NULL) {
+        if(paddingData == NULL) {
+            for(int j = 0; j < blockSize; j++) 
+                *(dedyData + beg + j) = -(*(goldData + beg + j)) / 
+                                         (*(outputData + beg + j));
+        }
+        else {
+            if(*(paddingData + i) == 0)
+                memset(dedyData + beg, 0, blockSize * sizeof(DTYPE));
+            else
+                for(int j = 0; j < blockSize; j++)
+                    *(dedyData + beg + j) = -(*(goldData + beg + j)) / 
+                                             (*(outputData + beg + j));
+        }
+    }
+    else {
+        if(paddingData == NULL) {
+            for(int j = 0; j < blockSize; j++)
+                *(dedyData + beg + j) = -(*(weightData + j)) * 
+                                         (*(goldData + beg + j)) / 
+                                         (*(outputData + beg + j));
+        }
+        else {
+            if(*(paddingData + i) == 0)
+                memset(dedyData + beg, 0, blockSize * sizeof(DTYPE));
+            else
+                for(int j = 0; j < blockSize; j++) {
+                    *(dedyData + beg + j) = -(*(weightData + j)) * 
+                                             (*(goldData + beg + j)) / 
+                                             (*(outputData + beg + j));
+            }
+        }
+    }
+}
+
+/* 
+backward computation of cross entropy function 
+
+loss = sum_{i} (-t_i * log(y_i))
+dE/dy_i = -t_i / y_i
+where E is the error(loss) function that measure the errors in y
+with respect to gold standard, and y this the model output
+
+>> dedy - dE/dy (for return)
+>> output - model prediction
+>> gold - gold standard
+>> weight - a rescaling weight given to each class
+>> padding - specify a target value that is ignored and does not contribute to the loss computation
+>> leadingDim - the leading dimension for the output
+*/
+void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor * gold, 
+                               const XTensor * weight, XTensor * padding,
+                               int leadingDim)
+{
+    int order = output->order;
+    int n = leadingDim < 0 ? output->order - 1 : leadingDim;
+    int leadingDimSize = output->GetDim(n);
+
+    CheckNTErrors(n >= 0 && n < output->order, 
+                 "Wrong leading dimension!");
+    CheckNTErrors(XTensor::IsSameShaped(dedy, output, gold), 
+                 "The output tensor and gold tensor must be of the same size!");
+    CheckNTErrors(weight == NULL || weight->unitNum == leadingDimSize, 
+                 "Wrong weight tensor!");
+    CheckNTErrors(padding == NULL || padding->order == output->order - 1, 
+                 "Wrong padding tensor!");
+    CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, 
+                 "TODO!");
+    
+    int blockNum = 1;
+    int blockSize = 1;
+    int stride = 1;
+
+    for(int i = n + 1; i < order; i++)
+        stride *= output->GetDim(i);
+    
+    blockSize = stride * leadingDimSize;
+    blockNum = output->unitNum / blockSize;
+
+    int cudaGrids[3];
+    int cudaBlocks[3];
+
+    GDevs.GetCudaThread(output->devID, blockNum, cudaGrids, cudaBlocks);
+    
+    dim3 blocks(cudaGrids[0], cudaGrids[1]);
+    dim3 threads(cudaBlocks[0], cudaBlocks[1]);
+
+    int devIDBackup;
+    ProtectCudaDev(output->devID, devIDBackup);
+    
+    DTYPE * dedyData = (DTYPE*)dedy->data;
+    DTYPE * outputData = (DTYPE*)output->data;
+    DTYPE * goldData = (DTYPE*)gold->data;
+
+    if(weight == NULL) {
+        if(padding == NULL)
+            KernelCrossEntropyBackward<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
+                                        (dedyData, outputData, goldData,
+                                         NULL, NULL,
+                                         blockNum, blockSize);
+        else
+            KernelCrossEntropyBackward<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
+                                        (dedyData, outputData, goldData,
+                                         NULL, (DTYPE*)padding->data,
+                                         blockNum, blockSize);
+    }
+    else {
+        if(padding == NULL)
+            KernelCrossEntropyBackward<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
+                                        (dedyData, outputData, goldData,
+                                        (DTYPE*)weight->data, NULL,
+                                         blockNum, blockSize);
+        else
+            KernelCrossEntropyBackward<<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
+                                        (dedyData, outputData, goldData,
+                                        (DTYPE*)weight->data, (DTYPE*)padding->data,
+                                         blockNum, blockSize);
+    }
+
+    if(padding != NULL) {
+        XTensor * tmp(padding);
+        _IsZero(padding, tmp);
+        int nonZeroNum = _ReduceSumAll(tmp);
+        _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum);
+        delete tmp;
+    }
+    else {
+        _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)blockNum);
+    }
+    
+    BacktoCudaDev(output->devID, devIDBackup);
+
+}
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __CROSSENTROPY_CUH__
\ No newline at end of file
--- a/source/tensor/function/CrossEntropy.cuh
+++ b/source/tensor/function/CrossEntropy.cuh
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-17
+ */
+
+#ifndef __CROSSENTROPY_CUH__
+#define __CROSSENTROPY_CUH__
+
+#include "../XTensor.h"
+#include "CrossEntropy.h"
+
+namespace nts{ // namespace nts(NiuTrans.Tensor)
+
+/* compute the cross entropy loss (tensor version) */
+void _CudaCrossEntropyManual(const XTensor * output, const XTensor * gold,
+                             XTensor * loss, const XTensor * weight = NULL, 
+                             const XTensor * padding = NULL, int leadingDim = -1);
+
+/* compute the cross entropy loss (scalar version) */
+DTYPE _CudaCrossEntropyManual(const XTensor * output, const XTensor * gold,
+                              LOSS_COMPUTE_WAY reduceWay, const XTensor * weight = NULL, 
+                              const XTensor * padding = NULL, int leadingDim = -1);
+
+/* backward computation of cross entropy function */
+void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor * gold, 
+                               const XTensor * weight = NULL, XTensor * padding = NULL, 
+                               int leadingDim = -1);
+
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __CROSSENTROPY_CUH__
\ No newline at end of file
--- a/source/tensor/function/CrossEntropy.h
+++ b/source/tensor/function/CrossEntropy.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-17
+ */
+
+#ifndef __CROSSENTROPY_H__
+#define __CROSSENTROPY_H__
+
+#include "../XTensor.h"
+
+namespace nts{ // namespace nts(NiuTrans.Tensor)
+
+enum LOSS_COMPUTE_WAY{
+REDUCE_SUM,
+REDUCE_MEAN
+};
+
+/* compute the cross entropy loss (tensor version) */
+void _CrossEntropy(const XTensor * output, const XTensor * gold, 
+                   XTensor * loss, const XTensor * weight = NULL, 
+                   const XTensor * padding = NULL, int leadingDim = -1);
+
+/* compute the cross entropy loss (tensor version) */
+void _CrossEntropyManual(const XTensor * output, const XTensor * gold,
+                         XTensor * loss, const XTensor * weight = NULL, 
+                         const XTensor * padding = NULL, int leadingDim = -1);
+
+/* compute the cross entropy loss (scalar version) */
+DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
+                    LOSS_COMPUTE_WAY reduceWay, const XTensor * weight = NULL, 
+                    const XTensor * padding = NULL, int leadingDim = -1);
+
+/* compute the cross entropy loss (scalar version) */
+DTYPE _CrossEntropyManual(const XTensor * output, const XTensor * gold,
+                          LOSS_COMPUTE_WAY reduceWay = REDUCE_MEAN, const XTensor * weight = NULL, 
+                          const XTensor * padding = NULL, int leadingDim = -1);
+
+/* backward computation of cross entropy function */
+void _CrossEntropyBackward(XTensor * dedy, const XTensor * output, const XTensor * gold, 
+                           const XTensor * weight = NULL, XTensor * padding = NULL, 
+                           int leadingDim = -1);
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __CROSSENTROPY_H__
\ No newline at end of file
--- a/source/tensor/function/Dropout.cpp
+++ b/source/tensor/function/Dropout.cpp
@@ -20,7 +20,6 @@
 */

 #include "../XName.h"
-#include <math.h>
 #include <time.h>
 #include "Dropout.h"
 #include "Dropout.cuh"

--- a/source/tensor/function/Dropout.h
+++ b/source/tensor/function/Dropout.h
@@ -23,7 +23,6 @@
 #define __DROPOUT_H__

 #include "../XTensor.h"
-#include "Loss.h"

 namespace nts{ // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/function/FHeader.h
+++ b/source/tensor/function/FHeader.h
@@ -26,6 +26,7 @@

 #include "../XTensor.h"

+#include "CrossEntropy.h"
 #include "Dropout.h"
 #include "HardTanH.h"
 #include "Identity.h"

--- a/source/tensor/function/HardTanH.cpp
+++ b/source/tensor/function/HardTanH.cpp
@@ -23,6 +23,7 @@
 #include "../XName.h"
 #include "HardTanH.h"
 #include "HardTanH.cuh"
+#include "CrossEntropy.h"

 namespace nts{ // namespace nts(NiuTrans.Tensor)

@@ -72,7 +73,7 @@ y =  1    if x > 1
 XTensor HardTanH(const XTensor &x)
 {
    XTensor y(&x);
-    y.SetTMP();
+    y.SetTMPFlag();

    /* call _HardTanH function */
    _HardTanH(&x, &y);
@@ -118,7 +119,9 @@ void _HardTanHBackward(XTensor * gold, XTensor * y, XTensor * x,

    if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
        /* calculate dE/dy */
-        if(lossName != NOLOSS)
+        if(lossName == CROSSENTROPY)
+            _CrossEntropyBackward(dedy, y, gold);
+        else if(lossName != NOLOSS)
            _LossBackward(dedy, gold, y, lossName);

        DTYPE * dedyp = (DTYPE*)dedy->data;

--- a/source/tensor/function/HardTanH.cu
+++ b/source/tensor/function/HardTanH.cu
@@ -22,6 +22,7 @@
 #include "HardTanH.h"
 #include "HardTanH.cuh"
 #include "Loss.cuh"
+#include "CrossEntropy.cuh"
 #include "../XDevice.h"

 namespace nts{ // namespace nts(NiuTrans.Tensor)
@@ -136,8 +137,10 @@ void _CudaHardTanHBackward(XTensor * gold, XTensor * y, XTensor * x,
    if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){

        /* calculate dE/dy */
-        if(lossName != NOLOSS)
-            _LossBackward(dedy, gold, y, lossName);
+        if(lossName == CROSSENTROPY)
+            _CudaCrossEntropyBackward(dedy, y, gold);
+        else if(lossName != NOLOSS)
+            _CudaLossBackward(dedy, gold, y, lossName);

        int gridSize[3], blockSize[3];


--- a/source/tensor/function/Identity.cpp
+++ b/source/tensor/function/Identity.cpp
@@ -21,6 +21,7 @@

 #include "../XName.h"
 #include "Identity.h"
+#include "CrossEntropy.h"
 #include "../XUtility.h"
 #include "../core/movement/CopyValues.h"

@@ -46,7 +47,7 @@ make a new tensor to keep the result and return it
 XTensor Identity(const XTensor &x)
 {
    XTensor y(&x);
-    y.SetTMP();
+    y.SetTMPFlag();

    /* call _Identity function */
    _Identity(&x, &y);
@@ -78,7 +79,9 @@ void _IdentityBackward(XTensor * gold, XTensor * y, XTensor * x,
    if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE)
    {
        /* calculate dE/dy */
-        if(lossName != NOLOSS)
+        if(lossName == CROSSENTROPY)
+            _CrossEntropyBackward(dedy, y, gold);
+        else if(lossName != NOLOSS)
            _LossBackward(dedy, gold, y, lossName);

        if(dedy->data != dedx->data)

--- a/source/tensor/function/LogSoftmax.cpp
+++ b/source/tensor/function/LogSoftmax.cpp
@@ -122,10 +122,11 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
                        for (int i = 0; i < n; i++) {
                            DTYPE r = (DTYPE)log(exp(ip[i * m + j] - mp[j]) / sp[j]);
                            if (IsNAN(r))
-                                r = DTYPE_MIN;
+                                r = LOGPROB_MIN;
                            if (IsINF(r))
-                                r = DTYPE_MIN;
-                            op[i * m + j] = r;
+                                r = LOGPROB_MIN;
+
+                            op[i * m + j] = MAX(r, LOGPROB_MIN);
                        }
                    }
                }
@@ -181,7 +182,7 @@ XTensor LogSoftmax(const XTensor &x, int leadDim)
        ld = x.order - 1;

    XTensor y(&x);
-    y.SetTMP();
+    y.SetTMPFlag();

    /* call _LogSoftmax function */
    _LogSoftmax(&x, &y, ld);

--- a/source/tensor/function/LogSoftmax.cu
+++ b/source/tensor/function/LogSoftmax.cu
@@ -78,11 +78,13 @@ void KernelLogSoftmaxComputeByRow(DTYPE * x, DTYPE * max, DTYPE * sum, DTYPE * y
    if (i < rowNum && j < colNum) {
        int key = i * colNum + j;
        DTYPE r = log(exp(x[key] - inputMax[threadIdx.x]) / inputSum[threadIdx.x]);
+
        if (isnan(r))
-            r = DTYPE_MIN;
+            r = LOGPROB_MIN;
        if (isinf(r))
-            r = DTYPE_MIN;
-        y[key] = r;
+            r = LOGPROB_MIN;
+
+        y[key] = MAX(r, LOGPROB_MIN);
    }
 }

@@ -123,11 +125,18 @@ void KernelLogSoftmaxComputeByCol(DTYPE * x, DTYPE * max, DTYPE * sum, DTYPE * y
    if (i < rowNum && j < colNum) {
        int key = i * colNum + j;
        DTYPE r = log(exp(x[key] - inputMax[threadIdx.y]) / inputSum[threadIdx.y]);
+
+        /*if (r < LOGPROB_MIN)
+        {
+            printf("min %e %e, %e %e, %e %e\n", r, x[key] - inputMax[threadIdx.y], x[key], inputMax[threadIdx.y], exp(x[key] - inputMax[threadIdx.y]), inputSum[threadIdx.y]);
+        }*/
+
        if (isnan(r))
-            r = DTYPE_MIN;
+            r = LOGPROB_MIN;
        if (isinf(r))
-            r = DTYPE_MIN;
-        y[key] = r;
+            r = LOGPROB_MIN;
+        
+        y[key] = MAX(r, LOGPROB_MIN);
    }
 }

@@ -228,21 +237,29 @@ void KernelLogSoftmaxBackwardDEDS(DTYPE * dedy, DTYPE * dedx, DTYPE * gold, DTYP
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if (i < size) {
+        DTYPE r = 0;
        /* dE/ds_j = exp(y_j) */
        if (lossName == CROSSENTROPY)
-            dedx[i] = -gold[i] + exp(y[i]);
+            r = -gold[i] + exp(y[i]);
        /* dE/ds_j = exp(y_j) */
        else if (lossName == SQUAREDERROR)
-            dedx[i] = -gold[i] + exp(y[i]);
+            r = -gold[i] + exp(y[i]);
        else if (lossName == ONEHOTERROR) {
            if (gold[i] == 1.0F)
-                dedx[i] = -gold[i] + exp(y[i]);
+                r = -gold[i] + exp(y[i]);
            else
-                dedx[i] = 0;
+                r = 0;
        }
        else {
-            dedx[i] = dedy[i];
+            r = dedy[i];
        }
+
+        if (isnan(r))
+            r = 0;
+        if (isinf(r))
+            r = 0;
+
+        dedx[i] = r;
    }
 }


--- a/source/tensor/function/Loss.cpp
+++ b/source/tensor/function/Loss.cpp
@@ -16,8 +16,8 @@
 */

 /*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
+ * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
+ */

 #include <math.h>
 #include "Loss.h"

--- a/source/tensor/function/Rectify.cpp
+++ b/source/tensor/function/Rectify.cpp
@@ -22,6 +22,7 @@
 #include "../XName.h"
 #include "Rectify.h"
 #include "Rectify.cuh"
+#include "CrossEntropy.h"

 namespace nts{ // namespace nts(NiuTrans.Tensor)

@@ -65,7 +66,7 @@ make a new tensor to keep the result and return it
 XTensor Rectify(const XTensor &x)
 {
    XTensor y(&x);
-    y.SetTMP();
+    y.SetTMPFlag();

    /* call _Rectify function */
    _Rectify(&x, &y);
@@ -116,7 +117,9 @@ void _RectifyBackward(XTensor * gold, XTensor * y, XTensor * x,
    if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE)
    {
        /* calculate dE/dy */
-        if(lossName != NOLOSS)
+        if(lossName == CROSSENTROPY)
+            _CrossEntropyBackward(dedy, y, gold);
+        else if(lossName != NOLOSS)
            _LossBackward(dedy, gold, y, lossName);

        DTYPE * dedyp = (DTYPE*)dedy->data;

--- a/source/tensor/function/Rectify.cu
+++ b/source/tensor/function/Rectify.cu
@@ -22,6 +22,7 @@
 #include "Rectify.h"
 #include "Rectify.cuh"
 #include "Loss.cuh"
+#include "CrossEntropy.cuh"
 #include "../XDevice.h"

 namespace nts{ // namespace nts(NiuTrans.Tensor)
@@ -133,7 +134,9 @@ void _CudaRectifyBackward(XTensor * gold, XTensor * y, XTensor * x,
    if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){

        /* calculate dE/dy */
-        if(lossName != NOLOSS)
+        if(lossName == CROSSENTROPY)
+            _CudaCrossEntropyBackward(dedy, y, gold);
+        else if(lossName != NOLOSS)
            _CudaLossBackward(dedy, gold, y, lossName);
        
        int gridSize[3], blockSize[3];

--- a/source/tensor/function/Sigmoid.cpp
+++ b/source/tensor/function/Sigmoid.cpp
@@ -23,6 +23,7 @@
 #include <math.h>
 #include "Sigmoid.h"
 #include "Sigmoid.cuh"
+#include "CrossEntropy.h"

 namespace nts{ // namespace nts(NiuTrans.Tensor)

@@ -63,7 +64,7 @@ make a new tensor to keep the result and return it
 XTensor Sigmoid(const XTensor &x)
 {
    XTensor y(&x);
-    y.SetTMP();
+    y.SetTMPFlag();

    /* call _Sigmoid function */
    _Sigmoid(&x, &y);
@@ -107,7 +108,9 @@ void _SigmoidBackward(XTensor * gold, XTensor * y, XTensor * x,
    if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE)
    {
        /* calculate dE/dy */
-        if(lossName != NOLOSS)
+        if(lossName == CROSSENTROPY)
+            _CrossEntropyBackward(dedy, y, gold);
+        else if(lossName != NOLOSS)
            _LossBackward(dedy, gold, y, lossName);

        DTYPE * dedyp = (DTYPE*)dedy->data;

--- a/source/tensor/function/Sigmoid.cu
+++ b/source/tensor/function/Sigmoid.cu
@@ -22,6 +22,7 @@
 #include "Sigmoid.h"
 #include "Sigmoid.cuh"
 #include "Loss.cuh"
+#include "CrossEntropy.cuh"
 #include "../XDevice.h"

 #ifdef USE_CUDA
@@ -128,7 +129,9 @@ void _CudaSigmoidBackward(XTensor * gold, XTensor * y, XTensor * x,
 {
    if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
        /* calculate dE/dy */
-        if(lossName != NOLOSS)
+        if(lossName == CROSSENTROPY)
+            _CudaCrossEntropyBackward(dedy, y, gold);
+        else if(lossName != NOLOSS)
            _LossBackward(dedy, gold, y, lossName);

        

--- a/source/tensor/function/Softmax.cpp
+++ b/source/tensor/function/Softmax.cpp
@@ -136,7 +136,7 @@ XTensor Softmax(const XTensor &x, int leadDim)
        ld = x.order - 1;

    XTensor y(&x);
-    y.SetTMP();
+    y.SetTMPFlag();

    /* call _Softmax function */
    _Softmax(&x, &y, ld);

--- a/source/tensor/test/TCopyIndexed.cpp
+++ b/source/tensor/test/TCopyIndexed.cpp
 /* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */

 /*
-* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-27
-*/
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-27
+ */

 #include "TCopyIndexed.h"

@@ -344,6 +344,235 @@ bool TestCopyIndexed3()
 #endif // USE_CUDA
 }

+/* 
+case 4: copy indexed sub-tensors 
+In this case, (3, 2, 3) -> (3, 2, 2), dim = 2, indexSize = 2, 
+srcIndex = [0, 2], tgtIndex = [0, 1], copyNum = 1.
+*/
+bool TestCopyIndexed4()
+{
+    /* a input tensor of size (3, 2, 3) */
+    int sOrder = 3;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 3;
+    sDimSize[1] = 2;
+    sDimSize[2] = 3;
+
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+
+    /* a output tensor of size (3, 2, 2) */
+    int tOrder = 3;
+    int * tDimSize = new int[tOrder];
+    tDimSize[0] = 3;
+    tDimSize[1] = 2;
+    tDimSize[2] = 2;
+
+    int tUnitNum = 1;
+    for (int i = 0; i < tOrder; i++)
+        tUnitNum *= tDimSize[i];
+
+    /* a index tensor of size(2) */
+    int iOrder = 3;
+    int * iDimSize = new int[iOrder];
+    iDimSize[0] = 3;
+    iDimSize[1] = 2;
+    iDimSize[2] = 2;
+
+    int iUnitNum = 1;
+    for (int i = 0; i < iOrder; i++)
+        iUnitNum *= iDimSize[i];
+
+    DTYPE sData[3][2][3] = { { {0.0F, -1.0F, 2.0F},
+                               {2.0F, 1.0F, 3.0F} },
+                             { {1.0F, 2.0F, 4.0F}, 
+                               {3.0F, 1.0F, 2.0F}},
+                             { {-1.0F, 3.0F, 2.0F}, 
+                               {1.0F, -1.0F, 0.0F} } };
+
+    DTYPE answer[3][2][2] = { { {0.0F, 2.0F},
+                                {2.0F, 3.0F} },
+                              { {1.0F, 4.0F}, 
+                                {3.0F, 2.0F}},
+                              { {-1.0F, 2.0F}, 
+                                {1.0F, 0.0F} } };
+    int dim = 2;
+    int indexSize = 2;
+    int srcIndex[2] = {0, 2};
+    int tgtIndex[2] = {0, 1};
+    int copyNum = 1;
+
+    /* CPU test */
+    bool cpuTest = true;
+
+    /* create tensors */
+    XTensor * s = NewTensor(sOrder, sDimSize);
+    XTensor * t = NewTensor(tOrder, tDimSize);
+    XTensor * index = NewTensor(tOrder, tDimSize, X_INT);
+    XTensor tUser;
+
+    /* initialize variables */
+    s->SetData(sData, sUnitNum);
+    t->SetZeroAll();
+    index->SetData(srcIndex, iUnitNum);
+
+    /* call CopyIndexed function */
+    _CopyIndexed(s, t, dim, (int*)index->data, indexSize, tgtIndex, copyNum);
+    tUser = CopyIndexed(*s, dim, (int*)index->data, indexSize, tgtIndex, copyNum);
+
+    /* check results */
+    cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum);
+    
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensors */
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor tUserGPU;
+
+    /* initialize variables */
+    sGPU->SetData(sData, sUnitNum);
+    tGPU->SetZeroAll();
+
+    /* call CopyIndexed function */
+    _CopyIndexed(sGPU, tGPU, dim, (int*)index->data, indexSize, tgtIndex, copyNum);
+    tUserGPU = CopyIndexed(*sGPU, dim, srcIndex, indexSize, tgtIndex, copyNum);
+
+    /* check results */
+    gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);
+
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete index;
+    delete sGPU;
+    delete tGPU;
+    delete[] sDimSize;
+    delete[] tDimSize;
+
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete[] sDimSize;
+    delete[] tDimSize;
+
+    return cpuTest;
+#endif // USE_CUDA
+}
+
+
+/* 
+case 5: copy indexed sub-tensors 
+In this case, (3, 2, 3) -> (3, 2, 2), dim = 2, indexSize = 1, 
+srcIndex = [0, 1], tgtIndex = [0, 2], copyNum = 2.
+*/
+bool TestCopyIndexed5()
+{
+    /* a input tensor of size (3, 2, 3) */
+    int sOrder = 3;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 3;
+    sDimSize[1] = 2;
+    sDimSize[2] = 3;
+
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+
+    /* a output tensor of size (3, 2, 2) */
+    int tOrder = 3;
+    int * tDimSize = new int[tOrder];
+    tDimSize[0] = 3;
+    tDimSize[1] = 2;
+    tDimSize[2] = 4;
+
+    int tUnitNum = 1;
+    for (int i = 0; i < tOrder; i++)
+        tUnitNum *= tDimSize[i];
+
+    DTYPE sData[3][2][3] = { { {0.0F, -1.0F, 2.0F},
+                               {2.0F, 1.0F, 3.0F} },
+                             { {1.0F, 2.0F, 4.0F}, 
+                               {3.0F, 1.0F, 2.0F}},
+                             { {-1.0F, 3.0F, 2.0F}, 
+                               {1.0F, -1.0F, 0.0F} } };
+
+    DTYPE answer[3][2][4] = { { {0.0F, -1.0F, -1.0F, 2.0F},
+                                {2.0F, 1.0F, 1.0F, 3.0F} },
+                              { {1.0F, 2.0F, 2.0F, 4.0F}, 
+                                {3.0F, 1.0F, 1.0F, 2.0F}},
+                              { {-1.0F, 3.0F, 3.0F, 2.0F}, 
+                                {1.0F, -1.0F, -1.0F, 0.0F} } };
+    int dim = 2;
+    int indexSize = 2;
+    int srcIndex[2] = {0, 1};
+    int tgtIndex[2] = {0, 2};
+    int copyNum = 2;
+
+    /* CPU test */
+    bool cpuTest = true;
+
+    /* create tensors */
+    XTensor * s = NewTensor(sOrder, sDimSize);
+    XTensor * t = NewTensor(tOrder, tDimSize);
+    XTensor tUser;
+
+    /* initialize variables */
+    s->SetData(sData, sUnitNum);
+    t->SetZeroAll();
+
+    /* call CopyIndexed function */
+    _CopyIndexed(s, t, dim, srcIndex, indexSize, tgtIndex, copyNum);
+    tUser = CopyIndexed(*s, dim, srcIndex, indexSize, tgtIndex, copyNum);
+
+    /* check results */
+    cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum);
+    
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensors */
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor tUserGPU;
+
+    /* initialize variables */
+    sGPU->SetData(sData, sUnitNum);
+    tGPU->SetZeroAll();
+
+    /* call CopyIndexed function */
+    _CopyIndexed(sGPU, tGPU, dim, srcIndex, indexSize, tgtIndex, copyNum);
+    tUserGPU = CopyIndexed(*sGPU, dim, srcIndex, indexSize, tgtIndex, copyNum);
+
+    /* check results */
+    gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);
+
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete sGPU;
+    delete tGPU;
+    delete[] sDimSize;
+    delete[] tDimSize;
+
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete[] sDimSize;
+    delete[] tDimSize;
+
+    return cpuTest;
+#endif // USE_CUDA
+}
+
 /* other cases */
 /*
 TODO!!
@@ -381,6 +610,24 @@ bool TestCopyIndexed()
    }
    else
        XPRINT(0, stdout, ">> case 3 passed!\n");
+            
+    /* case 4 test */
+    caseFlag = TestCopyIndexed4();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 4 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 4 passed!\n");
+
+    /* case 5 test */
+    caseFlag = TestCopyIndexed5();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 5 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 5 passed!\n");

    /* other cases test */
    /*

--- a/source/tensor/test/TCopyIndexed.h
+++ b/source/tensor/test/TCopyIndexed.h
 /* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */

 /*
-* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-27
-*/
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-27
+ */

 #ifndef __TEST_COPYINDEXED_H__
 #define __TEST_COPYINDEXED_H__

--- a/source/tensor/test/TCos.cpp
+++ b/source/tensor/test/TCos.cpp
 /* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */

 /*
-* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
-*/
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-18
+ */

 #include "../core/math/Unary.h"
 #include "TCos.h"

--- a/source/tensor/test/TCrossEntropy.cpp
+++ b/source/tensor/test/TCrossEntropy.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-17
+ */
+
+#include <math.h>
+#include "TCrossEntropy.h"
+#include "../core/math/ScaleAndShift.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* 
+case 1: test CrossEntropy function.
+loss = sum_{i} (-t_i * log(y_i))
+where t_i is the gold standard and y_i is the model output.
+*/
+bool TestCrossEntropy1()
+{
+    /* a tensor of size (1, 4) */
+    int order = 2;
+    int * dimSize = new int[order];
+    dimSize[0] = 1;
+    dimSize[1] = 4;
+
+    int unitNum = 1;
+    for (int i = 0; i < order; i++)
+        unitNum *= dimSize[i];
+
+    DTYPE outputData[4] = {0.25F, 0.25F, 0.25F, 0.25F};
+    DTYPE goldData[4] = {0.5F, 0.5F, 0.0F, 0.0F};
+    DTYPE answer = 1.3863F;
+    DTYPE error1;
+    DTYPE error2;
+    
+    /* CPU test */
+    bool cpuTest = true;
+
+    /* create tensors */
+    XTensor * output = NewTensor(order, dimSize);
+    XTensor * gold = NewTensor(order, dimSize);
+    XTensor * loss = NewTensor1D(1);
+
+    /* initialize variables */
+    output->SetData(outputData, unitNum);
+    gold->SetData(goldData, unitNum);
+
+    /* call CrossEntropy function */
+    _CrossEntropyManual(output, gold, loss);
+    error2 = _CrossEntropy(output, gold, REDUCE_SUM);
+    error1 = loss->Get1D(0);
+
+    /* check results */
+    cpuTest = (fabs(error1 - answer) < 1e-4F && 
+               fabs(error2 - answer) < 1e-4F);
+
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensor */
+    XTensor * outputGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * goldGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * lossGPU = NewTensor1D(1, X_FLOAT, 0);
+
+    /* Initialize variables */
+    outputGPU->SetData(outputData, unitNum);
+    goldGPU->SetData(goldData, unitNum);
+
+    /* call CrossEntropy function */
+    _CrossEntropyManual(outputGPU, goldGPU, lossGPU);
+    error1 = lossGPU->Get1D(0);
+    error2 = _CrossEntropy(outputGPU, goldGPU, REDUCE_SUM);
+
+    /* check results */
+    gpuTest = (fabs(error1 - answer) < 1e-4F && 
+               fabs(error2 - answer) < 1e-4F);
+
+    /* destroy variables */
+    delete output;
+    delete gold;
+    delete loss;
+    delete outputGPU;
+    delete goldGPU;
+    delete lossGPU;
+
+    delete[] dimSize;
+
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete output;
+    delete gold;
+    delete loss;
+    delete[] dimSize;
+
+    return cpuTest;
+#endif // USE_CUDA
+}
+
+/* 
+case 2: test CrossEntropy function.
+loss = sum_{i} (-t_i * log(y_i))
+where t_i is the gold standard and y_i is the model output.
+*/
+bool TestCrossEntropy2()
+{
+    /* a tensor of size (4, 10) */
+    int order = 2;
+    int * dimSize = new int[order];
+    dimSize[0] = 4;
+    dimSize[1] = 10;
+
+    int unitNum = 1;
+    for (int i = 0; i < order; i++)
+        unitNum *= dimSize[i];
+
+    DTYPE outputData[4][10] = { {0.5F, 2.6F, 0.3F, 1.7F, 0.6F, 
+                                 0.1F, 0.7F, 1.3F, 0.4F, 0.6F}, 
+                                {0.5F, 1.6F, 0.2F, 1.1F, 0.3F, 
+                                 0.8F, 2.2F, 0.1F, 0.1F, 0.8F},
+                                {0.2F, 0.5F, 1.1F, 1.2F, 0.6F, 
+                                 0.1F, 0.2F, 0.7F, 0.5F, 0.7F},
+                                {0.2F, 1.7F, 0.6F, 1.5F, 0.8F, 
+                                 0.1F, 0.8F, 0.1F, 0.6F, 0.2F} };
+    DTYPE answer1 = 4.3275F;
+    DTYPE answer2 = 1.0818F;
+    DTYPE error1;
+    DTYPE error2;
+    DTYPE error3;
+    DTYPE error4;
+    
+    /* CPU test */
+    bool cpuTest = true;
+
+    /* create tensors */
+    XTensor * output = NewTensor(order, dimSize);
+    XTensor * gold = NewTensor(order, dimSize);
+
+    /* initialize variables */
+    output->SetData(outputData, unitNum);
+    gold->SetZeroAll();
+    gold->Set2D(1.0F, 0, 9);
+    gold->Set2D(1.0F, 1, 7);
+    gold->Set2D(1.0F, 2, 2);
+    gold->Set2D(1.0F, 3, 9);
+
+    /* call CrossEntropy function */
+    error1 = _CrossEntropy(output, gold, REDUCE_SUM);
+    error2 = _CrossEntropy(output, gold, REDUCE_MEAN);
+    error3 = _CrossEntropyManual(output, gold, REDUCE_SUM);
+    error4 = _CrossEntropyManual(output, gold, REDUCE_MEAN);
+    
+    /* check results */
+    cpuTest = (fabs(error1 - answer1) < 1e-4F &&
+               fabs(error2 - answer2) < 1e-4F && 
+               fabs(error3 - answer1) < 1e-4F &&
+               fabs(error4 - answer2) < 1e-4F);
+
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensor */
+    XTensor * outputGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * goldGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+
+    /* Initialize variables */
+    outputGPU->SetData(outputData, unitNum);
+    goldGPU->SetZeroAll();
+    goldGPU->Set2D(1.0F, 0, 9);
+    goldGPU->Set2D(1.0F, 1, 7);
+    goldGPU->Set2D(1.0F, 2, 2);
+    goldGPU->Set2D(1.0F, 3, 9);
+
+    /* call CrossEntropy function */
+    error1 = _CrossEntropy(outputGPU, goldGPU, REDUCE_SUM);
+    error2 = _CrossEntropy(outputGPU, goldGPU, REDUCE_MEAN);
+    error3 = _CrossEntropyManual(outputGPU, goldGPU, REDUCE_SUM);
+    error4 = _CrossEntropyManual(outputGPU, goldGPU, REDUCE_MEAN);
+
+    /* check results */
+    gpuTest = (fabs(error1 - answer1) < 1e-4F &&
+               fabs(error2 - answer2) < 1e-4F && 
+               fabs(error3 - answer1) < 1e-4F &&
+               fabs(error4 - answer2) < 1e-4F);
+
+    /* destroy variables */
+    delete output;
+    delete gold;
+    delete outputGPU;
+    delete goldGPU;
+
+    delete[] dimSize;
+
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete output;
+    delete gold;
+    delete[] dimSize;
+
+    return cpuTest;
+#endif // USE_CUDA
+}
+
+/*
+case 3: test CrossEntropy function.
+loss = sum_{i} (-t_i * log(y_i))
+where t_i is the gold standard and y_i is the model output.
+In this case, I compute the cross entropy with weight.
+*/
+bool TestCrossEntropy3()
+{
+    /* a output tensor of size (4, 4) */
+    int order = 2;
+    int * dimSize = new int[order];
+    dimSize[0] = 4;
+    dimSize[1] = 4;
+
+    int unitNum = 1;
+    for (int i = 0; i < order; i++)
+        unitNum *= dimSize[i];
+        
+    /* a weight tensor of size (4) */
+    int wOrder = 1;
+    int * wDimSize = new int[wOrder];
+    wDimSize[0] = 4;
+
+    int wUnitNum = 1;
+    for (int i = 0; i < wOrder; i++)
+        wUnitNum *= wDimSize[i];
+
+    DTYPE outputData[4][4] = { {0.3F, 0.2F, 0.3F, 0.2F}, 
+                               {0.1F, 0.4F, 0.2F, 0.3F}, 
+                               {0.7F, 0.1F, 0.1F, 0.1F}, 
+                               {0.5F, 0.1F, 0.2F, 0.2F} };
+    DTYPE weightData[4] = {2.0F, 1.0F, 5.0F, 0.0F};
+    DTYPE answer[4] = {2.4079F, 0.9163F, 11.5129F, 0.0F};
+    
+    /* CPU test */
+    bool cpuTest = true;
+
+    /* create tensors */
+    XTensor * output = NewTensor(order, dimSize);
+    XTensor * gold = NewTensor(order, dimSize);
+    XTensor * loss = NewTensor1D(4);
+    XTensor * weight = NewTensor(wOrder, wDimSize);
+
+    /* initialize variables */
+    output->SetData(outputData, unitNum);
+    weight->SetData(weightData, wUnitNum);
+    gold->SetZeroAll();
+    gold->Set2D(1.0F, 0, 0);
+    gold->Set2D(1.0F, 1, 1);
+    gold->Set2D(1.0F, 2, 2);
+    gold->Set2D(1.0F, 3, 3);
+
+    /* call CrossEntropy function */
+    _CrossEntropyManual(output, gold, loss, weight);
+
+    /* check results */
+    cpuTest = loss->CheckData(answer, 4, 1e-4F);
+
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensor */
+    XTensor * outputGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * goldGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * lossGPU = NewTensor1D(4, X_FLOAT, 0);
+    XTensor * weightGPU = NewTensor(wOrder, wDimSize, X_FLOAT, 1.0F, 0);
+
+    /* Initialize variables */
+    outputGPU->SetData(outputData, unitNum);
+    weightGPU->SetData(weightData, wUnitNum);
+    goldGPU->SetZeroAll();
+    goldGPU->Set2D(1.0F, 0, 0);
+    goldGPU->Set2D(1.0F, 1, 1);
+    goldGPU->Set2D(1.0F, 2, 2);
+    goldGPU->Set2D(1.0F, 3, 3);
+        
+    /* call CrossEntropy function */
+    _CrossEntropyManual(outputGPU, goldGPU, lossGPU, weightGPU);
+
+    /* check results */
+    gpuTest = lossGPU->CheckData(answer, 4, 1e-4F);
+
+    /* destroy variables */
+    delete output;
+    delete gold;
+    delete loss;
+    delete weight;
+    delete outputGPU;
+    delete goldGPU;
+    delete lossGPU;
+    delete weightGPU;
+
+    delete[] dimSize;
+
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete output;
+    delete gold;
+    delete loss;
+    delete weight;
+    delete[] dimSize;
+
+    return cpuTest;
+#endif // USE_CUDA
+}
+
+/* 
+case 4: test CrossEntropy function.
+loss = sum_{i} (-t_i * log(y_i))
+where t_i is the gold standard and y_i is the model output.
+*/
+bool TestCrossEntropy4()
+{
+    /* a tensor of size (10, 1) */
+    int order = 2;
+    int * dimSize = new int[order];
+    dimSize[0] = 10;
+    dimSize[1] = 1;
+
+    int unitNum = 1;
+    for (int i = 0; i < order; i++)
+        unitNum *= dimSize[i];
+
+    /* CPU test */
+    bool cpuTest = true;
+
+    DTYPE answer = 0.0F;
+    DTYPE error;
+
+    /* create tensors */
+    XTensor * output = NewTensor(order, dimSize);
+    XTensor * gold = NewTensor(order, dimSize);
+
+    /* initialize variables */
+    output->SetZeroAll();
+    gold->SetZeroAll();
+    _ScaleAndShiftMe(output, 1, 1);
+    _ScaleAndShiftMe(gold, 1, 2);
+
+    /* call CrossEntropy function */
+    error = _CrossEntropyManual(output, gold);
+    
+    /* check results */
+    cpuTest = (fabs(error - answer) < 1e-4);
+
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensor */
+    XTensor * outputGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * goldGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+
+    /* Initialize variables */
+    outputGPU->SetZeroAll();
+    goldGPU->SetZeroAll();
+    _ScaleAndShiftMe(outputGPU, 1, 1);
+    _ScaleAndShiftMe(goldGPU, 1, 2);
+
+    /* call CrossEntropy function */
+    error = _CrossEntropyManual(outputGPU, goldGPU);
+    
+    /* check results */
+    gpuTest = (fabs(error - answer) < 1e-4);
+
+    /* destroy variables */
+    delete output;
+    delete gold;
+    delete outputGPU;
+    delete goldGPU;
+    delete[] dimSize;
+
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete output;
+    delete gold;
+    delete[] dimSize;
+
+    return cpuTest;
+#endif // USE_CUDA
+}
+
+/* other cases */
+/*
+TODO!!
+*/
+
+/* test for CrossEntropy Function */
+bool TestCrossEntropy()
+{
+    XPRINT(0, stdout, "[TEST CrossEntropy] compute the cross entropy loss and backward gradient \n");
+    bool returnFlag = true, caseFlag = true;
+
+    /* case 1 test */
+    caseFlag = TestCrossEntropy1();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 1 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 1 passed!\n");
+    
+    /* case 2 test */
+    caseFlag = TestCrossEntropy2();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 2 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 2 passed!\n");
+        
+    /* case 3 test */
+    caseFlag = TestCrossEntropy3();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 3 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 3 passed!\n");        
+    
+    /* case 4 test */
+    caseFlag = TestCrossEntropy4();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 4 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 4 passed!\n");
+
+    ///* other cases test */
+    ///*
+    //TODO!!
+    //*/
+
+    if (returnFlag) {
+        XPRINT(0, stdout, ">> All Passed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> Failed!\n");
+
+    XPRINT(0, stdout, "\n");
+
+    return returnFlag;
+}
+
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/test/TCrossEntropy.h
+++ b/source/tensor/test/TCrossEntropy.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-17
+ */
+
+#ifndef __TEST_CROSSENTROPY_H__
+#define __TEST_CROSSENTROPY_H__
+
+#include "../function/CrossEntropy.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* test for CrossEntropy Function */
+bool TestCrossEntropy();
+
+} // namespace nts(NiuTrans.Tensor)
+#endif // __TEST_CROSSENTROPY_H__
--- a/source/tensor/test/TDivDim.cpp
+++ b/source/tensor/test/TDivDim.cpp
@@ -259,7 +259,7 @@ bool TestDivDim2()
 /* test for DivDim Function */
 bool TestDivDim()
 {
-    XPRINT(0, stdout, "[TEST DIVDIM] tensor division c(i) = a/b + \alpha * c by broadcasting\n");
+    XPRINT(0, stdout, "[TEST DIVDIM] tensor division c(i) = a/b + \\alpha * c by broadcasting\n");
    bool returnFlag = true, caseFlag = true;

    /* case 1 test */

--- a/source/tensor/test/TDropout.cpp
+++ b/source/tensor/test/TDropout.cpp
@@ -54,10 +54,10 @@ bool TestDropout1()
    y->SetZeroAll();

    /* call Dropout function */
-    float drop_prob = 0.2F;
+    float dropProb = 0.2F;
    int seed = 20;
-    _Dropout(x, y, seed, drop_prob);
-    yUser = Dropout(*x, drop_prob);
+    _Dropout(x, y, seed, dropProb);
+    yUser = Dropout(*x, dropProb);

    /* check result */
    int zeroNum1 = 0;
@@ -74,9 +74,9 @@ bool TestDropout1()
    }
    printf("CPU Test:\n");
    printf("In tensor y, there are %d units.\n", unitNum);
-    printf("There are %d zero units by Dropout layer with probability %.2f.\n", zeroNum1, drop_prob);
+    printf("There are %d zero units by Dropout layer with probability %.2f.\n", zeroNum1, dropProb);
    printf("In tensor yUser, there are %d units.\n", unitNum);
-    printf("There are %d zero units by Dropout layer with default probability %.2f.\n", zeroNum2, drop_prob);
+    printf("There are %d zero units by Dropout layer with default probability %.2f.\n", zeroNum2, dropProb);

 #ifdef USE_CUDA
    /* GPU test */
@@ -92,8 +92,8 @@ bool TestDropout1()
    yGPU->SetZeroAll();

    /* call Dropout function */
-    _Dropout(xGPU, yGPU, seed, drop_prob);
-    yUserGPU = Dropout(*xGPU, drop_prob);
+    _Dropout(xGPU, yGPU, seed, dropProb);
+    yUserGPU = Dropout(*xGPU, dropProb);

    /* check result */
    zeroNum1 = 0;
@@ -110,9 +110,9 @@ bool TestDropout1()
    }
    printf("CPU Test:\n");
    printf("In tensor y, there are %d units.\n", unitNum);
-    printf("There are %d zero units by Dropout layer with probability %.2f.\n", zeroNum1, drop_prob);
+    printf("There are %d zero units by Dropout layer with probability %.2f.\n", zeroNum1, dropProb);
    printf("In tensor yUser, there are %d units.\n", unitNum);
-    printf("There are %d zero units by Dropout layer with default probability %.2f.\n", zeroNum2, drop_prob);
+    printf("There are %d zero units by Dropout layer with default probability %.2f.\n", zeroNum2, dropProb);

    /* destroy variables */
    delete x;
@@ -163,10 +163,10 @@ bool TestDropout2()
    _SetDataFixedFloat(dedy, 1.5F);

    /* call Dropout function */
-    float drop_prob = 0.5F;
+    float dropProb = 0.5F;
    int seed = 1;
-    _Dropout(x, y, seed, drop_prob);
-    _DropoutBackward(y, x, dedy, dedx, 1, drop_prob);
+    _Dropout(x, y, seed, dropProb);
+    _DropoutBackward(y, x, dedy, dedx, 1, dropProb);

    /* check result */
    y->Dump(stderr, "y");
@@ -189,8 +189,8 @@ bool TestDropout2()
    _SetDataFixedFloat(dedyGPU, 1.5F);

    /* call Dropout function */
-    _Dropout(xGPU, yGPU, seed, drop_prob);
-    _DropoutBackward(yGPU, xGPU, dedyGPU, dedxGPU, 1, drop_prob);
+    _Dropout(xGPU, yGPU, seed, dropProb);
+    _DropoutBackward(yGPU, xGPU, dedyGPU, dedxGPU, 1, dropProb);

    /* check result */
    yGPU->Dump(stderr, "yGPU");

--- a/source/tensor/test/TGather.cpp
+++ b/source/tensor/test/TGather.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-18
+ */
+
+#include "TGather.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* 
+case 1: gather indexed sub-tensors 
+In this case, (3, 2, 3) -> (3, 2, 2), dim = 2, 
+srcIndex = [0, 2], indexSize = 2
+*/
+bool TestGather1()
+{
+    /* a input tensor of size (3, 2, 3) */
+    int sOrder = 3;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 3;
+    sDimSize[1] = 2;
+    sDimSize[2] = 3;
+
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+
+    /* a output tensor of size (3, 2, 2) */
+    int tOrder = 3;
+    int * tDimSize = new int[tOrder];
+    tDimSize[0] = 3;
+    tDimSize[1] = 2;
+    tDimSize[2] = 2;
+
+    int tUnitNum = 1;
+    for (int i = 0; i < tOrder; i++)
+        tUnitNum *= tDimSize[i];
+
+    DTYPE sData[3][2][3] = { { {0.0F, -1.0F, 2.0F},
+                               {2.0F, 1.0F, 3.0F} },
+                             { {1.0F, 2.0F, 4.0F}, 
+                               {3.0F, 1.0F, 2.0F}},
+                             { {-1.0F, 3.0F, 2.0F}, 
+                               {1.0F, -1.0F, 0.0F} } };
+
+    DTYPE answer[3][2][2] = { { {0.0F, 2.0F},
+                                {2.0F, 3.0F} },
+                              { {1.0F, 4.0F}, 
+                                {3.0F, 2.0F}},
+                              { {-1.0F, 2.0F}, 
+                                {1.0F, 0.0F} } };
+    int dim = 2;
+    int indexSize = 2;
+    int srcIndex[2] = {0, 2};
+
+    /* CPU test */
+    bool cpuTest = true;
+
+    /* create tensors */
+    XTensor * s = NewTensor(sOrder, sDimSize);
+    XTensor * t = NewTensor(tOrder, tDimSize);
+    XTensor tUser;
+
+    /* initialize variables */
+    s->SetData(sData, sUnitNum);
+    t->SetZeroAll();
+
+    /* call Gather function */
+    _Gather(s, t, dim, srcIndex, indexSize);
+    tUser = Gather(*s, dim, srcIndex, indexSize);
+
+    /* check results */
+    cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum);
+    
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensors */
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor tUserGPU;
+
+    /* initialize variables */
+    sGPU->SetData(sData, sUnitNum);
+    tGPU->SetZeroAll();
+
+    /* call Gather function */
+    _Gather(sGPU, tGPU, dim, srcIndex, indexSize);
+    tUserGPU = Gather(*sGPU, dim, srcIndex, indexSize);
+
+    /* check results */
+    gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);
+
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete sGPU;
+    delete tGPU;
+    delete[] sDimSize;
+    delete[] tDimSize;
+
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete[] sDimSize;
+    delete[] tDimSize;
+
+    return cpuTest;
+#endif // USE_CUDA
+}
+
+/* 
+case 2: gather indexed sub-tensors 
+In this case, (3, 2, 3) -> (3, 1, 3), dim = 1, 
+srcIndex = [0], indexSize = 1
+*/
+bool TestGather2()
+{
+    /* a input tensor of size (3, 2, 3) */
+    int sOrder = 3;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 3;
+    sDimSize[1] = 2;
+    sDimSize[2] = 3;
+
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+
+    /* a output tensor of size (3, 1, 3) */
+    int tOrder = 3;
+    int * tDimSize = new int[tOrder];
+    tDimSize[0] = 3;
+    tDimSize[1] = 1;
+    tDimSize[2] = 3;
+
+    int tUnitNum = 1;
+    for (int i = 0; i < tOrder; i++)
+        tUnitNum *= tDimSize[i];
+
+    DTYPE sData[3][2][3] = { { {0.0F, -1.0F, 2.0F},
+                               {2.0F, 1.0F, 3.0F} },
+                             { {1.0F, 2.0F, 4.0F}, 
+                               {3.0F, 1.0F, 2.0F}},
+                             { {-1.0F, 3.0F, 2.0F}, 
+                               {1.0F, -1.0F, 0.0F} } };
+
+    DTYPE answer[3][1][3] = { { {0.0F, -1.0F, 2.0F} },
+                              { {1.0F, 2.0F, 4.0F} } , 
+                              { {-1.0F, 3.0F, 2.0F} } };
+    int dim = 1;
+    int indexSize = 1;
+    int srcIndex[2] = {0};
+
+    /* CPU test */
+    bool cpuTest = true;
+
+    /* create tensors */
+    XTensor * s = NewTensor(sOrder, sDimSize);
+    XTensor * t = NewTensor(tOrder, tDimSize);
+    XTensor tUser;
+
+    /* initialize variables */
+    s->SetData(sData, sUnitNum);
+    t->SetZeroAll();
+
+    /* call Gather function */
+    _Gather(s, t, dim, srcIndex, indexSize);
+    tUser = Gather(*s, dim, srcIndex, indexSize);
+    
+    /* check results */
+    cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum);
+    
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensors */
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor tUserGPU;
+
+    /* initialize variables */
+    sGPU->SetData(sData, sUnitNum);
+    tGPU->SetZeroAll();
+
+    /* call Gather function */
+    _Gather(sGPU, tGPU, dim, srcIndex, indexSize);
+    tUserGPU = Gather(*sGPU, dim, srcIndex, indexSize);
+
+    /* check results */
+    gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);
+
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete sGPU;
+    delete tGPU;
+    delete[] sDimSize;
+    delete[] tDimSize;
+
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete[] sDimSize;
+    delete[] tDimSize;
+
+    return cpuTest;
+#endif // USE_CUDA
+}
+
+/* other cases */
+/*
+TODO!!
+*/
+
+/* test for Gather Function */
+bool TestGather()
+{
+    XPRINT(0, stdout, "[TEST Gather] gather indexed sub-tensors \n");
+    bool returnFlag = true, caseFlag = true;
+
+    /* case 1 test */
+    caseFlag = TestGather1();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 1 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 1 passed!\n");
+    
+    /* case 2 test */
+    caseFlag = TestGather2();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 2 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 2 passed!\n");
+     
+    /* other cases test */
+    /*
+    TODO!!
+    */
+
+    if (returnFlag) {
+        XPRINT(0, stdout, ">> All Passed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> Failed!\n");
+
+    XPRINT(0, stdout, "\n");
+
+    return returnFlag;
+    }
+
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/test/TGather.h
+++ b/source/tensor/test/TGather.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-18
+ */
+
+#ifndef __TEST_GATHER_H__
+#define __TEST_GATHER_H__
+
+#include "../core/movement/Gather.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* test for Gather Function */
+bool TestGather();
+
+} // namespace nts(NiuTrans.Tensor)
+#endif // __TEST_GATHER_H__
--- a/source/tensor/test/TLoss.h
+++ b/source/tensor/test/TLoss.h
 /* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */

 /*
-* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-04-30
-*/
+ * $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-04-30
+ */

 #ifndef __TEST_LOSS_H__
 #define __TEST_LOSS_H__

--- a/source/tensor/test/TMultiplyDim.cpp
+++ b/source/tensor/test/TMultiplyDim.cpp
@@ -251,7 +251,7 @@ bool TestMultiplyDim2()
 /* test for MultiplyDim Function */
 bool TestMultiplyDim()
 {
-    XPRINT(0, stdout, "[TEST MULTIPLYDIM] tensor multiplication c = a * b + \alpha * c by broadcasting\n");
+    XPRINT(0, stdout, "[TEST MULTIPLYDIM] tensor multiplication c = a * b + \\alpha * c by broadcasting\n");
    bool returnFlag = true, caseFlag = true;

    /* case 1 test */

--- a/source/tensor/test/TReduceSumAll.cpp
+++ b/source/tensor/test/TReduceSumAll.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-27
+ */
+
+#include "TReduceSumAll.h"
+#include <math.h>
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* 
+case 1: test ReduceSumAll function
+sum all the items of the tensor
+*/
+bool TestReduceSumAll1()
+{
+    /* a tensor of size (2, 4) */
+    int sOrder = 2;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 2;
+    sDimSize[1] = 4;
+
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+
+    DTYPE sData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
+                          {4.0F, 5.0F, 6.0F, 7.0F} };
+    DTYPE summation;
+    DTYPE answer = 28.0F;
+
+    /* CPU test */
+    bool cpuTest = true;
+
+    /* create tensors */
+    XTensor * s = NewTensor(sOrder, sDimSize);
+
+    /* initialize variables */
+    s->SetData(sData, sUnitNum);
+
+    /* call ReduceSumAll function */
+    summation = _ReduceSumAll(s);
+
+    /* check results */
+    cpuTest = (fabs(answer - summation) < 1e-4F);
+
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensors */
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+
+    /* initialize variables */
+    sGPU->SetData(sData, sUnitNum);
+
+    /* call ReduceSumAll function */
+    summation = _ReduceSumAll(sGPU);
+
+    /* check results */
+    gpuTest = (fabs(answer - summation) < 1e-4F);
+
+    /* destroy variables */
+    delete s;
+    delete sGPU;
+    delete[] sDimSize;
+
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s;
+    delete[] sDimSize;
+
+    return cpuTest;
+#endif // USE_CUDA
+}
+
+/* other cases */
+/*
+TODO!!
+*/
+
+/* test for ReduceSumAll Function */
+bool TestReduceSumAll()
+{
+    XPRINT(0, stdout, "[TEST ReduceSumAll] sum the items along a dimension of the tensor.\n");
+    bool returnFlag = true, caseFlag = true;
+
+    /* case 1 test */
+    caseFlag = TestReduceSumAll1();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 1 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 1 passed!\n");
+
+    /* other cases test */
+    /*
+    TODO!!
+    */
+
+    if (returnFlag) {
+        XPRINT(0, stdout, ">> All Passed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> Failed!\n");
+
+    XPRINT(0, stdout, "\n");
+
+    return returnFlag;
+    }
+
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/test/TReduceSumAll.h
+++ b/source/tensor/test/TReduceSumAll.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-27
+ */
+
+#ifndef __TEST_REDUCESUMALL_H__
+#define __TEST_REDUCESUMALL_H__
+
+#include "../core/reduce/ReduceSumAll.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* test for ReduceSumAll Function */
+bool TestReduceSumAll();
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __TEST_REDUCESUMALL_H__
+
+
+
--- a/source/tensor/test/TSetData.cpp
+++ b/source/tensor/test/TSetData.cpp
 /* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */

 /*
-* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-06
-*/
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-06
+ */

 #include "TSetData.h"
+#include "../core/getandset/SetData.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

@@ -80,6 +81,327 @@ bool TestSetData1()
 #endif // USE_CUDA
 }

+/*
+case 2: test SetDataIndexed function.
+modify data items along with a given dimension.
+*/
+bool TestSetData2()
+{
+    /* a input tensor of size (2, 4) */
+    int sOrder = 2;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 2;
+    sDimSize[1] = 4;
+
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+
+    /* a data tensor of size (4) for GPU test */
+    int dataOrder = 1;
+    int * dataDimSize = new int[dataOrder];
+    dataDimSize[0] = 4;
+
+    int dataUnitNum = 1;
+    for (int i = 0; i < dataOrder; i++)
+        dataUnitNum *= dataDimSize[i];
+
+    DTYPE data[4] = {0.0F, 1.0F, 2.0F, 3.0F};
+    DTYPE answer[2][4] = { {1.0F, 1.0F, 1.0F, 1.0F}, 
+                           {0.0F, 1.0F, 2.0F, 3.0F} };
+    
+    /* CPU test */
+    bool cpuTest = true;
+
+    /* create tensors */
+    XTensor * s = NewTensor(sOrder, sDimSize);
+    XTensor * modify = NewTensor(dataOrder, dataDimSize);
+
+    /* Initialize variables */
+    _SetDataFixedFloat(s, 1.0F);
+    modify->SetData(data, dataUnitNum);
+
+    /* call SetDataIndexed function */
+    _SetDataIndexed(s, modify, 0, 1);
+
+    /* check results */
+    cpuTest = s->CheckData(answer, sUnitNum, 1e-5F);
+    
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensors */
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * modifyGPU = NewTensor(dataOrder, dataDimSize, X_FLOAT, 1.0F, 0);
+
+    /* Initialize variables */
+    _SetDataFixedFloat(sGPU, 1.0F);
+    modifyGPU->SetData(data, dataUnitNum);
+
+    /* call SetDataIndexed function */
+    _SetDataIndexed(sGPU, modifyGPU, 0, 1);
+    
+    gpuTest = sGPU->CheckData(answer, sUnitNum, 1e-5F);
+
+    /* destroy variables */
+    delete s;
+    delete modify;
+    delete sGPU;
+    delete modifyGPU;
+    delete[] sDimSize;
+
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s;
+    delete modify;
+    delete[] sDimSize;
+
+    return cpuTest;
+#endif // USE_CUDA
+}
+
+/*
+case 3: test SetDataIndexed function.
+modify data items along with a given dimension.
+*/
+bool TestSetData3()
+{
+    /* a input tensor of size (2, 4, 3) */
+    int sOrder = 3;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 2;
+    sDimSize[1] = 4;
+    sDimSize[2] = 3;
+
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+    
+    /* a data tensor of size (2, 3) for GPU test */
+    int dataOrder = 2;
+    int * dataDimSize = new int[dataOrder];
+    dataDimSize[0] = 2;
+    dataDimSize[1] = 3;
+
+    int dataUnitNum = 1;
+    for (int i = 0; i < dataOrder; i++)
+        dataUnitNum *= dataDimSize[i];
+
+    DTYPE data[2][3] = { {0.0F, 1.0F, 2.0F},
+                         {3.0F, 4.0F, 5.0F} };
+
+    DTYPE answer[2][4][3] = { { {1.0F, 1.0F, 1.0F},
+                                {0.0F, 1.0F, 2.0F},
+                                {1.0F, 1.0F, 1.0F},
+                                {1.0F, 1.0F, 1.0F} },
+                              { {1.0F, 1.0F, 1.0F},
+                                {3.0F, 4.0F, 5.0F},
+                                {1.0F, 1.0F, 1.0F},
+                                {1.0F, 1.0F, 1.0F} } };
+
+    /* CPU test */
+    bool cpuTest = true;
+
+    /* create tensors */
+    XTensor * s = NewTensor(sOrder, sDimSize);
+    XTensor * modify = NewTensor(dataOrder, dataDimSize);
+
+    /* Initialize variables */
+    _SetDataFixedFloat(s, 1.0F);
+    modify->SetData(data, dataUnitNum);
+
+    /* call SetDataIndexed function */
+    _SetDataFixedFloat(s, 1.0F);
+    _SetDataIndexed(s, modify, 1, 1);
+    
+    /* check results */
+    cpuTest = s->CheckData(answer, sUnitNum, 1e-5F);
+    
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensors */
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * modifyGPU = NewTensor(dataOrder, dataDimSize, X_FLOAT, 1.0F, 0);
+
+    /* Initialize variables */
+    _SetDataFixedFloat(sGPU, 1.0F);
+    modifyGPU->SetData(data, dataUnitNum);
+    
+    /* call SetDataIndexed function */
+    _SetDataIndexed(sGPU, modifyGPU, 1, 1);
+    
+    gpuTest = sGPU->CheckData(answer, sUnitNum, 1e-5F);
+
+    /* destroy variables */
+    delete s;
+    delete modify;
+    delete sGPU;
+    delete modifyGPU;
+    delete[] sDimSize;
+
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s;
+    delete modify;
+    delete[] sDimSize;
+
+    return cpuTest;
+#endif // USE_CUDA
+}
+
+/*
+case 4: test SetDataDim function.
+set data items along with a given dimension (and keep the remaining items unchanged)
+*/
+bool TestSetData4()
+{
+    /* a input tensor of size (3, 3) */
+    int order = 2;
+    int * dimSize = new int[order];
+    dimSize[0] = 3;
+    dimSize[1] = 3;
+
+    int unitNum = 1;
+    for (int i = 0; i < order; i++)
+        unitNum *= dimSize[i];
+
+    DTYPE sData[3][3] = { {1.0F, 2.0F, 3.0F},
+                          {4.0F, 5.0F, 6.0F},
+                          {7.0F, 8.0F, 9.0F} };
+    DTYPE answer[3][3] = { {1.0F, 2.0F, 3.0F},
+                           {0.0F, 0.0F, 0.0F},
+                           {7.0F, 8.0F, 9.0F} };
+
+    /* CPU test */
+    bool cpuTest = true;
+
+    /* create tensors */
+    XTensor * s = NewTensor(order, dimSize);
+
+    /* initialize variables */
+    s->SetData(sData, unitNum);
+
+    /* call _SetDataDim function */
+    _SetDataDim(s, 1, 1, 0, 0);
+
+    /* check results */
+    cpuTest = s->CheckData(answer, unitNum, 1e-4F);
+
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensors */
+    XTensor * sGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+
+    /* initialize variables */
+    sGPU->SetData(sData, unitNum);
+
+    /* call _SetDataDim function */
+    _SetDataDim(sGPU, 1, 1, 0, 0);
+
+    gpuTest = sGPU->CheckData(answer, unitNum, 1e-4F);
+
+    /* destroy variables */
+    delete s;
+    delete sGPU;
+    delete[] dimSize;
+
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s;
+    delete[] dimSize;
+
+    return cpuTest;
+#endif // USE_CUDA
+}
+
+/*
+case 5: test SetDataDim function.
+set data items along with a given dimension (and keep the remaining items unchanged)
+*/
+bool TestSetData5()
+{
+    /* a input tensor of size (2, 4, 3) */
+    int order = 3;
+    int * dimSize = new int[order];
+    dimSize[0] = 2;
+    dimSize[1] = 4;
+    dimSize[2] = 3;
+
+    int unitNum = 1;
+    for (int i = 0; i < order; i++)
+        unitNum *= dimSize[i];
+
+    DTYPE data[2][4][3] = { { {1.0F, 1.0F, 1.0F},
+                              {0.0F, 1.0F, 2.0F},
+                              {1.0F, 1.0F, 1.0F},
+                              {1.0F, 1.0F, 1.0F} },
+                            { {1.0F, 1.0F, 1.0F},
+                              {3.0F, 4.0F, 5.0F},
+                              {1.0F, 1.0F, 1.0F},
+                              {1.0F, 1.0F, 1.0F} } };
+
+    DTYPE answer[2][4][3] = { { {1.0F, 1.0F, 1.0F},
+                                {0.0F, 1.0F, 2.0F},
+                                {5.0F, 5.0F, 5.0F},
+                                {1.0F, 1.0F, 1.0F} },
+                              { {1.0F, 1.0F, 1.0F},
+                                {3.0F, 4.0F, 5.0F},
+                                {5.0F, 5.0F, 5.0F},
+                                {1.0F, 1.0F, 1.0F} } };
+
+    /* CPU test */
+    bool cpuTest = true;
+
+    /* create tensors */
+    XTensor * s = NewTensor(order, dimSize);
+
+    /* initialize variables */
+    s->SetData(data, unitNum);
+
+    /* call _SetDataDim function */
+    _SetDataDim(s, 2, 1, 1, 5.0F);
+
+    /* check results */
+    cpuTest = s->CheckData(answer, unitNum, 1e-4F);
+
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensors */
+    XTensor * sGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+
+    /* initialize variables */
+    sGPU->SetData(data, unitNum);
+
+    /* call _SetDataDim function */
+    _SetDataDim(sGPU, 2, 1, 1, 5.0F);
+
+    gpuTest = sGPU->CheckData(answer, unitNum, 1e-4F);
+
+    /* destroy variables */
+    delete s;
+    delete sGPU;
+    delete[] dimSize;
+
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s;
+    delete[] dimSize;
+
+    return cpuTest;
+#endif // USE_CUDA
+}
+
 /* other cases */
 /*
 TODO!!
@@ -100,6 +422,42 @@ bool TestSetData()
    else
        XPRINT(0, stdout, ">> case 1 passed!\n");

+    /* case 2 test */
+    caseFlag = TestSetData2();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 2 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 2 passed!\n");
+    
+    /* case 3 test */
+    caseFlag = TestSetData3();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 3 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 3 passed!\n");
+        
+    /* case 4 test */
+    caseFlag = TestSetData4();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 4 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 4 passed!\n");
+            
+    /* case 5 test */
+    caseFlag = TestSetData5();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 5 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 5 passed!\n");
+
    /* other cases test */
    /*
    TODO!!

--- a/source/tensor/test/TSpread.cpp
+++ b/source/tensor/test/TSpread.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-25
+ */
+
+#include "TSpread.h"
+#include "../core/getandset/SetData.h"
+#include "../core/movement/Spread.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/*
+case 1: test _Spread function.
+spread a collection tensor to source tensor.
+*/
+bool TestSpread1()
+{
+    /* a input tensor of size (2, 4, 3) */
+    int sOrder = 3;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 4;
+    sDimSize[1] = 4;
+    sDimSize[2] = 3;
+
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+    
+    /* a data tensor of size (2, 4, 3) */
+    int dataOrder = 3;
+    int * dataDimSize = new int[dataOrder];
+    dataDimSize[0] = 2;
+    dataDimSize[1] = 4;
+    dataDimSize[2] = 3;
+
+    int dataUnitNum = 1;
+    for (int i = 0; i < dataOrder; i++)
+        dataUnitNum *= dataDimSize[i];
+    
+    int srcIndex[2] = {0, 1};
+    int tgtIndex[2] = {0, 1};
+
+
+    DTYPE data[2][4][3] = { { {1.0F, 1.0F, 1.0F},
+                              {0.0F, 1.0F, 2.0F},
+                              {1.0F, 1.0F, 1.0F},
+                              {1.0F, 1.0F, 1.0F} },
+                            { {1.0F, 1.0F, 1.0F},
+                              {3.0F, 4.0F, 5.0F},
+                              {1.0F, 1.0F, 1.0F},
+                              {1.0F, 1.0F, 1.0F} } };
+
+    DTYPE answer[4][4][3] = { { {1.0F, 1.0F, 1.0F},
+                                {0.0F, 1.0F, 2.0F},
+                                {1.0F, 1.0F, 1.0F},
+                                {1.0F, 1.0F, 1.0F} },
+                              { {1.0F, 1.0F, 1.0F},
+                                {3.0F, 4.0F, 5.0F},
+                                {1.0F, 1.0F, 1.0F},
+                                {1.0F, 1.0F, 1.0F} },
+                              { {0.0F, 0.0F, 0.0F}, 
+                                {0.0F, 0.0F, 0.0F}, 
+                                {0.0F, 0.0F, 0.0F} },
+                              { {0.0F, 0.0F, 0.0F}, 
+                                {0.0F, 0.0F, 0.0F}, 
+                                {0.0F, 0.0F, 0.0F} },
+    };
+
+    /* CPU test */
+    bool cpuTest = true;
+
+    /* create tensors */
+    XTensor * s = NewTensor(sOrder, sDimSize);
+    XTensor * modify = NewTensor(dataOrder, dataDimSize);
+
+    /* Initialize variables */
+    _SetDataFixedFloat(s, 0.0F);
+    modify->SetData(data, dataUnitNum);
+
+    /* call _Spread function */
+    _Spread(s, modify, 0, srcIndex, 2, tgtIndex);
+    
+    /* check results */
+    cpuTest = s->CheckData(answer, sUnitNum, 1e-5F);
+
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensors */
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * modifyGPU = NewTensor(dataOrder, dataDimSize, X_FLOAT, 1.0F, 0);
+
+    /* Initialize variables */
+    _SetDataFixedFloat(sGPU, 0.0F);
+    modifyGPU->SetData(data, dataUnitNum);
+    
+    /* call _Spread function */
+    _Spread(sGPU, modifyGPU, 0, srcIndex, 2, tgtIndex);
+    
+    gpuTest = sGPU->CheckData(answer, sUnitNum, 1e-5F);
+
+    /* destroy variables */
+    delete s;
+    delete modify;
+    delete sGPU;
+    delete modifyGPU;
+    delete[] sDimSize;
+
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s;
+    delete[] sDimSize;
+
+    return cpuTest;
+#endif // USE_CUDA
+}
+
+/* other cases */
+/*
+TODO!!
+*/
+
+/* test for Spread Function */
+bool TestSpread()
+{
+    XPRINT(0, stdout, "[TEST Spread] spread a collection tensor to source tensor \n");
+    bool returnFlag = true, caseFlag = true;
+
+    /* case 1 test */
+    caseFlag = TestSpread1();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 1 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 1 passed!\n");
+
+    /* other cases test */
+    /*
+    TODO!!
+    */
+
+    if (returnFlag) {
+        XPRINT(0, stdout, ">> All Passed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> Failed!\n");
+
+    XPRINT(0, stdout, "\n");
+
+    return returnFlag;
+    }
+
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/test/TSpread.h
+++ b/source/tensor/test/TSpread.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-25
+ */
+
+#ifndef __TEST_SPREAD_H__
+#define __TEST_SPREAD_H__
+
+#include "../XTensor.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* test for Spread Function */
+bool TestSpread();
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __TEST_SPREAD_H__
\ No newline at end of file
--- a/source/tensor/test/Test.cpp
+++ b/source/tensor/test/Test.cpp
@@ -28,7 +28,7 @@ bool Test()
 {
    bool wrong = false;
    XPRINT(0, stdout, "Testing the XTensor utilites ... \n\n");
-
+    
    wrong = !TestAbsolute() || wrong;
    wrong = !TestClip() || wrong;
    wrong = !TestConcatenate() || wrong;
@@ -40,6 +40,7 @@ bool Test()
    wrong = !TestDiv() || wrong;
    wrong = !TestDivDim() || wrong;
    wrong = !TestExp() || wrong;
+    wrong = !TestGather() || wrong;
    wrong = !TestLog() || wrong;
    wrong = !TestMatrixMul() || wrong;
    wrong = !TestMatrixMul2D() || wrong;
@@ -54,6 +55,7 @@ bool Test()
    wrong = !TestReduceMax() || wrong;
    wrong = !TestReduceMean() || wrong;
    wrong = !TestReduceSum() || wrong;
+    wrong = !TestReduceSumAll() || wrong;
    wrong = !TestReduceSumSquared() || wrong;
    wrong = !TestReduceVariance() || wrong;
    wrong = !TestRound() || wrong;
@@ -75,7 +77,8 @@ bool Test()
    //wrong = !TestTopK() || wrong;
    wrong = !TestUnsqueeze() || wrong;
    wrong = !TestXMem() || wrong;
-
+    
+    wrong = !TestCrossEntropy() || wrong;
 	wrong = !TestDropout() || wrong;
    wrong = !TestHardTanH() || wrong;
    wrong = !TestIdentity() || wrong;

--- a/source/tensor/test/Test.h
+++ b/source/tensor/test/Test.h
@@ -33,6 +33,7 @@
 #include "TDiv.h"
 #include "TDivDim.h"
 #include "TExp.h"
+#include "TGather.h"
 #include "TLog.h"
 #include "TMatrixMul.h"
 #include "TMatrixMul2D.h"
@@ -47,6 +48,7 @@
 #include "TReduceMax.h"
 #include "TReduceMean.h"
 #include "TReduceSum.h"
+#include "TReduceSumAll.h"
 #include "TReduceSumSquared.h"
 #include "TReduceVariance.h"
 #include "TRound.h"
@@ -69,6 +71,7 @@
 #include "TUnsqueeze.h"
 #include "TXMem.h"

+#include "TCrossEntropy.h"
 #include "TDropout.h"
 #include "THardTanH.h"
 #include "TIdentity.h"