code optimization

f7f33b29 · xuchen · e223c59c · f7f33b29 · f7f33b29 · f7f33b29
Commit f7f33b29 authored Aug 19, 2018 by xuchen
--- a/source/network/XBackwardMath.cpp
+++ b/source/network/XBackwardMath.cpp
@@ -35,57 +35,61 @@ void XMathGrad::MakeGrad(XTensor * node)
    XLink &income = node->income;
    int operID = income.typeID;
-    if(operID == MATH_SUM)
-        GradSum(node);
+    if(operID == MATH_ABSOLUTE)
-    else if(operID == MATH_SUMDIM)
+        GradAbsolute(node);
-        GradSumDim(node);
+    else if(operID == MATH_COS)
-    else if(operID == MATH_MULTIPLY)
+        GradCos(node);
-        GradMultiply(node);
+    else if(operID == MATH_EXP)
-    else if (operID == MATH_MULTIPLYDIM)
+        GradExp(node);
-        GradMultiplyDim(node);
+    else if(operID == MATH_LOG)
+        GradLog(node);
+    else if(operID == MATH_ROUND)
+        GradRound(node);
+    else if(operID == MATH_SIGN)
+        GradSign(node);
+    else if(operID == MATH_SIN)
+        GradSin(node);
+    else if(operID == MATH_TAN)
+        GradTan(node);
+    else if(operID == MATH_CLIP)
+        GradClip(node);
+    else if(operID == MATH_DIV)
+        GradDiv(node);
+    else if(operID == MATH_DIVDIM)
+        GradDivDim(node);
    else if(operID == MATH_MATRIXMUL)
        GradMatrixMul(node);
    else if(operID == MATH_MATRIXMULBATCHED)
        GradMatrixMulBatched(node);
-    else if (operID == MATH_LOG)
+    else if(operID == MATH_MULTIPLY)
-        GradLog(node);
+        GradMultiply(node);
-    else if (operID == MATH_POWER)
+    else if(operID == MATH_MULTIPLYDIM)
-        GradPower(node);
+        GradMultiplyDim(node);
-    else if (operID == MATH_NEGATE)
+    else if(operID == MATH_NEGATE)
        GradNegate(node);
-    else if (operID == MATH_SCALEANDSHIFT)
+    else if(operID == MATH_NORMALIZE)
+        GradNormalize(node);
+    else if(operID == MATH_POWER)
+        GradPower(node);
+    else if(operID == MATH_SCALEANDSHIFT)
        GradScaleAndShift(node);
-    else if (operID == MATH_DIV)
+    else if(operID == MATH_SUB)
-        GradDiv(node);
-    else if (operID == MATH_DIVDIM)
-        GradDivDim(node);
-    else if (operID == MATH_SUB)
        GradSub(node);
-    else if (operID == MATH_SIN)
+    else if(operID == MATH_SUBDIM)
-        GradSin(node);
+        GradSubDim(node);
-    else if (operID == MATH_COS)
+    else if(operID == MATH_SUM)
-        GradCos(node);
+        GradSum(node);
-    else if (operID == MATH_TAN)
+    else if(operID == MATH_SUMDIM)
-        GradTan(node);
+        GradSumDim(node);
-    else if (operID == MATH_EXP)
+    else if(operID == REDUCE_REDUCEMEAN)
-        GradExp(node);
-    else if (operID == MATH_NORMALIZE)
-        GradNormalize(node);
-    else if (operID == MATH_ABSOLUTE)
-        GradAbsolute(node);
-    else if (operID == MATH_SIGN)
-        GradSign(node);
-    else if (operID == MATH_ROUND)
-        GradRound(node);
-    else if (operID == MATH_CLIP)
-        GradClip(node);
-    else if (operID == REDUCE_REDUCEMEAN)
        GradReduceMean(node);
-    else if (operID == REDUCE_REDUCESUM)
+    else if(operID == REDUCE_REDUCESUM)
        GradReduceSum(node);
-    else if (operID == REDUCE_REDUCESUMSQUARED)
+    else if(operID == REDUCE_REDUCESUMSQUARED)
        GradReduceSumSquared(node);
-    else if (operID == REDUCE_REDUCEVARIANCE)
+    else if(operID == REDUCE_REDUCEVARIANCE)
        GradReduceVariance(node);
    else{
        ShowNTErrors("TODO!");
@@ -99,427 +103,82 @@ bool XMathGrad::IsMathOP(XTensor * node)
    return (income.typeID & MATH_BASE) != 0;
 }
-/* 
+/*
-gradient for sum
+gradient for absolute
-for 
-c =  a + b * \beta
-we have
-dE/da = dE/dc 
-dE/db = dE/dc * \beta
->> node - the node (c) for backward computation
-*/
-void XMathGrad::GradSum(XTensor * node)
-{
-    XLink &income = node->income;
-    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for SUM!");
-    XTensor * a = income.tails[0];
-    XTensor * b = income.tails[1];
-    DTYPE beta = income.GetParam(0);
-    XNoder::MakeGrad(a);
-    XNoder::MakeGrad(b);
-    _Sum(a->grad, node->grad, a->grad);
-    _Sum(b->grad, node->grad, b->grad, beta);
-    node->visitMark = NODE_FINISHED;
-}
-/* 
-gradient for sum with one dimension
-c = a + b * \beta
-where the size of b is equal to dimension n of a, i.e., |b| = a.dimSize[n]
-dE/da = dE/dc
-dE/db = dE/dc * b.reduce(0,...,n-1,n+1,...) * \beta
-*/
-void XMathGrad::GradSumDim(XTensor * node)
-{
-    XLink &income = node->income;
-    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for SUMDIM!");
-    XTensor * a = income.tails[0];
-    XTensor * b = income.tails[1];
-    int n = income.GetParamInt(0);
-    DTYPE beta = income.GetParam(1);
-    XNoder::MakeGrad(a);
-    XNoder::MakeGrad(b);
-    _Sum(a->grad, node->grad, a->grad);
-    int order = a->order;
-    int dimSize[MAX_TENSOR_DIM_NUM];
-    memcpy(dimSize, a->dimSize, sizeof(int) * a->order);
-    if(n == order - 1){
-        int reshapedSize[MAX_TENSOR_DIM_NUM];
-        reshapedSize[0] = a->unitNum/dimSize[order - 1];
-        reshapedSize[1] = dimSize[order - 1];
-        /* we reshape dE/dc to a matrix whose column number is equal to the 
-           size of b. Then we can reduce the matrix into a row vector. */
-        node->grad->Reshape(2, reshapedSize);
-        if(b->outgo.tailNum > 1){
-            XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
-            _ReduceSum(node->grad, bGradTMP, 0);
-            if(beta != 1.0F)
-                _ScaleAndShiftMe(bGradTMP, beta);
-            _Sum(bGradTMP, b->grad, b->grad);
-            DelTensorBuf(bGradTMP);
-        }
-        else{
-            _ReduceSum(node->grad, b->grad, 0);
-            if(beta != 1.0F)
-                _ScaleAndShiftMe(b->grad, beta);
-        }
-        node->grad->Reshape(order, dimSize);
-    }
-    else{
-        int reshapedSize[MAX_TENSOR_DIM_NUM];
-        reshapedSize[0] = 1;
-        reshapedSize[1] = dimSize[n];
-        reshapedSize[2] = 1;
-        for(int i = 0; i < order; i++){
-            if(i < n)
-                reshapedSize[0] *= dimSize[i];
-        }
-        reshapedSize[2] = a->unitNum / (reshapedSize[0] * reshapedSize[1]);
-        /* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|. 
-           Then reduce along with z and x to obtain dE/db. */
-        node->grad->Reshape(3, reshapedSize);
-        XTensor * interGrad = NewTensorBuf(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
-        _ReduceSum(node->grad, interGrad, 2);
-        if(b->outgo.tailNum > 1){
-            XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
-            _ReduceSum(interGrad, bGradTMP, 0);
-            if(beta != 1.0F)
-                _ScaleAndShiftMe(bGradTMP, beta);
-            _Sum(bGradTMP, b->grad, b->grad);
-            DelTensorBuf(bGradTMP);
-        }
-        else{
-            _ReduceSum(interGrad, b->grad, 0);
-            if(beta != 1.0F)
-                _ScaleAndShiftMe(b->grad, beta);
-        }
-        node->grad->Reshape(order, dimSize);
-        DelTensorBuf(interGrad);
-    }
-    node->visitMark = NODE_FINISHED;
-}
-/* 
-gradient for multiply (dot production)
 for
-c =  a * b 
+c = |a|
 we have
-dE/da = dE/dc * b
+dE/da = dE/dc   a >= 0
-dE/db = dE/dc * a 
+        -dE/dc  a < 0
 >> node - the node (c) for backward computation
 */
-void XMathGrad::GradMultiply(XTensor * node)
+void XMathGrad::GradAbsolute(XTensor * node)
-{
-    XLink &income = node->income;
-    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for MULTIPLY!");
-    XTensor * a = income.tails[0]; 
-    XTensor * b = income.tails[1];
-    XNoder::MakeGrad(a);
-    XNoder::MakeGrad(b);
-    CheckNTErrors(XTensor::IsSameShaped(a, b), "Wrong sized input tensors!");
-    _Multiply(node->grad, b, a->grad, 1.0F);
-    _Multiply(node->grad, a, b->grad, 1.0F);
-    node->visitMark = NODE_FINISHED;
-}
-/*
-gradient for multiply with one dimension
-c = a * b
-where the size of b is equal to dimension n of a, i.e., |b| = a.dimSize[n]
-dE/da = dE/dc * b
-dE/db = (dE/dc * a).reduce(0,...,n-1,n+1,...)
-*/
-void XMathGrad::GradMultiplyDim(XTensor * node)
 {
    XLink &income = node->income;
-    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for MULTIPLYDIM!");
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for ABSOLUTE!");
    XTensor * a = income.tails[0];
-    XTensor * b = income.tails[1];
+    XTensor * b = NewTensorBuf(a, a->devID, a->mem);
-    int n = income.GetParamInt(0);
-    XNoder::MakeGrad(a);
-    XNoder::MakeGrad(b);
-    /* dE/da */
-    _MultiplyDim(node->grad, b, a->grad, n, 1.0F);
-	/* dE/db */
-    int order = a->order;
-    int dimSize[MAX_TENSOR_DIM_NUM];
-    memcpy(dimSize, a->dimSize, sizeof(int) * a->order);
-    XTensor * bGradTMP = NewTensorBuf(node->grad, node->devID, node->mem);
-    _Multiply(node->grad, a, bGradTMP);
-    if(n == order - 1){
-        int reshapedSize[MAX_TENSOR_DIM_NUM];
-        reshapedSize[0] = a->unitNum/dimSize[order - 1];
-        reshapedSize[1] = dimSize[order - 1];
-        /* we reshape dE/dc * a to a matrix whose column number is equal to the 
-           size of b. Then we can reduce the matrix into a row vector. */
-        bGradTMP->Reshape(2, reshapedSize);
-        if(b->outgo.tailNum > 1){
-            XTensor * bGradTMP2 = NewTensorBuf(b->grad, b->devID, b->mem);
-            _ReduceSum(bGradTMP, bGradTMP2, 0);
-            _Sum(b->grad, bGradTMP2, b->grad);
-            DelTensorBuf(bGradTMP2);
-        }
-        else{
-            _ReduceSum(bGradTMP, b->grad, 0);
-        }
-    }
-    else{
-        int reshapedSize[MAX_TENSOR_DIM_NUM];
-        reshapedSize[0] = 1;
-        reshapedSize[1] = dimSize[n];
-        reshapedSize[2] = 1;
-        for(int i = 0; i < order; i++){
-            if(i < n)
-                reshapedSize[0] *= dimSize[i];
-        }
-        reshapedSize[2] = a->unitNum / (reshapedSize[0] * reshapedSize[1]);
-        /* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|. 
-           Then reduce along with z and x to obtain dE/db. */
-        bGradTMP->Reshape(3, reshapedSize);
-        XTensor * interGrad = NewTensorBuf(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
-        _ReduceSum(bGradTMP, interGrad, 2);
-        if(b->outgo.tailNum > 1){
-            XTensor * bGradTMP2 = NewTensorBuf(b->grad, b->devID, b->mem);
-            _ReduceSum(interGrad, bGradTMP2, 0);
-            _Sum(b->grad, bGradTMP2, b->grad);
-            DelTensorBuf(bGradTMP2);
-        }
-        else{
-            _ReduceSum(interGrad, b->grad, 0);
-        }
-        DelTensorBuf(interGrad);
-    }
-    DelTensor(bGradTMP);
-    node->visitMark = NODE_FINISHED;
-}
-/* 
-gradient for matrix multiply
-for c = matmul(a, b) * \alpha
-we have 
-dE/da = dE/dc * b^T * \alpha
-dE/db = a^T * dE/dc * \alpha
->> node - the node (c) for backward computation
-*/
-void XMathGrad::GradMatrixMul(XTensor * node)
-{
-    XLink &income = node->income;
-    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for MULTIPLY!");
-    CheckNTErrors(income.paramNum == 3, "Wrong parameter number for MULTIPLY!");
-    XTensor * a = income.tails[0]; 
-    XTensor * b = income.tails[1];
-    MATRIX_TRANS_TYPE transA = income.GetParamTrans(0);
-    MATRIX_TRANS_TYPE transB = income.GetParamTrans(1);
-    DTYPE alpha = income.GetParam(2);
    XNoder::MakeGrad(a);
-    XNoder::MakeGrad(b);
-    XTensor * c = node;
+    _Sign(a, b);
-    XTensor * dedc = node->grad;
+    _Multiply(node->grad, b, a->grad, 1.0F);
-    XTensor * deda = a->grad;
-    XTensor * dedb = b->grad;
-    if(deda->order == 2 && dedb->order == 2)
-        GradMatrixMul(a, deda, transA, b, dedb, transB, dedc, alpha);
-    else if(transA == X_NOTRANS && deda->order > 2 && dedb->order == 2){
-        int orderBackupA = a->order;
-        int orderBackupC = c->order;
-        int dimsBackupA[MAX_TENSOR_DIM_NUM];
-        int dimsBackupC[MAX_TENSOR_DIM_NUM];
-        memcpy(dimsBackupA, a->dimSize, sizeof(int) * a->order);
-        memcpy(dimsBackupC, c->dimSize, sizeof(int) * c->order);
-        a->Reshape(a->unitNum/a->GetDim(-1), a->GetDim(-1));
-        c->Reshape(c->unitNum/c->GetDim(-1), c->GetDim(-1));
-        deda->Reshape(a->unitNum/a->GetDim(-1), a->GetDim(-1));
-        dedc->Reshape(c->unitNum/c->GetDim(-1), c->GetDim(-1));
-        GradMatrixMul(a, deda, transA, b, dedb, transB, dedc, alpha);
-        a->Reshape(orderBackupA, dimsBackupA);
+    DelTensorBuf(b);
-        c->Reshape(orderBackupC, dimsBackupC);
-        deda->Reshape(orderBackupA, dimsBackupA);
-        dedc->Reshape(orderBackupC, dimsBackupC);
-    }
-    else{
-        ShowNTErrors("TODO!");
-    }
    node->visitMark = NODE_FINISHED;
 }
 /*
-gradient for matrix multiply: c = matmul(a, b) * \alpha
+gradient for cos
->> a - as it is
+for
->> deda - dE/da
+c = cos(a)
->> b - as it is
+we have
->> dedb - dE/db
+dE/da = dE/dc * -sin(a)
->> dedc - dE/dc
+>> node - the node (c) for backward computation
->> alpha - the scalar
 */
-void XMathGrad::GradMatrixMul(XTensor * a, XTensor * deda, MATRIX_TRANS_TYPE transA,
+void XMathGrad::GradCos(XTensor * node)
-                              XTensor * b, XTensor * dedb, MATRIX_TRANS_TYPE transB,
-                              XTensor * dedc, DTYPE alpha)
 {
-    /* c = a * b * \alpha */
+    XLink &income = node->income;
-    if(transA == X_NOTRANS && transB == X_NOTRANS){
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for COS!");
-        /* dE/da = dE/dc * b^T * \alpha */
+    XTensor * a = income.tails[0];
-        _MatrixMul(dedc, X_NOTRANS, b, X_TRANS, deda, alpha, 1.0F);
+    XTensor * b = NewTensorBuf(a, a->devID, a->mem);
-        /* dE/db = a^T * dE/dc * \alpha */
+    XNoder::MakeGrad(a);
-        _MatrixMul(a, X_TRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
-    }
+    _Sin(a, b);
+    _ScaleAndShiftMe(b, -1.0F);
-    /* c = a^T * b * \alpha */
+    _Multiply(node->grad, b, a->grad, 1.0F);
-    else if(transA == X_TRANS && transB == X_NOTRANS){
+    DelTensorBuf(b);
-        /* dE/da = (dE/dc * b^T)^T * \alpha 
-                 = b * dE/dc^T * \alpha */
+    node->visitMark = NODE_FINISHED;
-        _MatrixMul(b, X_NOTRANS, dedc, X_TRANS, deda, alpha, 1.0F);
-        /* dE/db = a * dE/dc * \alpha */
-        _MatrixMul(a, X_NOTRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
-    }
-    /* c = a * b^T * \alpha */
-    else if(transA == X_NOTRANS && transB == X_TRANS){
-        /* dE/da = dE/dc * b * \alpha */
-        _MatrixMul(dedc, X_NOTRANS, b, X_NOTRANS, deda, alpha, 1.0F);
-        /* dE/db = (a^T * dE/dc)^T * \alpha 
-                 = dE/dc^T * a * \alpha */
-        _MatrixMul(dedc, X_TRANS, a, X_NOTRANS, dedb, alpha, 1.0F);
-    }
-    /* c = a^T * b^T * \alpha */
-    else if(transA == X_TRANS && transB == X_TRANS){
-        /* dE/da = (dE/dc * b)^T * \alpha 
-                 = b^T * dE/dc^T * \alpha */
-        _MatrixMul(b, X_TRANS, dedc, X_TRANS, deda, alpha, 1.0F);
-        /* dE/db = (a * dE/dc)^T * \alpha 
-                 = dE/dc^T * a^T * \alpha */
-        _MatrixMul(dedc, X_TRANS, a, X_TRANS, dedb, alpha, 1.0F);
-    }
 }
-/* 
+/*
-gradient for matrix multiply in batch mode.
+gradient for exp
-for each batch: c_i = matmul(a_i, b_i) * \alpha
+for
-for c_i = matmul(a_i, b_i) * \alpha
+c = exp(a)
-we have 
+we have
-dE/da_i = dE/dc_i * b_i^T * \alpha
+dE/da = dE/dc * exp(a)
-dE/db_i = a_i^T * dE/dc_i * \alpha
 >> node - the node (c) for backward computation
 */
-void XMathGrad::GradMatrixMulBatched(XTensor * node)
+void XMathGrad::GradExp(XTensor * node)
 {
    XLink &income = node->income;
-    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for MULTIPLY!");
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for EXP!");
-    CheckNTErrors(income.paramNum == 3, "Wrong parameter number for MULTIPLY!");
-    XTensor * a = income.tails[0]; 
+    XTensor * a = income.tails[0];
-    XTensor * b = income.tails[1];
+    XTensor * b = NewTensorBuf(a, a->devID, a->mem);
-    MATRIX_TRANS_TYPE transA = income.GetParamTrans(0);
-    MATRIX_TRANS_TYPE transB = income.GetParamTrans(1);
-    DTYPE alpha = income.GetParam(2);
    XNoder::MakeGrad(a);
-    XNoder::MakeGrad(b);
-    XTensor * dedc = node->grad;
+    _Exp(a, b);
-    XTensor * deda = a->grad;
+    _Multiply(node->grad, b, a->grad, 1.0F);
-    XTensor * dedb = b->grad;
-    /* c = a * b * \alpha */
+    DelTensorBuf(b);
-    if(transA == X_NOTRANS && transB == X_NOTRANS){
-        /* dE/da = dE/dc * b^T * \alpha */
-        _MatrixMulBatched(dedc, X_NOTRANS, b, X_TRANS, deda, alpha, 1.0F);
-        /* dE/db = a^T * dE/dc * \alpha */
-        _MatrixMulBatched(a, X_TRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
-    }
-    /* c = a^T * b * \alpha */
-    else if(transA == X_TRANS && transB == X_NOTRANS){
-        /* dE/da = (dE/dc * b^T)^T * \alpha 
-                 = b * dE/dc^T * \alpha */
-        _MatrixMulBatched(b, X_NOTRANS, dedc, X_TRANS, deda, alpha, 1.0F);
-        /* dE/db = a * dE/dc * \alpha */
-        _MatrixMulBatched(a, X_NOTRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
-    }
-    /* c = a * b^T * \alpha */
-    else if(transA == X_NOTRANS && transB == X_TRANS){
-        /* dE/da = dE/dc * b * \alpha */
-        _MatrixMulBatched(dedc, X_NOTRANS, b, X_NOTRANS, deda, alpha, 1.0F);
-        /* dE/db = (a^T * dE/dc)^T * \alpha 
-                 = dE/dc^T * a * \alpha */
-        _MatrixMulBatched(dedc, X_TRANS, a, X_NOTRANS, dedb, alpha, 1.0F);
-    }
-    /* c = a^T * b^T * \alpha */
-    else if(transA == X_TRANS && transB == X_TRANS){
-        /* dE/da = (dE/dc * b)^T * \alpha 
-                 = b^T * dE/dc^T * \alpha */
-        _MatrixMulBatched(b, X_TRANS, dedc, X_TRANS, deda, alpha, 1.0F);
-        /* dE/db = (a * dE/dc)^T * \alpha 
-                 = dE/dc^T * a^T * \alpha */
-        _MatrixMulBatched(dedc, X_TRANS, a, X_TRANS, dedb, alpha, 1.0F);
-    }
    node->visitMark = NODE_FINISHED;
 }
@@ -547,54 +206,63 @@ void XMathGrad::GradLog(XTensor * node)
 }
 /*
-gradient for power
+gradient for round
 for
-c = pow(a,p)
+c = round(a)
 we have
-dE/da = (dE/dc) * p * a^(p-1)
+dE/da = 0
 >> node - the node (c) for backward computation
 */
-void XMathGrad::GradPower(XTensor * node)
+void XMathGrad::GradRound(XTensor * node)
 {
    XLink &income = node->income;
-    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for POWER!");
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for ROUND!");
-    XTensor * a = income.tails[0];
-    XTensor * b = NewTensorBuf(a, a->devID, a->mem);
-    DTYPE p = income.GetParam(0);
+    // we do nothing here
+    // TODO: set grad = 0 if the node is the only child
-    XNoder::MakeGrad(a);
+    node->visitMark = NODE_FINISHED;
+}
-    _Power(a, b, p - 1.0F);
+/*
-    _ScaleAndShiftMe(b, p);
+gradient for sign
-    _Multiply(node->grad, b, a->grad, 1.0F);
+for
+c = sign(a)
+we have
+dE/da = 0
+>> node - the node (c) for backward computation
+*/
+void XMathGrad::GradSign(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SIGN!");
-    DelTensor(b);
+    // we do nothing here
+    // TODO: set grad = 0 if the node is the only child
    node->visitMark = NODE_FINISHED;
 }
 /*
-gradient for negate
+gradient for sin
 for
-c = -a
+c = sin(a)
 we have
-dE/da = dE/dc * (-1)
+dE/da = dE/dc * cos(a)
 >> node - the node (c) for backward computation
 */
-void XMathGrad::GradNegate(XTensor * node)
+void XMathGrad::GradSin(XTensor * node)
 {
    XLink &income = node->income;
-    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for NEGATE!");
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SIN!");
    XTensor * a = income.tails[0];
    XTensor * b = NewTensorBuf(a, a->devID, a->mem);
    XNoder::MakeGrad(a);
-    _ScaleAndShift(node->grad, b, -1.0F);
+    _Cos(a, b);
-    _Sum(a->grad, b, a->grad);
+    _Multiply(node->grad, b, a->grad, 1.0F);
    DelTensorBuf(b);
@@ -602,52 +270,56 @@ void XMathGrad::GradNegate(XTensor * node)
 }
 /*
-gradient for ScaleAndShift
+gradient for tan
 for
-c = a * scale + shift
+c = tan(a)
 we have
-dE/da = dE/dc * scale
+dE/da = dE/dc * 1/(cos(a))^2
 >> node - the node (c) for backward computation
 */
-void XMathGrad::GradScaleAndShift(XTensor * node)
+void XMathGrad::GradTan(XTensor * node)
 {
    XLink &income = node->income;
-    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SCALEANDSHIFT!");
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for TAN!");
    XTensor * a = income.tails[0];
+    XTensor * b = NewTensorBuf(a, a->devID, a->mem);
-    DTYPE scale = income.GetParam(0);
    XNoder::MakeGrad(a);
-    _Sum(a->grad, node->grad, a->grad, scale);
+    _Cos(a, b);
+    _PowerMe(b, -2.0F);
+    _Multiply(node->grad, b, a->grad, 1.0F);
+    DelTensorBuf(b);
    node->visitMark = NODE_FINISHED;
 }
 /*
-gradient for minus
+gradient for clip
-for
-c =  a - b * \beta
 we have
-dE/da = dE/dc
+dE/da = 1  lower < a < upper
-dE/db = -dE/dc * \beta
+dE/da = 0  otherwise 
 >> node - the node (c) for backward computation
 */
-void XMathGrad::GradSub(XTensor * node)
+void XMathGrad::GradClip(XTensor * node)
 {
    XLink &income = node->income;
-    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for SUBSTRACT!");
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for CLIP!");
    XTensor * a = income.tails[0];
-    XTensor * b = income.tails[1];
+    XTensor * b = NewTensorBuf(a, a->devID, a->mem);
-    DTYPE beta = income.GetParam(0);
+    DTYPE lower = income.GetParam(0);
+    DTYPE upper = income.GetParam(1);
    XNoder::MakeGrad(a);
-    XNoder::MakeGrad(b);
-    _Sum(a->grad, node->grad, a->grad);
+    _ClipBackward(node, a, node->grad, a->grad, lower, upper);
-    _Sum(b->grad, node->grad, b->grad, -beta);
+    _Sum(a->grad, b, a->grad);
+    DelTensorBuf(b);
    node->visitMark = NODE_FINISHED;
 }
@@ -686,13 +358,13 @@ void XMathGrad::GradDiv(XTensor * node)
    node->visitMark = NODE_FINISHED;
 }
 /* 
 gradient for division with one dimension
 c = a / b
 where the size of b is equal to dimension n of a, i.e., |b| = a.dimSize[n]
 dE/da = dE/dc * (1/b)
-dE/db = dE/dc * b.reduce(0,...,n-1,n+1,...)
+dE/db = (dE/dc * (-a/b^2)).reduce(0,...,n-1,n+1,...)
-dE/db = (dE/dc * a).reduce(0,...,n-1,n+1,...)
 */
 void XMathGrad::GradDivDim(XTensor * node)
 {
@@ -787,106 +459,331 @@ void XMathGrad::GradDivDim(XTensor * node)
    node->visitMark = NODE_FINISHED;
 }
+/* 
+gradient for matrix multiply
+for c = matmul(a, b) * \alpha
+we have 
+dE/da = dE/dc * b^T * \alpha
+dE/db = a^T * dE/dc * \alpha
+>> node - the node (c) for backward computation
+*/
+void XMathGrad::GradMatrixMul(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for MULTIPLY!");
+    CheckNTErrors(income.paramNum == 3, "Wrong parameter number for MULTIPLY!");
+    XTensor * a = income.tails[0]; 
+    XTensor * b = income.tails[1];
+    MATRIX_TRANS_TYPE transA = income.GetParamTrans(0);
+    MATRIX_TRANS_TYPE transB = income.GetParamTrans(1);
+    DTYPE alpha = income.GetParam(2);
+    XNoder::MakeGrad(a);
+    XNoder::MakeGrad(b);
+    XTensor * c = node;
+    XTensor * dedc = node->grad;
+    XTensor * deda = a->grad;
+    XTensor * dedb = b->grad;
+    if(deda->order == 2 && dedb->order == 2)
+        GradMatrixMul(a, deda, transA, b, dedb, transB, dedc, alpha);
+    else if(transA == X_NOTRANS && deda->order > 2 && dedb->order == 2){
+        int orderBackupA = a->order;
+        int orderBackupC = c->order;
+        int dimsBackupA[MAX_TENSOR_DIM_NUM];
+        int dimsBackupC[MAX_TENSOR_DIM_NUM];
+        memcpy(dimsBackupA, a->dimSize, sizeof(int) * a->order);
+        memcpy(dimsBackupC, c->dimSize, sizeof(int) * c->order);
+        a->Reshape(a->unitNum/a->GetDim(-1), a->GetDim(-1));
+        c->Reshape(c->unitNum/c->GetDim(-1), c->GetDim(-1));
+        deda->Reshape(a->unitNum/a->GetDim(-1), a->GetDim(-1));
+        dedc->Reshape(c->unitNum/c->GetDim(-1), c->GetDim(-1));
+        GradMatrixMul(a, deda, transA, b, dedb, transB, dedc, alpha);
+        a->Reshape(orderBackupA, dimsBackupA);
+        c->Reshape(orderBackupC, dimsBackupC);
+        deda->Reshape(orderBackupA, dimsBackupA);
+        dedc->Reshape(orderBackupC, dimsBackupC);
+    }
+    else{
+        ShowNTErrors("TODO!");
+    }
+    node->visitMark = NODE_FINISHED;
+}
 /*
-gradient for exp
+gradient for matrix multiply: c = matmul(a, b) * \alpha
-for
+>> a - as it is
-c = exp(a)
+>> deda - dE/da
-we have
+>> b - as it is
-dE/da = dE/dc * exp(a)
+>> dedb - dE/db
+>> dedc - dE/dc
+>> alpha - the scalar
+*/
+void XMathGrad::GradMatrixMul(XTensor * a, XTensor * deda, MATRIX_TRANS_TYPE transA,
+                              XTensor * b, XTensor * dedb, MATRIX_TRANS_TYPE transB,
+                              XTensor * dedc, DTYPE alpha)
+{
+    /* c = a * b * \alpha */
+    if(transA == X_NOTRANS && transB == X_NOTRANS){
+        /* dE/da = dE/dc * b^T * \alpha */
+        _MatrixMul(dedc, X_NOTRANS, b, X_TRANS, deda, alpha, 1.0F);
+        /* dE/db = a^T * dE/dc * \alpha */
+        _MatrixMul(a, X_TRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
+    }
+    /* c = a^T * b * \alpha */
+    else if(transA == X_TRANS && transB == X_NOTRANS){
+        /* dE/da = (dE/dc * b^T)^T * \alpha 
+                 = b * dE/dc^T * \alpha */
+        _MatrixMul(b, X_NOTRANS, dedc, X_TRANS, deda, alpha, 1.0F);
+        /* dE/db = a * dE/dc * \alpha */
+        _MatrixMul(a, X_NOTRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
+    }
+    /* c = a * b^T * \alpha */
+    else if(transA == X_NOTRANS && transB == X_TRANS){
+        /* dE/da = dE/dc * b * \alpha */
+        _MatrixMul(dedc, X_NOTRANS, b, X_NOTRANS, deda, alpha, 1.0F);
+        /* dE/db = (a^T * dE/dc)^T * \alpha 
+                 = dE/dc^T * a * \alpha */
+        _MatrixMul(dedc, X_TRANS, a, X_NOTRANS, dedb, alpha, 1.0F);
+    }
+    /* c = a^T * b^T * \alpha */
+    else if(transA == X_TRANS && transB == X_TRANS){
+        /* dE/da = (dE/dc * b)^T * \alpha 
+                 = b^T * dE/dc^T * \alpha */
+        _MatrixMul(b, X_TRANS, dedc, X_TRANS, deda, alpha, 1.0F);
+        /* dE/db = (a * dE/dc)^T * \alpha 
+                 = dE/dc^T * a^T * \alpha */
+        _MatrixMul(dedc, X_TRANS, a, X_TRANS, dedb, alpha, 1.0F);
+    }
+}
+/* 
+gradient for matrix multiply in batch mode.
+for each batch: c_i = matmul(a_i, b_i) * \alpha
+for c_i = matmul(a_i, b_i) * \alpha
+we have 
+dE/da_i = dE/dc_i * b_i^T * \alpha
+dE/db_i = a_i^T * dE/dc_i * \alpha
 >> node - the node (c) for backward computation
 */
-void XMathGrad::GradExp(XTensor * node)
+void XMathGrad::GradMatrixMulBatched(XTensor * node)
 {
    XLink &income = node->income;
-    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for EXP!");
+    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for MULTIPLY!");
+    CheckNTErrors(income.paramNum == 3, "Wrong parameter number for MULTIPLY!");
-    XTensor * a = income.tails[0];
+    XTensor * a = income.tails[0]; 
-    XTensor * b = NewTensorBuf(a, a->devID, a->mem);
+    XTensor * b = income.tails[1];
+    MATRIX_TRANS_TYPE transA = income.GetParamTrans(0);
+    MATRIX_TRANS_TYPE transB = income.GetParamTrans(1);
+    DTYPE alpha = income.GetParam(2);
    XNoder::MakeGrad(a);
+    XNoder::MakeGrad(b);
-    _Exp(a, b);
+    XTensor * dedc = node->grad;
-    _Multiply(node->grad, b, a->grad, 1.0F);
+    XTensor * deda = a->grad;
+    XTensor * dedb = b->grad;
-    DelTensorBuf(b);
+    /* c = a * b * \alpha */
+    if(transA == X_NOTRANS && transB == X_NOTRANS){
+        /* dE/da = dE/dc * b^T * \alpha */
+        _MatrixMulBatched(dedc, X_NOTRANS, b, X_TRANS, deda, alpha, 1.0F);
+        /* dE/db = a^T * dE/dc * \alpha */
+        _MatrixMulBatched(a, X_TRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
+    }
+    /* c = a^T * b * \alpha */
+    else if(transA == X_TRANS && transB == X_NOTRANS){
+        /* dE/da = (dE/dc * b^T)^T * \alpha 
+                 = b * dE/dc^T * \alpha */
+        _MatrixMulBatched(b, X_NOTRANS, dedc, X_TRANS, deda, alpha, 1.0F);
+        /* dE/db = a * dE/dc * \alpha */
+        _MatrixMulBatched(a, X_NOTRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
+    }
+    /* c = a * b^T * \alpha */
+    else if(transA == X_NOTRANS && transB == X_TRANS){
+        /* dE/da = dE/dc * b * \alpha */
+        _MatrixMulBatched(dedc, X_NOTRANS, b, X_NOTRANS, deda, alpha, 1.0F);
+        /* dE/db = (a^T * dE/dc)^T * \alpha 
+                 = dE/dc^T * a * \alpha */
+        _MatrixMulBatched(dedc, X_TRANS, a, X_NOTRANS, dedb, alpha, 1.0F);
+    }
+    /* c = a^T * b^T * \alpha */
+    else if(transA == X_TRANS && transB == X_TRANS){
+        /* dE/da = (dE/dc * b)^T * \alpha 
+                 = b^T * dE/dc^T * \alpha */
+        _MatrixMulBatched(b, X_TRANS, dedc, X_TRANS, deda, alpha, 1.0F);
+        /* dE/db = (a * dE/dc)^T * \alpha 
+                 = dE/dc^T * a^T * \alpha */
+        _MatrixMulBatched(dedc, X_TRANS, a, X_TRANS, dedb, alpha, 1.0F);
+    }
    node->visitMark = NODE_FINISHED;
 }
-/*
+/* 
-gradient for sin
+gradient for multiply (dot production)
 for
-c = sin(a)
+c =  a * b 
 we have
-dE/da = dE/dc * cos(a)
+dE/da = dE/dc * b
+dE/db = dE/dc * a 
 >> node - the node (c) for backward computation
 */
-void XMathGrad::GradSin(XTensor * node)
+void XMathGrad::GradMultiply(XTensor * node)
 {
    XLink &income = node->income;
-    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SIN!");
+    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for MULTIPLY!");
+    XTensor * a = income.tails[0]; 
+    XTensor * b = income.tails[1];
+    XNoder::MakeGrad(a);
+    XNoder::MakeGrad(b);
+    CheckNTErrors(XTensor::IsSameShaped(a, b), "Wrong sized input tensors!");
+    _Multiply(node->grad, b, a->grad, 1.0F);
+    _Multiply(node->grad, a, b->grad, 1.0F);
+    node->visitMark = NODE_FINISHED;
+}
+/*
+gradient for multiply with one dimension
+c = a * b
+where the size of b is equal to dimension n of a, i.e., |b| = a.dimSize[n]
+dE/da = dE/dc * b
+dE/db = (dE/dc * a).reduce(0,...,n-1,n+1,...)
+*/
+void XMathGrad::GradMultiplyDim(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for MULTIPLYDIM!");
    XTensor * a = income.tails[0];
-    XTensor * b = NewTensorBuf(a, a->devID, a->mem);
+    XTensor * b = income.tails[1];
+    int n = income.GetParamInt(0);
+    XNoder::MakeGrad(a);
+    XNoder::MakeGrad(b);
+    /* dE/da */
+    _MultiplyDim(node->grad, b, a->grad, n, 1.0F);
+	/* dE/db */
+    int order = a->order;
+    int dimSize[MAX_TENSOR_DIM_NUM];
+    memcpy(dimSize, a->dimSize, sizeof(int) * a->order);
+    XTensor * bGradTMP = NewTensorBuf(node->grad, node->devID, node->mem);
+    _Multiply(node->grad, a, bGradTMP);
+    if(n == order - 1){
+        int reshapedSize[MAX_TENSOR_DIM_NUM];
+        reshapedSize[0] = a->unitNum/dimSize[order - 1];
+        reshapedSize[1] = dimSize[order - 1];
+        /* we reshape dE/dc * a to a matrix whose column number is equal to the 
+           size of b. Then we can reduce the matrix into a row vector. */
+        bGradTMP->Reshape(2, reshapedSize);
+        if(b->outgo.tailNum > 1){
+            XTensor * bGradTMP2 = NewTensorBuf(b->grad, b->devID, b->mem);
+            _ReduceSum(bGradTMP, bGradTMP2, 0);
+            _Sum(b->grad, bGradTMP2, b->grad);
-    XNoder::MakeGrad(a);
+            DelTensorBuf(bGradTMP2);
+        }
+        else{
+            _ReduceSum(bGradTMP, b->grad, 0);
+        }
+    }
+    else{
+        int reshapedSize[MAX_TENSOR_DIM_NUM];
+        reshapedSize[0] = 1;
+        reshapedSize[1] = dimSize[n];
+        reshapedSize[2] = 1;
-    _Cos(a, b);
+        for(int i = 0; i < order; i++){
-    _Multiply(node->grad, b, a->grad, 1.0F);
+            if(i < n)
+                reshapedSize[0] *= dimSize[i];
+        }
-    DelTensorBuf(b);
+        reshapedSize[2] = a->unitNum / (reshapedSize[0] * reshapedSize[1]);
-    node->visitMark = NODE_FINISHED;
+        /* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|. 
-}
+           Then reduce along with z and x to obtain dE/db. */
+        bGradTMP->Reshape(3, reshapedSize);
-/*
+        XTensor * interGrad = NewTensorBuf(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
-gradient for cos
+        _ReduceSum(bGradTMP, interGrad, 2);
-for
-c = cos(a)
-we have
-dE/da = dE/dc * -sin(a)
->> node - the node (c) for backward computation
-*/
-void XMathGrad::GradCos(XTensor * node)
-{
-    XLink &income = node->income;
-    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for COS!");
-    XTensor * a = income.tails[0];
+        if(b->outgo.tailNum > 1){
-    XTensor * b = NewTensorBuf(a, a->devID, a->mem);
+            XTensor * bGradTMP2 = NewTensorBuf(b->grad, b->devID, b->mem);
-    XNoder::MakeGrad(a);
+            _ReduceSum(interGrad, bGradTMP2, 0);
+            _Sum(b->grad, bGradTMP2, b->grad);
-    _Sin(a, b);
+            DelTensorBuf(bGradTMP2);
-    _ScaleAndShiftMe(b, -1.0F);
+        }
-    _Multiply(node->grad, b, a->grad, 1.0F);
+        else{
+            _ReduceSum(interGrad, b->grad, 0);
+        }
-    DelTensorBuf(b);
+        DelTensorBuf(interGrad);
+    }
+    DelTensor(bGradTMP);
    node->visitMark = NODE_FINISHED;
 }
 /*
-gradient for tan
+gradient for negate
 for
-c = tan(a)
+c = -a
 we have
-dE/da = dE/dc * 1/(cos(a))^2
+dE/da = dE/dc * (-1)
 >> node - the node (c) for backward computation
 */
-void XMathGrad::GradTan(XTensor * node)
+void XMathGrad::GradNegate(XTensor * node)
 {
    XLink &income = node->income;
-    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for TAN!");
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for NEGATE!");
    XTensor * a = income.tails[0];
    XTensor * b = NewTensorBuf(a, a->devID, a->mem);
    XNoder::MakeGrad(a);
-    _Cos(a, b);
+    _ScaleAndShift(node->grad, b, -1.0F);
-    _PowerMe(b, -2.0F);
+    _Sum(a->grad, b, a->grad);
-    _Multiply(node->grad, b, a->grad, 1.0F);
    DelTensorBuf(b);
@@ -982,95 +879,299 @@ void XMathGrad::GradNormalize(XTensor * node)
 }
 /*
-gradient for absolute
+gradient for power
 for
-c = |a|
+c = pow(a,p)
 we have
-dE/da = dE/dc   a >= 0
+dE/da = (dE/dc) * p * a^(p-1)
-        -dE/dc  a < 0
 >> node - the node (c) for backward computation
 */
-void XMathGrad::GradAbsolute(XTensor * node)
+void XMathGrad::GradPower(XTensor * node)
 {
    XLink &income = node->income;
-    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for ABSOLUTE!");
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for POWER!");
    XTensor * a = income.tails[0];
    XTensor * b = NewTensorBuf(a, a->devID, a->mem);
+    DTYPE p = income.GetParam(0);
    XNoder::MakeGrad(a);
-    _Sign(a, b);
+    _Power(a, b, p - 1.0F);
+    _ScaleAndShiftMe(b, p);
    _Multiply(node->grad, b, a->grad, 1.0F);
-    DelTensorBuf(b);
+    DelTensor(b);
    node->visitMark = NODE_FINISHED;
 }
 /*
-gradient for sign
+gradient for ScaleAndShift
 for
-c = sign(a)
+c = a * scale + shift
 we have
-dE/da = 0
+dE/da = dE/dc * scale
 >> node - the node (c) for backward computation
 */
-void XMathGrad::GradSign(XTensor * node)
+void XMathGrad::GradScaleAndShift(XTensor * node)
 {
    XLink &income = node->income;
-    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SIGN!");
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SCALEANDSHIFT!");
-    // we do nothing here
+    XTensor * a = income.tails[0];
-    // TODO: set grad = 0 if the node is the only child
+    DTYPE scale = income.GetParam(0);
+    XNoder::MakeGrad(a);
+    _Sum(a->grad, node->grad, a->grad, scale);
    node->visitMark = NODE_FINISHED;
 }
 /*
-gradient for round
+gradient for minus
 for
-c = round(a)
+c =  a - b * \beta
 we have
-dE/da = 0
+dE/da = dE/dc
+dE/db = -dE/dc * \beta
 >> node - the node (c) for backward computation
 */
-void XMathGrad::GradRound(XTensor * node)
+void XMathGrad::GradSub(XTensor * node)
 {
    XLink &income = node->income;
-    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for ROUND!");
+    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for SUBSTRACT!");
-    // we do nothing here
+    XTensor * a = income.tails[0];
-    // TODO: set grad = 0 if the node is the only child
+    XTensor * b = income.tails[1];
+    DTYPE beta = income.GetParam(0);
+    XNoder::MakeGrad(a);
+    XNoder::MakeGrad(b);
+    _Sum(a->grad, node->grad, a->grad);
+    _Sum(b->grad, node->grad, b->grad, -beta);
    node->visitMark = NODE_FINISHED;
 }
 /*
-gradient for clip
+gradient for subtraction with one dimension
+c = a - b * \beta
+where the size of b is equal to dimension n of a, i.e., |b| = a.dimSize[n]
+dE/da = dE/dc
+dE/db = - dE/dc * b.reduce(0,...,n-1,n+1,...) * \beta
+*/
+void XMathGrad::GradSubDim(XTensor * node)
+{
+	XLink &income = node->income;
+	CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for SUBDIM!");
+	XTensor * a = income.tails[0];
+	XTensor * b = income.tails[1];
+	int n = income.GetParamInt(0);
+	DTYPE beta = income.GetParam(1);
+	XNoder::MakeGrad(a);
+	XNoder::MakeGrad(b);
+	_Sum(a->grad, node->grad, a->grad);
+	int order = a->order;
+	int dimSize[MAX_TENSOR_DIM_NUM];
+	memcpy(dimSize, a->dimSize, sizeof(int) * a->order);
+	if(n == order - 1){
+		int reshapedSize[MAX_TENSOR_DIM_NUM];
+		reshapedSize[0] = a->unitNum / dimSize[order - 1];
+		reshapedSize[1] = dimSize[order - 1];
+		/* we reshape dE/dc to a matrix whose column number is equal to the
+		   size of b. Then we can reduce the matrix into a row vector. */
+		node->grad->Reshape(2, reshapedSize);
+		if(b->outgo.tailNum > 1){
+			XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
+			_ReduceSum(node->grad, bGradTMP, 0);
+			if(beta != 1.0F)
+				_ScaleAndShiftMe(bGradTMP, beta);
+			_Sub(b->grad, bGradTMP, b->grad);
+			DelTensorBuf(bGradTMP);
+		}
+		else{
+			_ReduceSum(node->grad, b->grad, 0);
+			if(beta != 1.0F)
+				_ScaleAndShiftMe(b->grad, beta);
+			_ScaleAndShiftMe(b->grad, -1.0F);
+		}
+		node->grad->Reshape(order, dimSize);
+	}
+	else{
+		int reshapedSize[MAX_TENSOR_DIM_NUM];
+		reshapedSize[0] = 1;
+		reshapedSize[1] = dimSize[n];
+		reshapedSize[2] = 1;
+		for(int i = 0; i < order; i++){
+			if(i < n)
+				reshapedSize[0] *= dimSize[i];
+		}
+		reshapedSize[2] = a->unitNum / (reshapedSize[0] * reshapedSize[1]);
+		/* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|.
+		   Then reduce along with z and x to obtain dE/db. */
+		node->grad->Reshape(3, reshapedSize);
+		XTensor * interGrad = NewTensorBuf(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
+		_ReduceSum(node->grad, interGrad, 2);
+		if(b->outgo.tailNum > 1){
+			XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
+			_ReduceSum(interGrad, bGradTMP, 0);
+			if(beta != 1.0F)
+				_ScaleAndShiftMe(bGradTMP, beta);
+			_Sub(b->grad, bGradTMP, b->grad);
+			DelTensorBuf(bGradTMP);
+		}
+		else{
+			_ReduceSum(interGrad, b->grad, 0);
+			if(beta != 1.0F)
+				_ScaleAndShiftMe(b->grad, beta);
+			_ScaleAndShiftMe(b->grad, -1.0F);
+		}
+		node->grad->Reshape(order, dimSize);
+		DelTensorBuf(interGrad);
+	}
+	node->visitMark = NODE_FINISHED;
+}
+/* 
+gradient for sum
+for 
+c =  a + b * \beta
 we have
-dE/da = 1  lower < a < upper
+dE/da = dE/dc 
-dE/da = 0  otherwise 
+dE/db = dE/dc * \beta
 >> node - the node (c) for backward computation
 */
-void XMathGrad::GradClip(XTensor * node)
+void XMathGrad::GradSum(XTensor * node)
 {
    XLink &income = node->income;
-    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for CLIP!");
+    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for SUM!");
    XTensor * a = income.tails[0];
-    XTensor * b = NewTensorBuf(a, a->devID, a->mem);
+    XTensor * b = income.tails[1];
+    DTYPE beta = income.GetParam(0);
-    DTYPE lower = income.GetParam(0);
+    XNoder::MakeGrad(a);
-    DTYPE upper = income.GetParam(1);
+    XNoder::MakeGrad(b);
+    _Sum(a->grad, node->grad, a->grad);
+    _Sum(b->grad, node->grad, b->grad, beta);
+    node->visitMark = NODE_FINISHED;
+}
+/* 
+gradient for sum with one dimension
+c = a + b * \beta
+where the size of b is equal to dimension n of a, i.e., |b| = a.dimSize[n]
+dE/da = dE/dc
+dE/db = dE/dc * b.reduce(0,...,n-1,n+1,...) * \beta
+*/
+void XMathGrad::GradSumDim(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for SUMDIM!");
+    XTensor * a = income.tails[0];
+    XTensor * b = income.tails[1];
+    int n = income.GetParamInt(0);
+    DTYPE beta = income.GetParam(1);
    XNoder::MakeGrad(a);
+    XNoder::MakeGrad(b);
-    _ClipBackward(node, a, node->grad, a->grad, lower, upper);
+    _Sum(a->grad, node->grad, a->grad);
-    _Sum(a->grad, b, a->grad);
-    DelTensorBuf(b);
+    int order = a->order;
+    int dimSize[MAX_TENSOR_DIM_NUM];
+    memcpy(dimSize, a->dimSize, sizeof(int) * a->order);
+    if(n == order - 1){
+        int reshapedSize[MAX_TENSOR_DIM_NUM];
+        reshapedSize[0] = a->unitNum/dimSize[order - 1];
+        reshapedSize[1] = dimSize[order - 1];
+        /* we reshape dE/dc to a matrix whose column number is equal to the 
+           size of b. Then we can reduce the matrix into a row vector. */
+        node->grad->Reshape(2, reshapedSize);
+        if(b->outgo.tailNum > 1){
+            XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
+            _ReduceSum(node->grad, bGradTMP, 0);
+            if(beta != 1.0F)
+                _ScaleAndShiftMe(bGradTMP, beta);
+            _Sum(bGradTMP, b->grad, b->grad);
+            DelTensorBuf(bGradTMP);
+        }
+        else{
+            _ReduceSum(node->grad, b->grad, 0);
+            if(beta != 1.0F)
+                _ScaleAndShiftMe(b->grad, beta);
+        }
+        node->grad->Reshape(order, dimSize);
+    }
+    else{
+        int reshapedSize[MAX_TENSOR_DIM_NUM];
+        reshapedSize[0] = 1;
+        reshapedSize[1] = dimSize[n];
+        reshapedSize[2] = 1;
+        for(int i = 0; i < order; i++){
+            if(i < n)
+                reshapedSize[0] *= dimSize[i];
+        }
+        reshapedSize[2] = a->unitNum / (reshapedSize[0] * reshapedSize[1]);
+        /* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|. 
+           Then reduce along with z and x to obtain dE/db. */
+        node->grad->Reshape(3, reshapedSize);
+        XTensor * interGrad = NewTensorBuf(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
+        _ReduceSum(node->grad, interGrad, 2);
+        if(b->outgo.tailNum > 1){
+            XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
+            _ReduceSum(interGrad, bGradTMP, 0);
+            if(beta != 1.0F)
+                _ScaleAndShiftMe(bGradTMP, beta);
+            _Sum(bGradTMP, b->grad, b->grad);
+            DelTensorBuf(bGradTMP);
+        }
+        else{
+            _ReduceSum(interGrad, b->grad, 0);
+            if(beta != 1.0F)
+                _ScaleAndShiftMe(b->grad, beta);
+        }
+        node->grad->Reshape(order, dimSize);
+        DelTensorBuf(interGrad);
+    }
    node->visitMark = NODE_FINISHED;
 }

--- a/source/network/XBackwardMath.h
+++ b/source/network/XBackwardMath.h
@@ -40,23 +40,50 @@ public:
    bool IsMathOP(XTensor * node);
 private:
-    /* gradient for sum: c =  a + b * \beta */
+    /* gradient for absolute */
    static
-    void GradSum(XTensor * node);
+    void GradAbsolute(XTensor * node);
+    /* gradient for cos */
+    static
+    void GradCos(XTensor * node);
+    /* gradient for exp */
+    static
+    void GradExp(XTensor * node);
-    /* gradient for sum with one dimension: c = a + b * \beta
+    /* gradient for log: c =  log(a) */
-       where the size of b is equal to that of one dimension of a */
    static
-    void GradSumDim(XTensor * node);
+    void GradLog(XTensor * node);
+    /* gradient for round */
+    static
+    void GradRound(XTensor * node);
+    /* gradient for sign */
+    static
+    void GradSign(XTensor * node);
-    /* gradient for multiply (dot production): c =  a * b * \alpha */
+    /* gradient for sin */
    static
-    void GradMultiply(XTensor * node);
+    void GradSin(XTensor * node);
-    /* gradient for multiply one dimension: c =  a * b * \alpha 
+    /* gradient for tan */
-       where the size of b is equal to that of one dimension of a */
    static
-    void GradMultiplyDim(XTensor * node);
+    void GradTan(XTensor * node);
+    /* gradient for clip */
+    static
+    void GradClip(XTensor * node);
+    /* gradient for Divide */
+    static
+    void GradDiv(XTensor * node);
+    /* gradient for DivideDim */
+    static
+    void GradDivDim(XTensor * node);
    /* gradient for matrix multiply: c = matmul(a, b) * \alpha */
    static
@@ -73,17 +100,26 @@ private:
    static
    void GradMatrixMulBatched(XTensor * node);
-    /* gradient for log: c =  log(a) */
+    /* gradient for multiply (dot production): c =  a * b * \alpha */
    static
-    void GradLog(XTensor * node);
+    void GradMultiply(XTensor * node);
-    /* gradient for power */
+    /* gradient for multiply one dimension: c =  a * b * \alpha 
+       where the size of b is equal to that of one dimension of a */
    static
-    void GradPower(XTensor * node);
+    void GradMultiplyDim(XTensor * node);
    /* gradient for negate */
    static
    void GradNegate(XTensor * node);
+    /* gradient for normalize */
+    static
+    void GradNormalize(XTensor * node);
+    /* gradient for power */
+    static
+    void GradPower(XTensor * node);
    /* gradient for ScaleAndShift */
    static
@@ -92,14 +128,20 @@ private:
    /* gradient for Minus */
    static
    void GradSub(XTensor * node);
+	/* gradient for sub with one dimension: c = a - b * \beta
+	where the size of b is equal to that of one dimension of a */
+	static
+	void GradSubDim(XTensor * node);
-    /* gradient for Divide */
+    /* gradient for sum: c =  a + b * \beta */
    static
-    void GradDiv(XTensor * node);
+    void GradSum(XTensor * node);
-    /* gradient for DivideDim */
+    /* gradient for sum with one dimension: c = a + b * \beta
+       where the size of b is equal to that of one dimension of a */
    static
-    void GradDivDim(XTensor * node);
+    void GradSumDim(XTensor * node);
    /* gradient for reduceMean */
    static
@@ -116,42 +158,6 @@ private:
    /* gradient for reduceVariance */
    static
    void GradReduceVariance(XTensor * node);
-    /* gradient for sin */
-    static
-    void GradSin(XTensor * node);
-    /* gradient for cos */
-    static
-    void GradCos(XTensor * node);
-    /* gradient for tan */
-    static
-    void GradTan(XTensor * node);
-    /* gradient for exp */
-    static
-    void GradExp(XTensor * node);
-    /* gradient for normalize */
-    static
-    void GradNormalize(XTensor * node);
-    /* gradient for absolute */
-    static
-    void GradAbsolute(XTensor * node);
-    /* gradient for sign */
-    static
-    void GradSign(XTensor * node);
-    /* gradient for clip */
-    static
-    void GradClip(XTensor * node);
-    /* gradient for round */
-    static
-    void GradRound(XTensor * node);
 };
 }

--- a/source/network/XNet.cpp
+++ b/source/network/XNet.cpp
@@ -137,8 +137,6 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
            XTensor * x = income.tails[0];
            XNoder::MakeGrad(x);
            lossGrad.Compute(gold, root, x, NULL, x->grad, funcID, params, loss);
-            //XNoder::MakeGrad(root);
-            //lossGrad.Compute(gold, root, x, root->grad, x->grad, funcID, params, loss);
            root->visitMark = NODE_FINISHED;
        }
        /* we compuate dE/dy (y is the output) if no predefined activation function is used */

--- a/source/sample/transformer/T2TAttention.cpp
+++ b/source/sample/transformer/T2TAttention.cpp
@@ -35,6 +35,8 @@ T2TAttention::T2TAttention()
    dk = -1;
    dv = -1;
    d  = -1;
+    isMasked = false;
+    ignored = 0;
 }
 /* deconstructor */
@@ -46,13 +48,19 @@ T2TAttention::~T2TAttention()
 initialize the model 
 >> argc - number of arguments
 >> argv - list of pointers to the arguments
+>> myIgnored - number of position ignored in attention (from the begining)
+>> myIsMasked - indicates whether the attention is with a mask
 >> myDevID - device id
 >> myMem - the memory pool
 */
-void T2TAttention::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
+void T2TAttention::InitModel(int argc, const char ** argv, 
+                             bool myIsMasked, int myIgnored, 
+                             int myDevID, XMem * myMem)
 {
    devID = myDevID;
    mem = myMem;
+    isMasked = myIsMasked;
+    ignored = myIgnored;
    float minmax = 0;
@@ -82,9 +90,10 @@ make the network
       and H = vector size of each position
 >> q - queries
 >> v - values
+>> maske - as it is
 << return - multi-attention result
 */
-XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v)
+XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask)
 {
    XTensor k2;
    XTensor q2;
@@ -105,10 +114,18 @@ XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v)
    vheads = Split(v2, v2.order - 1, nhead);
    XTensor att;
+    XTensor dot;
    XTensor scalar;
    /* scalar = softmax(Q * K^T / sqrt(dk)) * V */
-    scalar = Softmax(Linear(BMMul(qheads, X_NOTRANS, kheads, X_TRANS), 1/(float)sqrt((float)dk)), -1);
+    dot = BMMul(qheads, X_NOTRANS, kheads, X_TRANS);
+    if(isMasked)
+        dot = dot + mask;
+    scalar = Softmax(Linear(dot, 1/(float)sqrt((float)dk)), -1);
+    if(ignored > 0)
+        _SetDataDim(&scalar, 0, ignored, scalar.order - 2, 1e-9F);
    att = BMMul(scalar, vheads);
    /* concatenate the heads */

--- a/source/sample/transformer/T2TAttention.h
+++ b/source/sample/transformer/T2TAttention.h
@@ -66,6 +66,13 @@ public:
    /* size of input Q, K and V */
    int d;
+    /* indicates whether the attention is masked */
+    bool isMasked;
+    /* some positions can be ignored in attention. this is useful in lm where the first position needs
+       special design for the attention model. */
+    int ignored;
 public:
    /* constructor */
    T2TAttention();
@@ -74,10 +81,12 @@ public:
    ~T2TAttention();
    /* initialize the model */
-    void InitModel(int argc, const char ** argv, int myDevID = -1, XMem * myMem = NULL);
+    void InitModel(int argc, const char ** argv, 
+                   bool myIsMasked, int myIgnored, 
+                   int myDevID = -1, XMem * myMem = NULL);
    /* make the network */
-    XTensor Make(XTensor &k, XTensor &q, XTensor &v);
+    XTensor Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask);
 };
 }

--- a/source/sample/transformer/T2TEmbedding.cpp
+++ b/source/sample/transformer/T2TEmbedding.cpp
@@ -136,7 +136,7 @@ XTensor T2TEmbedder::Make(XTensor &input)
    wordEmbedding = Linear(MMul(input, w), (float)sqrt((float)d));
    /* we sum over the two embeddings */
-    return wordEmbedding +posEmbedding;
+    return wordEmbedding + posEmbedding;
 }
 }
--- a/source/sample/transformer/T2TEncoder.cpp
+++ b/source/sample/transformer/T2TEncoder.cpp
@@ -46,13 +46,18 @@ AttEncoder::~AttEncoder()
 initialize the model 
 >> argc - number of arguments
 >> argv - list of pointers to the arguments
+>> myIsMasked - indicates whether the masked attention is employed
+>> myIgnored - number of positions ignored in attention (from the start)
 >> myDevID - device id
 >> myMem - the memory pool
 */
-void AttEncoder::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
+void AttEncoder::InitModel(int argc, const char ** argv, 
+                           bool myIsMasked, int myIgnored, 
+                           int myDevID, XMem * myMem)
 {
    devID = myDevID;
    mem = myMem;
+    ignored = myIgnored;
    LoadParamInt(argc, argv, "nlayer", &nlayer, 6);
    LoadParamInt(argc, argv, "hsize", &hSize, DEFAULT_EMBEDDING_SIZE);
@@ -72,7 +77,7 @@ void AttEncoder::InitModel(int argc, const char ** argv, int myDevID, XMem * myM
    /* initialize the stacked layers */
    for(int i = 0; i < nlayer; i++){
-        attentions[i].InitModel(argc, argv, myDevID, myMem);
+        attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
        fnns[i].InitModel(argc, argv, myDevID, myMem);
        attLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
        fnnLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
@@ -82,9 +87,11 @@ void AttEncoder::InitModel(int argc, const char ** argv, int myDevID, XMem * myM
 /* 
 make the encoding network
 >> input - the input tensor of the encoder
+>> mask - the mask that indicate each position is valid
+>> skipInputRes - indicates whether we skip the residual connection of the first layer
 << return - the output tensor of the encoder
 */
-XTensor AttEncoder::Make(XTensor &input)
+XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool skipInputRes)
 {
    XTensor x;
@@ -96,16 +103,27 @@ XTensor AttEncoder::Make(XTensor &input)
        XTensor fnn;
        XTensor res;
-        /* self attention */
+        if(skipInputRes && i == 0){
-        att = attentions[i].Make(x, x, x);
+            /* self attention */
+            att = attentions[i].Make(x, x, x, mask);
-        /* residual connection */
+            /* TODO: dropout */
-        res = Sum(att, x);
-        /* TODO: dropout */
+            /* layer normalization */
+            x = attLayerNorms[i].Make(att); 
+        }
+        else{
+            /* self attention */
+            att = attentions[i].Make(x, x, x, mask);
-        /* layer normalization */
+            /* residual connection */
-        x = attLayerNorms[i].Make(res);
+            res = Sum(att, x);
+            /* TODO: dropout */
+            /* layer normalization */
+            x = attLayerNorms[i].Make(res);
+        }
        /* fnn */
        fnn = fnns[i].Make(x);

--- a/source/sample/transformer/T2TEncoder.h
+++ b/source/sample/transformer/T2TEncoder.h
@@ -40,7 +40,7 @@ class T2TEncoder
 {
 public:
    virtual
-    XTensor Make(XTensor &input) = 0;
+    XTensor Make(XTensor &input, XTensor &mask, bool skipInputRes) = 0;
 };
 /* 
@@ -49,7 +49,7 @@ the encoder based on RNN
 class RNNEncoder : T2TEncoder
 {
 public:
-    XTensor Make(XTensor &input);
+    XTensor Make(XTensor &input, XTensor &mask, bool skipInputRes);
 };
@@ -77,6 +77,10 @@ public:
    /* vocabulary size */
    int vSize;
+    /* some positions can be ignored in attention. this is useful in lm where the first position needs
+       special design for the attention model. */
+    int ignored;
    /* embedding of word at each position */
    T2TEmbedder embedder;
@@ -106,10 +110,12 @@ public:
    ~AttEncoder();
    /* initialize the model */
-    void InitModel(int argc, const char ** argv, int myDevID = -1, XMem * myMem = NULL);
+    void InitModel(int argc, const char ** argv, 
+                   bool myIsMasked, int myIgnored, 
+                   int myDevID = -1, XMem * myMem = NULL);
    /* make the encoding network */
-    XTensor Make(XTensor &input);
+    XTensor Make(XTensor &input, XTensor &mask, bool skipInputRes);
 };

--- a/source/sample/transformer/T2TLayerNormal.cpp
+++ b/source/sample/transformer/T2TLayerNormal.cpp
@@ -19,6 +19,7 @@
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
 */
+#include <math.h>
 #include "T2TLayerNormal.h"
 #include "T2TUtility.h"
 #include "T2TEmbedding.h"
@@ -89,14 +90,13 @@ XTensor T2TLN::Make(XTensor &input)
    /* standard = sqrt(variance) */
    standard = Power(variance, 0.5F);
    /* unsqueeze mean and standard deviation to fit them into 
       the same shape of x */
    meanFilled = Unsqueeze(mean, x.order - 1, x.GetDim(-1));
    standardFilled = Unsqueeze(standard, x.order - 1, x.GetDim(-1));
    /* x' = (x - \mu)/standard */
-    xn = (x - meanFilled)/standardFilled ;
+    xn = (x - meanFilled)/standardFilled;
    /* result = x' * w + b   */
    return MMul(xn, w) + b;

--- a/source/sample/transformer/T2TModel.cpp
+++ b/source/sample/transformer/T2TModel.cpp
@@ -34,6 +34,7 @@ T2TModel::T2TModel()
    mem = NULL;
    isLM = false;
    isMT = false;
+    nhead = 1;
 }
 /* de-constructor */
@@ -55,24 +56,27 @@ void T2TModel::InitModel(int argc, const char ** argv)
    LoadParamBool(argc, argv, "mem", &useMem, useMem);
    LoadParamBool(argc, argv, "lm", &isLM, true);
    LoadParamBool(argc, argv, "mt", &isMT, false);
+    LoadParamInt(argc, argv, "nhead", &nhead, 8);
    if(useMem){
        delete mem;
        mem = new XMem(devID);
    }
-    encoder.InitModel(argc, argv, devID, mem);
+    encoder.InitModel(argc, argv, isLM, isLM ? 1 : 0, devID, mem);
    outputLayer.InitModel(argc, argv, devID, mem);
 }
 /* 
 make the encoding network
 >> input - input tensor
+>> mask - the mask for positions that are/not involved in computation
+>> skipInputRes - indicates whether we skip the residual connection of the first layer
 << return - encoding result
 */
-XTensor T2TModel::MakeEncoding(XTensor &input)
+XTensor T2TModel::MakeEncoding(XTensor &input, XTensor &mask, bool skipInputRes)
 {
-    return encoder.Make(input);
+    return encoder.Make(input, mask, skipInputRes);
 }
 /* 
@@ -85,8 +89,23 @@ void T2TModel::Make(XTensor &input, XTensor &output)
    XTensor encoding;
    if(isLM){
-        encoding = MakeEncoding(input);
+        /* generate mask to see "previous" words only */
+        int len = input.GetDim(input.order - 2);
+        int * dims = new int[input.order + 1];
+        for(int i = 0; i < input.order; i++)
+            dims[i + 1] = input.GetDim(i);
+        dims[0] = nhead;
+        dims[input.order] = len;
+        XTensor mask(input.order + 1, dims, X_FLOAT, 1.0F, input.devID, input.mem);
+        /* a upper triangular matrix where the cells of the upper triangular are set to -1e-9 */
+        _SetDataLowTri(&mask, 1e9F, -1);
+        _ScaleAndShiftMe(&mask, 1.0F, -1e9F);
+        encoding = MakeEncoding(input, mask, true);
        outputLayer.Make(encoding, output);
+        delete[] dims;
    }
    else{
        ShowNTErrors("TODO!");

--- a/source/sample/transformer/T2TModel.h
+++ b/source/sample/transformer/T2TModel.h
@@ -55,6 +55,9 @@ public:
    /* indicates whether the model is running for machine translation */
    bool isMT;
+    /* number of heads in the attention model */
+    int nhead;
 public:
    /* constructor */
    T2TModel();
@@ -66,7 +69,7 @@ public:
    void InitModel(int argc, const char ** argv);
    /* make the encoding network */
-    XTensor MakeEncoding(XTensor &input);
+    XTensor MakeEncoding(XTensor &input, XTensor &mask, bool skipInputRes);
    /* make the entire network (with the output softmax layer) */
    void Make(XTensor &input, XTensor &output);

--- a/source/sample/transformer/T2TUtility.cpp
+++ b/source/sample/transformer/T2TUtility.cpp
@@ -100,7 +100,9 @@ void ShowParams(int argc, const char ** argv)
 {
    fprintf(stderr, "args:\n");
    for(int i = 0; i < argc; i++){
-        if(argv[i][0] == '-'){
+        if(argv[i][1] == 0)
+            continue;
+        if(argv[i][0] == '-' && (argv[i][1] < '1' || argv[i][1] > '9')){
            if(i + 1 < argc && argv[i + 1][0] != '-')
                fprintf(stderr, " %s=%s\n", argv[i], argv[i + 1]);
            else

--- a/source/tensor/core/arithmetic/Div.cpp
+++ b/source/tensor/core/arithmetic/Div.cpp
@@ -23,6 +23,7 @@
 #include "../../XName.h"
 #include "Div.h"
 #include "Div.cuh"
+#include "DivDim.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -137,6 +138,33 @@ void _DivMe(XTensor * a, const XTensor * b, DTYPE alpha, int leadingDim)
    _Div(a, b, a, alpha, leadingDim);
 }
+/* 
+return a dimension if the division is performed as DivDim (in more details in DivDim.h)
+>> a - a tensor
+>> b - another tensor for division
+*/
+int GetDivDimIndex(const XTensor &a, const XTensor &b)
+{
+    if(a.order < b.order)
+        return -1;
+    int hitCount = 0;
+    int hitDim = -1;
+    for(int i = 0; i < b.order; i++){
+        if(b.dimSize[b.order - 1 - i] == 1)
+            continue;
+        else if(b.dimSize[b.order - 1 - i] == a.dimSize[a.order - 1 - i]){
+            hitCount++;
+            hitDim = a.order - b.order + i;
+        }
+    }
+    if(hitCount == 1)
+        return hitDim;
+    else
+        return -1;
+}
 /*
 element-wise division of two tensors (return a XTensor structure)
 make a new tensor c to keep the result and return it
@@ -146,23 +174,41 @@ where i is the index of the item
 >> a - tensor a
 >> b - tensor b
+>> alpha - the coefficient
 >> leadingDim - the dimension along which we perform broadcasting
 << return - the product of the tensors
 */
-XTensor Div(const XTensor &a, const XTensor &b, int leadingDim)
+XTensor Div(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim)
 {
-    CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
    XTensor c(&a);
    c.SetTMP();
+    int n = GetDivDimIndex(a, b);
+    if(n == -1){
+        CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
+        /* call _Div function */
+        _Div(&a, &b, &c, alpha, leadingDim);
-    /* call _Multiply function */
+        /* tensor connections */
-    _Div(&a, &b, &c, 0, leadingDim);
+        XLink::MakeLink(&a, &b, &c, MATH_DIV);
+        XLink::AddParamToHead(&c, alpha);
-    /* tensor connections */
+        XLink::AddParamToHeadInt(&c, leadingDim);
-    XLink::MakeLink(&a, &b, &c, MATH_DIV);
+    }
-    XLink::AddParamToHeadInt(&c, leadingDim);
+    else if(n >= 0 && n < a.order){
+        /* call _DivDim function */
+        _DivDim(&a, &b, &c, n, alpha);
+        /* tensor connections */
+        XLink::MakeLink(&a, &b, &c, MATH_DIVDIM);
+        XLink::AddParamToHeadInt(&c, n);
+        XLink::AddParamToHeadInt(&c, alpha);
+    }
+    else{
+        ShowNTErrors("Something is wrong!");
+    }
    return c;
 }

--- a/source/tensor/core/arithmetic/Div.h
+++ b/source/tensor/core/arithmetic/Div.h
@@ -47,7 +47,7 @@ make a new tensor to keep the result and return it
 c(i) = a(i)/b(i)
 where i is the index of the element 
 */
-XTensor Div(const XTensor &a, const XTensor &b, int leadingDim = 0);
+XTensor Div(const XTensor &a, const XTensor &b, DTYPE alpha = 0.0, int leadingDim = 0);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/Multiply.cpp
+++ b/source/tensor/core/arithmetic/Multiply.cpp
@@ -23,6 +23,7 @@
 #include "../../XName.h"
 #include "Multiply.h"
 #include "Multiply.cuh"
+#include "MultiplyDim.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -138,6 +139,33 @@ void _MultiplyMe(XTensor * a, const XTensor * b, DTYPE alpha, int leadingDim)
    _Multiply(a, b, a, alpha, leadingDim);
 }
+/* 
+return a dimension if the multiplication is performed as MultiplyDim (in more details in MultiplyDim.h)
+>> a - a tensor
+>> b - another tensor for multiplication
+*/
+int GetMultiplyDimIndex(const XTensor &a, const XTensor &b)
+{
+    if(a.order < b.order)
+        return -1;
+    int hitCount = 0;
+    int hitDim = -1;
+    for(int i = 0; i < b.order; i++){
+        if(b.dimSize[b.order - 1 - i] == 1)
+            continue;
+        else if(b.dimSize[b.order - 1 - i] == a.dimSize[a.order - 1 - i]){
+            hitCount++;
+            hitDim = a.order - b.order + i;
+        }
+    }
+    if(hitCount == 1)
+        return hitDim;
+    else
+        return -1;
+}
 /*
 element-wise product of two tensors (return a XTensor structure)
 make a new tensor c to keep the result and return it
@@ -150,20 +178,38 @@ where i is the index of the item
 >> leadingDim - the dimension along which we perform broadcasting
 << return - the product of the tensors
 */
-XTensor Multiply(const XTensor &a, const XTensor &b, int leadingDim)
+XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim)
 {
-    CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
    XTensor c(&a);
    c.SetTMP();
-    /* call _Multiply function */
+    int n = GetMultiplyDimIndex(a, b);
-    _Multiply(&a, &b, &c, 0, leadingDim);
+    if(n == -1){
+        CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
-    /* tensor connections */
+        /* call _Multiply function */
-    XLink::MakeLink(&a, &b, &c, MATH_MULTIPLY);
+        _Multiply(&a, &b, &c, 0, leadingDim);
-    XLink::AddParamToHeadInt(&c, leadingDim);
+        /* tensor connections */
+        XLink::MakeLink(&a, &b, &c, MATH_MULTIPLY);
+        XLink::AddParamToHead(&c, alpha);
+        XLink::AddParamToHeadInt(&c, leadingDim);
+    }
+    else if(n >= 0 && n < a.order){
+        /* call _MultiplyDim function */
+        _MultiplyDim(&a, &b, &c, n, alpha);
+        /* tensor connections */
+        XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM);
+        XLink::AddParamToHeadInt(&c, n);
+        XLink::AddParamToHeadInt(&c, alpha);
+    }
+    else{
+        ShowNTErrors("Something is wrong!");
+    }
    return c;
 }

--- a/source/tensor/core/arithmetic/Multiply.h
+++ b/source/tensor/core/arithmetic/Multiply.h
@@ -47,7 +47,7 @@ make a new tensor to keep the result and return it
 c(i) = a(i)*b(i)
 where i is the index of the element 
 */
-XTensor Multiply(const XTensor &a, const XTensor &b, int leadingDim = 0);
+XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha = 0.0, int leadingDim = 0);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/Sub.cpp
+++ b/source/tensor/core/arithmetic/Sub.cpp
@@ -135,15 +135,14 @@ int GetSubDimIndex(const XTensor &a, const XTensor &b)
    if(a.order < b.order)
        return -1;
-    if(XTensor::IsSameShaped(&a, &b))
-        return -1;
    int hitCount = 0;
    int hitDim = -1;
-    for(int i = 0; i < a.order; i++){
+    for(int i = 0; i < b.order; i++){
-        if(a.dimSize[i] == b.unitNum){
+        if(b.dimSize[b.order - 1 - i] == 1)
-            hitDim = i;
+            continue;
+        else if(b.dimSize[b.order - 1 - i] == a.dimSize[a.order - 1 - i]){
            hitCount++;
+            hitDim = a.order - b.order + i;
        }
    }
@@ -172,7 +171,6 @@ XTensor Sub(const XTensor &a, const XTensor &b, DTYPE beta)
    if(n == -1){
        /* call _Sub function */
        _Sub(&a, &b, &c, beta);
        /* tensor connections */
        XLink::MakeLink(&a, &b, &c, MATH_SUB);

--- a/source/tensor/core/arithmetic/SubDim.cu
+++ b/source/tensor/core/arithmetic/SubDim.cu
@@ -146,8 +146,8 @@ void _CudaSubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE
 			GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
 			if (beta == (DTYPE)1.0F)
 				KernelSubWithCol<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
-				                                 ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
+				                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
-					                               blockSize, stride, blockSize * stride, blockNum, beta);
+					                              blockSize, stride, blockSize * stride, blockNum, beta);
 			else
 				KernelSubWithCol<DTYPE, true>  <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
 				                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,

--- a/source/tensor/core/arithmetic/Sum.cpp
+++ b/source/tensor/core/arithmetic/Sum.cpp
@@ -137,37 +137,17 @@ return a dimension if the sum is performed as SumDim (in more details in SumDim.
 */
 int GetSumDimIndex(const XTensor &a, const XTensor &b)
 {
-    //if(a.order < b.order)
-    //    return -1;
-    //int hitCount = 0;
-    //int hitDim = -1;
-    //for(int i = 0; i < b.order; i++){
-    //    if(b.dimSize[b.order - 1 - i] == 1)
-    //        continue;
-    //    else if(b.dimSize[b.order - 1 - i] == a.dimSize[a.order - 1 - i]){
-    //        hitCount++;
-    //        hitDim = a.order - b.order + i;
-    //    }
-    //}
-    //if(hitCount == 1)
-    //    return hitDim;
-    //else
-    //    return -1;
    if(a.order < b.order)
        return -1;
-    if(XTensor::IsSameShaped(&a, &b))
-        return -1;
    int hitCount = 0;
    int hitDim = -1;
-    for(int i = 0; i < a.order; i++){
+    for(int i = 0; i < b.order; i++){
-        if(a.dimSize[i] == b.unitNum){
+        if(b.dimSize[b.order - 1 - i] == 1)
-            hitDim = i;
+            continue;
+        else if(b.dimSize[b.order - 1 - i] == a.dimSize[a.order - 1 - i]){
            hitCount++;
+            hitDim = a.order - b.order + i;
        }
    }

--- a/source/tensor/core/getandset/Select.cpp
+++ b/source/tensor/core/getandset/Select.cpp
@@ -49,7 +49,7 @@ void _SelectRange(const XTensor * a, XTensor * c, int dim, int low, int high)
    for(int i = 0; i < a->order; i++){
        if(i == dim){
-            CheckNTErrors(low > 0 && low < a->dimSize[dim], "Illegal range specified!");
+            CheckNTErrors(low >= 0 && low < a->dimSize[dim], "Illegal range specified!");
            CheckNTErrors(high > 0 && high <= a->dimSize[dim], "Illegal range specified!");
        }
        else{
@@ -101,7 +101,7 @@ XTensor SelectRange(const XTensor &a, int dim, int low, int high)
    for(int i = 0; i < a.order; i++){
        if(i == dim){
-            CheckNTErrors(low > 0 && low < a.dimSize[dim], "Illegal range specified!");
+            CheckNTErrors(low >= 0 && low < a.dimSize[dim], "Illegal range specified!");
            CheckNTErrors(high > 0 && high <= a.dimSize[dim], "Illegal range specified!");
            dimSize[i] = high - low;
        }

--- a/source/tensor/core/getandset/SetData.cpp
+++ b/source/tensor/core/getandset/SetData.cpp
@@ -213,6 +213,106 @@ void _SetDataFixedDouble(XTensor * tensor, double p)
        _SetDataFixed(tensor, &p);
 }
+/* 
+set data items along with a given dimension (and keep the remaining items unchanged) 
+>> tensor - the tensor whose data array would be initialized
+>> beg - the beginning position
+>> len - length along with the given dimension
+>> dim - the dimension along which we set the data
+e.g., given a 3 * 3 tensor 
+      1 2 3
+      4 5 6
+      7 8 9
+      when beg = 1, len = 1, dim = 0 and p = 0, we have
+      1 2 3
+      0 0 0
+      7 8 9
+      i.e., we set all entries of row 1 to 0
+*/
+void _SetDataDim(XTensor * tensor, int beg, int len, int dim, DTYPE p)
+{
+    int n = tensor->order;
+    CheckNTErrors(tensor->dataType == DEFAULT_DTYPE, "TODO!");
+    CheckNTErrors(dim < n && dim > 0, "Illegal dimension!");
+    CheckNTErrors(beg >= 0 && beg < tensor->GetDim(dim), "Illegal beginning position!");
+    CheckNTErrors(beg + len >= 0 && beg + len < tensor->GetDim(dim), "Illegal length!");
+    if(tensor->devID < 0){
+        int stride = 1;
+        int blockSize = 1;
+        int blockNum  = 1;
+        for(int i = n - 1; i > dim; i--){
+            stride *= tensor->GetDim(i);
+        }
+        blockSize = stride * tensor->GetDim(dim);
+        blockNum = tensor->unitNum / blockSize;
+        int l = len * stride;
+        for(int i = 0; i < blockNum; i++){
+            DTYPE * d = (DTYPE*)tensor->data + blockSize * i + beg * stride;    
+            for(int j = 0; j < l; j++)
+                d[j] = p;
+        }
+    }
+    else{
+#ifdef USE_CUDA
+        _CudaSetDataDim(tensor, beg, len, dim, p);
+#endif
+    }
+}
+/* 
+generate data as lower triangular matrics for last two dimensions 
+>> tensor - the tensor whose data to be set
+>> p - the value for each entry of the lower triangular matrics
+>> shift - the offset from diagonal
+e.g., for a 3* 3 tensor, 
+      when p = 1 ans shift = 0, we have
+      1 0 0
+      1 1 0
+      1 1 1
+      when p = 2 and shift = -1, we have
+      0 0 0
+      2 0 0
+      2 2 0
+*/
+void _SetDataLowTri(XTensor * tensor, DTYPE p, int shift)
+{
+    int n = tensor->order;
+    CheckNTErrors(tensor->dataType == DEFAULT_DTYPE, "TODO!");
+    CheckNTErrors(n >= 2, "The tensor must have a order no less than 2!");
+    CheckNTErrors(tensor->GetDim(n - 1) == tensor->GetDim(n - 2), 
+                 "The last two dimensions must be of the same size!");
+    if(tensor->devID < 0){
+        int l = tensor->GetDim(-1);
+        int blockNum = 1;
+        int blockSize = l * l;
+        for(int i = 0; i < n - 2; i++)
+            blockNum *= tensor->GetDim(i);
+        for(int i = 0; i < blockNum; i++){
+            DTYPE * d = (DTYPE*)tensor->data + i * blockSize;
+            for(int row = 0; row < l; row++){
+                for(int col = 0; col <= row + shift; col++){
+                    d[row * l + col] = p;
+                }
+                for(int col = MAX(0, row + shift + 1); col < l; col++){
+                    d[row * l + col] = 0;
+                }
+            }
+        }
+    }
+    else{
+#ifdef USE_CUDA
+        _CudaSetDataLowTri(tensor, p, shift);
+#endif
+    }
+}
 /*
 generate data items with a uniform distribution in [lower, upper]
 >> tensor - the tensor whose data array would be initialized

--- a/source/tensor/core/getandset/SetData.cu
+++ b/source/tensor/core/getandset/SetData.cu
@@ -184,6 +184,169 @@ void KernelSetDataRandDouble(double * d, int size, DTYPE lower, DTYPE variance)
    }
 }
+/* 
+set data items along with a given dimension (and keep the remaining items unchanged) - kernel version
+>> tensor - the tensor whose data array would be initialized
+>> beg - the beginning position
+>> len - length of the segment to be set
+>> blockSize - size of a data block
+>> blockNum - number of data blocks
+*/
+__global__
+void KernelSetDataDim(DTYPE * d, int beg, int len, int blockSize, int blockNum, DTYPE p)
+{
+    /* offset in each block */
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    /* block id */
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+    if(i >= blockSize || j > blockNum)
+        return;
+    if(i < beg || i >= beg + len)
+        return;
+    d[blockSize * j + i] = p;
+}
+/* 
+set data items along with a given dimension (and keep the remaining items unchanged) - cuda version
+>> tensor - the tensor whose data array would be initialized
+>> beg - the beginning position
+>> len - length along with the given dimension
+>> dim - the dimension along which we set the data
+e.g., given a 3 * 3 tensor 
+      1 2 3
+      4 5 6
+      7 8 9
+      when beg = 1, len = 1, dim = 0 and p = 0, we have
+      1 2 3
+      0 0 0
+      7 8 9
+      i.e., we set all entries of row 1 to 0
+*/
+void _CudaSetDataDim(XTensor * tensor, int beg, int len, int dim, DTYPE p)
+{
+    int n = tensor->order;
+    CheckNTErrors(tensor->dataType == DEFAULT_DTYPE, "TODO!");
+    CheckNTErrors(dim < n && dim > 0, "Illegal dimension!");
+    CheckNTErrors(beg >= 0 && beg < tensor->GetDim(dim), "Illegal beginning position!");
+    CheckNTErrors(beg + len >= 0 && beg + len < tensor->GetDim(dim), "Illegal length!");
+    int stride = 1;
+    int blockSize = 1;
+    int blockNum  = 1;
+    for(int i = n - 1; i > dim; i--){
+        stride *= tensor->GetDim(i);
+    }
+    blockSize = stride * tensor->GetDim(dim);
+    blockNum = tensor->unitNum / blockSize;
+    int cudaGrids[3];
+    int cudaBlocks[3];
+    GDevs.GetCudaThread2D(tensor->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
+    dim3 blocks(cudaGrids[0], cudaGrids[1]);
+    dim3 threads(cudaBlocks[0], cudaBlocks[1]);
+    int devIDBackup;
+    ProtectCudaDev(tensor->devID, devIDBackup);
+    KernelSetDataDim<<<blocks, threads >>>((DTYPE*)tensor->data, beg * stride, len * stride, blockSize, blockNum, p);
+    BacktoCudaDev(tensor->devID, devIDBackup);
+}
+/* 
+set lower triangular matrics for each block
+>> d - pointer to the data array
+>> l - row number (or column number) of each block, i.e, 
+       a block is l * l matrix
+>> blockSize - size of each block (blockSize = l * l)
+>> blockNum - number of the blocks
+>> p - the value for each entry of the lower triangular matrics
+>> shift - the offset from diagonal
+e.g., for a 3* 3 tensor, 
+      when p = 1 ans shift = 0, we have
+      1 0 0
+      1 1 0
+      1 1 1
+      when p = 2 and shift = -1, we have
+      0 0 0
+      2 0 0
+      2 2 0
+*/
+__global__
+void _KernelSetDataLowTri(DTYPE * d, int l, int blockSize, int blockNum, DTYPE p, int shift)
+{
+    /* offset in each block */
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    /* block id */
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+    if(i >= blockSize || j > blockNum)
+        return;
+    int row = i / l;
+    int col = i % l;
+    DTYPE * d2 = d + blockSize * j + row * l + col;
+    if(col <= row + shift)
+        *d2 = p;
+    else
+        *d2 = 0;
+}
+/* 
+generate data as lower triangular matrics for last two dimensions (cuda version)
+>> tensor - the tensor whose data to be set
+>> p - the value for each entry of the lower triangular matrics
+>> shift - the offset from diagonal
+e.g., for a 3* 3 tensor, 
+      when p = 1 ans shift = 0, we have
+      1 0 0
+      1 1 0
+      1 1 1
+      when p = 2 and shift = -1, we have
+      0 0 0
+      2 0 0
+      2 2 0
+*/
+void _CudaSetDataLowTri(XTensor * tensor, DTYPE p, int shift)
+{
+    int n = tensor->order;
+    CheckNTErrors(tensor->dataType == DEFAULT_DTYPE, "TODO!");
+    CheckNTErrors(n >= 2, "The tensor must have a order no less than 2!");
+    CheckNTErrors(tensor->GetDim(n - 1) == tensor->GetDim(n - 2), 
+                 "The last two dimensions must be of the same size!");
+    int l = tensor->GetDim(-1);
+    int blockNum = 1;
+    int blockSize = l * l;
+    for(int i = 0; i < n - 2; i++)
+        blockNum *= tensor->GetDim(i);
+    int cudaGrids[3];
+    int cudaBlocks[3];
+    GDevs.GetCudaThread2D(tensor->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
+    dim3 blocks(cudaGrids[0], cudaGrids[1]);
+    dim3 threads(cudaBlocks[0], cudaBlocks[1]);
+    int devIDBackup;
+    ProtectCudaDev(tensor->devID, devIDBackup);
+    _KernelSetDataLowTri<<<blocks, threads >>>((DTYPE*)tensor->data, l, blockSize, blockNum, p, shift);
+    BacktoCudaDev(tensor->devID, devIDBackup);
+}
 /*
 generate data items with a uniform distribution in [lower, upper]
 >> tensor - the tensor whose data array would be initialized

--- a/source/tensor/core/getandset/SetData.cuh
+++ b/source/tensor/core/getandset/SetData.cuh
@@ -37,6 +37,12 @@ void _CudaSetDataFixedFloat(XTensor * tensor, float p);
 /* generate data items with a fixed value p (in double) */
 void _CudaSetDataFixedDouble(XTensor * tensor, double p);
+/* set data items along with a given dimension (and keep the remaining items unchanged) */
+void _CudaSetDataDim(XTensor * tensor, int beg, int len, int dim, DTYPE p);
+/* generate data as lower triangular matrics for last two dimensions (cuda version) */
+void _CudaSetDataLowTri(XTensor * tensor, DTYPE p, int shift);
 /* generate data items with a uniform distribution in [lower, upper] */
 void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper);

--- a/source/tensor/core/getandset/SetData.h
+++ b/source/tensor/core/getandset/SetData.h
@@ -45,6 +45,12 @@ void _SetDataFixedFloat(XTensor * tensor, float p);
 /* generate data items with a fixed value p (in double) */
 void _SetDataFixedDouble(XTensor * tensor, double p);
+/* set data items along with a given dimension (and keep the remaining items unchanged) */
+void _SetDataDim(XTensor * tensor, int beg, int len, int dim, DTYPE p);
+/* generate data as lower triangular matrics for last two dimensions */
+void _SetDataLowTri(XTensor * tensor, DTYPE p, int shift);
 /* generate data items with a uniform distribution in [lower, upper] */
 void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper);

--- a/source/tensor/test/TSubDim.cpp
+++ b/source/tensor/test/TSubDim.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
+*/
+#include "TSubDim.h"
+#include "../core/arithmetic/SubDim.h"
+#include "../XTensor.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/*
+case 1: tensor subtraction c = a - b * \beta
+where the size of b is equal to the n-th dimension of a,
+i.e., a is subtracted with b by broadcasting
+*/
+bool TestSubDim1()
+{
+	/* a tensor of size (2, 4) */
+	int aOrder = 2;
+	int * aDimSize = new int[aOrder];
+	aDimSize[0] = 2;
+	aDimSize[1] = 4;
+	int aUnitNum = 1;
+	for (int i = 0; i < aOrder; i++)
+		aUnitNum *= aDimSize[i];
+	/* a tensor of size (2) */
+	int bOrder = 1;
+	int * bDimSize = new int[bOrder];
+	bDimSize[0] = 2;
+	int bUnitNum = 1;
+	for (int i = 0; i < bOrder; i++)
+		bUnitNum *= bDimSize[i];
+	DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
+	                      {4.0F, 5.0F, 6.0F, 7.0F} };
+	DTYPE bData[2] = {1.0F, -1.0F};
+	DTYPE answer[2][4] = { {-1.0F, 0.0F, 1.0F, 2.0F},
+	                       {5.0F, 6.0F, 7.0F, 8.0F} };
+	/* CPU test */
+	bool cpuTest = true;
+	/* create tensors */
+	XTensor * a = NewTensor(aOrder, aDimSize);
+	XTensor * b = NewTensor(bOrder, bDimSize);
+	XTensor * c = NewTensor(aOrder, aDimSize);
+	XTensor * cMe = NewTensor(aOrder, aDimSize);
+	XTensor cUser;
+	/* initialize variables */
+	a->SetData(aData, aUnitNum);
+	cMe->SetData(aData, aUnitNum);
+	b->SetData(bData, bUnitNum);
+	c->SetZeroAll();
+	/* call SubDim function */
+	_SubDim(a, b, c, 0);
+	_SubDim(cMe, b, 0);
+	cUser = SubDim(*a, *b, 0);
+	/* check results */
+    cpuTest = c->CheckData(answer, aUnitNum) && 
+              cMe->CheckData(answer, aUnitNum) && 
+              cUser.CheckData(answer, aUnitNum);
+#ifdef USE_CUDA
+	/* GPU test */
+	bool gpuTest = true;
+	/* create tensor */
+	XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+	XTensor cUserGPU;
+	/* Initialize variables */
+	aGPU->SetData(aData, aUnitNum);
+	cMeGPU->SetData(aData, aUnitNum);
+	bGPU->SetData(bData, bUnitNum);
+	cGPU->SetZeroAll();
+	/* call sub function */
+	_SubDim(aGPU, bGPU, cGPU, 0);
+	_SubDim(cMeGPU, bGPU, 0);
+	cUserGPU = SubDim(*aGPU, *bGPU, 0);
+	/* check results */
+    gpuTest = cGPU->CheckData(answer, aUnitNum) && 
+              cMeGPU->CheckData(answer, aUnitNum) && 
+              cUserGPU.CheckData(answer, aUnitNum);
+	/* destroy variables */
+	delete a;
+	delete b;
+	delete c;
+	delete cMe;
+	delete aGPU;
+	delete bGPU;
+	delete cGPU;
+	delete cMeGPU;
+	delete[] aDimSize;
+	delete[] bDimSize;
+	return cpuTest && gpuTest;
+#else
+	/* destroy variables */
+	delete a;
+	delete b;
+	delete c;
+	delete cMe;
+	delete[] aDimSize;
+	delete[] bDimSize;
+	return cpuTest;
+#endif // USE_CUDA
+}
+/*
+case 2: tensor subtraction c = a - b * \beta
+where the size of b is equal to the n-th dimension of a,
+i.e., a is subtracted with b by broadcasting
+*/
+bool TestSubDim2()
+{
+	/* a tensor of size (2, 4) */
+	int aOrder = 2;
+	int * aDimSize = new int[aOrder];
+	aDimSize[0] = 2;
+	aDimSize[1] = 4;
+	int aUnitNum = 1;
+	for (int i = 0; i < aOrder; i++)
+		aUnitNum *= aDimSize[i];
+	/* a tensor of size (2, 2) */
+	int bOrder = 2;
+	int * bDimSize = new int[bOrder];
+	bDimSize[0] = 2;
+	bDimSize[1] = 2;
+	int bUnitNum = 1;
+	for (int i = 0; i < bOrder; i++)
+		bUnitNum *= bDimSize[i];
+	DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
+	                      {4.0F, 5.0F, 6.0F, 7.0F} };
+	DTYPE bData[2][2] = { {1.0F, -1.0F},
+	                      {-1.0F, 1.0F} };
+	DTYPE answer[2][4] = { {-1.0F, 2.0F, 3.0F, 2.0F},
+	                       {3.0F, 6.0F, 7.0F, 6.0F} };
+	/* CPU test */
+	bool cpuTest = true;
+	/* create tensors */
+	XTensor * a = NewTensor(aOrder, aDimSize);
+	XTensor * b = NewTensor(bOrder, bDimSize);
+	XTensor * c = NewTensor(aOrder, aDimSize);
+	XTensor * cMe = NewTensor(aOrder, aDimSize);
+	XTensor cUser;
+	/* initialize variables */
+	a->SetData(aData, aUnitNum);
+	cMe->SetData(aData, aUnitNum);
+	b->SetData(bData, bUnitNum);
+	c->SetZeroAll();
+	/* call SubDim function */
+	_SubDim(a, b, c, 1);
+	_SubDim(cMe, b, 1);
+	cUser = SubDim(*a, *b, 1);
+	/* check results */
+    cpuTest = c->CheckData(answer, aUnitNum) && 
+              cMe->CheckData(answer, aUnitNum) && 
+              cUser.CheckData(answer, aUnitNum);
+#ifdef USE_CUDA
+	/* GPU test */
+	bool gpuTest = true;
+	/* create tensor */
+	XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+	XTensor cUserGPU;
+	/* Initialize variables */
+	aGPU->SetData(aData, aUnitNum);
+	cMeGPU->SetData(aData, aUnitNum);
+	bGPU->SetData(bData, bUnitNum);
+	cGPU->SetZeroAll();
+	/* call sub function */
+	_SubDim(aGPU, bGPU, cGPU, 1);
+	_SubDim(cMeGPU, bGPU, 1);
+	cUserGPU = SubDim(*aGPU, *bGPU, 1);
+	/* check results */
+    gpuTest = cGPU->CheckData(answer, aUnitNum) && 
+              cMeGPU->CheckData(answer, aUnitNum) && 
+              cUserGPU.CheckData(answer, aUnitNum);
+	/* destroy variables */
+	delete a;
+	delete b;
+	delete c;
+	delete cMe;
+	delete aGPU;
+	delete bGPU;
+	delete cGPU;
+	delete cMeGPU;
+	delete[] aDimSize;
+	delete[] bDimSize;
+	return cpuTest && gpuTest;
+#else
+	/* destroy variables */
+	delete a;
+	delete b;
+	delete c;
+	delete cMe;
+	delete[] aDimSize;
+	delete[] bDimSize;
+	return cpuTest;
+#endif // USE_CUDA
+}
+/* other cases */
+/*
+TODO!!
+*/
+/* test for SubDim Function */
+bool TestSubDim()
+{
+	XPRINT(0, stdout, "[TEST SUBDIM] tensor subtraction c = a - b * beta by broadcasting\n");
+	bool returnFlag = true, caseFlag = true;
+	/* case 1 test */
+	caseFlag = TestSubDim1();
+	if (!caseFlag) {
+		returnFlag = false;
+		XPRINT(0, stdout, ">> case 1 failed!\n");
+	}
+	else
+		XPRINT(0, stdout, ">> case 1 passed!\n");
+	/* case 2 test */
+	caseFlag = TestSubDim2();
+	if (!caseFlag) {
+		returnFlag = false;
+		XPRINT(0, stdout, ">> case 2 failed!\n");
+	}
+	else
+		XPRINT(0, stdout, ">> case 2 passed!\n");
+	/* other cases test */
+	/*
+	TODO!!
+	*/
+	if (returnFlag) {
+		XPRINT(0, stdout, ">> All Passed!\n");
+	}
+	else
+		XPRINT(0, stdout, ">> Failed!\n");
+	XPRINT(0, stdout, "\n");
+	return returnFlag;
+}
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/test/TSubDim.h
+++ b/source/tensor/test/TSubDim.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
+*/
+#ifndef __TEST_SUBDIM_H__
+#define __TEST_SUBDIM_H__
+#include "../core/arithmetic/SubDim.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* test for SubDim Function */
+bool TestSubDim();
+} // namespace nts(NiuTrans.Tensor)
+#endif // __TEST_SUBDIM_H__