Merge branch 'xuchen'

fa4d308e · xuchen · b8bcae5b · 98db6f24 · fa4d308e · fa4d308e
Commit fa4d308e authored Aug 19, 2018 by xuchen
--- a/doc/Configuration.md
+++ b/doc/Configuration.md
@@ -2,7 +2,7 @@

 ## 注意事项

-CUDA最新版本9.2尚且不支持VS2017最新版本，因此建议使用CUDA版本为9.0或9.1，建议使用VS版本为VS2015，或使用VS2017时安装v140工具集。
+CUDA最新版本9.2尚且不支持VS2017最新版本，因此建议使用CUDA版本为9.0或9.1，建议使用VS版本为VS2015，或使用VS2017时安装v140工具集，解决方案平台设置为×64。

 ## CUDA配置

@@ -29,7 +29,7 @@ CUDA最新版本9.2尚且不支持VS2017最新版本，因此建议使用CUDA版

 在 **C/C++->预处理器->预处理器定义** 中，添加

->USE_CUDA;USE_BLAS;WIN32;MKL;DEBUG;CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_WARNINGS_
+>USE_CUDA;USE_BLAS;WIN32;MKL;_DEBUG;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_WARNINGS_
 CONSOLE;

 **链接器->系统->子系统**，设置为控制台。

--- a/source/network/Main.cpp
+++ b/source/network/Main.cpp
@@ -24,6 +24,7 @@
 #include "../tensor/XUtility.h"
 #include "../tensor/function/FHeader.h"
 #include "../tensor/core/CHeader.h"
+#include "../tensor/test/Test.h"
 #include "../sample/fnnlm/FNNLM.h"
 #include "../sample/transformer/Transformer.h"

@@ -31,18 +32,24 @@
 //#include <stdlib.h>
 //#include <crtdbg.h>

+void BackwardTest();
 void TransposeTest();
 void SumDimTest();

 using namespace nts;
 using namespace fnnlm;
 using namespace transformer;
+using namespace GAN;

 int main( int argc, const char ** argv )
 {
    //_CrtSetBreakAlloc(896);
+    //BackwardTest();
+    //return 0;

-    if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
+    if(argc > 1 && !strcmp(argv[1], "-test"))
+        Test();
+    else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
        FNNLMMain(argc - 1, argv + 1);
    else if(argc > 1 && !strcmp(argv[1], "-t2t"))
        TransformerMain(argc - 1, argv + 1);
@@ -58,6 +65,41 @@ int main( int argc, const char ** argv )
    return 0;
 }

+void BackwardTest()
+{
+    XNet net;
+
+    XTensor a;
+    XTensor b;
+    XTensor c;
+    XTensor mean;
+    XTensor origin;
+    InitTensor2D(&a, 2, 3);
+    InitTensor1D(&b, 2);
+
+    a.SetZeroAll();
+    b.SetZeroAll();
+    a.Set2D(1.0F, 0, 0);
+    a.Set2D(2.0F, 0, 1);
+    a.Set2D(3.0F, 0, 2);
+    a.Set2D(4.0F, 1, 0);
+    a.Set2D(5.0F, 1, 1);
+    a.Set2D(6.0F, 1, 2);
+
+    b.Set1D(2.0F, 0);
+    b.Set1D(1.0F, 1);
+
+    c = DivDim(a, b, 0);
+    c.Dump(stderr, "c:");
+
+    XLink::ShowNetwork(stderr, &c);
+
+    net.Backward(c);
+
+    net.Dump(stderr);
+
+}
+
 void TransposeTest()
 {
 #ifdef USE_CUDA

--- a/source/network/XBackwardFunc.cpp
+++ b/source/network/XBackwardFunc.cpp
@@ -50,6 +50,7 @@ void XFuncGrad::MakeGrad(XTensor * node)
        _IdentityBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
    else if(operID == FUNC_LOGSOFTMAX){
        int leadDim = income.GetParamInt(0);
+        CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in logsoftmax!");
        _LogSoftmaxBackward(NULL, output, input, output->grad, input->grad, leadDim, NOLOSS);
    }
    else if(operID == FUNC_RECTIFY)
@@ -58,6 +59,7 @@ void XFuncGrad::MakeGrad(XTensor * node)
        _SigmoidBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
    else if(operID == FUNC_SOFTMAX){
        int leadDim = income.GetParamInt(0);
+        CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in softmax!");
        _SoftmaxBackward(NULL, output, input, output->grad, input->grad, leadDim, NOLOSS);
    }
    else{

--- a/source/network/XBackwardLoss.cpp
+++ b/source/network/XBackwardLoss.cpp
@@ -22,7 +22,11 @@
 #include "XBackwardLoss.h"
 #include "../tensor/XName.h"
 #include "../tensor/function/HardTanH.h"
+#include "../tensor/function/Identity.h"
 #include "../tensor/function/LogSoftmax.h"
+#include "../tensor/function/Rectify.h"
+#include "../tensor/function/Sigmoid.h"
+#include "../tensor/function/Softmax.h"

 namespace nts{

@@ -49,10 +53,22 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x,
    if(funcID == FUNC_HARDTANH){
        _HardTanHBackward(gold, y, x, dedy, dedx, lossName);
    }
+    else if(funcID == FUNC_IDENTITY){
+        _IdentityBackward(gold, y, x, dedy, dedx, lossName);
+    }
    else if(funcID == FUNC_LOGSOFTMAX){
        int leadDim = *(int*)params;
        _LogSoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName);
    }
+    else if(funcID == FUNC_RECTIFY){
+        _RectifyBackward(gold, y, x, dedy, dedx, lossName);
+    }
+    else if(funcID == FUNC_SIGMOID){
+        _SigmoidBackward(gold, y, x, dedy, dedx, lossName);
+    }else if(funcID == FUNC_SOFTMAX){
+        int leadDim = *(int*)params;
+        _SoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName);
+    }
    else{
        ShowNTErrors("wrong function found when call the backward process!");
    }

--- a/source/network/XBackwardMath.cpp
+++ b/source/network/XBackwardMath.cpp
@@ -35,53 +35,61 @@ void XMathGrad::MakeGrad(XTensor * node)
    XLink &income = node->income;
    int operID = income.typeID;

-    if(operID == MATH_SUM)
-        GradSum(node);
-    else if(operID == MATH_SUMDIM)
-        GradSumDim(node);
-    else if(operID == MATH_MULTIPLY)
-        GradMultiply(node);
+
+    if(operID == MATH_ABSOLUTE)
+        GradAbsolute(node);
+    else if(operID == MATH_COS)
+        GradCos(node);
+    else if(operID == MATH_EXP)
+        GradExp(node);
+    else if(operID == MATH_LOG)
+        GradLog(node);
+    else if(operID == MATH_ROUND)
+        GradRound(node);
+    else if(operID == MATH_SIGN)
+        GradSign(node);
+    else if(operID == MATH_SIN)
+        GradSin(node);
+    else if(operID == MATH_TAN)
+        GradTan(node);
+
+    else if(operID == MATH_CLIP)
+        GradClip(node);
+    else if(operID == MATH_DIV)
+        GradDiv(node);
+    else if(operID == MATH_DIVDIM)
+        GradDivDim(node);
    else if(operID == MATH_MATRIXMUL)
        GradMatrixMul(node);
    else if(operID == MATH_MATRIXMULBATCHED)
        GradMatrixMulBatched(node);
-    else if (operID == MATH_LOG)
-        GradLog(node);
-    else if (operID == MATH_POWER)
-        GradPower(node);
-    else if (operID == MATH_NEGATE)
+    else if(operID == MATH_MULTIPLY)
+        GradMultiply(node);
+    else if(operID == MATH_MULTIPLYDIM)
+        GradMultiplyDim(node);
+    else if(operID == MATH_NEGATE)
        GradNegate(node);
-    else if (operID == MATH_SCALEANDSHIFT)
+    else if(operID == MATH_NORMALIZE)
+        GradNormalize(node);
+    else if(operID == MATH_POWER)
+        GradPower(node);
+    else if(operID == MATH_SCALEANDSHIFT)
        GradScaleAndShift(node);
-    else if (operID == MATH_DIV)
-        GradDiv(node);
-    else if (operID == MATH_SUB)
+    else if(operID == MATH_SUB)
        GradSub(node);
-    else if (operID == MATH_SIN)
-        GradSin(node);
-    else if (operID == MATH_COS)
-        GradCos(node);
-    else if (operID == MATH_TAN)
-        GradTan(node);
-    else if (operID == MATH_EXP)
-        GradExp(node);
-    else if (operID == MATH_NORMALIZE)
-        GradNormalize(node);
-    else if (operID == MATH_ABSOLUTE)
-        GradAbsolute(node);
-    else if (operID == MATH_SIGN)
-        GradSign(node);
-    else if (operID == MATH_ROUND)
-        GradRound(node);
-    else if (operID == MATH_CLIP)
-        GradClip(node);
-    else if (operID == REDUCE_REDUCEMEAN)
+    else if(operID == MATH_SUBDIM)
+        GradSubDim(node);
+    else if(operID == MATH_SUM)
+        GradSum(node);
+    else if(operID == MATH_SUMDIM)
+        GradSumDim(node);
+    else if(operID == REDUCE_REDUCEMEAN)
        GradReduceMean(node);
-    else if (operID == REDUCE_REDUCESUM)
+    else if(operID == REDUCE_REDUCESUM)
        GradReduceSum(node);
-    else if (operID == REDUCE_REDUCESUMSQUARED)
+    else if(operID == REDUCE_REDUCESUMSQUARED)
        GradReduceSumSquared(node);
-    else if (operID == REDUCE_REDUCEVARIANCE)
+    else if(operID == REDUCE_REDUCEVARIANCE)
        GradReduceVariance(node);
    else{
        ShowNTErrors("TODO!");
@@ -95,82 +103,319 @@ bool XMathGrad::IsMathOP(XTensor * node)
    return (income.typeID & MATH_BASE) != 0;
 }

-/* 
-gradient for sum
-for 
-c =  a + b * \beta
+/*
+gradient for absolute
+for
+c = |a|
 we have
-dE/da = dE/dc 
-dE/db = dE/dc * \beta
+dE/da = dE/dc   a >= 0
+        -dE/dc  a < 0
 >> node - the node (c) for backward computation
 */
-void XMathGrad::GradSum(XTensor * node)
+void XMathGrad::GradAbsolute(XTensor * node)
 {
    XLink &income = node->income;
-    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for SUM!");
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for ABSOLUTE!");
+
+    XTensor * a = income.tails[0];
+    XTensor * b = NewTensorBuf(a, a->devID, a->mem);
+
+    XNoder::MakeGrad(a);
+
+    _Sign(a, b);
+    _Multiply(node->grad, b, a->grad, 1.0F);
+
+    DelTensorBuf(b);
+
+    node->visitMark = NODE_FINISHED;
+}
+
+/*
+gradient for cos
+for
+c = cos(a)
+we have
+dE/da = dE/dc * -sin(a)
+>> node - the node (c) for backward computation
+*/
+void XMathGrad::GradCos(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for COS!");
+
+    XTensor * a = income.tails[0];
+    XTensor * b = NewTensorBuf(a, a->devID, a->mem);
+
+    XNoder::MakeGrad(a);
+
+    _Sin(a, b);
+    _ScaleAndShiftMe(b, -1.0F);
+    _Multiply(node->grad, b, a->grad, 1.0F);
+
+    DelTensorBuf(b);
+
+    node->visitMark = NODE_FINISHED;
+}
+
+/*
+gradient for exp
+for
+c = exp(a)
+we have
+dE/da = dE/dc * exp(a)
+>> node - the node (c) for backward computation
+*/
+void XMathGrad::GradExp(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for EXP!");
+
+    XTensor * a = income.tails[0];
+    XTensor * b = NewTensorBuf(a, a->devID, a->mem);
+
+    XNoder::MakeGrad(a);
+
+    _Exp(a, b);
+    _Multiply(node->grad, b, a->grad, 1.0F);
+
+    DelTensorBuf(b);
+
+    node->visitMark = NODE_FINISHED;
+}
+
+/*
+gradient for log
+for
+c = log(a)
+we have
+dE/da = dE/dc * 1/a
+>> node - the node (c) for backward computation
+*/
+void XMathGrad::GradLog(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for LOG!");
+
+    XTensor * a = income.tails[0];
+
+    XNoder::MakeGrad(a);
+
+    _Div(node->grad, a, a->grad, 1.0F);
+
+    node->visitMark = NODE_FINISHED;
+}
+
+/*
+gradient for round
+for
+c = round(a)
+we have
+dE/da = 0
+>> node - the node (c) for backward computation
+*/
+void XMathGrad::GradRound(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for ROUND!");
+
+    // we do nothing here
+    // TODO: set grad = 0 if the node is the only child
+
+    node->visitMark = NODE_FINISHED;
+}
+
+/*
+gradient for sign
+for
+c = sign(a)
+we have
+dE/da = 0
+>> node - the node (c) for backward computation
+*/
+void XMathGrad::GradSign(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SIGN!");
+
+    // we do nothing here
+    // TODO: set grad = 0 if the node is the only child
+
+    node->visitMark = NODE_FINISHED;
+}
+
+/*
+gradient for sin
+for
+c = sin(a)
+we have
+dE/da = dE/dc * cos(a)
+>> node - the node (c) for backward computation
+*/
+void XMathGrad::GradSin(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SIN!");
+
+    XTensor * a = income.tails[0];
+    XTensor * b = NewTensorBuf(a, a->devID, a->mem);
+
+    XNoder::MakeGrad(a);
+
+    _Cos(a, b);
+    _Multiply(node->grad, b, a->grad, 1.0F);
+
+    DelTensorBuf(b);
+
+    node->visitMark = NODE_FINISHED;
+}
+
+/*
+gradient for tan
+for
+c = tan(a)
+we have
+dE/da = dE/dc * 1/(cos(a))^2
+>> node - the node (c) for backward computation
+*/
+void XMathGrad::GradTan(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for TAN!");
+
+    XTensor * a = income.tails[0];
+    XTensor * b = NewTensorBuf(a, a->devID, a->mem);
+
+    XNoder::MakeGrad(a);
+
+    _Cos(a, b);
+    _PowerMe(b, -2.0F);
+    _Multiply(node->grad, b, a->grad, 1.0F);
+
+    DelTensorBuf(b);
+
+    node->visitMark = NODE_FINISHED;
+}
+
+/*
+gradient for clip
+we have
+dE/da = 1  lower < a < upper
+dE/da = 0  otherwise 
+>> node - the node (c) for backward computation
+*/
+void XMathGrad::GradClip(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for CLIP!");
+
+    XTensor * a = income.tails[0];
+    XTensor * b = NewTensorBuf(a, a->devID, a->mem);
+
+    DTYPE lower = income.GetParam(0);
+    DTYPE upper = income.GetParam(1);
+
+    XNoder::MakeGrad(a);
+
+    _ClipBackward(node, a, node->grad, a->grad, lower, upper);
+    _Sum(a->grad, b, a->grad);
+
+    DelTensorBuf(b);
+
+    node->visitMark = NODE_FINISHED;
+}
+
+/*
+gradient for divide
+for
+c =  a / b
+we have
+dE/da = dE/dc / b
+dE/db = dE/dc * a / -b^2
+>> node - the node (c) for backward computation
+*/
+void XMathGrad::GradDiv(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for DIVIDE!");

    XTensor * a = income.tails[0];
    XTensor * b = income.tails[1];
-    DTYPE beta = income.GetParam(0);
+    XTensor * ab2 = NewTensorBuf(a, a->devID, a->mem);
+
+    XNoder::MakeGrad(a);
+    XNoder::MakeGrad(b);
+
+    CheckNTErrors(XTensor::IsSameShaped(a, b), "Wrong sized input tensors!");
+
+    _Div(node->grad, b, a->grad, 1.0F);

-    XNoder::MakeGrad(a);
-    XNoder::MakeGrad(b);
+    _Power(b, ab2, -2.0F);
+    _Multiply(a, ab2, ab2);
+    _ScaleAndShiftMe(ab2, -1.0F);
+    _Multiply(node->grad, ab2, b->grad, 1.0F);

-    _Sum(a->grad, node->grad, a->grad);
-    _Sum(b->grad, node->grad, b->grad, beta);
+    DelTensorBuf(ab2);

    node->visitMark = NODE_FINISHED;
 }

 /* 
-gradient for sum with one dimension
-c = a + b * \beta
+gradient for division with one dimension
+c = a / b
 where the size of b is equal to dimension n of a, i.e., |b| = a.dimSize[n]
-dE/da = dE/dc
-dE/db = dE/dc * b.reduce(0,...,n-1,n+1,...) * \beta
+dE/da = dE/dc * (1/b)
+dE/db = (dE/dc * (-a/b^2)).reduce(0,...,n-1,n+1,...)
 */
-void XMathGrad::GradSumDim(XTensor * node)
+void XMathGrad::GradDivDim(XTensor * node)
 {
    XLink &income = node->income;
-    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for SUMDIM!");
+    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for DIVDIM!");

    XTensor * a = income.tails[0];
    XTensor * b = income.tails[1];
    int n = income.GetParamInt(0);
-    DTYPE beta = income.GetParam(1);
    XNoder::MakeGrad(a);
    XNoder::MakeGrad(b);

-    _Sum(a->grad, node->grad, a->grad);
+    /* dE/da = dE/dc * (1/b) */
+    _DivDim(node->grad, b, a->grad, n, 1.0);

+    /* dE/db = dE/dc * dc/db */
    int order = a->order;
    int dimSize[MAX_TENSOR_DIM_NUM];
    memcpy(dimSize, a->dimSize, sizeof(int) * a->order);

+    XTensor * aTMP1 = NewTensorBuf(a, a->devID, a->mem);
+    XTensor * aTMP2 = NewTensorBuf(a, a->devID, a->mem);
+    XTensor * bTMP  = NewTensorBuf(b, b->devID, b->mem);
+    XTensor * interGradTMP = NewTensorBuf(node->grad, node->devID, node->mem);
+
+    _Negate(a, aTMP1);
+    _Power(b, bTMP, -2);
+    _MultiplyDim(aTMP1, bTMP, aTMP2, n);
+
+    _Multiply(node->grad, aTMP2, interGradTMP);
+
    if(n == order - 1){
        int reshapedSize[MAX_TENSOR_DIM_NUM];
        reshapedSize[0] = a->unitNum/dimSize[order - 1];
        reshapedSize[1] = dimSize[order - 1];

-        /* we reshape dE/dc to a matrix whose column number is equal to the 
+        /* we reshape dE/dc * a to a matrix whose column number is equal to the 
           size of b. Then we can reduce the matrix into a row vector. */
-        node->grad->Reshape(2, reshapedSize);
+        interGradTMP->Reshape(2, reshapedSize);

        if(b->outgo.tailNum > 1){
            XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
-            _ReduceSum(node->grad, bGradTMP, 0);
-            if(beta != 1.0F)
-                _ScaleAndShiftMe(bGradTMP, beta);
-            _Sum(bGradTMP, b->grad, b->grad);
+
+            _ReduceSum(interGradTMP, bGradTMP, 0);
+            _Sum(b->grad, bGradTMP, b->grad);
+
            DelTensorBuf(bGradTMP);
        }
        else{
-            _ReduceSum(node->grad, b->grad, 0);
-            if(beta != 1.0F)
-                _ScaleAndShiftMe(b->grad, beta);
+            _ReduceSum(interGradTMP, b->grad, 0);
        }
-
-        node->grad->Reshape(order, dimSize);
    }
    else{
        int reshapedSize[MAX_TENSOR_DIM_NUM];
@@ -187,57 +432,29 @@ void XMathGrad::GradSumDim(XTensor * node)

        /* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|. 
           Then reduce along with z and x to obtain dE/db. */
-        node->grad->Reshape(3, reshapedSize);
+        interGradTMP->Reshape(3, reshapedSize);

        XTensor * interGrad = NewTensorBuf(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
-
-        _ReduceSum(node->grad, interGrad, 2);
+        _ReduceSum(interGradTMP, interGrad, 2);

        if(b->outgo.tailNum > 1){
-            XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
-            _ReduceSum(interGrad, bGradTMP, 0);
-            if(beta != 1.0F)
-                _ScaleAndShiftMe(bGradTMP, beta);
-            _Sum(bGradTMP, b->grad, b->grad);
-            DelTensorBuf(bGradTMP);
+            XTensor * bGradTMP2 = NewTensorBuf(b->grad, b->devID, b->mem);
+
+            _ReduceSum(interGrad, bGradTMP2, 0);
+            _Sum(b->grad, bGradTMP2, b->grad);
+
+            DelTensorBuf(bGradTMP2);
        }
        else{
            _ReduceSum(interGrad, b->grad, 0);
-            if(beta != 1.0F)
-                _ScaleAndShiftMe(b->grad, beta);
        }
-
-        node->grad->Reshape(order, dimSize);
-
        DelTensorBuf(interGrad);
-
    }
-    
-    node->visitMark = NODE_FINISHED;
-}
-
-/* 
-gradient for multiply (dot production)
-for
-c =  a * b 
-we have
-dE/da = dE/dc * b
-dE/db = dE/dc * a 
->> node - the node (c) for backward computation
-*/
-void XMathGrad::GradMultiply(XTensor * node)
-{
-    XLink &income = node->income;
-    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for MULTIPLY!");
-
-    XTensor * a = income.tails[0]; 
-    XTensor * b = income.tails[1];
-    XNoder::MakeGrad(a);
-    XNoder::MakeGrad(b);

-    CheckNTErrors(XTensor::IsSameShaped(a, b), "Wrong sized input tensors!");
-    _Multiply(node->grad, b, a->grad, 1.0F);
-    _Multiply(node->grad, a, b->grad, 1.0F);
+    DelTensorBuf(aTMP1);
+    DelTensorBuf(aTMP2);
+    DelTensorBuf(bTMP);
+    DelTensorBuf(interGradTMP);

    node->visitMark = NODE_FINISHED;
 }
@@ -381,7 +598,6 @@ void XMathGrad::GradMatrixMulBatched(XTensor * node)
    XNoder::MakeGrad(a);
    XNoder::MakeGrad(b);

-    XTensor * c = node;
    XTensor * dedc = node->grad;
    XTensor * deda = a->grad;
    XTensor * dedb = b->grad;
@@ -433,286 +649,145 @@ void XMathGrad::GradMatrixMulBatched(XTensor * node)
    node->visitMark = NODE_FINISHED;
 }

-/*
-gradient for log
-for
-c = log(a)
-we have
-dE/da = dE/dc * 1/a
->> node - the node (c) for backward computation
-*/
-void XMathGrad::GradLog(XTensor * node)
-{
-    XLink &income = node->income;
-    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for LOG!");
-
-    XTensor * a = income.tails[0];
-
-    XNoder::MakeGrad(a);
-
-    _Div(node->grad, a, a->grad, 1.0F);
-
-    node->visitMark = NODE_FINISHED;
-}
-
-/*
-gradient for power
-for
-c = pow(a,p)
-we have
-dE/da = (dE/dc) * p*a^(p-1)
->> node - the node (c) for backward computation
-*/
-void XMathGrad::GradPower(XTensor * node)
-{
-    XLink &income = node->income;
-    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for POWER!");
-
-    XTensor * a = income.tails[0];
-    XTensor * b = NewTensor(a);
-    XTensor * c = NewTensor(a);
-
-    DTYPE p = income.GetParam(0);
-
-    XNoder::MakeGrad(a);
-
-    _Power(a, b, (p-1)/p);
-    _ScaleAndShift(b, c, p);
-    _Multiply(node->grad, c, a->grad, 1.0F);
-
-    node->visitMark = NODE_FINISHED;
-
-    delete b;
-    delete c;
-}
-
-/*
-gradient for negate
-for
-c = -a
-we have
-dE/da = dE/dc * (-1)
->> node - the node (c) for backward computation
-*/
-void XMathGrad::GradNegate(XTensor * node)
-{
-    XLink &income = node->income;
-    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for NEGATE!");
-
-    XTensor * a = income.tails[0];
-    XTensor * b = NewTensor(a);
-
-    XNoder::MakeGrad(a);
-
-    _ScaleAndShift(node->grad, b, -1.0F);
-    _Sum(a->grad, b, a->grad);
-
-    node->visitMark = NODE_FINISHED;
-
-    delete b;
-}
-
-/*
-gradient for ScaleAndShift
-for
-c = a * scale + shift
-we have
-dE/da = dE/dc * scale
->> node - the node (c) for backward computation
-*/
-void XMathGrad::GradScaleAndShift(XTensor * node)
-{
-    XLink &income = node->income;
-    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SCALEANDSHIFT!");
-
-    XTensor * a = income.tails[0];
-    XTensor * b = NewTensor(a);
-
-    DTYPE scale = income.GetParam(0);
-
-    XNoder::MakeGrad(a);
-
-    _ScaleAndShift(node->grad, b, scale);
-    _Sum(a->grad, b, a->grad);
-
-    node->visitMark = NODE_FINISHED;
-
-    delete b;
-}
-
-/*
-gradient for minus
+/* 
+gradient for multiply (dot production)
 for
-c =  a - b * \beta
+c =  a * b 
 we have
-dE/da = dE/dc
-dE/db = -dE/dc * \beta
+dE/da = dE/dc * b
+dE/db = dE/dc * a 
 >> node - the node (c) for backward computation
 */
-void XMathGrad::GradSub(XTensor * node)
+void XMathGrad::GradMultiply(XTensor * node)
 {
    XLink &income = node->income;
-    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for SUBSTRACT!");
+    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for MULTIPLY!");

-    XTensor * a = income.tails[0];
+    XTensor * a = income.tails[0]; 
    XTensor * b = income.tails[1];
-    DTYPE beta = income.GetParam(0);
-
    XNoder::MakeGrad(a);
    XNoder::MakeGrad(b);

-    _Sum(a->grad, node->grad, a->grad);
-    _Sum(b->grad, node->grad, b->grad, -beta);
+    CheckNTErrors(XTensor::IsSameShaped(a, b), "Wrong sized input tensors!");
+    _Multiply(node->grad, b, a->grad, 1.0F);
+    _Multiply(node->grad, a, b->grad, 1.0F);

    node->visitMark = NODE_FINISHED;
 }

 /*
-gradient for divide
-for
-c =  a / b
-we have
-dE/da = dE/dc / b
-dE/db = dE/dc * a / -b^2
->> node - the node (c) for backward computation
+gradient for multiply with one dimension
+c = a * b
+where the size of b is equal to dimension n of a, i.e., |b| = a.dimSize[n]
+dE/da = dE/dc * b
+dE/db = (dE/dc * a).reduce(0,...,n-1,n+1,...)
 */
-void XMathGrad::GradDiv(XTensor * node)
+void XMathGrad::GradMultiplyDim(XTensor * node)
 {
    XLink &income = node->income;
-    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for DIVIDE!");
+    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for MULTIPLYDIM!");

    XTensor * a = income.tails[0];
    XTensor * b = income.tails[1];
-    XTensor * c = NewTensor(b);
-    XTensor * d = NewTensor(b);
-    XTensor * e = NewTensor(b);
-
+    int n = income.GetParamInt(0);
    XNoder::MakeGrad(a);
    XNoder::MakeGrad(b);

-    CheckNTErrors(XTensor::IsSameShaped(a, b), "Wrong sized input tensors!");
-
-    _Div(node->grad, b, a->grad, 1.0F);
-    _Power(b, c, -2.0F);
-    _Multiply(a, c, d);
-    _ScaleAndShift(d, e, -1.0F);
-    _Multiply(node->grad, e, b->grad, 1.0F);
-
-    node->visitMark = NODE_FINISHED;
-
-    delete c;
-    delete d;
-    delete e;
-}
-
-/*
-gradient for exp
-for
-c = exp(a)
-we have
-dE/da = dE/dc * exp(a)
->> node - the node (c) for backward computation
-*/
-void XMathGrad::GradExp(XTensor * node)
-{
-    XLink &income = node->income;
-    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for EXP!");
-
-    XTensor * a = income.tails[0];
-    XTensor * b = NewTensor(a);
-
-    XNoder::MakeGrad(a);
-
-    _Exp(a, b);
-    _Multiply(node->grad, b, a->grad, 1.0F);
+    /* dE/da */
+    _MultiplyDim(node->grad, b, a->grad, n, 1.0F);
+	
+	/* dE/db */
+    int order = a->order;
+    int dimSize[MAX_TENSOR_DIM_NUM];
+    memcpy(dimSize, a->dimSize, sizeof(int) * a->order);

-    node->visitMark = NODE_FINISHED;
+    XTensor * bGradTMP = NewTensorBuf(node->grad, node->devID, node->mem);
+    _Multiply(node->grad, a, bGradTMP);
+	
+    if(n == order - 1){
+        int reshapedSize[MAX_TENSOR_DIM_NUM];
+        reshapedSize[0] = a->unitNum/dimSize[order - 1];
+        reshapedSize[1] = dimSize[order - 1];

-    delete b;
-}
+        /* we reshape dE/dc * a to a matrix whose column number is equal to the 
+           size of b. Then we can reduce the matrix into a row vector. */
+        bGradTMP->Reshape(2, reshapedSize);

-/*
-gradient for sin
-for
-c = sin(a)
-we have
-dE/da = dE/dc * cos(a)
->> node - the node (c) for backward computation
-*/
-void XMathGrad::GradSin(XTensor * node)
-{
-    XLink &income = node->income;
-    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SIN!");
+        if(b->outgo.tailNum > 1){
+            XTensor * bGradTMP2 = NewTensorBuf(b->grad, b->devID, b->mem);

-    XTensor * a = income.tails[0];
-    XTensor * b = NewTensor(a);
+            _ReduceSum(bGradTMP, bGradTMP2, 0);
+            _Sum(b->grad, bGradTMP2, b->grad);

-    XNoder::MakeGrad(a);
+            DelTensorBuf(bGradTMP2);
+        }
+        else{
+            _ReduceSum(bGradTMP, b->grad, 0);
+        }
+    }
+    else{
+        int reshapedSize[MAX_TENSOR_DIM_NUM];
+        reshapedSize[0] = 1;
+        reshapedSize[1] = dimSize[n];
+        reshapedSize[2] = 1;

-    _Cos(a, b);
-    _Multiply(node->grad, b, a->grad, 1.0F);
+        for(int i = 0; i < order; i++){
+            if(i < n)
+                reshapedSize[0] *= dimSize[i];
+        }

-    node->visitMark = NODE_FINISHED;
+        reshapedSize[2] = a->unitNum / (reshapedSize[0] * reshapedSize[1]);

-    delete b;
-}
+        /* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|. 
+           Then reduce along with z and x to obtain dE/db. */
+        bGradTMP->Reshape(3, reshapedSize);

-/*
-gradient for cos
-for
-c = cos(a)
-we have
-dE/da = dE/dc * -sin(a)
->> node - the node (c) for backward computation
-*/
-void XMathGrad::GradCos(XTensor * node)
-{
-    XLink &income = node->income;
-    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for COS!");
+        XTensor * interGrad = NewTensorBuf(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
+        _ReduceSum(bGradTMP, interGrad, 2);

-    XTensor * a = income.tails[0];
-    XTensor * b = NewTensor(a);
-    XTensor * c = NewTensor(a);
+        if(b->outgo.tailNum > 1){
+            XTensor * bGradTMP2 = NewTensorBuf(b->grad, b->devID, b->mem);

-    XNoder::MakeGrad(a);
+            _ReduceSum(interGrad, bGradTMP2, 0);
+            _Sum(b->grad, bGradTMP2, b->grad);

-    _Sin(a, b);
-    _ScaleAndShift(b, c, -1.0F);
-    _Multiply(node->grad, c, a->grad, 1.0F);
+            DelTensorBuf(bGradTMP2);
+        }
+        else{
+            _ReduceSum(interGrad, b->grad, 0);
+        }

-    node->visitMark = NODE_FINISHED;
+        DelTensorBuf(interGrad);
+    }

-    delete b;
-    delete c;
+    DelTensor(bGradTMP);
+    node->visitMark = NODE_FINISHED;
 }

 /*
-gradient for tan
+gradient for negate
 for
-c = tan(a)
+c = -a
 we have
-dE/da = dE/dc * 1/(cos(a))^2
+dE/da = dE/dc * (-1)
 >> node - the node (c) for backward computation
 */
-void XMathGrad::GradTan(XTensor * node)
+void XMathGrad::GradNegate(XTensor * node)
 {
    XLink &income = node->income;
-    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for TAN!");
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for NEGATE!");

    XTensor * a = income.tails[0];
-    XTensor * b = NewTensor(a);
-    XTensor * c = NewTensor(a);
+    XTensor * b = NewTensorBuf(a, a->devID, a->mem);

    XNoder::MakeGrad(a);

-    _Cos(a, b);
-    _Power(b, c, -2.0F);
-    _Multiply(node->grad, c, a->grad, 1.0F);
+    _ScaleAndShift(node->grad, b, -1.0F);
+    _Sum(a->grad, b, a->grad);

-    node->visitMark = NODE_FINISHED;
+    DelTensorBuf(b);

-    delete b;
-    delete c;
+    node->visitMark = NODE_FINISHED;
 }

 /*
@@ -804,110 +879,300 @@ void XMathGrad::GradNormalize(XTensor * node)
 }

 /*
-gradient for absolute
+gradient for power
 for
-c = |a|
+c = pow(a,p)
 we have
-dE/da = dE/dc   a >= 0
-        -dE/dc  a < 0
+dE/da = (dE/dc) * p * a^(p-1)
 >> node - the node (c) for backward computation
 */
-void XMathGrad::GradAbsolute(XTensor * node)
+void XMathGrad::GradPower(XTensor * node)
 {
    XLink &income = node->income;
-    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for ABSOLUTE!");
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for POWER!");

    XTensor * a = income.tails[0];
-    XTensor * b = NewTensor(a);
+    XTensor * b = NewTensorBuf(a, a->devID, a->mem);
+
+    DTYPE p = income.GetParam(0);

    XNoder::MakeGrad(a);

-    _Sign(a, b);
+    _Power(a, b, p - 1.0F);
+    _ScaleAndShiftMe(b, p);
    _Multiply(node->grad, b, a->grad, 1.0F);

-    node->visitMark = NODE_FINISHED;
+    DelTensor(b);

-    delete b;
+    node->visitMark = NODE_FINISHED;
 }

 /*
-gradient for sign
+gradient for ScaleAndShift
 for
-c = sign(a)
+c = a * scale + shift
 we have
-dE/da = 0
+dE/da = dE/dc * scale
 >> node - the node (c) for backward computation
 */
-void XMathGrad::GradSign(XTensor * node)
+void XMathGrad::GradScaleAndShift(XTensor * node)
 {
    XLink &income = node->income;
-    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SIGN!");
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SCALEANDSHIFT!");

    XTensor * a = income.tails[0];
-    XTensor * b = NewTensor(a);
+
+    DTYPE scale = income.GetParam(0);

    XNoder::MakeGrad(a);

-    b->SetZeroAll();
-    _Sum(a->grad, b, a->grad);
+    _Sum(a->grad, node->grad, a->grad, scale);

    node->visitMark = NODE_FINISHED;
-
-    delete b;
 }

 /*
-gradient for round
+gradient for minus
 for
-c = round(a)
+c =  a - b * \beta
 we have
-dE/da = 0
+dE/da = dE/dc
+dE/db = -dE/dc * \beta
 >> node - the node (c) for backward computation
 */
-void XMathGrad::GradRound(XTensor * node)
+void XMathGrad::GradSub(XTensor * node)
 {
    XLink &income = node->income;
-    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for ROUND!");
+    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for SUBSTRACT!");

    XTensor * a = income.tails[0];
-    XTensor * b = NewTensor(a);
+    XTensor * b = income.tails[1];
+    DTYPE beta = income.GetParam(0);

    XNoder::MakeGrad(a);
+    XNoder::MakeGrad(b);

-    b->SetZeroAll();
-    _Sum(a->grad, b, a->grad);
+    _Sum(a->grad, node->grad, a->grad);
+    _Sum(b->grad, node->grad, b->grad, -beta);

    node->visitMark = NODE_FINISHED;
-
-    delete b;
 }

 /*
-gradient for clip
+gradient for subtraction with one dimension
+c = a - b * \beta
+where the size of b is equal to dimension n of a, i.e., |b| = a.dimSize[n]
+dE/da = dE/dc
+dE/db = - dE/dc * b.reduce(0,...,n-1,n+1,...) * \beta
+*/
+void XMathGrad::GradSubDim(XTensor * node)
+{
+	XLink &income = node->income;
+	CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for SUBDIM!");
+
+	XTensor * a = income.tails[0];
+	XTensor * b = income.tails[1];
+	int n = income.GetParamInt(0);
+	DTYPE beta = income.GetParam(1);
+	XNoder::MakeGrad(a);
+	XNoder::MakeGrad(b);
+
+	_Sum(a->grad, node->grad, a->grad);
+
+	int order = a->order;
+	int dimSize[MAX_TENSOR_DIM_NUM];
+	memcpy(dimSize, a->dimSize, sizeof(int) * a->order);
+
+	if(n == order - 1){
+		int reshapedSize[MAX_TENSOR_DIM_NUM];
+		reshapedSize[0] = a->unitNum / dimSize[order - 1];
+		reshapedSize[1] = dimSize[order - 1];
+
+		/* we reshape dE/dc to a matrix whose column number is equal to the
+		   size of b. Then we can reduce the matrix into a row vector. */
+		node->grad->Reshape(2, reshapedSize);
+
+		if(b->outgo.tailNum > 1){
+			XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
+			_ReduceSum(node->grad, bGradTMP, 0);
+			if(beta != 1.0F)
+				_ScaleAndShiftMe(bGradTMP, beta);
+			_Sub(b->grad, bGradTMP, b->grad);
+			DelTensorBuf(bGradTMP);
+		}
+		else{
+			_ReduceSum(node->grad, b->grad, 0);
+			if(beta != 1.0F)
+				_ScaleAndShiftMe(b->grad, beta);
+			_ScaleAndShiftMe(b->grad, -1.0F);
+		}
+
+		node->grad->Reshape(order, dimSize);
+	}
+	else{
+		int reshapedSize[MAX_TENSOR_DIM_NUM];
+		reshapedSize[0] = 1;
+		reshapedSize[1] = dimSize[n];
+		reshapedSize[2] = 1;
+
+		for(int i = 0; i < order; i++){
+			if(i < n)
+				reshapedSize[0] *= dimSize[i];
+		}
+
+		reshapedSize[2] = a->unitNum / (reshapedSize[0] * reshapedSize[1]);
+
+		/* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|.
+		   Then reduce along with z and x to obtain dE/db. */
+		node->grad->Reshape(3, reshapedSize);
+
+		XTensor * interGrad = NewTensorBuf(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
+
+		_ReduceSum(node->grad, interGrad, 2);
+
+		if(b->outgo.tailNum > 1){
+			XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
+			_ReduceSum(interGrad, bGradTMP, 0);
+			if(beta != 1.0F)
+				_ScaleAndShiftMe(bGradTMP, beta);
+			_Sub(b->grad, bGradTMP, b->grad);
+			DelTensorBuf(bGradTMP);
+		}
+		else{
+			_ReduceSum(interGrad, b->grad, 0);
+			if(beta != 1.0F)
+				_ScaleAndShiftMe(b->grad, beta);
+			_ScaleAndShiftMe(b->grad, -1.0F);
+		}
+
+		node->grad->Reshape(order, dimSize);
+
+		DelTensorBuf(interGrad);
+
+	}
+
+	node->visitMark = NODE_FINISHED;
+}
+
+/* 
+gradient for sum
+for 
+c =  a + b * \beta
 we have
-dE/da = 1  lower < a < upper
-dE/da = 0  otherwise 
+dE/da = dE/dc 
+dE/db = dE/dc * \beta
 >> node - the node (c) for backward computation
 */
-void XMathGrad::GradClip(XTensor * node)
+void XMathGrad::GradSum(XTensor * node)
 {
    XLink &income = node->income;
-    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for CLIP!");
+    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for SUM!");

    XTensor * a = income.tails[0];
-    XTensor * b = NewTensor(a);
-
-    DTYPE lower = income.GetParam(0);
-    DTYPE upper = income.GetParam(1);
+    XTensor * b = income.tails[1];
+    DTYPE beta = income.GetParam(0);

    XNoder::MakeGrad(a);
+    XNoder::MakeGrad(b);

-    _ClipBackward(node, a, node->grad, a->grad, lower, upper);
-    _Sum(a->grad, b, a->grad);
+    _Sum(a->grad, node->grad, a->grad);
+    _Sum(b->grad, node->grad, b->grad, beta);

    node->visitMark = NODE_FINISHED;
+}
+
+/* 
+gradient for sum with one dimension
+c = a + b * \beta
+where the size of b is equal to dimension n of a, i.e., |b| = a.dimSize[n]
+dE/da = dE/dc
+dE/db = dE/dc * b.reduce(0,...,n-1,n+1,...) * \beta
+*/
+void XMathGrad::GradSumDim(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for SUMDIM!");
+
+    XTensor * a = income.tails[0];
+    XTensor * b = income.tails[1];
+    int n = income.GetParamInt(0);
+    DTYPE beta = income.GetParam(1);
+    XNoder::MakeGrad(a);
+    XNoder::MakeGrad(b);
+
+    _Sum(a->grad, node->grad, a->grad);
+
+    int order = a->order;
+    int dimSize[MAX_TENSOR_DIM_NUM];
+    memcpy(dimSize, a->dimSize, sizeof(int) * a->order);
+
+    if(n == order - 1){
+        int reshapedSize[MAX_TENSOR_DIM_NUM];
+        reshapedSize[0] = a->unitNum/dimSize[order - 1];
+        reshapedSize[1] = dimSize[order - 1];
+
+        /* we reshape dE/dc to a matrix whose column number is equal to the 
+           size of b. Then we can reduce the matrix into a row vector. */
+        node->grad->Reshape(2, reshapedSize);
+
+        if(b->outgo.tailNum > 1){
+            XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
+            _ReduceSum(node->grad, bGradTMP, 0);
+            if(beta != 1.0F)
+                _ScaleAndShiftMe(bGradTMP, beta);
+            _Sum(bGradTMP, b->grad, b->grad);
+            DelTensorBuf(bGradTMP);
+        }
+        else{
+            _ReduceSum(node->grad, b->grad, 0);
+            if(beta != 1.0F)
+                _ScaleAndShiftMe(b->grad, beta);
+        }
+
+        node->grad->Reshape(order, dimSize);
+    }
+    else{
+        int reshapedSize[MAX_TENSOR_DIM_NUM];
+        reshapedSize[0] = 1;
+        reshapedSize[1] = dimSize[n];
+        reshapedSize[2] = 1;
+
+        for(int i = 0; i < order; i++){
+            if(i < n)
+                reshapedSize[0] *= dimSize[i];
+        }
+
+        reshapedSize[2] = a->unitNum / (reshapedSize[0] * reshapedSize[1]);
+
+        /* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|. 
+           Then reduce along with z and x to obtain dE/db. */
+        node->grad->Reshape(3, reshapedSize);
+
+        XTensor * interGrad = NewTensorBuf(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
+
+        _ReduceSum(node->grad, interGrad, 2);
+
+        if(b->outgo.tailNum > 1){
+            XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
+            _ReduceSum(interGrad, bGradTMP, 0);
+            if(beta != 1.0F)
+                _ScaleAndShiftMe(bGradTMP, beta);
+            _Sum(bGradTMP, b->grad, b->grad);
+            DelTensorBuf(bGradTMP);
+        }
+        else{
+            _ReduceSum(interGrad, b->grad, 0);
+            if(beta != 1.0F)
+                _ScaleAndShiftMe(b->grad, beta);
+        }
+
+        node->grad->Reshape(order, dimSize);

-    delete b;
+        DelTensorBuf(interGrad);
+
+    }
+    
+    node->visitMark = NODE_FINISHED;
 }

 /*
@@ -924,21 +1189,20 @@ void XMathGrad::GradReduceMean(XTensor * node)
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for Reduce!");

    XTensor * a = income.tails[0];
-    XTensor * b = NewTensor(a);
-    XTensor * c = NewTensor(a);
+    XTensor * b = NewTensorBuf(a, a->devID, a->mem);

    int dim = income.GetParamInt(0);
    int n = a->GetDim(dim);
+
    XNoder::MakeGrad(a);

    _Unsqueeze(node->grad, b, dim, n);
-    _ScaleAndShift(b, c, 1.0F/n);
-    _Sum(a->grad, c, a->grad);
+    _ScaleAndShiftMe(b, 1.0F/n);
+    _Sum(a->grad, b, a->grad);

-    node->visitMark = NODE_FINISHED;
+    DelTensorBuf(b);

-    delete b;
-    delete c;
+    node->visitMark = NODE_FINISHED;
 }

 /*
@@ -955,27 +1219,28 @@ void XMathGrad::GradReduceSum(XTensor * node)
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for Reduce!");

    XTensor * a = income.tails[0];
-    XTensor * b = NewTensor(a);
+    XTensor * b = NewTensorBuf(a, a->devID, a->mem);

    int dim = income.GetParamInt(0);
    int n = a->GetDim(dim);
+
    XNoder::MakeGrad(a);

    _Unsqueeze(node->grad, b, dim, n);
    _Sum(a->grad, b, a->grad);

-    node->visitMark = NODE_FINISHED;
+    DelTensor(b);

-    delete b;
+    node->visitMark = NODE_FINISHED;
 }

 /*
 gradient for reduceSumSquared
 for
-c = reduceSumSquared(a, dim, b)
+c = \sum_i (a_i - b)^2 
 we have
 dE/da = Unsqueeze(dE/dc) * 2a
-dE/db = Unsqueeze(dE/dc) * (-2b)
+dE/db = dE/dc * -2 * n * b
 >> node - the node (c) for backward computation
 */
 void XMathGrad::GradReduceSumSquared(XTensor * node)
@@ -985,35 +1250,46 @@ void XMathGrad::GradReduceSumSquared(XTensor * node)

    XTensor * a = income.tails[0];
    XTensor * b = income.tails[1];
-    XTensor * c = NewTensor(a);
-    XTensor * d = NewTensor(b);
-    XTensor * e = NewTensor(c);
+    XTensor * c = NewTensorBuf(a, a->devID, a->mem);
+    XTensor * d = NewTensorBuf(a, a->devID, a->mem);
+    XTensor * e = NewTensorBuf(a, a->devID, a->mem);
+    XTensor * f = NewTensorBuf(b, b->devID, b->mem);

    int dim = income.GetParamInt(0);
    int n = a->GetDim(dim);
    XNoder::MakeGrad(a);
    XNoder::MakeGrad(b);

-    _ScaleAndShift(a, c, 2.0F);
-    _ScaleAndShift(b, d, -2.0F);
+    /* compute a-b */
+    _Unsqueeze(b, c, dim, n);
+    _Sub(a, c, d);
+    _ReduceSum(d, f, dim);
+    
+    /* dE/da = Unsqueeze(dE/dc) * 2(a-b) */
+    _ScaleAndShiftMe(d, 2.0F);
    _Unsqueeze(node->grad, e, dim, n);
-    _Multiply(e, c, a->grad, 1.0F);
-    _Multiply(node->grad, d, b->grad, 1.0F);
+    _Multiply(d, e, a->grad, 1.0F);

-    node->visitMark = NODE_FINISHED;
+    /* dE/db = dE/dc * -2 * (a-b*n) */
+    _ScaleAndShiftMe(f, -2.0F);
+    _Multiply(node->grad, f, b->grad, 1.0F);

-    delete c;
-    delete d;
-    delete e;
+    DelTensorBuf(c);
+    DelTensorBuf(d);
+    DelTensorBuf(e);
+    DelTensorBuf(f);
+
+    node->visitMark = NODE_FINISHED;
 }

 /*
 gradient for reduceVariance
 for
-c = reduceVariance(a, dim, b)
+c = (sum_i (a_i - b)^2) * 1/n
+where b is the mean, and n is the size of a
 we have
-dE/da = Unsqueeze(dE/dc) * 2a/dimSizeA[dim]
-dE/db = Unsqueeze(dE/dc) * (-2a/dimSizeA[dim])
+dE/da = Unsqueeze(dE/dc) * 2a/n
+dE/db = dE/dc * -2 * b
 >> node - the node (c) for backward computation
 */
 void XMathGrad::GradReduceVariance(XTensor * node)
@@ -1023,26 +1299,36 @@ void XMathGrad::GradReduceVariance(XTensor * node)

    XTensor * a = income.tails[0];
    XTensor * b = income.tails[1];
-    XTensor * c = NewTensor(a);
-    XTensor * d = NewTensor(b);
-    XTensor * e = NewTensor(a);
+    XTensor * c = NewTensorBuf(a, a->devID, a->mem);
+    XTensor * d = NewTensorBuf(a, a->devID, a->mem);
+    XTensor * e = NewTensorBuf(a, a->devID, a->mem);
+    XTensor * f = NewTensorBuf(b, b->devID, b->mem);

    int dim = income.GetParamInt(0);
    int n = a->GetDim(dim);
    XNoder::MakeGrad(a);
    XNoder::MakeGrad(b);

-    _ScaleAndShift(a, c, 2.0F / n);
-    _ScaleAndShift(b, d, -2.0F / n);
+    /* compute a-b */
+    _Unsqueeze(b, c, dim, n);
+    _Sub(a, c, d);
+    _ReduceSum(d, f, dim);
+
+    /* dE/da = Unsqueeze(dE/dc) * 2 (a-b) / n */
+    _ScaleAndShiftMe(d, 2.0F / n);
    _Unsqueeze(node->grad, e, dim, n);
-    _Multiply(e, c, a->grad, 1.0F);
-    _Multiply(node->grad, d, b->grad, 1.0F);
+    _Multiply(d, e, a->grad, 1.0F);

-    node->visitMark = NODE_FINISHED;
+    /* dE/db = dE/dc * -2 * (a-b) */
+    _ScaleAndShiftMe(f, -2.0F /n);
+    _Multiply(node->grad, f, b->grad, 1.0F);

-    delete c;
-    delete d;
-    delete e;
+    DelTensorBuf(c);
+    DelTensorBuf(d);
+    DelTensorBuf(e);
+    DelTensorBuf(f);
+
+    node->visitMark = NODE_FINISHED;
 }

 }
--- a/source/network/XBackwardMath.h
+++ b/source/network/XBackwardMath.h
@@ -40,18 +40,50 @@ public:
    bool IsMathOP(XTensor * node);

 private:
-    /* gradient for sum: c =  a + b * \beta */
+    
+    /* gradient for absolute */
    static
-    void GradSum(XTensor * node);
+    void GradAbsolute(XTensor * node);
+    
+    /* gradient for cos */
+    static
+    void GradCos(XTensor * node);
+    
+    /* gradient for exp */
+    static
+    void GradExp(XTensor * node);

-    /* gradient for sum with one dimension: c = a + b * \beta
-       where the size of b is equal to that of one dimension of a */
+    /* gradient for log: c =  log(a) */
    static
-    void GradSumDim(XTensor * node);
+    void GradLog(XTensor * node);
+    
+    /* gradient for round */
+    static
+    void GradRound(XTensor * node);
+    
+    /* gradient for sign */
+    static
+    void GradSign(XTensor * node);

-    /* gradient for multiply (dot production): c =  a * b * \alpha */
+    /* gradient for sin */
    static
-    void GradMultiply(XTensor * node);
+    void GradSin(XTensor * node);
+
+    /* gradient for tan */
+    static
+    void GradTan(XTensor * node);
+
+    /* gradient for clip */
+    static
+    void GradClip(XTensor * node);
+
+    /* gradient for Divide */
+    static
+    void GradDiv(XTensor * node);
+
+    /* gradient for DivideDim */
+    static
+    void GradDivDim(XTensor * node);

    /* gradient for matrix multiply: c = matmul(a, b) * \alpha */
    static
@@ -68,17 +100,26 @@ private:
    static
    void GradMatrixMulBatched(XTensor * node);

-    /* gradient for log: c =  log(a) */
+    /* gradient for multiply (dot production): c =  a * b * \alpha */
    static
-    void GradLog(XTensor * node);
+    void GradMultiply(XTensor * node);

-    /* gradient for power */
+    /* gradient for multiply one dimension: c =  a * b * \alpha 
+       where the size of b is equal to that of one dimension of a */
    static
-    void GradPower(XTensor * node);
+    void GradMultiplyDim(XTensor * node);

    /* gradient for negate */
    static
    void GradNegate(XTensor * node);
+    
+    /* gradient for normalize */
+    static
+    void GradNormalize(XTensor * node);
+
+    /* gradient for power */
+    static
+    void GradPower(XTensor * node);

    /* gradient for ScaleAndShift */
    static
@@ -87,10 +128,20 @@ private:
    /* gradient for Minus */
    static
    void GradSub(XTensor * node);
+    
+	/* gradient for sub with one dimension: c = a - b * \beta
+	where the size of b is equal to that of one dimension of a */
+	static
+	void GradSubDim(XTensor * node);

-    /* gradient for Divide */
+    /* gradient for sum: c =  a + b * \beta */
    static
-    void GradDiv(XTensor * node);
+    void GradSum(XTensor * node);
+
+    /* gradient for sum with one dimension: c = a + b * \beta
+       where the size of b is equal to that of one dimension of a */
+    static
+    void GradSumDim(XTensor * node);

    /* gradient for reduceMean */
    static
@@ -107,42 +158,6 @@ private:
    /* gradient for reduceVariance */
    static
    void GradReduceVariance(XTensor * node);
-
-    /* gradient for sin */
-    static
-    void GradSin(XTensor * node);
-
-    /* gradient for cos */
-    static
-    void GradCos(XTensor * node);
-
-    /* gradient for tan */
-    static
-    void GradTan(XTensor * node);
-
-    /* gradient for exp */
-    static
-    void GradExp(XTensor * node);
-
-    /* gradient for normalize */
-    static
-    void GradNormalize(XTensor * node);
-
-    /* gradient for absolute */
-    static
-    void GradAbsolute(XTensor * node);
-
-    /* gradient for sign */
-    static
-    void GradSign(XTensor * node);
-
-    /* gradient for clip */
-    static
-    void GradClip(XTensor * node);
-
-    /* gradient for round */
-    static
-    void GradRound(XTensor * node);
 };

 }

--- a/source/sample/fnnlm/FNNLM.cpp
+++ b/source/sample/fnnlm/FNNLM.cpp
@@ -99,7 +99,7 @@ arguments:
            (how many words)
 -shuffle: shuffle the training data
 -devid D: the id of the device used
-           -1: GPU, >=0: GPUs
+           -1: CPU, >=0: GPUs
 -mempool: use memory pools for memory management
 -autodiff: use automatic differentiation for training
 

--- a/source/sample/transformer/T2TAttention.cpp
+++ b/source/sample/transformer/T2TAttention.cpp
@@ -35,6 +35,8 @@ T2TAttention::T2TAttention()
    dk = -1;
    dv = -1;
    d  = -1;
+    isMasked = false;
+    ignored = 0;
 }

 /* deconstructor */
@@ -46,29 +48,39 @@ T2TAttention::~T2TAttention()
 initialize the model 
 >> argc - number of arguments
 >> argv - list of pointers to the arguments
+>> myIgnored - number of position ignored in attention (from the begining)
+>> myIsMasked - indicates whether the attention is with a mask
 >> myDevID - device id
 >> myMem - the memory pool
 */
-void T2TAttention::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
+void T2TAttention::InitModel(int argc, const char ** argv, 
+                             bool myIsMasked, int myIgnored, 
+                             int myDevID, XMem * myMem)
 {
    devID = myDevID;
    mem = myMem;
+    isMasked = myIsMasked;
+    ignored = myIgnored;
    
    float minmax = 0;

    LoadParamInt(argc, argv, "nhead", &nhead, 8);
-    LoadParamInt(argc, argv, "d", &dk, DEFAULT_BEDDING_SIZE);
-    LoadParamInt(argc, argv, "d", &dv, DEFAULT_BEDDING_SIZE);
-    LoadParamInt(argc, argv, "d", &d, DEFAULT_BEDDING_SIZE);
-    LoadParamFloat(argc, argv, "attminmax", &minmax, 0.08F);
+    LoadParamInt(argc, argv, "d", &dk, DEFAULT_EMBEDDING_SIZE);
+    LoadParamInt(argc, argv, "d", &dv, DEFAULT_EMBEDDING_SIZE);
+    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
+    LoadParamFloat(argc, argv, "attminmax", &minmax, 0.1F);

    InitTensor2D(&wk, d, dk, X_FLOAT, devID, mem);
    InitTensor2D(&wq, d, dk, X_FLOAT, devID, mem);
    InitTensor2D(&wv, d, dv, X_FLOAT, devID, mem);
+    
+    float scale = 1.0F;
+    float finfoutk = (float)sqrt(6.0F * scale/(d + dk));
+    float finfoutv = (float)sqrt(6.0F * scale/(d + dv));

-    wk.SetDataRand(-minmax, minmax);
-    wq.SetDataRand(-minmax, minmax);
-    wv.SetDataRand(-minmax, minmax);
+    wk.SetDataRand(-finfoutk, finfoutk);
+    wq.SetDataRand(-finfoutk, finfoutk);
+    wv.SetDataRand(-finfoutv, finfoutv);
 }

 /* 
@@ -78,9 +90,10 @@ make the network
       and H = vector size of each position
 >> q - queries
 >> v - values
+>> maske - as it is
 << return - multi-attention result
 */
-XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v)
+XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask)
 {
    XTensor k2;
    XTensor q2;
@@ -101,10 +114,18 @@ XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v)
    vheads = Split(v2, v2.order - 1, nhead);

    XTensor att;
+    XTensor dot;
    XTensor scalar;
-
+    
    /* scalar = softmax(Q * K^T / sqrt(dk)) * V */
-    scalar = Softmax(Linear(BMMul(qheads, X_NOTRANS, kheads, X_TRANS), 1/sqrt((float)dk)), -1);
+    dot = BMMul(qheads, X_NOTRANS, kheads, X_TRANS);
+    if(isMasked)
+        dot = dot + mask;
+    scalar = Softmax(Linear(dot, 1/(float)sqrt((float)dk)), -1);
+
+    if(ignored > 0)
+        _SetDataDim(&scalar, 0, ignored, scalar.order - 2, 1e-9F);
+
    att = BMMul(scalar, vheads);

    /* concatenate the heads */

--- a/source/sample/transformer/T2TAttention.h
+++ b/source/sample/transformer/T2TAttention.h
@@ -66,6 +66,13 @@ public:
    /* size of input Q, K and V */
    int d;

+    /* indicates whether the attention is masked */
+    bool isMasked;
+
+    /* some positions can be ignored in attention. this is useful in lm where the first position needs
+       special design for the attention model. */
+    int ignored;
+
 public:
    /* constructor */
    T2TAttention();
@@ -74,10 +81,12 @@ public:
    ~T2TAttention();

    /* initialize the model */
-    void InitModel(int argc, const char ** argv, int myDevID = -1, XMem * myMem = NULL);
+    void InitModel(int argc, const char ** argv, 
+                   bool myIsMasked, int myIgnored, 
+                   int myDevID = -1, XMem * myMem = NULL);

    /* make the network */
-    XTensor Make(XTensor &k, XTensor &q, XTensor &v);
+    XTensor Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask);
 };

 }

--- a/source/sample/transformer/T2TEmbedding.cpp
+++ b/source/sample/transformer/T2TEmbedding.cpp
@@ -53,16 +53,14 @@ void T2TEmbedder::InitModel(int argc, const char ** argv, int myDevID, XMem * my
    devID = myDevID;
    mem = myMem;
    
-    int d = 0;
-
    LoadParamInt(argc, argv, "vsize", &vSize, -1);
-    LoadParamInt(argc, argv, "maxlen", &maxLength, 256);
-    LoadParamInt(argc, argv, "d", &eSize, DEFAULT_BEDDING_SIZE);
-    LoadParamInt(argc, argv, "d", &d, DEFAULT_BEDDING_SIZE);
+    LoadParamInt(argc, argv, "maxlen", &maxLength, 512);
+    LoadParamInt(argc, argv, "d", &eSize, DEFAULT_EMBEDDING_SIZE);
+    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);

    InitTensor2D(&w, vSize, eSize, X_FLOAT, devID, mem);

-    w.SetDataRandn(0, sqrt((float)eSize));
+    w.SetDataRandn(0, 1.0F/(float)sqrt((float)eSize));

    /* create the positional embedding matrix */
    MakePosEmbedding(eSize, d, maxLength);
@@ -84,11 +82,11 @@ void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length)
        for(int k = 0; k < eSize; k++){
            if(k % 2 == 0){
                int i = k/2;
-                dp[k] = sin(pos/pow(10000.0F, 2.0F*i/d));
+                dp[k] = (float)sin(pos/pow(10000.0F, 2.0F*i/d));
            }
            else{
                int i = (k - 1)/2;
-                dp[k] = cos(pos/pow(10000.0F, 2.0F*i/d));
+                dp[k] = (float)cos(pos/pow(10000.0F, 2.0F*i/d));
            }
        }
    }
@@ -135,7 +133,7 @@ XTensor T2TEmbedder::Make(XTensor &input)
    XTensor wordEmbedding;

    /* then we make word embeddings */
-    wordEmbedding = MMul(&input, w);
+    wordEmbedding = Linear(MMul(input, w), (float)sqrt((float)d));

    /* we sum over the two embeddings */
    return wordEmbedding + posEmbedding;

--- a/source/sample/transformer/T2TEmbedding.h
+++ b/source/sample/transformer/T2TEmbedding.h
@@ -29,7 +29,7 @@ using namespace nts;
 namespace transformer
 {

-#define DEFAULT_BEDDING_SIZE 512
+#define DEFAULT_EMBEDDING_SIZE 512

 /* 
 embedding (of word at position i):
@@ -53,6 +53,9 @@ public:
    /* maximum length of the sequence */
    int maxLength;

+    /* dimension size of the hidden layers in the t2t model */
+    int d;
+
    /* word embedding matrix */
    XTensor w;


--- a/source/sample/transformer/T2TEncoder.cpp
+++ b/source/sample/transformer/T2TEncoder.cpp
@@ -38,28 +38,33 @@ AttEncoder::~AttEncoder()
 {
    delete[] attentions;
    delete[] fnns;
-    delete[] layerNorms;
+    delete[] attLayerNorms;
+    delete[] fnnLayerNorms;
 }

 /* 
 initialize the model 
 >> argc - number of arguments
 >> argv - list of pointers to the arguments
+>> myIsMasked - indicates whether the masked attention is employed
+>> myIgnored - number of positions ignored in attention (from the start)
 >> myDevID - device id
 >> myMem - the memory pool
 */
-void AttEncoder::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
+void AttEncoder::InitModel(int argc, const char ** argv, 
+                           bool myIsMasked, int myIgnored, 
+                           int myDevID, XMem * myMem)
 {
    devID = myDevID;
    mem = myMem;
+    ignored = myIgnored;
    
-    LoadParamInt(argc, argv, "nstack", &nlayer, 6);
-    LoadParamInt(argc, argv, "hsize", &hSize, 512);
-    LoadParamInt(argc, argv, "esize", &eSize, 512);
+    LoadParamInt(argc, argv, "nlayer", &nlayer, 6);
+    LoadParamInt(argc, argv, "hsize", &hSize, DEFAULT_EMBEDDING_SIZE);
+    LoadParamInt(argc, argv, "esize", &eSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "vsize", &vSize, -1);

-
-    CheckNTErrors(nlayer > 1, "We have one encoding layer at least!");
+    CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
    CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsize\"");

    /* embedding model */
@@ -67,22 +72,26 @@ void AttEncoder::InitModel(int argc, const char ** argv, int myDevID, XMem * myM

    attentions = new T2TAttention[nlayer];
    fnns = new T2TFNN[nlayer];
-    layerNorms = new T2TLN[nlayer];
+    attLayerNorms = new T2TLN[nlayer];
+    fnnLayerNorms = new T2TLN[nlayer];

    /* initialize the stacked layers */
    for(int i = 0; i < nlayer; i++){
-        attentions[i].InitModel(argc, argv, myDevID, myMem);
+        attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
        fnns[i].InitModel(argc, argv, myDevID, myMem);
-        layerNorms[i].InitModel(argc, argv, myDevID, myMem);
+        attLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
+        fnnLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
    }
 }

 /* 
 make the encoding network
 >> input - the input tensor of the encoder
+>> mask - the mask that indicate each position is valid
+>> skipInputRes - indicates whether we skip the residual connection of the first layer
 << return - the output tensor of the encoder
 */
-XTensor AttEncoder::Make(XTensor &input)
+XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool skipInputRes)
 {
    XTensor x;

@@ -94,19 +103,27 @@ XTensor AttEncoder::Make(XTensor &input)
        XTensor fnn;
        XTensor res;

-        /* self attention */
-        att = attentions[i].Make(x, x, x);
+        if(skipInputRes && i == 0){
+            /* self attention */
+            att = attentions[i].Make(x, x, x, mask);

-        /* residual connection */
-        res = Sum(att, x);
+            /* TODO: dropout */

-        /* TODO: dropout */
+            /* layer normalization */
+            x = attLayerNorms[i].Make(att); 
+        }
+        else{
+            /* self attention */
+            att = attentions[i].Make(x, x, x, mask);

-        /* layer normalization */
-        ln = layerNorms[i].Make(res);
+            /* residual connection */
+            res = Sum(att, x);
+        
+            /* TODO: dropout */

-        /* input of next layer */
-        x = ln;
+            /* layer normalization */
+            x = attLayerNorms[i].Make(res);
+        }

        /* fnn */
        fnn = fnns[i].Make(x);
@@ -117,10 +134,7 @@ XTensor AttEncoder::Make(XTensor &input)
        /* TODO: dropout */

        /* layer normalization */
-        ln = layerNorms[i].Make(res);
-
-        /* input of next layer */
-        x = ln;
+        x = fnnLayerNorms[i].Make(res);
    }

    return x;

--- a/source/sample/transformer/T2TEncoder.h
+++ b/source/sample/transformer/T2TEncoder.h
@@ -40,7 +40,7 @@ class T2TEncoder
 {
 public:
    virtual
-    XTensor Make(XTensor &input) = 0;
+    XTensor Make(XTensor &input, XTensor &mask, bool skipInputRes) = 0;
 };

 /* 
@@ -49,7 +49,7 @@ the encoder based on RNN
 class RNNEncoder : T2TEncoder
 {
 public:
-    XTensor Make(XTensor &input);
+    XTensor Make(XTensor &input, XTensor &mask, bool skipInputRes);
 };


@@ -77,6 +77,10 @@ public:
    /* vocabulary size */
    int vSize;

+    /* some positions can be ignored in attention. this is useful in lm where the first position needs
+       special design for the attention model. */
+    int ignored;
+
    /* embedding of word at each position */
    T2TEmbedder embedder;

@@ -86,8 +90,11 @@ public:
    /* attention model of each layer */
    T2TAttention * attentions;

-    /* layer normalization */
-    T2TLN * layerNorms;
+    /* layer normalization for fnn */
+    T2TLN * fnnLayerNorms;
+
+    /* layer normalization for attention */
+    T2TLN * attLayerNorms;

    /* input tensor of the encoder */
    XTensor * input;
@@ -103,10 +110,12 @@ public:
    ~AttEncoder();

    /* initialize the model */
-    void InitModel(int argc, const char ** argv, int myDevID = -1, XMem * myMem = NULL);
+    void InitModel(int argc, const char ** argv, 
+                   bool myIsMasked, int myIgnored, 
+                   int myDevID = -1, XMem * myMem = NULL);

    /* make the encoding network */
-    XTensor Make(XTensor &input);
+    XTensor Make(XTensor &input, XTensor &mask, bool skipInputRes);
 };



--- a/source/sample/transformer/T2TFNN.cpp
+++ b/source/sample/transformer/T2TFNN.cpp
@@ -19,6 +19,7 @@
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
 */

+#include <math.h>
 #include "T2TFNN.h"
 #include "T2TUtility.h"
 #include "T2TEmbedding.h"
@@ -55,10 +56,10 @@ void T2TFNN::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
    
    float minmax = 0;

-    LoadParamInt(argc, argv, "d", &inSize, DEFAULT_BEDDING_SIZE);
-    LoadParamInt(argc, argv, "d", &outSize, DEFAULT_BEDDING_SIZE);
-    LoadParamInt(argc, argv, "fnnh", &hSize, DEFAULT_BEDDING_SIZE);
-    LoadParamFloat(argc, argv, "fnnminmax", &minmax, 0.08F);
+    LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE);
+    LoadParamInt(argc, argv, "d", &outSize, DEFAULT_EMBEDDING_SIZE);
+    LoadParamInt(argc, argv, "fnnh", &hSize, DEFAULT_EMBEDDING_SIZE);
+    LoadParamFloat(argc, argv, "fnnminmax", &minmax, 0.1F);

    InitTensor2D(&w1, inSize, hSize, X_FLOAT, devID, mem);
    InitTensor1D(&b1, hSize, X_FLOAT, devID, mem);
@@ -66,10 +67,14 @@ void T2TFNN::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
    InitTensor2D(&w2, hSize, outSize, X_FLOAT, devID, mem);
    InitTensor1D(&b2, outSize, X_FLOAT, devID, mem);

-    w1.SetDataRand(-minmax, minmax);
-    b1.SetDataRand(-minmax, minmax);
-    w2.SetDataRand(-minmax, minmax);
-    b2.SetDataRand(-minmax, minmax);
+    float scale = 1.0F;
+    float finfout1 = (float)sqrt(6.0F * scale/(inSize + hSize));
+    float finfout2 = (float)sqrt(6.0F * scale/(hSize + outSize));
+    
+    w1.SetDataRand(-finfout1, finfout1);
+    b1.SetZeroAll();
+    w2.SetDataRand(-finfout2, finfout2);
+    b2.SetZeroAll();
 }

 /* 
@@ -83,10 +88,10 @@ XTensor T2TFNN::Make(XTensor &input)
    XTensor t1;

    /* t1 = max(0, x * w1 + b1) */
-    t1 = Rectify(MMul(input, X_NOTRANS, w1, X_NOTRANS) + b1);
+    t1 = Rectify(MMul(input, w1) + b1);

    /* result = t1 * w2 + b2 */
-    return MMul(t1, X_NOTRANS, w2, X_NOTRANS) + b2;
+    return MMul(t1, w2) + b2;
 }



--- a/source/sample/transformer/T2TLayerNormal.cpp
+++ b/source/sample/transformer/T2TLayerNormal.cpp
@@ -19,7 +19,10 @@
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
 */

+#include <math.h>
 #include "T2TLayerNormal.h"
+#include "T2TUtility.h"
+#include "T2TEmbedding.h"
 #include "../../tensor/core/CHeader.h"

 namespace transformer
@@ -48,6 +51,18 @@ void T2TLN::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
 {
    devID = myDevID;
    mem = myMem;
+
+    int d = 0;
+    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
+
+    InitTensor2D(&w, d, d, X_FLOAT, devID, mem);
+    InitTensor1D(&b, d, X_FLOAT, devID, mem);
+
+    float scale = 1.0F;
+    float finfout = (float)sqrt(6.0F * scale / (d + d));
+
+    w.SetDataRand(-finfout, finfout);
+    b.SetZeroAll();
 }
    
 /*
@@ -60,6 +75,7 @@ y =
 XTensor T2TLN::Make(XTensor &input)
 {
    XTensor &x = input;
+    XTensor xn;
    XTensor mean;
    XTensor variance;
    XTensor standard;
@@ -67,21 +83,23 @@ XTensor T2TLN::Make(XTensor &input)
    XTensor standardFilled;

    /* \mu = (sum_i x_i)/m */
-    mean = ReduceSum(x, x.order - 1);
+    mean = ReduceMean(x, x.order - 1);

    /* \sigma = (sum_i (x_i - \mu)^2)/m */
    variance = ReduceVariance(x, x.order - 1, mean);

    /* standard = sqrt(variance) */
    standard = Power(variance, 0.5F);
-
    /* unsqueeze mean and standard deviation to fit them into 
-       the same size of x */
+       the same shape of x */
    meanFilled = Unsqueeze(mean, x.order - 1, x.GetDim(-1));
    standardFilled = Unsqueeze(standard, x.order - 1, x.GetDim(-1));

    /* x' = (x - \mu)/standard */
-    return (x - meanFilled)/standardFilled;
+    xn = (x - meanFilled)/standardFilled;
+
+    /* result = x' * w + b   */
+    return MMul(xn, w) + b;
 }

 }
--- a/source/sample/transformer/T2TLayerNormal.h
+++ b/source/sample/transformer/T2TLayerNormal.h
@@ -29,6 +29,8 @@ using namespace nts;
 namespace transformer
 {

+/* layer normalization: y = norm(x) * w + b 
+   where norm(x) = (x - mean)/standardDeviation */
 class T2TLN
 {
 public:
@@ -37,6 +39,12 @@ public:
    
    /* memory pool */
    XMem * mem;
+
+    /* the transformation matrix w */
+    XTensor w;
+
+    /* the bias term b */
+    XTensor b;
    
 public:
    /* constructor */

--- a/source/sample/transformer/T2TModel.cpp
+++ b/source/sample/transformer/T2TModel.cpp
@@ -22,6 +22,7 @@

 #include "T2TModel.h"
 #include "T2TUtility.h"
+#include "../../tensor/core/CHeader.h"

 namespace transformer
 {
@@ -33,6 +34,7 @@ T2TModel::T2TModel()
    mem = NULL;
    isLM = false;
    isMT = false;
+    nhead = 1;
 }

 /* de-constructor */
@@ -54,24 +56,27 @@ void T2TModel::InitModel(int argc, const char ** argv)
    LoadParamBool(argc, argv, "mem", &useMem, useMem);
    LoadParamBool(argc, argv, "lm", &isLM, true);
    LoadParamBool(argc, argv, "mt", &isMT, false);
+    LoadParamInt(argc, argv, "nhead", &nhead, 8);

    if(useMem){
        delete mem;
        mem = new XMem(devID);
    }

-    encoder.InitModel(argc, argv, devID, mem);
+    encoder.InitModel(argc, argv, isLM, isLM ? 1 : 0, devID, mem);
    outputLayer.InitModel(argc, argv, devID, mem);
 }

 /* 
 make the encoding network
 >> input - input tensor
+>> mask - the mask for positions that are/not involved in computation
+>> skipInputRes - indicates whether we skip the residual connection of the first layer
 << return - encoding result
 */
-XTensor T2TModel::MakeEncoding(XTensor &input)
+XTensor T2TModel::MakeEncoding(XTensor &input, XTensor &mask, bool skipInputRes)
 {
-    return encoder.Make(input);
+    return encoder.Make(input, mask, skipInputRes);
 }

 /* 
@@ -81,15 +86,30 @@ make the entire network (with the output softmax layer)
 */
 void T2TModel::Make(XTensor &input, XTensor &output)
 {
+    XTensor encoding;
+    
    if(isLM){
-        XTensor encoding;
+        /* generate mask to see "previous" words only */
+        int len = input.GetDim(input.order - 2);
+        int * dims = new int[input.order + 1];
+        for(int i = 0; i < input.order; i++)
+            dims[i + 1] = input.GetDim(i);
+        dims[0] = nhead;
+        dims[input.order] = len;
+        XTensor mask(input.order + 1, dims, X_FLOAT, 1.0F, input.devID, input.mem);
+        
+        /* a upper triangular matrix where the cells of the upper triangular are set to -1e-9 */
+        _SetDataLowTri(&mask, 1e9F, -1);
+        _ScaleAndShiftMe(&mask, 1.0F, -1e9F);

-        encoding = MakeEncoding(input);
+        encoding = MakeEncoding(input, mask, true);
        outputLayer.Make(encoding, output);
+
+        delete[] dims;
    }
    else{
        ShowNTErrors("TODO!");
    }
 }

-}
\ No newline at end of file
+}
--- a/source/sample/transformer/T2TModel.h
+++ b/source/sample/transformer/T2TModel.h
@@ -55,6 +55,9 @@ public:
    /* indicates whether the model is running for machine translation */
    bool isMT;

+    /* number of heads in the attention model */
+    int nhead;
+
 public:
    /* constructor */
    T2TModel();
@@ -66,7 +69,7 @@ public:
    void InitModel(int argc, const char ** argv);

    /* make the encoding network */
-    XTensor MakeEncoding(XTensor &input);
+    XTensor MakeEncoding(XTensor &input, XTensor &mask, bool skipInputRes);

    /* make the entire network (with the output softmax layer) */
    void Make(XTensor &input, XTensor &output);

--- a/source/sample/transformer/T2TOutput.cpp
+++ b/source/sample/transformer/T2TOutput.cpp
@@ -19,6 +19,7 @@
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
 */

+#include <math.h>
 #include "T2TOutput.h"
 #include "T2TUtility.h"
 #include "T2TEmbedding.h"
@@ -56,12 +57,15 @@ void T2TOutput::InitModel(int argc, const char ** argv, int myDevID, XMem * myMe
    float minmax = 0;

    LoadParamInt(argc, argv, "vsize", &vSize, -1);
-    LoadParamInt(argc, argv, "d", &inSize, DEFAULT_BEDDING_SIZE);
-    LoadParamInt(argc, argv, "d", &hSize, DEFAULT_BEDDING_SIZE);
+    LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE);
+    LoadParamInt(argc, argv, "d", &hSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamFloat(argc, argv, "outputminmax", &minmax, 0.08F);

    InitTensor2D(&w, hSize, vSize, X_FLOAT, devID, mem);
-    w.SetDataRand(-minmax, minmax);
+    
+    float scale = 1.0F;
+    float finfout = (float)sqrt(6.0F * scale/(hSize + vSize));
+    w.SetDataRand(-finfout, finfout);
 }

 /* 
@@ -89,4 +93,4 @@ void T2TOutput::Make(XTensor &input, XTensor &output)
    output = LogSoftmax(MMul(x, w), -1);
 }

-}
\ No newline at end of file
+}
--- a/source/sample/transformer/T2TTrainer.cpp
+++ b/source/sample/transformer/T2TTrainer.cpp
@@ -59,6 +59,8 @@ void T2TTrainer::Init(int argc, const char ** argv)
    LoadParamInt(argc, argv, "wbatch", &wBatchSize, 1);
    LoadParamInt(argc, argv, "nepoch", &nepoch, 1);
    LoadParamInt(argc, argv, "nstep", &nstep, 1);
+    LoadParamInt(argc, argv, "d", &d, 512);
+    LoadParamInt(argc, argv, "nwarmup", &nwarmup, 4000);
    LoadParamInt(argc, argv, "vsize", &vSize, 1);
    LoadParamBool(argc, argv, "sorted", &isLenSorted, false);
    LoadParamInt(argc, argv, "bufsize", &bufSize, 50000);
@@ -82,6 +84,7 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
    int wordCountTotal = 0;
    bool isEnd = false;
    float loss = 0;
+    float lr = 0;

    XNet net;
    
@@ -108,8 +111,12 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
            /* back-propagation for obtaining gradients */
            net.Backward(output, batch, CROSSENTROPY);
            
+            /* learning rate */
+            lr = (1 / (float)sqrt((float)d)) * (float)MIN(pow(step + 1, -0.5), (step + 1) * pow(nwarmup, -1.5));
+            //lr = 0.00005F;
+            
            /* update the parameters */
-            Update(model);
+            Update(model, lr);
            
            /* get probabilities */
            float prob = GetProb(&output, &batch, NULL);
@@ -125,18 +132,21 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
            
            if (step % 1 == 0) {
                double elapsed = GetClockSec() - startT;
-                XPRINT5(0, stderr, "[INFO] elapsed=%.1fs, step=%d, epoch=%d, ngram=%d, ppl=%.3f\n",
-                        elapsed, step, epoch + 1, wordCountTotal, exp(loss / wordCount));
+                XPRINT6(0, stderr, "[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, ppl=%.3f\n",
+                        lr, elapsed, step, epoch + 1, wordCountTotal, exp(loss / wordCount));
            }
        }
        
        fclose(file);
+
+        if (isEnd)
+            break;
    }
    
    double elapsed = GetClockSec() - startT;
    
-    XPRINT5(0, stderr, "[INFO] elapsed=%.1fs, step=%d, epoch=%d, ngram=%d, ppl=%.3f\n",
-            elapsed, step, epoch, wordCountTotal, exp(loss / wordCount));
+    XPRINT6(0, stderr, "[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, ppl=%.3f\n",
+            lr, elapsed, step, epoch, wordCountTotal, exp(loss / wordCount));
    XPRINT3(0, stderr, "[INFO] training finished (took %.1fs, step=%d and epoch=%d)\n",
            elapsed, step, epoch);
 }
@@ -317,10 +327,14 @@ float T2TTrainer::GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs)
 }

 /* 
-update the model by delta rule 
+update the model by delta rule
+\theta_new = \theta - \lrate * grad
+where
+\lrate = d^-0.5 * min(stepNum^-0.5, stepNum * warmupStepNum^-1.5)
 >> model - the t2t model
+>> lr - learning rate
 */
-void T2TTrainer::Update(T2TModel * model)
+void T2TTrainer::Update(T2TModel * model, const float lr)
 {
    XList ws(100);

@@ -331,6 +345,13 @@ void T2TTrainer::Update(T2TModel * model)
        ws.Add(&model->encoder.fnns[i].b1);
        ws.Add(&model->encoder.fnns[i].w2);
        ws.Add(&model->encoder.fnns[i].b2);
+        ws.Add(&model->encoder.attentions[i].wk);
+        ws.Add(&model->encoder.attentions[i].wq);
+        ws.Add(&model->encoder.attentions[i].wv);
+        ws.Add(&model->encoder.fnnLayerNorms[i].w);
+        ws.Add(&model->encoder.fnnLayerNorms[i].b);
+        ws.Add(&model->encoder.attLayerNorms[i].w);
+        ws.Add(&model->encoder.attLayerNorms[i].b);
    }

    ws.Add(&model->encoder.embedder.w);
@@ -339,11 +360,37 @@ void T2TTrainer::Update(T2TModel * model)
        XTensor * para = (XTensor*)ws.Get(i);
        XTensor * paraGrad = para->grad;

+        if (para == NULL || paraGrad == NULL)
+            continue;
+
        CheckNTErrors(para != NULL, "NULL parameter tensor!");
        CheckNTErrors(paraGrad != NULL, "NULL gradient tensor!");

+        /*
+        DTYPE * d = new DTYPE[para->unitNum * para->unitSize];
+        DTYPE * g = new DTYPE[para->unitNum * para->unitSize];
+
+        XMemCopy(d, -1, para->data, para->devID, para->unitNum * para->unitSize);
+        XMemCopy(g, -1, paraGrad->data, paraGrad->devID, para->unitNum * para->unitSize);
+
+        for (int i = 0; i < para->unitNum; i++) {
+            if (IsNAN(d[i]) || IsINF(d[i])) {
+                int nnn = 0;
+            }
+            if (IsNAN(g[i]) || IsINF(g[i])) {
+                int nnn = 0;
+            }
+        }
+
+        delete[] d;
+        delete[] g;
+        */
+
        /* the delta rule */
-        _Sum(para, paraGrad, para, -lrate);
+        _Sum(para, paraGrad, para, -lr);
+
+        /* clear gradient */
+        paraGrad->SetZeroAll();
    }
 }


--- a/source/sample/transformer/T2TTrainer.h
+++ b/source/sample/transformer/T2TTrainer.h
@@ -63,6 +63,12 @@ public:
    
    /* indicates whether the sequence is sorted by length */
    bool isLenSorted;
+    
+    /* dimension size of each inner layer */
+    int d;
+    
+    /* step number of warm-up for training */
+    int nwarmup;

    /* vocabulary size of the source side */
    int vSize;
@@ -105,7 +111,7 @@ public:
    float GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs);

    /* update the model by delta rule */
-    void Update(T2TModel * model);
+    void Update(T2TModel * model, const float lr);
 };



--- a/source/sample/transformer/T2TUtility.cpp
+++ b/source/sample/transformer/T2TUtility.cpp
@@ -26,6 +26,8 @@
 namespace transformer
 {

+FILE * tmpFILE;
+
 void LoadParamString(int argc, const char ** argv, const char * name, char * p, const char * defaultP)
 {
    char vname[128];
@@ -98,7 +100,9 @@ void ShowParams(int argc, const char ** argv)
 {
    fprintf(stderr, "args:\n");
    for(int i = 0; i < argc; i++){
-        if(argv[i][0] == '-'){
+        if(argv[i][1] == 0)
+            continue;
+        if(argv[i][0] == '-' && (argv[i][1] < '1' || argv[i][1] > '9')){
            if(i + 1 < argc && argv[i + 1][0] != '-')
                fprintf(stderr, " %s=%s\n", argv[i], argv[i + 1]);
            else

--- a/source/sample/transformer/T2TUtility.h
+++ b/source/sample/transformer/T2TUtility.h
@@ -27,6 +27,8 @@
 namespace transformer
 {

+extern FILE * tmpFILE;
+
 /* load arguments */
 void LoadParamString(int argc, const char ** argv, const char * name, char * p, const char * defaultP);
 void LoadParamInt(int argc, const char ** argv, const char * name, int * p, int defaultP);

--- a/source/sample/transformer/Transformer.cpp
+++ b/source/sample/transformer/Transformer.cpp
@@ -33,6 +33,8 @@ int TransformerMain(int argc, const char ** argv)
    if(argc == 0)
        return 1;

+    tmpFILE = fopen("tmp.txt", "wb");
+
    ShowParams(argc, argv);

    char * trainFN = new char[MAX_LINE_LENGTH];
@@ -51,6 +53,8 @@ int TransformerMain(int argc, const char ** argv)

    delete[] trainFN;

+    fclose(tmpFILE);
+
    return 0;
 }


--- a/source/tensor/Main.cpp
+++ b/source/tensor/Main.cpp
@@ -45,7 +45,7 @@ int main( int argc, const char ** argv )
    //_CrtSetBreakAlloc(123);

    /* a tiny test */
-    SmallTest();
+    //SmallTest();

    //_CrtDumpMemoryLeaks();
    //return 0;

--- a/source/tensor/XGlobal.h
+++ b/source/tensor/XGlobal.h
@@ -43,7 +43,7 @@
 /* the nts (NiuTrans.Tensor) namespace */
 namespace nts {

-#define _XINLINE_ inline
+#define _XINLINE_  

 //#define DOUBELPRICSION


--- a/source/tensor/XName.cpp
+++ b/source/tensor/XName.cpp
@@ -45,12 +45,16 @@ const char * GetOPName(int type)
            return "M_CLIP";
        else if (type == MATH_DIV)
            return "M_DIV";
+        else if (type == MATH_DIVDIM)
+            return "M_DIVDIM";
        else if (type == MATH_MATRIXMUL)
            return "M_MATRIXMUL";
        else if (type == MATH_MATRIXMULBATCHED)
            return "M_MATRIXMULBATCHED";
        else if (type == MATH_MULTIPLY)
            return "M_MULTIPLY";
+        else if (type == MATH_MULTIPLYDIM)
+            return "M_MULTIPLYDIM";
        else if (type == MATH_NEGATE)
            return "M_NEGATE";
        else if (type == MATH_NORMALIZE)
@@ -61,10 +65,12 @@ const char * GetOPName(int type)
            return "M_SCALEANDSHIFT";
        else if (type == MATH_SIGN)
            return "M_SIGN";
-        else if (type == MATH_SUM)
-            return "M_SUM";
        else if (type == MATH_SUB)
            return "M_SUB";
+        else if (type == MATH_SUBDIM)
+            return "M_SUBDIM";
+        else if (type == MATH_SUM)
+            return "M_SUM";
        else if (type == MATH_SUMDIM)
            return "M_SUMDIM";
        else if (type == REDUCE_REDUCEMAX)

--- a/source/tensor/XName.h
+++ b/source/tensor/XName.h
@@ -41,17 +41,20 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 #define MATH_CLIP               MATH_ROUND + 1
 #define MATH_DIV                MATH_CLIP + 1
-#define MATH_MATRIXMUL          MATH_DIV + 1
+#define MATH_DIVDIM             MATH_DIV + 1
+#define MATH_MATRIXMUL          MATH_DIVDIM + 1
 #define MATH_MATRIXMULBATCHED   MATH_MATRIXMUL + 1
 #define MATH_MULTIPLY           MATH_MATRIXMULBATCHED + 1
-#define MATH_NEGATE             MATH_MULTIPLY + 1
+#define MATH_MULTIPLYDIM        MATH_MULTIPLY + 1
+#define MATH_NEGATE             MATH_MULTIPLYDIM + 1
 #define MATH_NORMALIZE          MATH_NEGATE + 1
 #define MATH_POWER              MATH_NORMALIZE + 1
 #define MATH_SCALEANDSHIFT      MATH_POWER + 1
 #define MATH_SIGN               MATH_SCALEANDSHIFT + 1
-#define MATH_SUM                MATH_SIGN + 1
-#define MATH_SUB                MATH_SUM + 1
-#define MATH_SUMDIM             MATH_SUB + 1
+#define MATH_SUB                MATH_SIGN + 1
+#define MATH_SUBDIM             MATH_SUB + 1
+#define MATH_SUM                MATH_SUBDIM + 1
+#define MATH_SUMDIM             MATH_SUM + 1

 #define REDUCE                  MATH_SUMDIM + 1
 #define REDUCE_REDUCEMAX        REDUCE + 1

--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -1046,7 +1046,7 @@ bool XTensor::Set3D(DTYPE value, int d0, int d1, int d2)
    CheckNTErrors(d2 >= 0 && d2 < dimSize[2], "dimension 1 is out of range!");
    CheckNTErrors(dataType == DEFAULT_DTYPE, "The tensor is not in default type.");

-    int dims[3] = {d0, d1, d1};
+    int dims[3] = {d0, d1, d2};

    return SetToDevice(devID, GetCell(dims, 3), value);
 }

--- a/source/tensor/core/CHeader.h
+++ b/source/tensor/core/CHeader.h
@@ -27,15 +27,18 @@
 #include "../XTensor.h"

 #include "arithmetic/Div.h"
+#include "arithmetic/DivDim.h"
 #include "arithmetic/MatrixMul.h"
 #include "arithmetic/MatrixMul2D.h"
 #include "arithmetic/MatrixMul2DMultiTheading.h"
 #include "arithmetic/MatrixMul2DParallel.h"
 #include "arithmetic/MatrixMulBatched.h"
 #include "arithmetic/Multiply.h"
+#include "arithmetic/MultiplyDim.h"
 #include "arithmetic/Negate.h"
 #include "arithmetic/Sign.h"
 #include "arithmetic/Sub.h"
+#include "arithmetic/SubDim.h"
 #include "arithmetic/Sum.h"
 #include "arithmetic/SumByColumnTV.h"
 #include "arithmetic/SumByColumnVT.h"

--- a/source/tensor/core/arithmetic/Div.cpp
+++ b/source/tensor/core/arithmetic/Div.cpp
@@ -23,6 +23,7 @@
 #include "../../XName.h"
 #include "Div.h"
 #include "Div.cuh"
+#include "DivDim.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

@@ -137,6 +138,33 @@ void _DivMe(XTensor * a, const XTensor * b, DTYPE alpha, int leadingDim)
    _Div(a, b, a, alpha, leadingDim);
 }

+/* 
+return a dimension if the division is performed as DivDim (in more details in DivDim.h)
+>> a - a tensor
+>> b - another tensor for division
+*/
+int GetDivDimIndex(const XTensor &a, const XTensor &b)
+{
+    if(a.order < b.order)
+        return -1;
+
+    int hitCount = 0;
+    int hitDim = -1;
+    for(int i = 0; i < b.order; i++){
+        if(b.dimSize[b.order - 1 - i] == 1)
+            continue;
+        else if(b.dimSize[b.order - 1 - i] == a.dimSize[a.order - 1 - i]){
+            hitCount++;
+            hitDim = a.order - b.order + i;
+        }
+    }
+
+    if(hitCount == 1)
+        return hitDim;
+    else
+        return -1;
+}
+
 /*
 element-wise division of two tensors (return a XTensor structure)
 make a new tensor c to keep the result and return it
@@ -146,23 +174,41 @@ where i is the index of the item

 >> a - tensor a
 >> b - tensor b
+>> alpha - the coefficient
 >> leadingDim - the dimension along which we perform broadcasting
 << return - the product of the tensors
 */
-XTensor Div(const XTensor &a, const XTensor &b, int leadingDim)
+XTensor Div(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim)
 {
-    CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
-
    XTensor c(&a);
    c.SetTMP();
+
+    int n = GetDivDimIndex(a, b);
+
+    if(n == -1){
+        CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
+
+        /* call _Div function */
+        _Div(&a, &b, &c, alpha, leadingDim);
    
-    /* call _Multiply function */
-    _Div(&a, &b, &c, 0, leadingDim);
-    
-    /* tensor connections */
-    XLink::MakeLink(&a, &b, &c, MATH_DIV);
-    XLink::AddParamToHeadInt(&c, leadingDim);
-    
+        /* tensor connections */
+        XLink::MakeLink(&a, &b, &c, MATH_DIV);
+        XLink::AddParamToHead(&c, alpha);
+        XLink::AddParamToHeadInt(&c, leadingDim);
+    }
+    else if(n >= 0 && n < a.order){
+        /* call _DivDim function */
+        _DivDim(&a, &b, &c, n, alpha);
+
+        /* tensor connections */
+        XLink::MakeLink(&a, &b, &c, MATH_DIVDIM);
+        XLink::AddParamToHeadInt(&c, n);
+        XLink::AddParamToHeadInt(&c, alpha);
+    }
+    else{
+        ShowNTErrors("Something is wrong!");
+    }
+
    return c;
 }


--- a/source/tensor/core/arithmetic/Div.h
+++ b/source/tensor/core/arithmetic/Div.h
@@ -31,7 +31,7 @@ element-wise division of two tensors:
 c(i) = a(i)/b(i) + \alpha * c(i) 
 where i is the index of the element
 */
-void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha = 0, int leadingDim = 0);
+void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha = 0.0, int leadingDim = 0);

 /* 
 element-wise division of two tensors (do it on site)
@@ -39,7 +39,7 @@ keep the result in the input tensor a and return nothing
 a(i) = a(i)/b(i) + \alpha * a(i) 
 where i is the index of the element 
 */
-void _DivMe(XTensor * a, const XTensor * b, DTYPE alpha = 0, int leadingDim = 0);
+void _DivMe(XTensor * a, const XTensor * b, DTYPE alpha = 0.0, int leadingDim = 0);

 /* 
 element-wise division of two tensors (return a XTensor structure)
@@ -47,7 +47,7 @@ make a new tensor to keep the result and return it
 c(i) = a(i)/b(i)
 where i is the index of the element 
 */
-XTensor Div(const XTensor &a, const XTensor &b, int leadingDim = 0);
+XTensor Div(const XTensor &a, const XTensor &b, DTYPE alpha = 0.0, int leadingDim = 0);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/arithmetic/DivDim.cpp
+++ b/source/tensor/core/arithmetic/DivDim.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-15
+ */
+
+#include "Div.h"
+#include "DivDim.h"
+#include "DivDim.cuh"
+#include "../../XName.h"
+#include "../movement/CopyValues.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/*
+tensor division
+
+c = a / b + \alpha * c
+where the size of b is equal to the n-th dimension of a, 
+i.e., a is divided with b by broadcasting 
+
+>> a - a tensor
+>> b - another tensor whose size is equal to that of dimension n of a
+>> c - where we put result. we save it in a if c is NULL
+>> n - the dimension index
+>> alpha - the scaling factor
+*/
+void _DivDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha)
+{
+    CheckNTErrors(a && b && c, "Empty tensor input!");
+    CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in division!");
+    CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
+                 "Unmatched data types in addition!");
+    CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in division!");
+    CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
+    CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
+
+    if(XTensor::IsSameShaped(a, b)){
+        _Div(a, b, c, alpha);
+        return;
+    }
+
+    if(a->devID >= 0 || b->devID >= 0 || c->devID >= 0){
+#ifdef USE_CUDA
+        _CudaDivDim(a, b, c, n, alpha);
+#else
+        ShowNTErrors("Please specify USE_CUDA and recompile the code!");
+#endif
+    }
+    else{
+        int stride = 1;
+        int blockSize = a->dimSize[n];
+        int blockNum = 1;
+
+        for(int i = a->order - 1; i >= 0; i--){
+            if(i > n)
+                stride *= a->dimSize[i];
+            else if(i < n)
+                blockNum *= a->dimSize[i];
+        }
+    
+        if (a->dataType == DEFAULT_DTYPE){
+            int num = a->unitNum;
+            if(stride > 1){
+                for(int i = 0, j = 0; i < num; i += stride, j++){
+                    DTYPE * ap = (DTYPE*)a->data + i;
+                    DTYPE   bv = *((DTYPE*)b->data + j % blockSize);
+                    DTYPE * cp = (DTYPE*)c->data + i;
+                    for(int k = 0; k < stride; k++){
+                        if(alpha == 0.0F)
+                            cp[k] = ap[k] / bv;
+                        else
+                            cp[k] = ap[k] / bv + alpha * cp[k];
+                    }
+                }
+            }
+            else if(stride == 1){
+                DTYPE * bp = (DTYPE*)b->data;
+                for(int i = 0; i < num; i += blockSize){
+                    DTYPE * ap = (DTYPE*)a->data + i;
+                    DTYPE * cp = (DTYPE*)c->data + i;
+                    if(alpha == 0.0F){
+                        for(int j = 0; j < blockSize; j++)
+                            cp[j] = ap[j] / bp[j];
+                    }
+                    else{
+                        for(int j = 0; j < blockSize; j++)
+                            cp[j] = ap[j] / bp[j] + alpha * cp[j];
+                    }
+                }
+            }
+            else{
+                ShowNTErrors("Something is wrong!");
+            }
+        }
+        else {
+            ShowNTErrors("TODO!");
+        }
+    }
+}
+    
+/*
+tensor division of two tensors (do it on site)
+keep the result in the input tensor and return nothing
+
+a = a/b + \alpha * a
+where the size of b is equal to the n-th dimension of a, 
+i.e., a is divided with b by broadcasting 
+
+>> a - a tensor
+>> b - another tensor whose size is equal to that of dimension n of a
+>> n - the dimension index
+>> alpha - the scaling factor
+*/
+void _DivDim(XTensor * a, const XTensor * b, int n, DTYPE alpha)
+{
+    _DivDim(a, b, a, n, alpha);
+}
+    
+/*
+tensor division of two tensors (return a XTensor structure and make tensor connections)
+make a new tensor to keep the result and return it
+
+c = a/b + \alpha * c
+where the size of b is equal to the n-th dimension of a, 
+i.e., a is divided with b by broadcasting 
+
+
+>> a - a tensor
+>> b - another tensor whose size is equal to that of dimension n of a
+>> n - the dimension index
+>> alpha - the scaling factor
+<< return - the result tensor by tensor division
+*/
+XTensor DivDim(const XTensor &a, const XTensor &b, int n, DTYPE alpha)
+{
+    XTensor c(&a);
+    c.SetTMP();
+    
+    /* call _Div function */
+    _DivDim(&a, &b, &c, n, alpha);
+    
+    /* tensor connections */
+    XLink::MakeLink(&a, &b, &c, MATH_DIVDIM);
+    XLink::AddParamToHeadInt(&c, n);
+    XLink::AddParamToHead(&c, alpha);
+    
+    return c;
+}
+    
+}
--- a/source/tensor/core/arithmetic/DivDim.cu
+++ b/source/tensor/core/arithmetic/DivDim.cu
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-15
+ */
+
+#include "DivDim.cuh"
+#include "../../XDevice.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+#ifdef USE_CUDA
+
+/* 
+tensor division of a tensor and a row vector
+c = a / b + alpha * c
+where a is a tensor and b is a row vector
+>> a - pointer to the data array of a
+>> b - pointer to the data array of b
+>> c - pointer to the data array of c
+>> rowNum - number of rows of a and c
+>> colNum - number of columns of a and c (i.e., the size of b)
+>> alpha - the scaling factor
+*/
+template <class T, bool alphaFired>
+__global__
+void KernelDivWithRow(T * a, T * b, T * c, int rowNum, int colNum, T alpha)
+{
+    __shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+    int col = blockDim.x * blockIdx.x + threadIdx.x;
+    int row = blockDim.y * blockIdx.y + threadIdx.y;
+
+    if(col >= colNum || row >= rowNum)
+        return;
+
+    if(threadIdx.y == 0)
+        bv[threadIdx.x] = b[col];
+
+    __syncthreads();
+
+    int offset = colNum * row + col;
+    if(alphaFired)
+        c[offset] = a[offset] / bv[threadIdx.x] + c[offset] * alpha;
+    else
+        c[offset] = a[offset] / bv[threadIdx.x];
+}
+
+/* 
+tensor division of a tensor and a colum vector
+c = a / b + alpha * c
+where a is a tensor and b is a colum vector
+>> a - pointer to the data array of a
+>> b - pointer to the data array of b
+>> c - pointer to the data array of c
+>> rowNum - number of rows of a and c (i.e., the size of b)
+>> colNum - number of columns of a and c 
+>> blockNum - size of a block (matrix), i.e., rowNum * colNum
+>> blockNum - number of matrics 
+>> alpha - the scaling factor
+*/
+template <class T, bool alphaFired>
+__global__
+void KernelDivWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, T alpha)
+{
+    __shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+
+    int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
+    int row = blockDim.y * blockIdx.y + threadIdx.y;
+
+    int col = colIndex % colNum;
+    int block = colIndex / colNum;
+
+    if(row >= rowNum || block >= blockNum)
+        return;
+
+    if(threadIdx.x == 0)
+        bv[threadIdx.y] = b[row];
+
+    __syncthreads();
+
+    int offset = block * blockSize + row * colNum + col;
+    
+    if(alphaFired)
+        c[offset] = a[offset] / bv[threadIdx.y] + c[offset] * alpha;
+    else
+        c[offset] = a[offset] / bv[threadIdx.y];
+}
+
+/*
+tensor division
+
+c = a / b + \alpha * c
+where the size of b is equal to the n-th dimension of a, 
+i.e., a is divided with b by broadcasting 
+
+>> a - a tensor
+>> b - another tensor whose size is equal to that of dimension n of a
+>> c - where we put a / b + \alpha * c. we save it in a if c is NULL
+>> n - the dimension index
+>> alpha - the scaling factor
+*/
+void _CudaDivDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha)
+{
+    CheckNTErrors(a && b && c, "Empty tensor input!");
+    CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in division!");
+    CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
+                 "Unmatched data types in division!");
+    CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in division!");
+    CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
+    CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
+
+    int stride = 1;
+    int blockSize = a->dimSize[n];
+    int blockNum = 1;
+
+    for(int i = a->order - 1; i >= 0; i--){
+        if(i > n)
+            stride *= a->dimSize[i];
+        else if(i < n)
+            blockNum *= a->dimSize[i];
+    }
+
+    int cudaGrids[3];
+    int cudaBlocks[3];
+
+    int devIDBackup = 0;
+    ProtectCudaDev(a->devID, devIDBackup);
+
+    if (a->dataType == DEFAULT_DTYPE){
+        if(stride > 1){
+            GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
+            if(alpha == (DTYPE)0.0F)
+                KernelDivWithCol<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
+                                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, 
+                                                  blockSize, stride, blockSize * stride, blockNum, alpha);
+            else
+                KernelDivWithCol<DTYPE, true>  <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
+                                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, 
+                                                  blockSize, stride, blockSize * stride, blockNum, alpha);
+        }
+        else if(stride == 1){
+            GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
+            if(alpha == (DTYPE)0.0F)
+                KernelDivWithRow<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
+                                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, 
+                                                  blockNum, blockSize, alpha);
+            else
+                KernelDivWithRow<DTYPE, true>  <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
+                                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, 
+                                                  blockNum, blockSize, alpha);
+        }
+        else{
+            ShowNTErrors("Something is wrong!");
+        }
+    }
+    else {
+        ShowNTErrors("TODO!");
+    }
+
+    BacktoCudaDev(a->devID, devIDBackup);
+}
+
+#endif
+
+} // namespace nts(NiuTrans.Tensor)
+
--- a/source/tensor/core/arithmetic/DivDim.cuh
+++ b/source/tensor/core/arithmetic/DivDim.cuh
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-15
+ */
+
+#ifndef __DIVDIM_CUH__
+#define __DIVDIM_CUH__
+
+#include "../../XTensor.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+#ifdef USE_CUDA
+
+/* 
+tensor division
+c(i) = a/b + \alpha * c
+where the size of b is equal to the n-th dimension of a, 
+i.e., a is divided with b by broadcasting (cuda version)
+*/
+void _CudaDivDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha = (DTYPE)0.0);
+
+#endif
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __DIVDIM_CUH__
--- a/source/tensor/core/arithmetic/DivDim.h
+++ b/source/tensor/core/arithmetic/DivDim.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-15
+ */
+
+#ifndef __DIVDIM_H__
+#define __DIVDIM_H__
+
+#include "../../XTensor.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+    
+/* 
+tensor division of two tensors:
+c(i) = a/b + \alpha * c
+where the size of b is equal to the n-th dimension of a, 
+i.e., a is divided with b by broadcasting 
+*/
+void _DivDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha = (DTYPE)0.0);
+
+/*
+tensor division of two tensors:
+c(i) = a/b + \alpha * c
+where the size of b is equal to the n-th dimension of a, 
+i.e., a is divided with b by broadcasting 
+we keep the result in the input tensor a and return nothing
+*/
+void _DivDim(XTensor * a, const XTensor * b, int n, DTYPE alpha = (DTYPE)0.0);
+    
+
+/*
+tensor division of two tensors:
+c(i) = a/b + \alpha * c
+where the size of b is equal to the n-th dimension of a, 
+i.e., a is divided with b by broadcasting 
+we make a new tensor c to keep the result and return it
+*/
+XTensor DivDim(const XTensor &a, const XTensor &b, int n, DTYPE alpha = (DTYPE)0.0);
+    
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __DIVDIM_H__
--- a/source/tensor/core/arithmetic/Multiply.cpp
+++ b/source/tensor/core/arithmetic/Multiply.cpp
@@ -23,6 +23,7 @@
 #include "../../XName.h"
 #include "Multiply.h"
 #include "Multiply.cuh"
+#include "MultiplyDim.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

@@ -138,6 +139,33 @@ void _MultiplyMe(XTensor * a, const XTensor * b, DTYPE alpha, int leadingDim)
    _Multiply(a, b, a, alpha, leadingDim);
 }

+/* 
+return a dimension if the multiplication is performed as MultiplyDim (in more details in MultiplyDim.h)
+>> a - a tensor
+>> b - another tensor for multiplication
+*/
+int GetMultiplyDimIndex(const XTensor &a, const XTensor &b)
+{
+    if(a.order < b.order)
+        return -1;
+
+    int hitCount = 0;
+    int hitDim = -1;
+    for(int i = 0; i < b.order; i++){
+        if(b.dimSize[b.order - 1 - i] == 1)
+            continue;
+        else if(b.dimSize[b.order - 1 - i] == a.dimSize[a.order - 1 - i]){
+            hitCount++;
+            hitDim = a.order - b.order + i;
+        }
+    }
+
+    if(hitCount == 1)
+        return hitDim;
+    else
+        return -1;
+}
+
 /*
 element-wise product of two tensors (return a XTensor structure)
 make a new tensor c to keep the result and return it
@@ -150,20 +178,38 @@ where i is the index of the item
 >> leadingDim - the dimension along which we perform broadcasting
 << return - the product of the tensors
 */
-XTensor Multiply(const XTensor &a, const XTensor &b, int leadingDim)
+XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim)
 {
-    CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");

    XTensor c(&a);
    c.SetTMP();
    
-    /* call _Multiply function */
-    _Multiply(&a, &b, &c, 0, leadingDim);
+    int n = GetMultiplyDimIndex(a, b);
+
+    if(n == -1){
+        CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
    
-    /* tensor connections */
-    XLink::MakeLink(&a, &b, &c, MATH_MULTIPLY);
-    XLink::AddParamToHeadInt(&c, leadingDim);
+        /* call _Multiply function */
+        _Multiply(&a, &b, &c, 0, leadingDim);
    
+        /* tensor connections */
+        XLink::MakeLink(&a, &b, &c, MATH_MULTIPLY);
+        XLink::AddParamToHead(&c, alpha);
+        XLink::AddParamToHeadInt(&c, leadingDim);
+    }
+    else if(n >= 0 && n < a.order){
+        /* call _MultiplyDim function */
+        _MultiplyDim(&a, &b, &c, n, alpha);
+
+        /* tensor connections */
+        XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM);
+        XLink::AddParamToHeadInt(&c, n);
+        XLink::AddParamToHeadInt(&c, alpha);
+    }
+    else{
+        ShowNTErrors("Something is wrong!");
+    }
+
    return c;
 }


--- a/source/tensor/core/arithmetic/Multiply.h
+++ b/source/tensor/core/arithmetic/Multiply.h
@@ -31,7 +31,7 @@ element-wise product of two tensors:
 c(i) = a(i)*b(i) + \alpha * c(i) 
 where i is the index of the element
 */
-void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha = 0, int leadingDim = 0);
+void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha = 0.0, int leadingDim = 0);

 /* 
 element-wise product of two tensors (do it on site)
@@ -39,7 +39,7 @@ keep the result in the input tensor a and return nothing
 a(i) = a(i)*b(i) + \alpha * a(i) 
 where i is the index of the element 
 */
-void _MultiplyMe(XTensor * a, const XTensor * b, DTYPE alpha = 0, int leadingDim = 0);
+void _MultiplyMe(XTensor * a, const XTensor * b, DTYPE alpha = 0.0, int leadingDim = 0);

 /* 
 element-wise product of two tensors (return a XTensor structure)
@@ -47,7 +47,7 @@ make a new tensor to keep the result and return it
 c(i) = a(i)*b(i)
 where i is the index of the element 
 */
-XTensor Multiply(const XTensor &a, const XTensor &b, int leadingDim = 0);
+XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha = 0.0, int leadingDim = 0);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/arithmetic/MultiplyDim.cpp
+++ b/source/tensor/core/arithmetic/MultiplyDim.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: JIANG Yufan (email: jiangyufan2018@outlook.com) 2018-08-14
+ */
+
+#include "Multiply.h"
+#include "MultiplyDim.h"
+#include "MultiplyDim.cuh"
+#include "../../XName.h"
+#include "../movement/CopyValues.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/*
+tensor multiplication
+
+c = a * b + \alpha * c
+where the size of b is equal to the n-th dimension of a,
+i.e., a is multiplied with b by broadcasting
+
+>> a - a tensor
+>> b - another tensor whose size is equal to that of dimension n of a
+>> c - where we put a * b + \alpha * c. we save it in a if c is NULL
+>> n - the dimension index
+>> alpha - the scaling factor
+*/
+void _MultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha) {
+    
+    CheckNTErrors(a && b && c, "Empty tensor input!");
+    CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in multiplication!");
+    CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
+                 "Unmatched data types in multiplication!");
+    CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in multiplication!");
+    CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
+    CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
+
+    if(XTensor::IsSameShaped(a, b)){
+        _Multiply(a, b, c, alpha);
+        return;
+    }
+
+    if(a->devID >= 0 || b->devID >= 0 || c->devID >= 0){
+#ifdef USE_CUDA
+        _CudaMultiplyDim(a, b, c, n, alpha);
+#else
+        ShowNTErrors("Please specify USE_CUDA and recompile the code!");
+#endif
+    }
+    else{
+        int stride = 1;
+        int blockSize = a->dimSize[n];
+        int blockNum = 1;
+
+        for(int i = a->order - 1; i >= 0; i--){
+            if(i > n)
+                stride *= a->dimSize[i];
+            else if(i < n)
+                blockNum *= a->dimSize[i];
+        }
+
+        if(a->dataType == DEFAULT_DTYPE){
+            int num = a->unitNum;
+            if(stride > 1){
+                for(int i = 0, j = 0; i < num; i += stride, j++){
+                    DTYPE * ap = (DTYPE*)a->data + i;
+                    DTYPE   bv = *((DTYPE*)b->data + j % blockSize);
+                    DTYPE * cp = (DTYPE*)c->data + i;
+                    for(int k = 0; k < stride; k++)
+                        if(alpha == 0.0F)
+                            cp[k] = ap[k] * bv;
+                        else
+                            cp[k] = ap[k] * bv + alpha * cp[k];
+                }
+            }
+            else if(stride == 1){
+                DTYPE * bp = (DTYPE*)b->data;
+                for(int i = 0; i < num; i += blockSize){
+                    DTYPE * ap = (DTYPE*)a->data + i;
+                    DTYPE * cp = (DTYPE*)c->data + i;
+                    if(alpha == 0.0F){
+                        for(int j = 0; j < blockSize; j++)
+                            cp[j] = ap[j] * bp[j];
+                    }
+                    else{
+                        for(int j = 0; j < blockSize; j++)
+                            cp[j] = ap[j] * bp[j] + alpha * cp[j];
+                    }
+                }
+            }
+            else{
+                ShowNTErrors("Something is wrong!");
+            }
+        }
+        else {
+            ShowNTErrors("TODO!");
+        }
+    }
+}
+
+/*
+tensor multiplication(do it on site)
+make a new tensor to keep the result and return it
+
+c = a * b + \alpha * c
+where the size of b is equal to the n-th dimension of a,
+i.e., a is multiplied with b by broadcasting
+
+>> a - a tensor
+>> b - another tensor whose size is equal to that of dimension n of a
+>> n - the dimension index
+>> alpha - the scaling factor
+*/
+void _MultiplyDimMe(XTensor * a, const XTensor * b, int n, DTYPE alpha)
+{
+    _MultiplyDim(a, b, a, n, alpha);
+}
+
+/*
+tensor multiplication (return a XTensor structure and make tensor connections)
+make a new tensor to keep the result and return it
+
+c = a * b + \alpha * c
+where the size of b is equal to the n-th dimension of a,
+i.e., a is multiplied with b by broadcasting
+
+>> a - a tensor
+>> b - another tensor whose size is equal to that of dimension n of a
+>> n - the dimension index
+>> alpha - the scaling factor
+<< return - the result tensor by tensor multiplication
+*/
+XTensor MultiplyDim(const XTensor &a, const XTensor &b, int n, DTYPE alpha)
+{
+    XTensor c(&a);
+    c.SetTMP();
+
+    /* call _Multiply function */
+    _MultiplyDim(&a, &b, &c, n, alpha);
+
+    /* tensor connections */
+    XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM);
+    XLink::AddParamToHeadInt(&c, n);
+    XLink::AddParamToHead(&c, alpha);
+
+    return c;
+}
+
+}
--- a/source/tensor/core/arithmetic/MultiplyDim.cu
+++ b/source/tensor/core/arithmetic/MultiplyDim.cu
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: JIANG Yufan (email: jiangyufan2018@outlook.com) 2018-08-14
+ */
+
+#include "../../XDevice.h"
+#include "../../XUtility.h"
+#include "MultiplyDim.cuh"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+#ifdef USE_CUDA
+
+/*
+tensor multiplication of a tensor and a row vector
+c = a * b + \alpha * c
+where a is a tensor and b is a row vector
+>> a - pointer to the data array of a
+>> b - pointer to the data array of b
+>> c - pointer to the data array of c
+>> rowNum - number of rows of a and c
+>> colNum - number of columns of a and c (i.e., the size of b)
+>> alpha - the scaling factor
+*/
+template <class T, bool alphaFired>
+__global__
+void KernelMultiplyWithRow(T * a, T * b, T * c, int rowNum, int colNum, T alpha)
+{
+    __shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+    int col = blockDim.x * blockIdx.x + threadIdx.x;
+    int row = blockDim.y * blockIdx.y + threadIdx.y;
+
+    if (col >= colNum || row >= rowNum)
+        return;
+
+    if (threadIdx.y == 0)
+        bv[threadIdx.x] = b[col];
+
+    __syncthreads();
+
+    int offset = colNum * row + col;
+    if (alphaFired)
+        c[offset] = a[offset] * bv[threadIdx.x] + c[offset] * alpha;
+    else
+        c[offset] = a[offset] * bv[threadIdx.x];
+}
+
+/*
+tensor multiplication of a tensor and a colum vector
+c = a * b + \alpha * c
+where a is a tensor and b is a colum vector
+>> a - pointer to the data array of a
+>> b - pointer to the data array of b
+>> c - pointer to the data array of c
+>> rowNum - number of rows of a and c (i.e., the size of b)
+>> colNum - number of columns of a and c
+>> blockNum - size of a block (matrix), i.e., rowNum * colNum
+>> blockNum - number of matrics
+>> alpha - the scaling factor
+*/
+template <class T, bool alphaFired>
+__global__
+void KernelMultiplyWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, T alpha)
+{
+    __shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+
+    int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
+    int row = blockDim.y * blockIdx.y + threadIdx.y;
+
+    int col = colIndex % colNum;
+    int block = colIndex / colNum;
+
+    if (row >= rowNum || block >= blockNum)
+        return;
+
+    if (threadIdx.x == 0)
+        bv[threadIdx.y] = b[row];
+
+    __syncthreads();
+
+    int offset = block * blockSize + row * colNum + col;
+
+    if (alphaFired)
+        c[offset] = a[offset] * bv[threadIdx.y] + c[offset] * alpha;
+    else
+        c[offset] = a[offset] * bv[threadIdx.y];
+}
+
+/*
+tensor multiplication
+
+c = a * b + \alpha * c
+where the size of b is equal to the n-th dimension of a,
+i.e., a is multiplied with b by broadcasting
+
+>> a - a tensor
+>> b - another tensor whose size is equal to that of dimension n of a
+>> c - where we put a * b + \alpha * c. we save it in a if c is NULL
+>> n - the dimension index
+>> alpha - the scaling factor
+*/
+void _CudaMultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha)
+{
+    CheckNTErrors(a && b && c, "Empty tensor input!");
+    CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in multiplication!");
+    CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
+                 "Unmatched data types in multiplication!");
+    CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in multiplication!");
+    CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
+    CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
+
+    int stride = 1;
+    int blockSize = a->dimSize[n];
+    int blockNum = 1;
+
+    for (int i = a->order - 1; i >= 0; i--) {
+        if (i > n)
+            stride *= a->dimSize[i];
+        else if (i < n)
+            blockNum *= a->dimSize[i];
+    }
+
+    int cudaGrids[3];
+    int cudaBlocks[3];
+
+    int devIDBackup = 0;
+    ProtectCudaDev(a->devID, devIDBackup);
+
+    if (a->dataType == DEFAULT_DTYPE) {
+        if (stride > 1) {
+            GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
+            if(alpha == (DTYPE)0.0F)
+                KernelMultiplyWithCol<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
+                                                     ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, 
+                                                       blockSize, stride, blockSize * stride, blockNum, alpha);
+            else
+                KernelMultiplyWithCol<DTYPE, true>  <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
+                                                     ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, 
+                                                       blockSize, stride, blockSize * stride, blockNum, alpha);
+        }
+        else if (stride == 1) {
+            GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
+            if(alpha == (DTYPE)0.0F)
+                KernelMultiplyWithRow<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
+                                                     ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, 
+                                                       blockNum, blockSize, alpha);
+            else
+                KernelMultiplyWithRow<DTYPE, true>  <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
+                                                     ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, 
+                                                       blockNum, blockSize, alpha);
+        }
+        else {
+            ShowNTErrors("Something is wrong!");
+        }
+    }
+    else {
+        ShowNTErrors("TODO!");
+    }
+
+    BacktoCudaDev(a->devID, devIDBackup);
+}
+
+#endif
+
+} // namespace nts(NiuTrans.Tensor)
+
--- a/source/tensor/core/arithmetic/MultiplyDim.cuh
+++ b/source/tensor/core/arithmetic/MultiplyDim.cuh
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: JIANG Yufan (email: jiangyufan2018@outlook.com) 2018-08-14
+ */
+
+#ifndef __MULTIPLYDIM_CUH__
+#define __MULTIPLYDIM_CUH__
+
+#include "../../XTensor.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+#ifdef USE_CUDA
+
+/* tensor summation a * b + \alpha * c where the size of b is equal to the n-th dimension of a,
+   i.e., a is multiplied with b by broadcasting (cuda version) */
+void _CudaMultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha = 0);
+
+#endif
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __MULTIPLYDIM_CUH__
--- a/source/tensor/core/arithmetic/MultiplyDim.h
+++ b/source/tensor/core/arithmetic/MultiplyDim.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+ * $Created by: JIANG Yufan (email: jiangyufan2018@outlook.com) 2018-08-14
+ */
+
+#ifndef __MULTIPLYDIM_H__
+#define __MULTIPLYDIM_H__
+
+#include "../../XTensor.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* tensor multiplication c = a * b + \alpha * c  where the size of b is equal to the n-th dimension of a,
+   i.e., a is multiplied with b by broadcasting */
+void _MultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha = 0.0);
+
+/* tensor multiplication a = a * b + \alpha * c where the size of b is equal to the n-th dimension of a,
+   i.e., a is multiplied with b by broadcasting. we keep the result in the input tensor a and return nothing */
+void _MultiplyDimMe(XTensor * a, const XTensor * b, int n, DTYPE alpha = 0.0);
+
+/* tensor multiplication c = a * b + \alpha * c where the size of b is equal to the n-th dimension of a,
+   i.e., a is multiplied with b by broadcasting. We make a new tensor c to keep the result and return it */
+XTensor MultiplyDim(const XTensor &a, const XTensor &b, int n, DTYPE alpha = 0.0);
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __MULTIPLYDIM_H__
--- a/source/tensor/core/arithmetic/Sub.cpp
+++ b/source/tensor/core/arithmetic/Sub.cpp
@@ -24,6 +24,7 @@
 #include "../../XUtility.h"
 #include "Sub.h"
 #include "Sub.cuh"
+#include "SubDim.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

@@ -123,7 +124,34 @@ void _SubMe(XTensor * a, const XTensor * b, DTYPE beta)
 {
    _Sub(a, b, a, beta);
 }
-    
+  
+/* 
+return a dimension if the subtraction is performed as SubDim (in more details in SubDim.h)
+>> a - a tensor
+>> b - another tensor for subtraction
+*/
+int GetSubDimIndex(const XTensor &a, const XTensor &b)
+{
+    if(a.order < b.order)
+        return -1;
+
+    int hitCount = 0;
+    int hitDim = -1;
+    for(int i = 0; i < b.order; i++){
+        if(b.dimSize[b.order - 1 - i] == 1)
+            continue;
+        else if(b.dimSize[b.order - 1 - i] == a.dimSize[a.order - 1 - i]){
+            hitCount++;
+            hitDim = a.order - b.order + i;
+        }
+    }
+
+    if(hitCount == 1)
+        return hitDim;
+    else
+        return -1;
+}
+
 /*
 tensor subtraction c = a - b * \beta (return a XTensor structure)
 make a new tensor c to keep the result and return it
@@ -138,12 +166,28 @@ XTensor Sub(const XTensor &a, const XTensor &b, DTYPE beta)
    XTensor c(&a);
    c.SetTMP();

-    /* call _Sub function */
-    _Sub(&a, &b, &c, beta);
-    
-    /* tensor connections */
-    XLink::MakeLink(&a, &b, &c, MATH_SUB);
-    XLink::AddParamToHead(&c, beta);
+    int n = GetSubDimIndex(a, b);
+
+    if(n == -1){
+        /* call _Sub function */
+        _Sub(&a, &b, &c, beta);
+        
+        /* tensor connections */
+        XLink::MakeLink(&a, &b, &c, MATH_SUB);
+        XLink::AddParamToHead(&c, beta);
+    }
+    else if(n >= 0 && n < a.order){
+        /* call _SubDim function */
+        _SubDim(&a, &b, &c, n, beta);
+        
+        /* tensor connections */
+        XLink::MakeLink(&a, &b, &c, MATH_SUBDIM);
+        XLink::AddParamToHeadInt(&c, n);
+        XLink::AddParamToHead(&c, beta);
+    }
+    else{
+        ShowNTErrors("Something is wrong!");
+    }
    
    return c;
 }

--- a/source/tensor/core/arithmetic/SubDim.cpp
+++ b/source/tensor/core/arithmetic/SubDim.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
+*/
+
+#include "Sub.h"
+#include "SubDim.h"
+#include "SubDim.cuh"
+#include "../../XName.h"
+#include "../movement/CopyValues.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/*
+tensor subtraction
+
+c = a - b * \beta
+where the size of b is equal to the n-th dimension of a,
+i.e., a is subtracted with b by broadcasting
+
+>> a - a tensor
+>> b - another tensor whose size is equal to that of dimension n of a
+>> c - where we put a-b*\beta. we save it in a if c is NULL
+>> n - the dimension index
+>> beta - the scaling factor
+*/
+void _SubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta)
+{
+	CheckNTErrors(a && b && c, "Empty tensor input!");
+	CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in subtraction!");
+	CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
+		          "Unmatched data types in subtraction!");
+	CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in subtraction!");
+	CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
+	CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
+
+	if (beta == 0) {
+		_CopyValues(a, c);
+		return;
+	}
+
+	if (XTensor::IsSameShaped(a, b)) {
+		_Sub(a, b, c, beta);
+		return;
+	}
+
+	if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
+#ifdef USE_CUDA
+		_CudaSubDim(a, b, c, n, beta);
+#else
+		ShowNTErrors("Please specify USE_CUDA and recompile the code!");
+#endif
+	}
+	else {
+		int stride = 1;
+		int blockSize = a->dimSize[n];
+		int blockNum = 1;
+
+		for (int i = a->order - 1; i >= 0; i--) {
+			if (i > n)
+				stride *= a->dimSize[i];
+			else if (i < n)
+				blockNum *= a->dimSize[i];
+		}
+
+		if (a->dataType == DEFAULT_DTYPE) {
+			int num = a->unitNum;
+			if (stride > 1) {
+				for (int i = 0, j = 0; i < num; i += stride, j++) {
+					DTYPE * ap = (DTYPE*)a->data + i;
+					DTYPE   bv = *((DTYPE*)b->data + j % blockSize) * beta;
+					DTYPE * cp = (DTYPE*)c->data + i;
+					for (int k = 0; k < stride; k++)
+						cp[k] = ap[k] - bv;
+				}
+			}
+			else if (stride == 1) {
+				DTYPE * bp = (DTYPE*)b->data;
+				for (int i = 0; i < num; i += blockSize) {
+					DTYPE * ap = (DTYPE*)a->data + i;
+					DTYPE * cp = (DTYPE*)c->data + i;
+					if (beta == 1.0F) {
+						for (int j = 0; j < blockSize; j++)
+							cp[j] = ap[j] - bp[j];
+					}
+					else {
+						for (int j = 0; j < blockSize; j++)
+							cp[j] = ap[j] - bp[j] * beta;
+					}
+				}
+			}
+			else {
+				ShowNTErrors("Something is wrong!");
+			}
+		}
+		else {
+			ShowNTErrors("TODO!");
+		}
+	}
+}
+
+/*
+tensor subtraction (do it on site)
+keep the result in the input tensor and return nothing
+
+c = a - b * \beta
+where the size of b is equal to the n-th dimension of a,
+i.e., a is subtracted with b by broadcasting
+
+>> a - a tensor
+>> b - another tensor whose size is equal to that of dimension n of a
+>> n - the dimension index
+>> beta - the scaling factor
+*/
+void _SubDim(XTensor * a, const XTensor * b, int n, DTYPE beta)
+{
+	_SubDim(a, b, a, n, beta);
+}
+
+/*
+tensor subtraction (return a XTensor structure and make tensor connections)
+make a new tensor to keep the result and return it
+
+c = a - b * \beta
+where the size of b is equal to the n-th dimension of a,
+i.e., a is subtracted with b by broadcasting
+
+>> a - a tensor
+>> b - another tensor whose size is equal to that of dimension n of a
+>> n - the dimension index
+>> beta - the scaling factor
+<< return - the result tensor by tensor subtraction
+*/
+XTensor SubDim(const XTensor &a, const XTensor &b, int n, DTYPE beta)
+{
+	XTensor c(&a);
+	c.SetTMP();
+
+	/* call _Sub function */
+	_SubDim(&a, &b, &c, n, beta);
+
+	/* tensor connections */
+	XLink::MakeLink(&a, &b, &c, MATH_SUBDIM);
+	XLink::AddParamToHeadInt(&c, n);
+	XLink::AddParamToHead(&c, beta);
+
+	return c;
+}
+
+}
--- a/source/tensor/core/arithmetic/SubDim.cu
+++ b/source/tensor/core/arithmetic/SubDim.cu
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
+*/
+
+#include "SubDim.cuh"
+#include "../../XDevice.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+#ifdef USE_CUDA
+
+/*
+tensor subtraction of a tensor and a row vector
+c = a - b * \beta
+where a is a tensor and b is a row vector
+>> a - pointer to the data array of a
+>> b - pointer to the data array of b
+>> c - pointer to the data array of c
+>> rowNum - number of rows of a and c
+>> colNum - number of columns of a and c (i.e., the size of b)
+>> beta - the scaling factor
+*/
+template <class T, bool betaFired>
+__global__
+	void KernelSubWithRow(T * a, T * b, T * c, int rowNum, int colNum, T beta)
+{
+	__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+	int col = blockDim.x * blockIdx.x + threadIdx.x;
+	int row = blockDim.y * blockIdx.y + threadIdx.y;
+
+	if (col >= colNum || row >= rowNum)
+		return;
+
+	if (threadIdx.y == 0)
+		bv[threadIdx.x] = b[col];
+
+	__syncthreads();
+
+	int offset = colNum * row + col;
+	if (betaFired)
+		c[offset] = a[offset] - bv[threadIdx.x] * beta;
+	else
+		c[offset] = a[offset] - bv[threadIdx.x];
+}
+
+/*
+tensor subtraction of a tensor and a colum vector
+c = a - b * \beta
+where a is a tensor and b is a colum vector
+>> a - pointer to the data array of a
+>> b - pointer to the data array of b
+>> c - pointer to the data array of c
+>> rowNum - number of rows of a and c (i.e., the size of b)
+>> colNum - number of columns of a and c
+>> blockNum - size of a block (matrix), i.e., rowNum * colNum
+>> blockNum - number of matrics
+>> beta - the scaling factor
+*/
+template <class T, bool betaFired>
+__global__
+	void KernelSubWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, T beta)
+{
+	__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+
+	int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
+	int row = blockDim.y * blockIdx.y + threadIdx.y;
+
+	int col = colIndex % colNum;
+	int block = colIndex / colNum;
+
+	if (row >= rowNum || block >= blockNum)
+		return;
+
+	if (threadIdx.x == 0)
+		bv[threadIdx.y] = b[row];
+
+	__syncthreads();
+
+	int offset = block * blockSize + row * colNum + col;
+
+	if (betaFired)
+		c[offset] = a[offset] - bv[threadIdx.y] * beta;
+	else
+		c[offset] = a[offset] - bv[threadIdx.y];
+}
+
+/*
+tensor subtraction (cuda version)
+
+c = a - b * \beta
+where the size of b is equal to the n-th dimension of a,
+i.e., a is subtracted with b by broadcasting
+
+>> a - a tensor
+>> b - another tensor whose size is equal to that of dimension n of a
+>> c - where we put a+b*\beta. we save it in a if c is NULL
+>> n - the dimension index
+>> beta - the scaling factor
+*/
+void _CudaSubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta)
+{
+	CheckNTErrors(a && b && c, "Empty tensor input!");
+	CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in subtraction!");
+	CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
+		          "Unmatched data types in subtraction!");
+	CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in subtraction!");
+	CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
+	CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
+
+	int stride = 1;
+	int blockSize = a->dimSize[n];
+	int blockNum = 1;
+
+	for (int i = a->order - 1; i >= 0; i--) {
+		if (i > n)
+			stride *= a->dimSize[i];
+		else if (i < n)
+			blockNum *= a->dimSize[i];
+	}
+
+	int cudaGrids[3];
+	int cudaBlocks[3];
+
+	int devIDBackup = 0;
+	ProtectCudaDev(a->devID, devIDBackup);
+
+	if (a->dataType == DEFAULT_DTYPE) {
+		if (stride > 1) {
+			GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
+			if (beta == (DTYPE)1.0F)
+				KernelSubWithCol<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
+				                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
+					                              blockSize, stride, blockSize * stride, blockNum, beta);
+			else
+				KernelSubWithCol<DTYPE, true>  <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
+				                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
+					                              blockSize, stride, blockSize * stride, blockNum, beta);
+		}
+		else if (stride == 1) {
+			GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
+			if (beta == (DTYPE)1.0F)
+				KernelSubWithRow<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
+				                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
+					                              blockNum, blockSize, beta);
+			else
+				KernelSubWithRow<DTYPE, true>  <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
+				                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
+					                              blockNum, blockSize, beta);
+		}
+		else {
+			ShowNTErrors("Something is wrong!");
+		}
+	}
+	else {
+		ShowNTErrors("TODO!");
+	}
+
+	BacktoCudaDev(a->devID, devIDBackup);
+}
+
+#endif
+
+} // namespace nts(NiuTrans.Tensor)
+
--- a/source/tensor/core/arithmetic/SubDim.cuh
+++ b/source/tensor/core/arithmetic/SubDim.cuh
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
+*/
+
+#ifndef __SUBDIM_CUH__
+#define __SUBDIM_CUH__
+
+#include "../../XTensor.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+#ifdef USE_CUDA
+
+/* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a,
+   i.e., a is subtracted with b by broadcasting (cuda version) */
+void _CudaSubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta = (DTYPE)1.0);
+
+#endif
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __SUBDIM_CUH__
--- a/source/tensor/core/arithmetic/SubDim.h
+++ b/source/tensor/core/arithmetic/SubDim.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
+*/
+
+#ifndef __SUBDIM_H__
+#define __SUBDIM_H__
+
+#include "../../XTensor.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a, 
+   i.e., a is subtracted with b by broadcasting*/
+void _SubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta = (DTYPE)1.0);
+
+/* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a, 
+   i.e., a is subtracted with b by broadcasting. we keep the result in the input tensor a and return nothing */
+void _SubDim(XTensor * a, const XTensor * b, int n, DTYPE beta = (DTYPE)1.0);
+
+/* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a,
+   i.e., a is subtracted with b by broadcasting. We make a new tensor c to keep the result and return it */
+XTensor SubDim(const XTensor &a, const XTensor &b, int n, DTYPE beta = (DTYPE)1.0);
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __SUBDIM_H__
--- a/source/tensor/core/arithmetic/Sum.cpp
+++ b/source/tensor/core/arithmetic/Sum.cpp
@@ -131,7 +131,7 @@ void _SumMe(XTensor * a, const XTensor * b, DTYPE beta)
 }

 /* 
-return a dimension if the sum is performed as SumDim (in more details in SumDim.h 
+return a dimension if the sum is performed as SumDim (in more details in SumDim.h)
 >> a - a tensor
 >> b - another tensor for sum
 */
@@ -182,7 +182,7 @@ XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta)
        XLink::AddParamToHead(&c, beta);
    }
    else if(n >= 0 && n < a.order){
-        /* call _Sum function */
+        /* call _SumDim function */
        _SumDim(&a, &b, &c, n, beta);
    
        /* tensor connections */

--- a/source/tensor/core/getandset/Select.cpp
+++ b/source/tensor/core/getandset/Select.cpp
@@ -49,7 +49,7 @@ void _SelectRange(const XTensor * a, XTensor * c, int dim, int low, int high)

    for(int i = 0; i < a->order; i++){
        if(i == dim){
-            CheckNTErrors(low > 0 && low < a->dimSize[dim], "Illegal range specified!");
+            CheckNTErrors(low >= 0 && low < a->dimSize[dim], "Illegal range specified!");
            CheckNTErrors(high > 0 && high <= a->dimSize[dim], "Illegal range specified!");
        }
        else{
@@ -101,7 +101,7 @@ XTensor SelectRange(const XTensor &a, int dim, int low, int high)
    
    for(int i = 0; i < a.order; i++){
        if(i == dim){
-            CheckNTErrors(low > 0 && low < a.dimSize[dim], "Illegal range specified!");
+            CheckNTErrors(low >= 0 && low < a.dimSize[dim], "Illegal range specified!");
            CheckNTErrors(high > 0 && high <= a.dimSize[dim], "Illegal range specified!");
            dimSize[i] = high - low;
        }
@@ -118,6 +118,7 @@ XTensor SelectRange(const XTensor &a, int dim, int low, int high)

    /* tensor connection */
    XLink::MakeLink(&a, NULL, &c, GETANDSET_SELECT);
+    XLink::AddParamToHeadInt(&c, dim);
    XLink::AddParamToHeadInt(&c, low);
    XLink::AddParamToHeadInt(&c, high);


--- a/source/tensor/core/getandset/SetData.cpp
+++ b/source/tensor/core/getandset/SetData.cpp
@@ -70,8 +70,8 @@ void _SetDataFanInOut(XTensor * tensor, DTYPE gain)
        fanOut = numOutputFmaps * receptiveFieldSize;
    }

-    DTYPE std = gain * sqrt(2.0/(fanIn + fanOut));
-    DTYPE a = sqrt(3.0) * std;
+    DTYPE std = gain * (float)sqrt(2.0/(fanIn + fanOut));
+    DTYPE a = (DTYPE)sqrt(3.0) * std;
    _SetDataRand(tensor, -a, a);
 }

@@ -213,6 +213,106 @@ void _SetDataFixedDouble(XTensor * tensor, double p)
        _SetDataFixed(tensor, &p);
 }

+/* 
+set data items along with a given dimension (and keep the remaining items unchanged) 
+>> tensor - the tensor whose data array would be initialized
+>> beg - the beginning position
+>> len - length along with the given dimension
+>> dim - the dimension along which we set the data
+e.g., given a 3 * 3 tensor 
+      1 2 3
+      4 5 6
+      7 8 9
+      when beg = 1, len = 1, dim = 0 and p = 0, we have
+      1 2 3
+      0 0 0
+      7 8 9
+      i.e., we set all entries of row 1 to 0
+*/
+void _SetDataDim(XTensor * tensor, int beg, int len, int dim, DTYPE p)
+{
+    int n = tensor->order;
+
+    CheckNTErrors(tensor->dataType == DEFAULT_DTYPE, "TODO!");
+    CheckNTErrors(dim < n && dim > 0, "Illegal dimension!");
+    CheckNTErrors(beg >= 0 && beg < tensor->GetDim(dim), "Illegal beginning position!");
+    CheckNTErrors(beg + len >= 0 && beg + len < tensor->GetDim(dim), "Illegal length!");
+    
+    if(tensor->devID < 0){
+        int stride = 1;
+        int blockSize = 1;
+        int blockNum  = 1;
+        for(int i = n - 1; i > dim; i--){
+            stride *= tensor->GetDim(i);
+        }
+        blockSize = stride * tensor->GetDim(dim);
+        blockNum = tensor->unitNum / blockSize;
+
+        int l = len * stride;
+
+        for(int i = 0; i < blockNum; i++){
+            DTYPE * d = (DTYPE*)tensor->data + blockSize * i + beg * stride;    
+            for(int j = 0; j < l; j++)
+                d[j] = p;
+        }
+    }
+    else{
+#ifdef USE_CUDA
+        _CudaSetDataDim(tensor, beg, len, dim, p);
+#endif
+    }
+}
+
+/* 
+generate data as lower triangular matrics for last two dimensions 
+>> tensor - the tensor whose data to be set
+>> p - the value for each entry of the lower triangular matrics
+>> shift - the offset from diagonal
+e.g., for a 3* 3 tensor, 
+      when p = 1 ans shift = 0, we have
+      1 0 0
+      1 1 0
+      1 1 1
+      when p = 2 and shift = -1, we have
+      0 0 0
+      2 0 0
+      2 2 0
+*/
+void _SetDataLowTri(XTensor * tensor, DTYPE p, int shift)
+{
+    int n = tensor->order;
+
+    CheckNTErrors(tensor->dataType == DEFAULT_DTYPE, "TODO!");
+    CheckNTErrors(n >= 2, "The tensor must have a order no less than 2!");
+    CheckNTErrors(tensor->GetDim(n - 1) == tensor->GetDim(n - 2), 
+                 "The last two dimensions must be of the same size!");
+
+    if(tensor->devID < 0){
+        int l = tensor->GetDim(-1);
+        int blockNum = 1;
+        int blockSize = l * l;
+        for(int i = 0; i < n - 2; i++)
+            blockNum *= tensor->GetDim(i);
+
+        for(int i = 0; i < blockNum; i++){
+            DTYPE * d = (DTYPE*)tensor->data + i * blockSize;
+            for(int row = 0; row < l; row++){
+                for(int col = 0; col <= row + shift; col++){
+                    d[row * l + col] = p;
+                }
+                for(int col = MAX(0, row + shift + 1); col < l; col++){
+                    d[row * l + col] = 0;
+                }
+            }
+        }
+    }
+    else{
+#ifdef USE_CUDA
+        _CudaSetDataLowTri(tensor, p, shift);
+#endif
+    }
+}
+
 /*
 generate data items with a uniform distribution in [lower, upper]
 >> tensor - the tensor whose data array would be initialized

--- a/source/tensor/core/getandset/SetData.cu
+++ b/source/tensor/core/getandset/SetData.cu
@@ -184,6 +184,169 @@ void KernelSetDataRandDouble(double * d, int size, DTYPE lower, DTYPE variance)
    }
 }

+/* 
+set data items along with a given dimension (and keep the remaining items unchanged) - kernel version
+>> tensor - the tensor whose data array would be initialized
+>> beg - the beginning position
+>> len - length of the segment to be set
+>> blockSize - size of a data block
+>> blockNum - number of data blocks
+*/
+__global__
+void KernelSetDataDim(DTYPE * d, int beg, int len, int blockSize, int blockNum, DTYPE p)
+{
+    /* offset in each block */
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    /* block id */
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+
+    if(i >= blockSize || j > blockNum)
+        return;
+
+    if(i < beg || i >= beg + len)
+        return;
+
+    d[blockSize * j + i] = p;
+}
+
+/* 
+set data items along with a given dimension (and keep the remaining items unchanged) - cuda version
+>> tensor - the tensor whose data array would be initialized
+>> beg - the beginning position
+>> len - length along with the given dimension
+>> dim - the dimension along which we set the data
+e.g., given a 3 * 3 tensor 
+      1 2 3
+      4 5 6
+      7 8 9
+      when beg = 1, len = 1, dim = 0 and p = 0, we have
+      1 2 3
+      0 0 0
+      7 8 9
+      i.e., we set all entries of row 1 to 0
+*/
+void _CudaSetDataDim(XTensor * tensor, int beg, int len, int dim, DTYPE p)
+{
+    int n = tensor->order;
+
+    CheckNTErrors(tensor->dataType == DEFAULT_DTYPE, "TODO!");
+    CheckNTErrors(dim < n && dim > 0, "Illegal dimension!");
+    CheckNTErrors(beg >= 0 && beg < tensor->GetDim(dim), "Illegal beginning position!");
+    CheckNTErrors(beg + len >= 0 && beg + len < tensor->GetDim(dim), "Illegal length!");
+
+    int stride = 1;
+    int blockSize = 1;
+    int blockNum  = 1;
+    for(int i = n - 1; i > dim; i--){
+        stride *= tensor->GetDim(i);
+    }
+    blockSize = stride * tensor->GetDim(dim);
+    blockNum = tensor->unitNum / blockSize;
+
+    int cudaGrids[3];
+    int cudaBlocks[3];
+
+    GDevs.GetCudaThread2D(tensor->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
+
+    dim3 blocks(cudaGrids[0], cudaGrids[1]);
+    dim3 threads(cudaBlocks[0], cudaBlocks[1]);
+
+    int devIDBackup;
+    ProtectCudaDev(tensor->devID, devIDBackup);
+
+    KernelSetDataDim<<<blocks, threads >>>((DTYPE*)tensor->data, beg * stride, len * stride, blockSize, blockNum, p);
+
+    BacktoCudaDev(tensor->devID, devIDBackup);
+}
+
+/* 
+set lower triangular matrics for each block
+>> d - pointer to the data array
+>> l - row number (or column number) of each block, i.e, 
+       a block is l * l matrix
+>> blockSize - size of each block (blockSize = l * l)
+>> blockNum - number of the blocks
+>> p - the value for each entry of the lower triangular matrics
+>> shift - the offset from diagonal
+e.g., for a 3* 3 tensor, 
+      when p = 1 ans shift = 0, we have
+      1 0 0
+      1 1 0
+      1 1 1
+      when p = 2 and shift = -1, we have
+      0 0 0
+      2 0 0
+      2 2 0
+*/
+__global__
+void _KernelSetDataLowTri(DTYPE * d, int l, int blockSize, int blockNum, DTYPE p, int shift)
+{
+    /* offset in each block */
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    /* block id */
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+
+    if(i >= blockSize || j > blockNum)
+        return;
+
+    int row = i / l;
+    int col = i % l;
+    DTYPE * d2 = d + blockSize * j + row * l + col;
+
+    if(col <= row + shift)
+        *d2 = p;
+    else
+        *d2 = 0;
+}
+
+/* 
+generate data as lower triangular matrics for last two dimensions (cuda version)
+>> tensor - the tensor whose data to be set
+>> p - the value for each entry of the lower triangular matrics
+>> shift - the offset from diagonal
+e.g., for a 3* 3 tensor, 
+      when p = 1 ans shift = 0, we have
+      1 0 0
+      1 1 0
+      1 1 1
+      when p = 2 and shift = -1, we have
+      0 0 0
+      2 0 0
+      2 2 0
+*/
+void _CudaSetDataLowTri(XTensor * tensor, DTYPE p, int shift)
+{
+    int n = tensor->order;
+
+    CheckNTErrors(tensor->dataType == DEFAULT_DTYPE, "TODO!");
+    CheckNTErrors(n >= 2, "The tensor must have a order no less than 2!");
+    CheckNTErrors(tensor->GetDim(n - 1) == tensor->GetDim(n - 2), 
+                 "The last two dimensions must be of the same size!");
+
+    int l = tensor->GetDim(-1);
+    int blockNum = 1;
+    int blockSize = l * l;
+    for(int i = 0; i < n - 2; i++)
+        blockNum *= tensor->GetDim(i);
+
+    int cudaGrids[3];
+    int cudaBlocks[3];
+
+    GDevs.GetCudaThread2D(tensor->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
+
+    dim3 blocks(cudaGrids[0], cudaGrids[1]);
+    dim3 threads(cudaBlocks[0], cudaBlocks[1]);
+
+    int devIDBackup;
+    ProtectCudaDev(tensor->devID, devIDBackup);
+
+    _KernelSetDataLowTri<<<blocks, threads >>>((DTYPE*)tensor->data, l, blockSize, blockNum, p, shift);
+
+    BacktoCudaDev(tensor->devID, devIDBackup);
+}
+
 /*
 generate data items with a uniform distribution in [lower, upper]
 >> tensor - the tensor whose data array would be initialized
@@ -213,7 +376,7 @@ void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
    DTYPE variance = upper - lower;

    if (tensor->dataType == X_FLOAT)
-        KernelSetDataRandFloat <<<blocks, threads >>>((float*)tensor->data, tensor->unitNum, lower, variance);
+        KernelSetDataRandFloat  <<<blocks, threads >>>((float*) tensor->data, tensor->unitNum, lower, variance);
    else if (tensor->dataType == X_DOUBLE)
        KernelSetDataRandDouble <<<blocks, threads >>>((double*)tensor->data, tensor->unitNum, lower, variance);


--- a/source/tensor/core/getandset/SetData.cuh
+++ b/source/tensor/core/getandset/SetData.cuh
@@ -37,6 +37,12 @@ void _CudaSetDataFixedFloat(XTensor * tensor, float p);
 /* generate data items with a fixed value p (in double) */
 void _CudaSetDataFixedDouble(XTensor * tensor, double p);

+/* set data items along with a given dimension (and keep the remaining items unchanged) */
+void _CudaSetDataDim(XTensor * tensor, int beg, int len, int dim, DTYPE p);
+
+/* generate data as lower triangular matrics for last two dimensions (cuda version) */
+void _CudaSetDataLowTri(XTensor * tensor, DTYPE p, int shift);
+
 /* generate data items with a uniform distribution in [lower, upper] */
 void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper);


--- a/source/tensor/core/getandset/SetData.h
+++ b/source/tensor/core/getandset/SetData.h
@@ -45,11 +45,17 @@ void _SetDataFixedFloat(XTensor * tensor, float p);
 /* generate data items with a fixed value p (in double) */
 void _SetDataFixedDouble(XTensor * tensor, double p);

+/* set data items along with a given dimension (and keep the remaining items unchanged) */
+void _SetDataDim(XTensor * tensor, int beg, int len, int dim, DTYPE p);
+
+/* generate data as lower triangular matrics for last two dimensions */
+void _SetDataLowTri(XTensor * tensor, DTYPE p, int shift);
+
 /* generate data items with a uniform distribution in [lower, upper] */
 void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper);

 /* generate data items with a normal distribution with specified mean and standard deviation */
-void _SetDataRandN(XTensor * tensor, DTYPE mean, DTYPE standardDeviation);
+void _SetDataRandN(XTensor * tensor, DTYPE mean = 0.0F, DTYPE standardDeviation = 1.0F);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/math/Power.cpp
+++ b/source/tensor/core/math/Power.cpp
@@ -60,8 +60,12 @@ void _Power(const XTensor * a, XTensor * b, DTYPE p)
            bData[i] = aData[i] * aData[i];
    }
    else {
-        for (int i = 0; i < a->unitNum; i++)
-            bData[i] = (DTYPE)pow(aData[i], p);
+        for (int i = 0; i < a->unitNum; i++) {
+            if (p < 0 && aData[i] == 0)
+                bData[i] = 1e20F;
+            else
+                bData[i] = (DTYPE)pow(aData[i], p);
+        }
    }
 }


--- a/source/tensor/core/math/Power.cu
+++ b/source/tensor/core/math/Power.cu
@@ -77,8 +77,13 @@ void KernelPower(DTYPE * a, DTYPE * b, DTYPE p, int size)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;

-    if (i < size)
-        b[i] = pow(a[i], p);
+    if (i < size) {
+        DTYPE v = a[i];
+        if (p < 0 && v == 0)
+            b[i] = 1e20;
+        else
+            b[i] = pow(a[i], p);
+    }
 }

 /*
@@ -94,8 +99,13 @@ void KernelPower(__half * a, __half * b, __half p, int size)
 #if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
 #else
    int i = blockDim.x * blockIdx.x + threadIdx.x;
-    if (i < size)
-        b[i] = __float2half(pow(__half2float(a[i]), __half2float(p)));
+    if (i < size) {
+        float v = __half2float(a[i]);
+        if (__half2float(p) < 0 && v == 0)
+            b[i] = __float2half(1e20);
+        else
+            b[i] = __float2half(pow(__half2float(a[i]), __half2float(p)));
+    }
 #endif
 }


--- a/source/tensor/core/reduce/ReduceMax.cu
+++ b/source/tensor/core/reduce/ReduceMax.cu
@@ -29,6 +29,71 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)

 #ifdef USE_CUDA

+
+/*
+use PTX code to reduce float data
+*/
+__device__ __forceinline__  
+float shflDownReduceMax(float input)
+{
+    float output;
+    asm volatile(
+        "{"
+        ".reg .f32 r0;"
+        ".reg .pred p;"
+        "shfl.down.b32  r0, %1, 0x10, 0x1f;"
+        "setp.lt.f32    p,%1,r0;"
+        "@p mov.f32     %1,r0;"
+        "shfl.down.b32  r0, %1, 0x8, 0xf;"
+        "setp.lt.f32    p,%1,r0;"
+        "@p mov.f32     %1,r0;"
+        "shfl.down.b32  r0, %1, 0x4, 0x7;"
+        "setp.lt.f32    p,%1,r0;"
+        "@p mov.f32     %1,r0;"
+        "shfl.down.b32  r0, %1, 0x2, 0x3;"
+        "setp.lt.f32    p,%1,r0;"
+        "@p mov.f32     %1,r0;"
+        "shfl.down.b32  r0, %1, 0x1, 0x1;"
+        "setp.lt.f32    p, %1, r0; "
+        "@p mov.f32     %1,r0;"
+        "mov.f32        %0,%1;"
+        "}"
+        : "=f"(output) : "f"(input));
+    return output;
+}
+
+/*
+use PTX code to reduce int data
+*/
+__device__ __forceinline__
+int shflDownReduceMax(int input)
+{
+    int output;
+    asm volatile(
+        "{"
+        ".reg .s32 r0;"
+        ".reg .pred p;"
+        "shfl.down.b32  r0, %1, 0x10, 0x1f;"
+        "setp.lt.s32    p,%1,r0;"
+        "@p mov.s32     %1,r0;"
+        "shfl.down.b32  r0, %1, 0x8, 0xf;"
+        "setp.lt.s32    p,%1,r0;"
+        "@p mov.s32     %1,r0;"
+        "shfl.down.b32  r0, %1, 0x4, 0x7;"
+        "setp.lt.s32    p,%1,r0;"
+        "@p mov.s32     %1,r0;"
+        "shfl.down.b32  r0, %1, 0x2, 0x3;"
+        "setp.lt.s32    p,%1,r0;"
+        "@p mov.s32     %1,r0;"
+        "shfl.down.b32  r0, %1, 0x1, 0x1;"
+        "setp.lt.s32    p, %1, r0; "
+        "@p mov.s32     %1,r0;"
+        "mov.s32        %0,%1;"
+        "}"
+        : "=r"(output) : "r"(input));
+    return output;
+}
+
 /* 
 reduce a tensor to another that keeps the max value along a dimension  - slow version
 Given a block of data, we go over each dimension i in the stride and we have
@@ -63,7 +128,7 @@ void KernelReduceMax(DTYPE * input, DTYPE * output,
    int iOffset = i % stride;

    DTYPE value = (i < stride * blockNum && j < strideNum) ? 
-                  input[blockSize * k + stride * j + iOffset]: FLOAT_MIN;
+                   input[blockSize * k + stride * j + iOffset] : FLOAT_MIN;

    /* load data into the shared mem */
    iData[threadIdx.x * blockDim.y + threadIdx.y] = value;
@@ -186,30 +251,26 @@ void KernelReduceMaxFast(DTYPE * input, DTYPE * output,
    int k = i / stride;
    int iOffset = i % stride;

-    DTYPE * data =  iData + threadIdx.x * blockDim.y;
-    DTYPE * inputData = input  + k * blockSize;
-    DTYPE value  = j < strideNum ? inputData[j * stride + iOffset]: FLOAT_MIN;
+    DTYPE * data = iData + threadIdx.x * blockDim.y;
+    DTYPE * inputData = input + k * blockSize;
+    DTYPE value = j < strideNum ? inputData[j * stride + iOffset] : FLOAT_MIN;
    DTYPE value2 = j + blockDim.y < strideNum ? inputData[(j + blockDim.y) * stride + iOffset]: FLOAT_MIN;

-    /* load data into the shared mem */
-    data[tid] = MAX(value, value2);
-
+    value = MAX(value, value2);
+    value = shflDownReduceMax(value);
+    if ((tid & 0x1f) == 0) 
+        data[tid / 32] = value;
    __syncthreads();

-    /* unroll the warp */
-    if(goodSize >= 512) {if(tid < 256) {if(data[tid] < data[tid + 256]) data[tid] = data[tid + 256];} __syncthreads();}
-    if(goodSize >= 256) {if(tid < 128) {if(data[tid] < data[tid + 128]) data[tid] = data[tid + 128];} __syncthreads();}
-    if(goodSize >= 128) {if(tid <  64) {if(data[tid] < data[tid +  64]) data[tid] = data[tid +  64];} __syncthreads();}
-    if(goodSize >=  64) {if(tid <  32) {if(data[tid] < data[tid +  32]) data[tid] = data[tid +  32];} __syncthreads();}
-    if(goodSize >=  32) {if(tid <  16) {if(data[tid] < data[tid +  16]) data[tid] = data[tid +  16];} __syncthreads();}
-    if(goodSize >=  16) {if(tid <   8) {if(data[tid] < data[tid +   8]) data[tid] = data[tid +   8];} __syncthreads();}
-    if(goodSize >=   8) {if(tid <   4) {if(data[tid] < data[tid +   4]) data[tid] = data[tid +   4];} __syncthreads();}
-    if(goodSize >=   4) {if(tid <   2) {if(data[tid] < data[tid +   2]) data[tid] = data[tid +   2];} __syncthreads();}
-    if(goodSize >=   2) {if(tid <   1) {if(data[tid] < data[tid +   1]) data[tid] = data[tid +   1];} __syncthreads();}
-
-    /* write result for this block to the output array */
-    if(threadIdx.y == 0 && blockIdx.y < reducedStrideNum) 
-        output[(k * reducedStrideNum + blockIdx.y) * stride  + iOffset] = data[0];
+    if (tid < 32) {
+        if (tid < blockDim.y / 32)
+            value = data[tid];
+        else 
+            value = FLOAT_MIN;
+        value = shflDownReduceMax(value);
+        if (tid == 0 && blockIdx.y < reducedStrideNum)
+            output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = value;
+    }
 }

 /*
@@ -271,9 +332,9 @@ void KernelReduceMaxFast(__half * input, __half * output,
    if (goodSize >=  64) { if (tid <  32) { if (data[tid] < data[tid +  32]) data[tid] = data[tid +  32]; } __syncthreads(); }
    if (goodSize >=  32) { if (tid <  16) { if (data[tid] < data[tid +  16]) data[tid] = data[tid +  16]; } __syncthreads(); }
    if (goodSize >=  16) { if (tid <   8) { if (data[tid] < data[tid +   8]) data[tid] = data[tid +   8]; } __syncthreads(); }
-    if (goodSize >=  8) { if (tid <    4) { if (data[tid] < data[tid +   4]) data[tid] = data[tid +   4]; } __syncthreads(); }
-    if (goodSize >=  4) { if (tid <    2) { if (data[tid] < data[tid +   2]) data[tid] = data[tid +   2]; } __syncthreads(); }
-    if (goodSize >=  2) { if (tid <    1) { if (data[tid] < data[tid +   1]) data[tid] = data[tid +   1]; } __syncthreads(); }
+    if (goodSize >=   8) { if (tid <   4) { if (data[tid] < data[tid +   4]) data[tid] = data[tid +   4]; } __syncthreads(); }
+    if (goodSize >=   4) { if (tid <   2) { if (data[tid] < data[tid +   2]) data[tid] = data[tid +   2]; } __syncthreads(); }
+    if (goodSize >=   2) { if (tid <   1) { if (data[tid] < data[tid +   1]) data[tid] = data[tid +   1]; } __syncthreads(); }

 #if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
    /* write result for this block to the output array */
@@ -291,7 +352,7 @@ reduce a tensor to another that keeps the max value along a dimension  - simple 
 */
 __global__
 void KernelReduceMaxSimpleFast(DTYPE * input, DTYPE * output, 
-                              int stride, int strideNum, int blockSize, int blockNum)
+                               int stride, int strideNum, int blockSize, int blockNum)
 {
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

@@ -326,6 +387,108 @@ void KernelReduceMaxSimpleFast(DTYPE * input, DTYPE * output,
    op[offset] = max;
 }

+/*
+according the GPU's sm number allocation warp num
+*/
+inline void continuousStorageThreadAllocation(dim3& grid, dim3& block, long long vectorNum, int vectorSize)
+{
+    int warpNum = 4;
+    if (vectorNum < 20 * 8){
+        warpNum = 8;
+        if (vectorNum < 20 * 4){
+            warpNum = 16;
+            if (warpNum < 20 * 2)
+                warpNum = 32;
+        }
+    }
+    int minWarpNum = vectorSize / 32;
+    if (vectorSize % 32 != 0) minWarpNum++;
+    warpNum = min(warpNum, minWarpNum);
+
+    grid.x = vectorNum;
+    grid.y = 1;
+    grid.z = 1;
+    block.x = 1;
+    block.y = warpNum * 32;
+    block.z = 1;
+}
+
+/*
+adjust threads.x number then we can use warp optimization 
+*/
+inline void adjustThreadForUseWarpOptimization(dim3& blocks, dim3& threads)
+{
+    if (threads.x > 1) {
+        blocks.x *= threads.x;
+        threads.x = 1;
+    }
+    if (threads.y < 32)
+        threads.y = 32;
+}
+
+/*
+In some case,we use less block to imporve efficiency
+*/
+__global__
+void KernelReduceMaxOpLessBlocks(DTYPE * input, DTYPE * output, int strideNum, int blockNum)
+{
+    int idx = threadIdx.x % 32;
+    int idy = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
+
+    int startIndex = idy * strideNum;
+    DTYPE threadMax = FLOAT_MIN;
+    for (int i = idx; i < strideNum; i += 32) {
+        threadMax = max(input[startIndex + i], threadMax);
+    }
+    threadMax = shflDownReduceMax(threadMax);
+    if (idx == 0) 
+        output[idy] = threadMax;
+}
+
+/*
+we use PTX code reduce
+*/
+__global__
+void KernelReduceMaxOp(DTYPE * input, DTYPE * output,int stride, int strideNum, 
+                       int reducedStrideNum,int blockSize, int blockNum)
+{
+    __shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK / 32];
+
+    unsigned int tid = threadIdx.y;
+    unsigned int j = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= stride * blockNum)
+        return;
+
+    /* first level reduction */
+    int k = i / stride;
+    int iOffset = i % stride;
+
+    DTYPE threadMax = FLOAT_MIN;
+
+    DTYPE * data = iData + threadIdx.x * blockDim.y;
+    DTYPE * inputData = input + k * blockSize;
+    for (int it = j; it < strideNum; it += blockDim.y){
+        threadMax = max(inputData[it * stride + iOffset], threadMax);
+    }
+
+    __syncthreads();
+    threadMax = shflDownReduceMax(threadMax);
+    if ((tid & 0x1f) == 0) 
+        data[tid / 32] = threadMax;
+
+    __syncthreads();
+    /* use one warp to reduce remaining data */
+    if (tid < 32){
+        if (tid < blockDim.y / 32)
+            threadMax = data[tid];
+        else threadMax = 0;
+        threadMax = shflDownReduceMax(threadMax);
+        if (tid == 0 && blockIdx.y < reducedStrideNum)
+            output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = threadMax;
+    }
+}
+
 /* 
 get the max-valued items along a dimension of the tensor (cuda version). 
 For a 1-dimensional data array a,
@@ -336,20 +499,18 @@ sum_i = max_{0<=j<strideNum} input_{i,j}
 */
 void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
 {
-    CheckNTErrors((input && output), "Empty input or output tensors!");
-    CheckNTErrors((input->order == output->order + 1), "Incorrect tensor sizes!");
-    CheckNTErrors((input->order > dim && dim >=0), "Illegal dimension to reduce!");
-    CheckNTErrors((input->dataType == output->dataType), "Unmatched data types!");
+    CheckNTErrors(input && output, "Empty input or output tensors!");
+    CheckNTErrors(input->order == output->order + 1, "Incorrect tensor sizes!");
+    CheckNTErrors(input->order > dim && dim >=0, "Illegal dimension to reduce!");
+    CheckNTErrors(input->dataType == output->dataType, "Unmatched data types!");

 	int dimRDI = input->order - dim - 1;
    for(int i = 0; i < input->order; i++){
        if(i < dimRDI){
-            CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i]), 
-                                 "Unmatched tensors!");
+            CheckNTErrors(input->dimSizeRDI[i] == output->dimSizeRDI[i], "Unmatched tensors!");
        }
        else if(i > dimRDI){
-            CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i - 1]), 
-                                "Unmatched tensors!");
+            CheckNTErrors(input->dimSizeRDI[i] == output->dimSizeRDI[i - 1], "Unmatched tensors!");
        }
    }

@@ -382,130 +543,149 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
    int devIDBackup;
    ProtectCudaDev(input->devID, devIDBackup);

-    do{
-        if (input->dataType == DEFAULT_DTYPE) {
-            DTYPE * iData = NULL;
-            DTYPE * oData = NULL;
-            if (iter == 0) {
-                iData = (DTYPE*)input->data;
-                oData = buf1;
-            }
-            else if (iter % 2 == 1) {
-                iData = buf1;
-                oData = buf2;
-            }
-            else {
-                iData = buf2;
-                oData = buf1;
-            }
-
-            /* unroll the reduction procedure. The code is messy but it is faster. */
-            if (strideNum < 32) {
-                GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                if (cudaGridSize[0] == 1)
-                    oData = (DTYPE*)output->data;
-                KernelReduceMax << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
-            }
-            else if (strideNum < 128) {
-                GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                if (cudaGridSize[0] == 1)
-                    oData = (DTYPE*)output->data;
-                CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!");
-                KernelReduceMaxFast<64> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
-            }
-            else if (strideNum < 256) {
-                GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                if (cudaGridSize[0] == 1)
-                    oData = (DTYPE*)output->data;
-                CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!");
-                KernelReduceMaxFast<128> << <blocks, threads >> >(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
-            }
-            else if (strideNum < 512) {
-                GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                if (cudaGridSize[0] == 1)
-                    oData = (DTYPE*)output->data;
-                CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!");
-                KernelReduceMaxFast<256> << <blocks, threads >> >(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
-            }
-            else {
-                GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                if (cudaGridSize[0] == 1)
-                    oData = (DTYPE*)output->data;
-                CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!");
-                KernelReduceMaxFast<512> << <blocks, threads >> >(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
-            }
+    if (stride == 1 && blockNum >= 10) {
+        dim3 grids;
+        dim3 blocks;
+        continuousStorageThreadAllocation(grids, blocks, (long long)blockNum, strideNum);
+        if (blocks.y >= 128) {
+            KernelReduceMaxOp <<<grids, blocks >>> ((DTYPE *)input->data, (DTYPE*)output->data, stride, strideNum, grids.y, blockSize, blockNum);
        }
-        else if (input->dataType == X_FLOAT16) {
-            __half * buf1ft16 = (__half *)buf1;
-            __half * buf2ft16 = (__half *)buf2;
-            __half * iData = NULL;
-            __half * oData = NULL;
-            if (iter == 0) {
-                iData = (__half*)input->data;
-                oData = buf1ft16;
-            }
-            else if (iter % 2 == 1) {
-                iData = buf1ft16;
-                oData = buf2ft16;
+        else {
+            if (blockNum % 4 != 0) blockNum = (int)(blockNum / 4) + 1;
+            else blockNum = blockNum / 4;
+            KernelReduceMaxOpLessBlocks <<<blockNum, 128 >>> ((DTYPE *)input->data, (DTYPE*)output->data, strideNum, blockNum);
+        }
+    }
+    else {
+        do {
+            if (input->dataType == DEFAULT_DTYPE) {
+                DTYPE * iData = NULL;
+                DTYPE * oData = NULL;
+                if (iter == 0) {
+                    iData = (DTYPE*)input->data;
+                    oData = buf1;
+                }
+                else if (iter % 2 == 1) {
+                    iData = buf1;
+                    oData = buf2;
+                }
+                else {
+                    iData = buf2;
+                    oData = buf1;
+                }
+
+                /* unroll the reduction procedure. The code is messy but it is faster. */
+                if (strideNum < 32) {
+                    GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    if (cudaGridSize[0] == 1)
+                        oData = (DTYPE*)output->data;
+                    KernelReduceMax <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
+                }
+                else if (strideNum < 128) {
+                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    if (cudaGridSize[0] == 1)
+                        oData = (DTYPE*)output->data;
+                    CheckNTErrors(cudaBlockSize[0] >= 64, "Incorrect thread number when calling the cuda kernel!");
+                    adjustThreadForUseWarpOptimization(blocks, threads);
+                    KernelReduceMaxFast<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
+                }
+                else if (strideNum < 256) {
+                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    if (cudaGridSize[0] == 1)
+                        oData = (DTYPE*)output->data;
+                    CheckNTErrors(cudaBlockSize[0] >= 128, "Incorrect thread number when calling the cuda kernel!");
+                    adjustThreadForUseWarpOptimization(blocks, threads);
+                    KernelReduceMaxFast<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
+                }
+                else if (strideNum < 512) {
+                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    if (cudaGridSize[0] == 1)
+                        oData = (DTYPE*)output->data;
+                    CheckNTErrors(cudaBlockSize[0] >= 256, "Incorrect thread number when calling the cuda kernel!");
+                    adjustThreadForUseWarpOptimization(blocks, threads);
+                    KernelReduceMaxFast<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
+                }
+                else {
+                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    if (cudaGridSize[0] == 1)
+                        oData = (DTYPE*)output->data;
+                    CheckNTErrors(cudaBlockSize[0] >= 512, "Incorrect thread number when calling the cuda kernel!");
+                    adjustThreadForUseWarpOptimization(blocks, threads);
+                    KernelReduceMaxFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
+                }
            }
-            else {
-                iData = buf2ft16;
-                oData = buf1ft16;
+            else if (input->dataType == X_FLOAT16) {
+                __half * buf1ft16 = (__half *)buf1;
+                __half * buf2ft16 = (__half *)buf2;
+                __half * iData = NULL;
+                __half * oData = NULL;
+                if (iter == 0) {
+                    iData = (__half*)input->data;
+                    oData = buf1ft16;
+                }
+                else if (iter % 2 == 1) {
+                    iData = buf1ft16;
+                    oData = buf2ft16;
+                }
+                else {
+                    iData = buf2ft16;
+                    oData = buf1ft16;
+                }
+
+                /* unroll the reduction procedure. The code is messy but it is faster. */
+                if (strideNum < 32) {
+                    GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    if (cudaGridSize[0] == 1)
+                        oData = (__half*)output->data;
+                    KernelReduceMax <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
+                }
+                else if (strideNum < 128) {
+                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    if (cudaGridSize[0] == 1)
+                        oData = (__half*)output->data;
+                    CheckNTErrors(cudaBlockSize[0] >= 64, "Incorrect thread number when calling the cuda kernel!");
+                    KernelReduceMaxFast<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
+                }
+                else if (strideNum < 256) {
+                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    if (cudaGridSize[0] == 1)
+                        oData = (__half*)output->data;
+                    CheckNTErrors(cudaBlockSize[0] >= 128, "Incorrect thread number when calling the cuda kernel!");
+                    KernelReduceMaxFast<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
+                }
+                else if (strideNum < 512) {
+                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    if (cudaGridSize[0] == 1)
+                        oData = (__half*)output->data;
+                    CheckNTErrors(cudaBlockSize[0] >= 256, "Incorrect thread number when calling the cuda kernel!");
+                    KernelReduceMaxFast<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
+                }
+                else {
+                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    if (cudaGridSize[0] == 1)
+                        oData = (__half*)output->data;
+                    CheckNTErrors(cudaBlockSize[0] >= 512, "Incorrect thread number when calling the cuda kernel!");
+                    KernelReduceMaxFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
+                }
            }

-            /* unroll the reduction procedure. The code is messy but it is faster. */
-            if (strideNum < 32) {
-                GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                if (cudaGridSize[0] == 1)
-                    oData = (__half*)output->data;
-                KernelReduceMax << <blocks, threads >> >(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
-            }
-            else if (strideNum < 128) {
-                GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                if (cudaGridSize[0] == 1)
-                    oData = (__half*)output->data;
-                CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!");
-                KernelReduceMaxFast<64> << <blocks, threads >> >(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
-            }
-            else if (strideNum < 256) {
-                GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                if (cudaGridSize[0] == 1)
-                    oData = (__half*)output->data;
-                CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!");
-                KernelReduceMaxFast<128> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
-            }
-            else if (strideNum < 512) {
-                GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                if (cudaGridSize[0] == 1)
-                    oData = (__half*)output->data;
-                CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!");
-                KernelReduceMaxFast<256> << <blocks, threads >> >(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
-            }
-            else {
-                GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                if (cudaGridSize[0] == 1)
-                    oData = (__half*)output->data;
-                CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!");
-                KernelReduceMaxFast<512> << <blocks, threads >> >(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
-            }
-        }
-        
-        strideNum = cudaGridSize[0];
-        blockSize = cudaGridSize[0];
+            strideNum = cudaGridSize[0];
+            blockSize = cudaGridSize[0];

-        iter++;
+            iter++;

-    }while(strideNum > 1);
+        } while (strideNum > 1);
+    }

    BacktoCudaDev(input->devID, devIDBackup);


--- a/source/tensor/core/reduce/ReduceSum.cu
+++ b/source/tensor/core/reduce/ReduceSum.cu
@@ -27,6 +27,57 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)

 #ifdef USE_CUDA

+/*
+use PTX code to reduce float data
+*/
+__device__ __forceinline__  
+float shflDownReduceSum(float input)
+{
+    float output;
+    asm volatile(
+        "{"
+        ".reg .f32 r0;"
+        "shfl.down.b32  r0, %1, 0x10, 0x1f;"
+        "add.f32        %1, r0, %1;"
+        "shfl.down.b32  r0, %1, 0x8, 0xf;"
+        "add.f32        %1, r0, %1;"
+        "shfl.down.b32  r0, %1, 0x4, 0x7;"
+        "add.f32        %1, r0, %1;"
+        "shfl.down.b32  r0, %1, 0x2, 0x3;"
+        "add.f32        %1, r0, %1;"
+        "shfl.down.b32  r0, %1, 0x1, 0x1;"
+        "add.f32        %0, r0, %1;"
+        "}"
+        : "=f"(output) : "f"(input));
+    return output;
+}
+
+/*
+use PTX code to reduce int data
+*/
+__device__ __forceinline__  
+int shflDownReduceSum(int input)
+{
+    int output;
+    asm volatile(
+        "{"
+        ".reg .s32 r0;"
+        "shfl.down.b32  r0, %1, 0x10, 0x1f;"
+        "add.s32        %1, r0, %1;"
+        "shfl.down.b32  r0, %1, 0x8, 0xf;"
+        "add.s32        %1, r0, %1;"
+        "shfl.down.b32  r0, %1, 0x4, 0x7;"
+        "add.s32        %1, r0, %1;"
+        "shfl.down.b32  r0, %1, 0x2, 0x3;"
+        "add.s32        %1, r0, %1;"
+        "shfl.down.b32  r0, %1, 0x1, 0x1;"
+        "add.s32        %0, r0, %1;"
+        "}"
+        : "=r"(output) : "r"(input));
+    return output;
+}
+
+
 /* 
 reduce a tensor to another that keeps the sum along a dimension  - slow version
 Given a block of data, we go over each dimension i in the stride and we have
@@ -157,6 +208,7 @@ void KernelReduceSum(__half * input, __half * output,
 #if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
    __half value = isValid ? __hsub(input[blockSize * k + stride * j + iOffset], bias[threadIdx.x]) : __half(0);
    DTYPE power2 = __half2float(power);
+
    if(power2 != (DTYPE)1.0){
        if(power2 == (DTYPE)2.0)
            value = __hmul(value, value);
@@ -276,25 +328,19 @@ void KernelReduceSumFast(DTYPE * input, DTYPE * output,
            value2 = exp(value2);
    }

-    /* load data into the shared mem */
-    data[tid] = value + value2;
-
+    value = value + value2;
    __syncthreads();
-
-    /* unroll the warp */
-    if(goodSize >= 512) {if(tid < 256) {data[tid] += data[tid + 256];} __syncthreads();}
-    if(goodSize >= 256) {if(tid < 128) {data[tid] += data[tid + 128];} __syncthreads();}
-    if(goodSize >= 128) {if(tid <  64) {data[tid] += data[tid +  64];} __syncthreads();}
-    if(goodSize >= 64)  {if(tid <  32) {data[tid] += data[tid +  32];} __syncthreads();}
-    if(goodSize >= 32)  {if(tid <  16) {data[tid] += data[tid +  16];} __syncthreads();}
-    if(goodSize >= 16)  {if(tid <   8) {data[tid] += data[tid +   8];} __syncthreads();}
-    if(goodSize >=  8)  {if(tid <   4) {data[tid] += data[tid +   4];} __syncthreads();}
-    if(goodSize >=  4)  {if(tid <   2) {data[tid] += data[tid +   2];} __syncthreads();}
-    if(goodSize >=  2)  {if(tid <   1) {data[tid] += data[tid +   1];} __syncthreads();}
-
-    /* write result for this block to the output array */
-    if(threadIdx.y == 0 && blockIdx.y < reducedStrideNum) 
-        output[(k * reducedStrideNum + blockIdx.y) * stride  + iOffset] = data[0];
+    value = shflDownReduceSum(value);
+    if ((tid & 0x1f) == 0) { data[tid / 32] = value; }
+    __syncthreads();
+    if (tid < 32){
+        if (tid < blockDim.y / 32)
+            value = data[tid];
+        else value = 0;
+            value = shflDownReduceSum(value);
+        if (tid == 0 && blockIdx.y < reducedStrideNum)
+            output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = value;
+    }
 }

 /* 
@@ -430,6 +476,195 @@ void KernelReduceSumFast(__half * input, __half * output,
 #endif
 }

+/*
+if data storage is discontinuius ,use this way to reduce 
+*/
+__global__ 
+void KernelReduceSumDiscontinuousStorage(DTYPE * input, DTYPE * output, int stride, int blockNum,
+                                         int strideNum, DTYPE * shift, DTYPE power, bool isExp)
+{
+    __shared__ DTYPE bias[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+    int idx = blockDim.x * blockIdx.x + threadIdx.x;
+    int blockIndex = idx / stride;
+    int offsetInBlock = idx % stride; 
+    if (idx >= stride * blockNum)
+        return;
+    bias[idx % blockDim.x] = shift != NULL ? shift[idx] : 0;
+    DTYPE ans = 0;
+
+#pragma unroll
+    for (int i = stride * strideNum * blockIndex + offsetInBlock;
+        i < stride * strideNum * blockIndex + offsetInBlock + stride * strideNum;
+        i += stride){
+        DTYPE value = input[i];
+        value = value - bias[idx % blockDim.x];
+        if (power != (DTYPE)1.0) {
+            if (power == (DTYPE)2.0) {
+                value = value * value;
+            }
+            else if (power == (DTYPE)0.5) {
+                value = sqrt(value);
+            }
+            else {
+                value = pow(value, power);
+            }
+        }
+        if (isExp) {
+            value = exp(value);
+        }
+        ans += value;
+    }
+    output[idx] = ans;
+}
+
+__global__
+void KernelReduceSumOp(DTYPE * input, DTYPE * output,
+                       int stride, int strideNum, int reducedStrideNum,
+                       int blockSize, int blockNum,
+                       DTYPE * shift, DTYPE power, bool isExp)
+{
+    __shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK / 32];
+    __shared__ DTYPE bias[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+
+    unsigned int tid = threadIdx.y;
+    unsigned int j = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= stride * blockNum)
+        return;
+
+    if (threadIdx.y == 0)
+        bias[threadIdx.x] = shift != NULL ? shift[i] : 0;
+
+    __syncthreads();
+
+    /* first level reduction */
+    int k = i / stride;
+    int iOffset = i % stride;
+
+    DTYPE threadSum = 0;
+
+    DTYPE * data = iData + threadIdx.x * blockDim.y;
+    DTYPE * inputData = input + k * blockSize;
+    for (int it = j; it < strideNum; it += blockDim.y){
+        DTYPE value = inputData[it * stride + iOffset] - bias[threadIdx.x];
+        if (power != (DTYPE)1.0) {
+            if (power == (DTYPE)2.0) {
+                value = value * value;
+            }
+            else if (power == (DTYPE)0.5) {
+                value = sqrt(value);
+            }
+            else {
+                value = pow(value, power);
+            }
+        }
+        if (isExp) value = exp(value);
+        threadSum += value;
+    }
+    __syncthreads();
+    threadSum = shflDownReduceSum(threadSum);
+    if ((tid & 0x1f) == 0) { data[tid / 32] = threadSum; }
+    __syncthreads();
+    if (tid < 32){
+        if (tid < blockDim.y / 32)
+            threadSum = data[tid];
+        else threadSum = 0;
+        threadSum = shflDownReduceSum(threadSum);
+        if (tid == 0 && blockIdx.y < reducedStrideNum)
+            output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = threadSum;
+    }
+
+}
+
+__global__
+void KernelReduceSumOpLessBlocks(DTYPE * input, DTYPE * output,
+                                 int strideNum, int blockNum,
+                                 DTYPE * shift, DTYPE power, bool isExp)
+{
+    __shared__ DTYPE bias[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+    int idx = threadIdx.x % 32;
+    int idy = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
+
+    if (idx == 0)
+        bias[threadIdx.x / 32] = shift != NULL ? shift[idy] : 0;
+
+    int startIndex = idy * strideNum;
+    DTYPE threadSum = 0;
+    for (int i = idx; i < strideNum; i += 32) {
+        DTYPE value = input[startIndex + i] - bias[threadIdx.x / 32];
+        if (power != (DTYPE)1.0) {
+            if (power == (DTYPE)2.0) {
+                value = value * value;
+            }
+            else if (power == (DTYPE)0.5) {
+                value = sqrt(value);
+            }
+            else {
+                value = pow(value, power);
+            }
+        }
+        if (isExp) value = exp(value);
+        threadSum += value;
+    }
+    threadSum = shflDownReduceSum(threadSum);
+    if (idx == 0)
+        output[idy] = threadSum;
+}
+
+/*
+according the GPU's sm number allocation warp num
+*/
+inline void continuousStorageThreadAllocation(dim3& grid, dim3& block, long long vectorNum, int vectorSize)
+{
+    int warpNum = 4;
+    if (vectorNum < 20 * 8) {
+        warpNum = 8;
+        if (vectorNum < 20 * 4) {
+            warpNum = 16;
+            if (warpNum < 20 * 2)
+                warpNum = 32;
+        }
+    }
+    int minWarpNum = vectorSize / 32;
+    if (vectorSize % 32 != 0) minWarpNum++;
+    warpNum = min(warpNum, minWarpNum);
+
+    grid.x = vectorNum;
+    grid.y = 1;
+    grid.z = 1;
+    block.x = 1;
+    block.y = warpNum * 32;
+    block.z = 1;
+}
+
+/* 
+this situation we use block.x * grid.x deal one vector for continuous read
+*/
+inline void discontinuousStorageNoShareMemThreadAllocation(dim3& grid, dim3& block, int stride, int blockNum)
+{
+    block.x = 512;
+    block.y = 1;
+
+    if ((stride * blockNum) % 512 == 0)
+        grid.x = (stride * blockNum) / 512;
+    else
+        grid.x = (stride * blockNum) / 512 + 1;
+    grid.y = 1;
+}
+
+/*
+adjust threads.x number then we can use warp optimization
+*/
+inline void adjustThreadForUseWarpOptimization(dim3& blocks, dim3& threads)
+{
+    if (threads.x > 1){
+        blocks.x *= threads.x;
+        threads.x = 1;
+    }
+    if (threads.y < 32)
+        threads.y = 32;
+}
+
 /* 
 sum the items along a dimension of the tensor (cuda version). 
 For a 1-dimensional data array a,
@@ -444,27 +679,24 @@ sum = \sum_i exp((a_i - shift)^power) if isExp == true
 */
 void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor * shift, DTYPE power, bool isExp)
 {
-    CheckNTErrors((input && output), "Empty input or output tensors!");
-    CheckNTErrors((input->order == output->order + 1), "Incorrect tensor sizes!");
-    CheckNTErrors((input->order > dim && dim >=0), "Illegal dimension to reduce!");
-    CheckNTErrors((input->dataType == output->dataType), "Unmatched data types!");
-    CheckNTErrors((shift == NULL || output->unitNum == shift->unitNum), "Incorrect shift tensor size!");
+    CheckNTErrors(input && output, "Empty input or output tensors!");
+    CheckNTErrors(input->order == output->order + 1, "Incorrect tensor sizes!");
+    CheckNTErrors(input->order > dim && dim >= 0, "Illegal dimension to reduce!");
+    CheckNTErrors(input->dataType == output->dataType, "Unmatched data types!");
+    CheckNTErrors(shift == NULL || output->unitNum == shift->unitNum, "Incorrect shift tensor size!");

 	int dimRDI = input->order - dim - 1;
    for(int i = 0; i < input->order; i++){
        if(i < dimRDI){
-            CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i]), 
-                                 "Unmatched tensors!");
+            CheckNTErrors(input->dimSizeRDI[i] == output->dimSizeRDI[i], "Unmatched tensors!");
        }
        else if(i > dimRDI){
-            CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i - 1]), 
-                                "Unmatched tensors!");
+            CheckNTErrors(input->dimSizeRDI[i] == output->dimSizeRDI[i - 1], "Unmatched tensors!");
        }
    }

-    if(input->dataType == X_FLOAT16){
-        CheckNTErrors((power == 0 || power == 0.5 || power == 1.0 || power == 2.0), "TODO!");
-    }
+    if(input->dataType == X_FLOAT16)
+        CheckNTErrors(power == 0 || power == 0.5 || power == 1.0 || power == 2.0, "TODO!");

    int cudaGridSize[3];
    int cudaBlockSize[3];
@@ -496,136 +728,172 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
    int devIDBackup;
    ProtectCudaDev(input->devID, devIDBackup);

-    do{
-        if(input->dataType == DEFAULT_DTYPE){
-            DTYPE * iData = NULL;
-            DTYPE * oData = NULL;
-            if (iter == 0) {
-                iData = (DTYPE*)input->data;
-                oData = buf1;
-            }
-            else if (iter % 2 == 1) {
-                iData = buf1;
-                oData = buf2;
-            }
-            else {
-                iData = buf2;
-                oData = buf1;
-            }
-            /* unroll the reduction procedure. The code is messy but it is faster. */
-            if(strideNum < 32){
-                GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                if (cudaGridSize[0] == 1)
-                    oData = (DTYPE*)output->data;
-                KernelReduceSum <<<blocks, threads >>>(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp);
-            }
-            else if(strideNum < 128){
-                GDevs.GetCudaThread2D(devID, MAX(strideNum/2+1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                if (cudaGridSize[0] == 1)
-                    oData = (DTYPE*)output->data;
-                CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!");
-                KernelReduceSumFast<64> <<<blocks, threads >>>(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp);
-            }
-            else if(strideNum < 256){
-                GDevs.GetCudaThread2D(devID, MAX(strideNum/2+1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                if (cudaGridSize[0] == 1)
-                    oData = (DTYPE*)output->data;
-                CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!");
-                KernelReduceSumFast<128> <<<blocks, threads >>>(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp);
-            }
-            else if(strideNum < 512){
-                GDevs.GetCudaThread2D(devID, MAX(strideNum/2+1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                if (cudaGridSize[0] == 1)
-                    oData = (DTYPE*)output->data;
-                CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!");
-                KernelReduceSumFast<256> <<<blocks, threads >>>(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp);
-            }
-            else{
-                GDevs.GetCudaThread2D(devID, MAX(strideNum/2+1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                if (cudaGridSize[0] == 1)
-                    oData = (DTYPE*)output->data;
-                CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!");
-                KernelReduceSumFast<512> <<<blocks, threads >>>(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp);
-            }
+    if (stride == 1 && blockNum >= 10) {
+        dim3 grids;
+        dim3 blocks;
+        continuousStorageThreadAllocation(grids, blocks, (long long)blockNum, strideNum);
+        if (blocks.y >= 128)
+            KernelReduceSumOp <<<grids, blocks >>> ((DTYPE *)input->data, (DTYPE*)output->data, stride, strideNum, grids.y, blockSize, blockNum, sp, power, isExp);
+        else {
+            if (blockNum % 4 != 0) blockNum = (int)(blockNum / 4) + 1;
+            else blockNum = blockNum / 4;
+            KernelReduceSumOpLessBlocks << <blockNum, 128 >> > ((DTYPE *)input->data, (DTYPE*)output->data, strideNum, blockNum, sp, power, isExp);
        }
-        else if(input->dataType == X_FLOAT16){
-            __half * buf1ft16 = (__half *)buf1;
-            __half * buf2ft16 = (__half *)buf2;
-            __half * spft16 = (__half *)sp;
-            unsigned short power2 = FloatToFloat16(power);
-            __half * powerft16p = (__half*)&power2;
-            __half * iData = NULL;
-            __half * oData = NULL;
-            if (iter == 0) {
-                iData = (__half*)input->data;
-                oData = buf1ft16;
-            }
-            else if (iter % 2 == 1) {
-                iData = buf1ft16;
-                oData = buf2ft16;
-            }
-            else {
-                iData = buf2ft16;
-                oData = buf1ft16;
-            }
-
-            /* unroll the reduction procedure. The code is messy but it is faster. */
-            if(strideNum < 32){
-                GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                if (cudaGridSize[0] == 1)
-                    oData = (__half*)output->data;
-                KernelReduceSum << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, spft16, *powerft16p, isExp);
-            }
-            else if(strideNum < 128){
-                GDevs.GetCudaThread2D(devID, MAX(strideNum/2+1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                if (cudaGridSize[0] == 1)
-                    oData = (__half*)output->data;
-                CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!");
-                KernelReduceSumFast<64> <<<blocks, threads >>>(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, spft16, *powerft16p, isExp);
-            }
-            else if(strideNum < 256){
-                GDevs.GetCudaThread2D(devID, MAX(strideNum/2+1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                if (cudaGridSize[0] == 1)
-                    oData = (__half*)output->data;
-                CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!");
-                KernelReduceSumFast<128> <<<blocks, threads >>>(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, spft16, *powerft16p, isExp);
-            }
-            else if(strideNum < 512){
-                GDevs.GetCudaThread2D(devID, MAX(strideNum/2+1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                if (cudaGridSize[0] == 1)
-                    oData = (__half*)output->data;
-                CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!");
-                KernelReduceSumFast<256> <<<blocks, threads >>>(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, spft16, *powerft16p, isExp);
+    }
+    else if (stride != 1 && stride * blockNum > 4096){
+        //GDevs->GetGridAndBlockSize2D(devID, stride * blockNum, strideNum,MAX_INT, cudaGridSize, cudaBlockSize);
+        //unsigned int* goutput = (unsigned int *)input->data;
+        //convert2uintV2 <<<dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1])>>> ((float*)input->data, goutput, stride, strideNum, blockNum, strideNum*blockNum*stride);
+        dim3 grid, block;
+        discontinuousStorageNoShareMemThreadAllocation(grid, block, stride, blockNum);
+        KernelReduceSumDiscontinuousStorage <<<grid, block>>> ((DTYPE *)input->data, (DTYPE*)output->data, stride, 
+                                                                strideNum, blockNum,sp, power, isExp);
+    }
+    else {
+        do {
+            if (input->dataType == DEFAULT_DTYPE) {
+                DTYPE * iData = NULL;
+                DTYPE * oData = NULL;
+                if (iter == 0) {
+                    iData = (DTYPE*)input->data;
+                    oData = buf1;
+                }
+                else if (iter % 2 == 1) {
+                    iData = buf1;
+                    oData = buf2;
+                }
+                else {
+                    iData = buf2;
+                    oData = buf1;
+                }
+                /* unroll the reduction procedure. The code is messy but it is faster. */
+                if (strideNum <= 32) {
+                    GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    if (cudaGridSize[0] == 1)
+                        oData = (DTYPE*)output->data;
+                    KernelReduceSum <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, 
+                                                           blockSize, blockNum, sp, power, isExp);
+                }
+                else if (strideNum < 128) {
+                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    if (cudaGridSize[0] == 1)
+                        oData = (DTYPE*)output->data;
+                    CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!");
+                    adjustThreadForUseWarpOptimization(blocks, threads);
+                    KernelReduceSumFast<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, 
+                                                                   blockSize, blockNum, sp, power, isExp);
+                }
+                else if (strideNum < 256) {
+                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    if (cudaGridSize[0] == 1)
+                        oData = (DTYPE*)output->data;
+                    CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!");
+                    adjustThreadForUseWarpOptimization(blocks, threads);
+                    KernelReduceSumFast<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, 
+                                                                    blockSize, blockNum, sp, power, isExp);
+                }
+                else if (strideNum < 512) {
+                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    if (cudaGridSize[0] == 1)
+                        oData = (DTYPE*)output->data;
+                    CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!");
+                    adjustThreadForUseWarpOptimization(blocks, threads);
+                    KernelReduceSumFast<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, 
+                                                                    blockSize, blockNum, sp, power, isExp);
+                }
+                else {
+                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    if (cudaGridSize[0] == 1)
+                        oData = (DTYPE*)output->data;
+                    CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!");
+                    adjustThreadForUseWarpOptimization(blocks, threads);
+                    KernelReduceSumFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, 
+                                                                    blockSize, blockNum, sp, power, isExp);
+                }
            }
-            else{
-                GDevs.GetCudaThread2D(devID, MAX(strideNum/2+1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                if (cudaGridSize[0] == 1)
-                    oData = (__half*)output->data;
-                CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!");
-                KernelReduceSumFast<512> <<<blocks, threads >>>(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, spft16, *powerft16p, isExp);
+            else if (input->dataType == X_FLOAT16) {
+                __half * buf1ft16 = (__half *)buf1;
+                __half * buf2ft16 = (__half *)buf2;
+                __half * spft16 = (__half *)sp;
+                unsigned short power2 = FloatToFloat16(power);
+                __half * powerft16p = (__half*)&power2;
+                __half * iData = NULL;
+                __half * oData = NULL;
+                if (iter == 0) {
+                    iData = (__half*)input->data;
+                    oData = buf1ft16;
+                }
+                else if (iter % 2 == 1) {
+                    iData = buf1ft16;
+                    oData = buf2ft16;
+                }
+                else {
+                    iData = buf2ft16;
+                    oData = buf1ft16;
+                }
+
+                /* unroll the reduction procedure. The code is messy but it is faster. */
+                if (strideNum < 32) {
+                    GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    if (cudaGridSize[0] == 1)
+                        oData = (__half*)output->data;
+                    KernelReduceSum <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, 
+                                                           blockSize, blockNum, spft16, *powerft16p, isExp);
+                }
+                else if (strideNum < 128) {
+                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    if (cudaGridSize[0] == 1)
+                        oData = (__half*)output->data;
+                    CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!");
+                    KernelReduceSumFast<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, 
+                                                                   blockSize, blockNum, spft16, *powerft16p, isExp);
+                }
+                else if (strideNum < 256) {
+                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    if (cudaGridSize[0] == 1)
+                        oData = (__half*)output->data;
+                    CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!");
+                    KernelReduceSumFast<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, 
+                                                                    blockSize, blockNum, spft16, *powerft16p, isExp);
+                }
+                else if (strideNum < 512) {
+                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    if (cudaGridSize[0] == 1)
+                        oData = (__half*)output->data;
+                    CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!");
+                    KernelReduceSumFast<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, 
+                                                                    blockSize, blockNum, spft16, *powerft16p, isExp);
+                }
+                else {
+                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    if (cudaGridSize[0] == 1)
+                        oData = (__half*)output->data;
+                    CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!");
+                    KernelReduceSumFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, 
+                                                                    blockSize, blockNum, spft16, *powerft16p, isExp);
+                }
            }
-        }

-        strideNum = cudaGridSize[0];
-        blockSize = cudaGridSize[0];
-        sp = NULL;
-        power = (DTYPE)1.0;
-        isExp = false;
+            strideNum = cudaGridSize[0];
+            blockSize = cudaGridSize[0];
+            sp = NULL;
+            power = (DTYPE)1.0;
+            isExp = false;

-        iter++;
-
-    }while(strideNum > 1);
+            iter++;

+        } while (strideNum > 1);
+    }
    ProtectCudaDev(input->devID, devIDBackup);

    if (mem != NULL)

--- a/source/tensor/function/LogSoftmax.cpp
+++ b/source/tensor/function/LogSoftmax.cpp
@@ -176,15 +176,19 @@ make a new tensor to keep the result and return it
 */
 XTensor LogSoftmax(const XTensor &x, int leadDim)
 {
+    int ld = leadDim;
+    if (ld < 0)
+        ld = x.order - 1;
+
    XTensor y(&x);
    y.SetTMP();

    /* call _LogSoftmax function */
-    _LogSoftmax(&x, &y, leadDim);
+    _LogSoftmax(&x, &y, ld);

    /* tensor connection */
    XLink::MakeLink(&x, NULL, &y, FUNC_LOGSOFTMAX);
-    XLink::AddParamToHeadInt(&y, leadDim);
+    XLink::AddParamToHeadInt(&y, ld);

    return y;
 }
@@ -248,7 +252,7 @@ There are two ways to implement this process.
 Method 1. we compute dE/dy and dy/dx resepectively, and then reach dE/dx by dE/dx = dE/dy * dy/dx
 (or more precisely dE/dx_j = \sum_{i} {dE/dy_i * dy_i/dx_j})
 Method 2. we compute dE/dx (or dE/dx_j) in a single step, rather than resorting to the
-sub-models dE/dy and dy/dx. We can do this by using dE/dx_j = -gold_j + exp(y_j)
+sub-models of dE/dy and dy/dx. We can do this by using dE/dx_j = -gold_j + exp(y_j)

 Here we choose Method 2, i.e., we straightforwardly compute dE/dx_j by

@@ -257,12 +261,12 @@ dE/dx_j = -gold_j + exp(y_j)
 (or dE/dx_j = -\delta(i,j) + exp(y_j) for a Maximum A Posteriori Estimation (MAP))

 Method 1 is also fine but is more time consuming due to the summation over dimensions.
-Note that this method is not good for the standard version softmax when working with
-the cross entropy loss. Because it is numerical unstable. When we use a usual method to
+Note that this method is not good for the standard version softmax when we work with
+the cross entropy loss because it is numerical unstable. When we use a usual method to
 define softmax, we have softmax: y_i = log(e^{x_i} / \sum_{k} e^{x_k}). It is trivial to
-know that dy_i/dx_j = y_i * \delta(i,j) - y_i * y_j. As y_i and y_j could be a small number,
-y_i * y_i would result in a much smaller one with a risk of lossing precision. This is even
-worse we multiply dy_i/dx_j with dE/dy_i. So it is in general to use log softmax instead for
+know that dy_i/dx_j = y_i * \delta(i,j) - y_i * y_j. As y_i and y_j could be small numbers,
+y_i * y_i would result in a much smaller value with a risk of lossing precision. This is even
+worse we multiply dy_i/dx_j with dE/dy_i. So it is in general to use log softmax for
 better numerical stability.

 >> gold - gold standard to measure error (or loss)

--- a/source/tensor/function/Softmax.cpp
+++ b/source/tensor/function/Softmax.cpp
@@ -103,10 +103,10 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
                    else{
                        for(int i = 0; i < n; i++){
                            DTYPE r = (DTYPE)exp(ip[i * m + j] - mp[j])/sp[j];
-                            if(IsNAN(r))
-                                r = DTYPE_MIN;
-                            if(IsINF(r))
-                                r = DTYPE_MIN;
+                            if (r > (DTYPE)1.0F)
+                                r = (DTYPE)1.0F;
+                            else if (r < 0)
+                                r = 0;
                            op[i * m + j] = r;
                        }
                    }
@@ -143,14 +143,19 @@ make a new tensor to keep the result and return it
 */
 XTensor Softmax(const XTensor &x, int leadDim)
 {
+    int ld = leadDim;
+    if (ld < 0)
+        ld = x.order - 1;
+
    XTensor y(&x);
    y.SetTMP();

    /* call _Softmax function */
-    _Softmax(&x, &y, leadDim);
+    _Softmax(&x, &y, ld);

    /* tensor connection */
    XLink::MakeLink(&x, NULL, &y, FUNC_SOFTMAX);
+    XLink::AddParamToHeadInt(&y, ld);

    return y;
 }

--- a/source/tensor/function/Softmax.cu
+++ b/source/tensor/function/Softmax.cu
@@ -85,7 +85,13 @@ void KernelSoftmaxComputeTensor(DTYPE * x, DTYPE * max, DTYPE * sum, DTYPE * y, 

    if(i < strideSizeTotal && j < strideNum){
        int offset = int(i / stride) * blockSize + j * stride + i2[threadIdx.x];
-        y[offset] = exp(x[offset] - xMax[threadIdx.x])/xSum[threadIdx.x];
+        DTYPE r = exp(x[offset] - xMax[threadIdx.x])/xSum[threadIdx.x];
+        if (r >(DTYPE)1.0F)
+            r = (DTYPE)1.0F;
+        else if (r < 0)
+            r = 0;
+        y[offset] = r;
+
    }
 }

@@ -194,7 +200,12 @@ void KernelSoftmaxComputeTensorUseBroadcast(DTYPE * input, DTYPE * max, DTYPE * 
        maxData = broadcast(maxData);
        if (i < strideNum){
            int offset = int(j / stride) * blockSize + i * stride + i2;
-            output[offset] = exp(input[offset] - maxData) / sumData;
+            DTYPE r = exp(input[offset] - maxData) / sumData;
+            if (r > (DTYPE)1.0F)
+                r = (DTYPE)1.0F;
+            else if (r < 0)
+                r = 0;
+            output[offset] = r;
        }
    }
 }

--- a/source/tensor/test/TDivDim.cpp
+++ b/source/tensor/test/TDivDim.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-14
+ */
+
+#include "TDivDim.h"
+#include "../core/arithmetic/DivDim.h"
+#include "../XTensor.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* 
+case 1: tensor division c = a/b + \alpha * c
+where the size of b is equal to the n-th dimension of a, 
+i.e., a is divided with b by broadcasting.
+In this case, (2, 4) / (2) = (2, 4), n = 0, alpha = 0.0.
+*/
+bool TestDivDim1()
+{
+    /* a tensor of size (2, 4) */
+    int aOrder = 2;
+    int * aDimSize = new int[aOrder];
+    aDimSize[0] = 2;
+    aDimSize[1] = 4;
+
+    int aUnitNum = 1;
+    for (int i = 0; i < aOrder; i++)
+        aUnitNum *= aDimSize[i];
+
+    /* a tensor of size (2) */
+    int bOrder = 1;
+    int * bDimSize = new int[bOrder];
+    bDimSize[0] = 2;
+
+    int bUnitNum = 1;
+    for (int i = 0; i < bOrder; i++)
+        bUnitNum *= bDimSize[i];
+
+    DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
+                          {4.0F, 5.0F, 6.0F, 7.0F} };
+    DTYPE bData[2] = {1.0F, -1.0F};
+    DTYPE answer[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
+                           {-4.0F, -5.0F, -6.0F, -7.0F} };
+
+    /* CPU test */
+    bool cpuTest = true;
+
+    /* create tensors */
+    XTensor * a = NewTensor(aOrder, aDimSize);
+    XTensor * b = NewTensor(bOrder, bDimSize);
+    XTensor * c = NewTensor(aOrder, aDimSize);
+    XTensor * cMe = NewTensor(aOrder, aDimSize);
+    XTensor cUser;
+
+    /* initialize variables */
+    a->SetData(aData, aUnitNum);
+    cMe->SetData(aData, aUnitNum);
+    b->SetData(bData, bUnitNum);
+    c->SetZeroAll();
+
+    /* call DivDim function */
+    _DivDim(a, b, c, 0);
+    _DivDim(cMe, b, 0);
+    cUser = DivDim(*a, *b, 0);
+
+    /* check results */
+    cpuTest = c->CheckData(answer, aUnitNum) && 
+              cMe->CheckData(answer, aUnitNum) && 
+              cUser.CheckData(answer, aUnitNum);
+
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensor */
+    XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor cUserGPU;
+
+    /* Initialize variables */
+    aGPU->SetData(aData, aUnitNum);
+    cMeGPU->SetData(aData, aUnitNum);
+    bGPU->SetData(bData, bUnitNum);
+    cGPU->SetZeroAll();
+
+    /* call sum function */
+    _DivDim(aGPU, bGPU, cGPU, 0);
+    _DivDim(cMeGPU, bGPU, 0);
+    cUserGPU = DivDim(*aGPU, *bGPU, 0);
+
+    /* check results */
+    gpuTest = cGPU->CheckData(answer, aUnitNum) && 
+              cMeGPU->CheckData(answer, aUnitNum) && 
+              cUserGPU.CheckData(answer, aUnitNum);
+
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete c;
+    delete cMe;
+    delete aGPU;
+    delete bGPU;
+    delete cGPU;
+    delete cMeGPU;
+    delete[] aDimSize;
+    delete[] bDimSize;
+
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete a;
+	delete b;
+	delete c;
+    delete cMe;
+    delete[] aDimSize;
+    delete[] bDimSize;
+
+    return cpuTest;
+#endif // USE_CUDA
+}
+
+/* 
+case 2: tensor division c = a/b + \alpha * c
+where the size of b is equal to the n-th dimension of a, 
+i.e., a is divided with b by broadcasting.
+In this case, (2, 4) / (2, 2) = (2, 4), n = 1.
+*/
+bool TestDivDim2()
+{
+    /* a tensor of size (2, 4) */
+    int aOrder = 2;
+    int * aDimSize = new int[aOrder];
+    aDimSize[0] = 2;
+    aDimSize[1] = 4;
+
+    int aUnitNum = 1;
+    for (int i = 0; i < aOrder; i++)
+        aUnitNum *= aDimSize[i];
+
+    /* a tensor of size (2, 2) */
+    int bOrder = 2;
+    int * bDimSize = new int[bOrder];
+    bDimSize[0] = 2;
+    bDimSize[1] = 2;
+
+    int bUnitNum = 1;
+    for (int i = 0; i < bOrder; i++)
+        bUnitNum *= bDimSize[i];
+
+    DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
+                          {4.0F, 5.0F, 6.0F, 7.0F} };
+    DTYPE bData[2][2] = { {1.0F, -1.0F},
+                          {-1.0F, 1.0F} };
+    DTYPE answer[2][4] = { {0.0F, -1.0F, -2.0F, 3.0F},
+                           {4.0F, -5.0F, -6.0F, 7.0F} };
+
+    /* CPU test */
+    bool cpuTest = true;
+
+    /* create tensors */
+    XTensor * a = NewTensor(aOrder, aDimSize);
+    XTensor * b = NewTensor(bOrder, bDimSize);
+    XTensor * c = NewTensor(aOrder, aDimSize);
+    XTensor * cMe = NewTensor(aOrder, aDimSize);
+    XTensor cUser;
+
+    /* initialize variables */
+    a->SetData(aData, aUnitNum);
+    cMe->SetData(aData, aUnitNum);
+    b->SetData(bData, bUnitNum);
+    c->SetZeroAll();
+
+    /* call DivDim function */
+    _DivDim(a, b, c, 1);
+    _DivDim(cMe, b, 1);
+    cUser = DivDim(*a, *b, 1);
+
+    /* check results */
+    cpuTest = c->CheckData(answer, aUnitNum) && 
+              cMe->CheckData(answer, aUnitNum) && 
+              cUser.CheckData(answer, aUnitNum);
+
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensor */
+    XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor cUserGPU;
+
+    /* Initialize variables */
+    aGPU->SetData(aData, aUnitNum);
+    cMeGPU->SetData(aData, aUnitNum);
+    bGPU->SetData(bData, bUnitNum);
+    cGPU->SetZeroAll();
+
+    /* call sum function */
+    _DivDim(aGPU, bGPU, cGPU, 1);
+    _DivDim(cMeGPU, bGPU, 1);
+    cUserGPU = DivDim(*aGPU, *bGPU, 1);
+
+    /* check results */
+    gpuTest = cGPU->CheckData(answer, aUnitNum) && 
+              cMeGPU->CheckData(answer, aUnitNum) && 
+              cUserGPU.CheckData(answer, aUnitNum);
+
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete c;
+    delete cMe;
+    delete aGPU;
+    delete bGPU;
+    delete cGPU;
+    delete cMeGPU;
+    delete[] aDimSize;
+    delete[] bDimSize;
+
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete a;
+	delete b;
+	delete c;
+    delete cMe;
+    delete[] aDimSize;
+    delete[] bDimSize;
+
+    return cpuTest;
+#endif // USE_CUDA
+}
+
+/* other cases */
+/*
+    TODO!!
+*/
+
+/* test for DivDim Function */
+bool TestDivDim()
+{
+    XPRINT(0, stdout, "[TEST DIVDIM] tensor division c(i) = a/b + \alpha * c by broadcasting\n");
+    bool returnFlag = true, caseFlag = true;
+
+    /* case 1 test */
+    caseFlag = TestDivDim1();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 1 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 1 passed!\n");
+
+    /* case 2 test */
+    caseFlag = TestDivDim2();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 2 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 2 passed!\n");
+
+    /* other cases test */
+    /*
+        TODO!!
+    */
+
+    if (returnFlag) {
+        XPRINT(0, stdout, ">> All Passed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> Failed!\n");
+
+    XPRINT(0, stdout, "\n");
+
+    return returnFlag;
+}
+
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/test/TDivDim.h
+++ b/source/tensor/test/TDivDim.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-14
+*/
+
+#ifndef __TEST_DIVDIM_H__
+#define __TEST_DIVDIM_H__
+
+#include "../core/arithmetic/DivDim.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* test for DivDim Function */
+bool TestDivDim();
+
+} // namespace nts(NiuTrans.Tensor)
+#endif // __TEST_DIVDIM_H__
--- a/source/tensor/test/TMultiplyDim.cpp
+++ b/source/tensor/test/TMultiplyDim.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-30
+ */
+
+#include "TMultiplyDim.h"
+#include "../core/arithmetic/MultiplyDim.h"
+#include "../XTensor.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/*
+case 1: tensor multiplication c = a * b + \alpha * c
+where the size of b is equal to the n-th dimension of a,
+i.e., a is multiplied with b by broadcasting
+In this case, (2, 4) * (2) = (2, 4), n = 0.
+*/
+bool TestMultiplyDim1()
+{
+    /* a tensor of size (2, 4) */
+    int aOrder = 2;
+    int * aDimSize = new int[aOrder];
+    aDimSize[0] = 2;
+    aDimSize[1] = 4;
+
+    int aUnitNum = 1;
+    for (int i = 0; i < aOrder; i++)
+        aUnitNum *= aDimSize[i];
+
+    /* a tensor of size (2) */
+    int bOrder = 1;
+    int * bDimSize = new int[bOrder];
+    bDimSize[0] = 2;
+
+    int bUnitNum = 1;
+    for (int i = 0; i < bOrder; i++)
+        bUnitNum *= bDimSize[i];
+
+    DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
+                          {4.0F, 5.0F, 6.0F, 7.0F} };
+    DTYPE bData[2] = {1.0F, -1.0F};
+    DTYPE answer[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
+                           {-4.0F, -5.0F, -6.0F, -7.0F} };
+
+    /* CPU test */
+    bool cpuTest = true;
+
+    /* create tensors */
+    XTensor * a = NewTensor(aOrder, aDimSize);
+    XTensor * b = NewTensor(bOrder, bDimSize);
+    XTensor * c = NewTensor(aOrder, aDimSize);
+    XTensor * cMe = NewTensor(aOrder, aDimSize);
+    XTensor cUser;
+
+    /* initialize variables */
+    a->SetData(aData, aUnitNum);
+    cMe->SetData(aData, aUnitNum);
+    b->SetData(bData, bUnitNum);
+    c->SetZeroAll();
+
+    /* call MultiplyDim function */
+    _MultiplyDim(a, b, c, 0);
+    _MultiplyDimMe(cMe, b, 0);
+    cUser = MultiplyDim(*a, *b, 0);
+
+    /* check results */
+    cpuTest = c->CheckData(answer, aUnitNum) && 
+              cMe->CheckData(answer, aUnitNum) && 
+              cUser.CheckData(answer, aUnitNum);
+
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensor */
+    XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor cUserGPU;
+
+    /* Initialize variables */
+    aGPU->SetData(aData, aUnitNum);
+    cMeGPU->SetData(aData, aUnitNum);
+    bGPU->SetData(bData, bUnitNum);
+    cGPU->SetZeroAll();
+
+    /* call MultiplyDim function */
+    _MultiplyDim(aGPU, bGPU, cGPU, 0);
+    _MultiplyDimMe(cMeGPU, bGPU, 0);
+    cUserGPU = MultiplyDim(*aGPU, *bGPU, 0);
+
+    /* check results */
+    gpuTest = cGPU->CheckData(answer, aUnitNum) && 
+              cMeGPU->CheckData(answer, aUnitNum) && 
+              cUserGPU.CheckData(answer, aUnitNum);
+
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete c;
+    delete cMe;
+    delete aGPU;
+    delete bGPU;
+    delete cGPU;
+    delete cMeGPU;
+    delete[] aDimSize;
+    delete[] bDimSize;
+
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete c;
+    delete cMe;
+    delete[] aDimSize;
+    delete[] bDimSize;
+
+    return cpuTest;
+#endif // USE_CUDA
+}
+
+/* 
+case 2: tensor multiplication c = a*b + \alpha * c
+where the size of b is equal to the n-th dimension of a, 
+i.e., a is multiplied with b by broadcasting.
+In this case, (2, 4) * (4) = (2, 4), n = 1.
+*/
+bool TestMultiplyDim2()
+{
+    /* a tensor of size (2, 4) */
+    int aOrder = 2;
+    int * aDimSize = new int[aOrder];
+    aDimSize[0] = 2;
+    aDimSize[1] = 4;
+
+    int aUnitNum = 1;
+    for (int i = 0; i < aOrder; i++)
+        aUnitNum *= aDimSize[i];
+
+    /* a tensor of size (4) */
+    int bOrder = 1;
+    int * bDimSize = new int[bOrder];
+    bDimSize[0] = 4;
+
+    int bUnitNum = 1;
+    for (int i = 0; i < bOrder; i++)
+        bUnitNum *= bDimSize[i];
+
+    DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
+                          {4.0F, 5.0F, 6.0F, 7.0F} };
+    DTYPE bData[4] = {1.0F, -1.0F , 1.0F, -1.0F};
+    DTYPE answer[2][4] = { {0.0F, -1.0F, 2.0F, -3.0F},
+                           {4.0F, -5.0F, 6.0F, -7.0F} };
+
+    /* CPU test */
+    bool cpuTest = true;
+
+    /* create tensors */
+    XTensor * a = NewTensor(aOrder, aDimSize);
+    XTensor * b = NewTensor(bOrder, bDimSize);
+    XTensor * c = NewTensor(aOrder, aDimSize);
+    XTensor * cMe = NewTensor(aOrder, aDimSize);
+    XTensor cUser;
+
+    /* initialize variables */
+    a->SetData(aData, aUnitNum);
+    cMe->SetData(aData, aUnitNum);
+    b->SetData(bData, bUnitNum);
+    c->SetZeroAll();
+
+    /* call MultiplyDim function */
+    _MultiplyDim(a, b, c, 1);
+    _MultiplyDimMe(cMe, b, 1);
+    cUser = MultiplyDim(*a, *b, 1);
+
+    /* check results */
+    cpuTest = c->CheckData(answer, aUnitNum) && 
+              cMe->CheckData(answer, aUnitNum) && 
+              cUser.CheckData(answer, aUnitNum);
+
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensor */
+    XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor cUserGPU;
+
+    /* Initialize variables */
+    aGPU->SetData(aData, aUnitNum);
+    cMeGPU->SetData(aData, aUnitNum);
+    bGPU->SetData(bData, bUnitNum);
+    cGPU->SetZeroAll();
+
+    /* call MultiplyDim function */
+    _MultiplyDim(aGPU, bGPU, cGPU, 1);
+    _MultiplyDimMe(cMeGPU, bGPU, 1);
+    cUserGPU = MultiplyDim(*aGPU, *bGPU, 1);
+
+    /* check results */
+    gpuTest = cGPU->CheckData(answer, aUnitNum) && 
+              cMeGPU->CheckData(answer, aUnitNum) && 
+              cUserGPU.CheckData(answer, aUnitNum);
+
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete c;
+    delete cMe;
+    delete aGPU;
+    delete bGPU;
+    delete cGPU;
+    delete cMeGPU;
+    delete[] aDimSize;
+    delete[] bDimSize;
+
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete c;
+    delete cMe;
+    delete[] aDimSize;
+    delete[] bDimSize;
+
+    return cpuTest;
+#endif // USE_CUDA
+}
+
+/* test for MultiplyDim Function */
+bool TestMultiplyDim()
+{
+    XPRINT(0, stdout, "[TEST MULTIPLYDIM] tensor multiplication c = a * b + \alpha * c by broadcasting\n");
+    bool returnFlag = true, caseFlag = true;
+
+    /* case 1 test */
+    caseFlag = TestMultiplyDim1(); 
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 1 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 1 passed!\n");
+
+    /* case 2 test */
+    caseFlag = TestMultiplyDim2();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 2 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 2 passed!\n");
+
+    /* other cases test */
+    /*
+    TODO!!
+    */
+
+    if (returnFlag) {
+        XPRINT(0, stdout, ">> All Passed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> Failed!\n");
+
+    XPRINT(0, stdout, "\n");
+
+    return returnFlag;
+}
+
+
+
+}
--- a/source/tensor/test/TMultiplyDim.h
+++ b/source/tensor/test/TMultiplyDim.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-30
+ */
+
+#ifndef __TEST_MULTIPLYDIM_H__
+#define __TEST_MULTIPLYDIM_H__
+
+#include "../core/arithmetic/MultiplyDim.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* test for MultiplyDim Function */
+bool TestMultiplyDim();
+
+} // namespace nts(NiuTrans.Tensor)
+#endif // __TEST_MULTIPLYDIM_H__
\ No newline at end of file
--- a/source/tensor/test/TReduceSum.cpp
+++ b/source/tensor/test/TReduceSum.cpp
@@ -24,7 +24,8 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)

 /* 
-case 1: sum the items along a dimension of the tensor.
+case 1: test ReduceSum function.
+Sum the items along a dimension of the tensor.
 In this case, 
 (2, 4) -> (4), dim = 0
 (2, 4) -> (2), dim = 1
@@ -90,8 +91,8 @@ bool TestReduceSum1()
    tUser2 = ReduceSum(*s, 1, *shift2);

    /* check results */
-    cpuTest = t1->CheckData(answer1, tUnitNum1) && tUser1.CheckData(answer1, tUnitNum1)
-        && t2->CheckData(answer2, tUnitNum2) && tUser2.CheckData(answer2, tUnitNum2);
+    cpuTest = t1->CheckData(answer1, tUnitNum1) && tUser1.CheckData(answer1, tUnitNum1) && 
+              t2->CheckData(answer2, tUnitNum2) && tUser2.CheckData(answer2, tUnitNum2);

 #ifdef USE_CUDA
    /* GPU test */
@@ -120,8 +121,8 @@ bool TestReduceSum1()
    tUserGPU2 = ReduceSum(*sGPU, 1, *shiftGPU2);

    /* check results */
-    gpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tUserGPU1.CheckData(answer1, tUnitNum1)
-        && tGPU2->CheckData(answer2, tUnitNum2) && tUserGPU2.CheckData(answer2, tUnitNum2);
+    gpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tUserGPU1.CheckData(answer1, tUnitNum1) && 
+              tGPU2->CheckData(answer2, tUnitNum2) && tUserGPU2.CheckData(answer2, tUnitNum2);

    /* destroy variables */
    delete s;

--- a/source/tensor/test/TSubDim.cpp
+++ b/source/tensor/test/TSubDim.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
+*/
+
+#include "TSubDim.h"
+#include "../core/arithmetic/SubDim.h"
+#include "../XTensor.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/*
+case 1: tensor subtraction c = a - b * \beta
+where the size of b is equal to the n-th dimension of a,
+i.e., a is subtracted with b by broadcasting
+*/
+bool TestSubDim1()
+{
+	/* a tensor of size (2, 4) */
+	int aOrder = 2;
+	int * aDimSize = new int[aOrder];
+	aDimSize[0] = 2;
+	aDimSize[1] = 4;
+
+	int aUnitNum = 1;
+	for (int i = 0; i < aOrder; i++)
+		aUnitNum *= aDimSize[i];
+
+	/* a tensor of size (2) */
+	int bOrder = 1;
+	int * bDimSize = new int[bOrder];
+	bDimSize[0] = 2;
+
+	int bUnitNum = 1;
+	for (int i = 0; i < bOrder; i++)
+		bUnitNum *= bDimSize[i];
+
+	DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
+	                      {4.0F, 5.0F, 6.0F, 7.0F} };
+	DTYPE bData[2] = {1.0F, -1.0F};
+	DTYPE answer[2][4] = { {-1.0F, 0.0F, 1.0F, 2.0F},
+	                       {5.0F, 6.0F, 7.0F, 8.0F} };
+
+	/* CPU test */
+	bool cpuTest = true;
+
+	/* create tensors */
+	XTensor * a = NewTensor(aOrder, aDimSize);
+	XTensor * b = NewTensor(bOrder, bDimSize);
+	XTensor * c = NewTensor(aOrder, aDimSize);
+	XTensor * cMe = NewTensor(aOrder, aDimSize);
+	XTensor cUser;
+
+	/* initialize variables */
+	a->SetData(aData, aUnitNum);
+	cMe->SetData(aData, aUnitNum);
+	b->SetData(bData, bUnitNum);
+	c->SetZeroAll();
+
+	/* call SubDim function */
+	_SubDim(a, b, c, 0);
+	_SubDim(cMe, b, 0);
+	cUser = SubDim(*a, *b, 0);
+
+	/* check results */
+    cpuTest = c->CheckData(answer, aUnitNum) && 
+              cMe->CheckData(answer, aUnitNum) && 
+              cUser.CheckData(answer, aUnitNum);
+
+#ifdef USE_CUDA
+	/* GPU test */
+	bool gpuTest = true;
+
+	/* create tensor */
+	XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+	XTensor cUserGPU;
+
+	/* Initialize variables */
+	aGPU->SetData(aData, aUnitNum);
+	cMeGPU->SetData(aData, aUnitNum);
+	bGPU->SetData(bData, bUnitNum);
+	cGPU->SetZeroAll();
+
+	/* call sub function */
+	_SubDim(aGPU, bGPU, cGPU, 0);
+	_SubDim(cMeGPU, bGPU, 0);
+	cUserGPU = SubDim(*aGPU, *bGPU, 0);
+
+	/* check results */
+    gpuTest = cGPU->CheckData(answer, aUnitNum) && 
+              cMeGPU->CheckData(answer, aUnitNum) && 
+              cUserGPU.CheckData(answer, aUnitNum);
+
+	/* destroy variables */
+	delete a;
+	delete b;
+	delete c;
+	delete cMe;
+	delete aGPU;
+	delete bGPU;
+	delete cGPU;
+	delete cMeGPU;
+	delete[] aDimSize;
+	delete[] bDimSize;
+
+	return cpuTest && gpuTest;
+#else
+	/* destroy variables */
+	delete a;
+	delete b;
+	delete c;
+	delete cMe;
+	delete[] aDimSize;
+	delete[] bDimSize;
+
+	return cpuTest;
+#endif // USE_CUDA
+}
+
+/*
+case 2: tensor subtraction c = a - b * \beta
+where the size of b is equal to the n-th dimension of a,
+i.e., a is subtracted with b by broadcasting
+*/
+bool TestSubDim2()
+{
+	/* a tensor of size (2, 4) */
+	int aOrder = 2;
+	int * aDimSize = new int[aOrder];
+	aDimSize[0] = 2;
+	aDimSize[1] = 4;
+
+	int aUnitNum = 1;
+	for (int i = 0; i < aOrder; i++)
+		aUnitNum *= aDimSize[i];
+
+	/* a tensor of size (2, 2) */
+	int bOrder = 2;
+	int * bDimSize = new int[bOrder];
+	bDimSize[0] = 2;
+	bDimSize[1] = 2;
+
+	int bUnitNum = 1;
+	for (int i = 0; i < bOrder; i++)
+		bUnitNum *= bDimSize[i];
+
+	DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
+	                      {4.0F, 5.0F, 6.0F, 7.0F} };
+	DTYPE bData[2][2] = { {1.0F, -1.0F},
+	                      {-1.0F, 1.0F} };
+	DTYPE answer[2][4] = { {-1.0F, 2.0F, 3.0F, 2.0F},
+	                       {3.0F, 6.0F, 7.0F, 6.0F} };
+
+	/* CPU test */
+	bool cpuTest = true;
+
+	/* create tensors */
+	XTensor * a = NewTensor(aOrder, aDimSize);
+	XTensor * b = NewTensor(bOrder, bDimSize);
+	XTensor * c = NewTensor(aOrder, aDimSize);
+	XTensor * cMe = NewTensor(aOrder, aDimSize);
+	XTensor cUser;
+
+	/* initialize variables */
+	a->SetData(aData, aUnitNum);
+	cMe->SetData(aData, aUnitNum);
+	b->SetData(bData, bUnitNum);
+	c->SetZeroAll();
+
+	/* call SubDim function */
+	_SubDim(a, b, c, 1);
+	_SubDim(cMe, b, 1);
+	cUser = SubDim(*a, *b, 1);
+
+	/* check results */
+    cpuTest = c->CheckData(answer, aUnitNum) && 
+              cMe->CheckData(answer, aUnitNum) && 
+              cUser.CheckData(answer, aUnitNum);
+
+#ifdef USE_CUDA
+	/* GPU test */
+	bool gpuTest = true;
+
+	/* create tensor */
+	XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+	XTensor cUserGPU;
+
+	/* Initialize variables */
+	aGPU->SetData(aData, aUnitNum);
+	cMeGPU->SetData(aData, aUnitNum);
+	bGPU->SetData(bData, bUnitNum);
+	cGPU->SetZeroAll();
+
+	/* call sub function */
+	_SubDim(aGPU, bGPU, cGPU, 1);
+	_SubDim(cMeGPU, bGPU, 1);
+	cUserGPU = SubDim(*aGPU, *bGPU, 1);
+
+	/* check results */
+    gpuTest = cGPU->CheckData(answer, aUnitNum) && 
+              cMeGPU->CheckData(answer, aUnitNum) && 
+              cUserGPU.CheckData(answer, aUnitNum);
+
+	/* destroy variables */
+	delete a;
+	delete b;
+	delete c;
+	delete cMe;
+	delete aGPU;
+	delete bGPU;
+	delete cGPU;
+	delete cMeGPU;
+	delete[] aDimSize;
+	delete[] bDimSize;
+
+	return cpuTest && gpuTest;
+#else
+	/* destroy variables */
+	delete a;
+	delete b;
+	delete c;
+	delete cMe;
+	delete[] aDimSize;
+	delete[] bDimSize;
+
+	return cpuTest;
+#endif // USE_CUDA
+}
+
+/* other cases */
+/*
+TODO!!
+*/
+
+/* test for SubDim Function */
+bool TestSubDim()
+{
+	XPRINT(0, stdout, "[TEST SUBDIM] tensor subtraction c = a - b * beta by broadcasting\n");
+	bool returnFlag = true, caseFlag = true;
+
+	/* case 1 test */
+	caseFlag = TestSubDim1();
+	if (!caseFlag) {
+		returnFlag = false;
+		XPRINT(0, stdout, ">> case 1 failed!\n");
+	}
+	else
+		XPRINT(0, stdout, ">> case 1 passed!\n");
+
+	/* case 2 test */
+	caseFlag = TestSubDim2();
+	if (!caseFlag) {
+		returnFlag = false;
+		XPRINT(0, stdout, ">> case 2 failed!\n");
+	}
+	else
+		XPRINT(0, stdout, ">> case 2 passed!\n");
+
+	/* other cases test */
+	/*
+	TODO!!
+	*/
+
+	if (returnFlag) {
+		XPRINT(0, stdout, ">> All Passed!\n");
+	}
+	else
+		XPRINT(0, stdout, ">> Failed!\n");
+
+	XPRINT(0, stdout, "\n");
+
+	return returnFlag;
+}
+
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/test/TSubDim.h
+++ b/source/tensor/test/TSubDim.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
+*/
+
+#ifndef __TEST_SUBDIM_H__
+#define __TEST_SUBDIM_H__
+
+#include "../core/arithmetic/SubDim.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* test for SubDim Function */
+bool TestSubDim();
+
+} // namespace nts(NiuTrans.Tensor)
+#endif // __TEST_SUBDIM_H__
--- a/source/tensor/test/TSumDim.cpp
+++ b/source/tensor/test/TSumDim.cpp
@@ -28,7 +28,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* 
 case 1: tensor summation c = a + b * \beta 
 where the size of b is equal to the n-th dimension of a, 
-i.e., a is summed with b by broadcasting 
+i.e., a is summed with b by broadcasting.
+In this case, (2, 4) + (2) = (2, 4), n = 0.
 */
 bool TestSumDim1()
 {
@@ -79,9 +80,9 @@ bool TestSumDim1()
    cUser = SumDim(*a, *b, 0);

    /* check results */
-    cpuTest = c->CheckData(answer, aUnitNum)
-              && cMe->CheckData(answer, aUnitNum) 
-              && cUser.CheckData(answer, aUnitNum);
+    cpuTest = c->CheckData(answer, aUnitNum) && 
+              cMe->CheckData(answer, aUnitNum) && 
+              cUser.CheckData(answer, aUnitNum);

 #ifdef USE_CUDA
    /* GPU test */
@@ -106,9 +107,9 @@ bool TestSumDim1()
    cUserGPU = SumDim(*aGPU, *bGPU, 0);

    /* check results */
-    gpuTest = cGPU->CheckData(answer, aUnitNum)
-              && cMeGPU->CheckData(answer, aUnitNum) 
-              && cUserGPU.CheckData(answer, aUnitNum);
+    gpuTest = cGPU->CheckData(answer, aUnitNum) && 
+              cMeGPU->CheckData(answer, aUnitNum) && 
+              cUserGPU.CheckData(answer, aUnitNum);

    /* destroy variables */
    delete a;
@@ -139,7 +140,8 @@ bool TestSumDim1()
 /* 
 case 2: tensor summation c = a + b * \beta 
 where the size of b is equal to the n-th dimension of a, 
-i.e., a is summed with b by broadcasting 
+i.e., a is summed with b by broadcasting.
+In this case, (2, 4) + (2, 2) = (2, 4), n = 1.
 */
 bool TestSumDim2()
 {
@@ -192,9 +194,9 @@ bool TestSumDim2()
    cUser = SumDim(*a, *b, 1);

    /* check results */
-    cpuTest = c->CheckData(answer, aUnitNum)
-              && cMe->CheckData(answer, aUnitNum) 
-              && cUser.CheckData(answer, aUnitNum);
+    cpuTest = c->CheckData(answer, aUnitNum) && 
+              cMe->CheckData(answer, aUnitNum) && 
+              cUser.CheckData(answer, aUnitNum);

 #ifdef USE_CUDA
    /* GPU test */
@@ -219,9 +221,9 @@ bool TestSumDim2()
    cUserGPU = SumDim(*aGPU, *bGPU, 1);

    /* check results */
-    gpuTest = cGPU->CheckData(answer, aUnitNum)
-              && cMeGPU->CheckData(answer, aUnitNum) 
-              && cUserGPU.CheckData(answer, aUnitNum);
+    gpuTest = cGPU->CheckData(answer, aUnitNum) && 
+              cMeGPU->CheckData(answer, aUnitNum) && 
+              cUserGPU.CheckData(answer, aUnitNum);

    /* destroy variables */
    delete a;

--- a/source/tensor/test/TUnsqueeze.cpp
+++ b/source/tensor/test/TUnsqueeze.cpp
@@ -99,8 +99,8 @@ bool TestUnsqueeze1()
    tUser2 = Unsqueeze(*s, 2, 2);

    /* check results */
-    cpuTest = t1->CheckData(answer1, tUnitNum1) && tUser1.CheckData(answer1, tUnitNum1)
-        && t2->CheckData(answer2, tUnitNum2) && tUser2.CheckData(answer2, tUnitNum2);
+    cpuTest = t1->CheckData(answer1, tUnitNum1) && tUser1.CheckData(answer1, tUnitNum1) && 
+              t2->CheckData(answer2, tUnitNum2) && tUser2.CheckData(answer2, tUnitNum2);

 #ifdef USE_CUDA
    /* GPU test */

--- a/source/tensor/test/Test.cpp
+++ b/source/tensor/test/Test.cpp
@@ -38,6 +38,7 @@ bool Test()
    wrong = !TestCopyIndexed() || wrong;
    wrong = !TestCopyValues() || wrong;
    wrong = !TestDiv() || wrong;
+    wrong = !TestDivDim() || wrong;
    wrong = !TestExp() || wrong;
    wrong = !TestLog() || wrong;
    wrong = !TestMatrixMul() || wrong;
@@ -46,6 +47,7 @@ bool Test()
    wrong = !TestMatrixMulBatched() || wrong;
    wrong = !TestMerge() || wrong;
    wrong = !TestMultiply() || wrong;
+    wrong = !TestMultiplyDim() || wrong;
    wrong = !TestNegate() || wrong;
    wrong = !TestNormalize() || wrong;
    wrong = !TestPower() || wrong;

--- a/source/tensor/test/Test.h
+++ b/source/tensor/test/Test.h
@@ -31,6 +31,7 @@
 #include "TCopyIndexed.h"
 #include "TCopyValues.h"
 #include "TDiv.h"
+#include "TDivDim.h"
 #include "TExp.h"
 #include "TLog.h"
 #include "TMatrixMul.h"
@@ -39,6 +40,7 @@
 #include "TMatrixMulBatched.h"
 #include "TMerge.h"
 #include "TMultiply.h"
+#include "TMultiplyDim.h"
 #include "TNegate.h"
 #include "TNormalize.h"
 #include "TPower.h"