update code and fix reduceSum bug

06e95a0a · 张裕浩 · fa2ed07c · 06e95a0a · 06e95a0a · 06e95a0a
Commit 06e95a0a authored Aug 06, 2018 by 张裕浩
--- a/source/network/Main.cpp
+++ b/source/network/Main.cpp
@@ -25,23 +25,55 @@
 #include "../tensor/core/CHeader.h"
 #include "../sample/fnnlm/FNNLM.h"
 #include "../tensor/test/Test.h"
+#include <cuda_runtime.h>
+#include <time.h>
+#include <windows.h>

 //#define CRTDBG_MAP_ALLOC
 //#include <stdlib.h>
 //#include <crtdbg.h>

 using namespace nts;
-using namespace samplefnnlm;
+using namespace fnnlm;

+void SetDataTest()
+{
+    int * dimSize = new int[2];
+    dimSize[0] = 10000;
+    dimSize[1] = 1000;
+    XTensor b1(2, dimSize, X_FLOAT, 1.0F, 0, NULL);
+    XTensor b2(2, dimSize, X_FLOAT, 1.0F, 0, NULL);
+    XTensor b3(2, dimSize, X_FLOAT, 1.0F, -1, NULL);
+
+    DWORD  m_start_time;
+    DWORD  m_end_time;
+    double time_diff = 0.0;
+
+    m_start_time = GetTickCount();
+    _SetDataRand(&b1, -2.0F, 2.0F);
+    cudaThreadSynchronize();
+    m_end_time = GetTickCount();
+    time_diff = m_end_time - m_start_time;
+    printf("time %f ms\n", time_diff);
+
+    m_start_time = GetTickCount();
+    _SetDataRand(&b3, -2.0F,2.0F);
+    cudaThreadSynchronize();
+    m_end_time = GetTickCount();
+    time_diff = m_end_time - m_start_time;
+    printf("time %f ms\n", time_diff);
+
+}

 int main( int argc, const char ** argv )
 {
    
-    //if(argc > 1 && !strcmp(argv[1], "-test"))
+    if(argc > 1 && !strcmp(argv[1], "-test"))
        Test();
-    /*else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
+    //SetDataTest();
+    else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
        FNNLMMain(argc - 1, argv + 1);
-    else{
+    /*else{
        fprintf(stderr, "Thanks for using NiuTrans.Network! This is a library for building\n");
        fprintf(stderr, "neural networks in an easy way. \n\n");
        fprintf(stderr, "Run this program with \"-test\" for unit test!\n");

--- a/source/network/XBackwardFunc.cpp
+++ b/source/network/XBackwardFunc.cpp
@@ -50,6 +50,7 @@ void XFuncGrad::MakeGrad(XTensor * node)
        _IdentityBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
    else if(operID == FUNC_LOGSOFTMAX){
        int leadDim = income.GetParamInt(0);
+        CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in logsoftmax!");
        _LogSoftmaxBackward(NULL, output, input, output->grad, input->grad, leadDim, NOLOSS);
    }
    else if(operID == FUNC_RECTIFY)
@@ -58,11 +59,14 @@ void XFuncGrad::MakeGrad(XTensor * node)
        _SigmoidBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
    else if(operID == FUNC_SOFTMAX){
        int leadDim = income.GetParamInt(0);
+        CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in softmax!");
        _SoftmaxBackward(NULL, output, input, output->grad, input->grad, leadDim, NOLOSS);
    }
    else{
        ShowNTErrors("Wrong activation function type!");
    }
+
+    node->visitMark = NODE_FINISHED;
 }

 /* indicates whether the node is for an activation function */

--- a/source/network/XBackwardMath.cpp
+++ b/source/network/XBackwardMath.cpp
@@ -37,10 +37,52 @@ void XMathGrad::MakeGrad(XTensor * node)

    if(operID == MATH_SUM)
        GradSum(node);
+    else if(operID == MATH_SUMDIM)
+        GradSumDim(node);
    else if(operID == MATH_MULTIPLY)
        GradMultiply(node);
    else if(operID == MATH_MATRIXMUL)
        GradMatrixMul(node);
+    else if(operID == MATH_MATRIXMULBATCHED)
+        GradMatrixMulBatched(node);
+    else if (operID == MATH_LOG)
+        GradLog(node);
+    else if (operID == MATH_POWER)
+        GradPower(node);
+    else if (operID == MATH_NEGATE)
+        GradNegate(node);
+    else if (operID == MATH_SCALEANDSHIFT)
+        GradScaleAndShift(node);
+    else if (operID == MATH_DIV)
+        GradDiv(node);
+    else if (operID == MATH_SUB)
+        GradSub(node);
+    else if (operID == MATH_SIN)
+        GradSin(node);
+    else if (operID == MATH_COS)
+        GradCos(node);
+    else if (operID == MATH_TAN)
+        GradTan(node);
+    else if (operID == MATH_EXP)
+        GradExp(node);
+    else if (operID == MATH_NORMALIZE)
+        GradNormalize(node);
+    else if (operID == MATH_ABSOLUTE)
+        GradAbsolute(node);
+    else if (operID == MATH_SIGN)
+        GradSign(node);
+    else if (operID == MATH_ROUND)
+        GradRound(node);
+    else if (operID == MATH_CLIP)
+        GradClip(node);
+    else if (operID == REDUCE_REDUCEMEAN)
+        GradReduceMean(node);
+    else if (operID == REDUCE_REDUCESUM)
+        GradReduceSum(node);
+    else if (operID == REDUCE_REDUCESUMSQUARED)
+        GradReduceSumSquared(node);
+    else if (operID == REDUCE_REDUCEVARIANCE)
+        GradReduceVariance(node);
    else{
        ShowNTErrors("TODO!");
    }
@@ -70,11 +112,108 @@ void XMathGrad::GradSum(XTensor * node)
    XTensor * a = income.tails[0];
    XTensor * b = income.tails[1];
    DTYPE beta = income.GetParam(0);
+
    XNoder::MakeGrad(a);
    XNoder::MakeGrad(b);

    _Sum(a->grad, node->grad, a->grad);
    _Sum(b->grad, node->grad, b->grad, beta);
+
+    node->visitMark = NODE_FINISHED;
+}
+
+/* 
+gradient for sum with one dimension
+c = a + b * \beta
+where the size of b is equal to dimension n of a, i.e., |b| = a.dimSize[n]
+dE/da = dE/dc
+dE/db = dE/dc * b.reduce(0,...,n-1,n+1,...) * \beta
+*/
+void XMathGrad::GradSumDim(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for SUMDIM!");
+
+    XTensor * a = income.tails[0];
+    XTensor * b = income.tails[1];
+    int n = income.GetParamInt(0);
+    DTYPE beta = income.GetParam(1);
+    XNoder::MakeGrad(a);
+    XNoder::MakeGrad(b);
+
+    _Sum(a->grad, node->grad, a->grad);
+
+    int order = a->order;
+    int dimSize[MAX_TENSOR_DIM_NUM];
+    memcpy(dimSize, a->dimSize, sizeof(int) * a->order);
+
+    if(n == order - 1){
+        int reshapedSize[MAX_TENSOR_DIM_NUM];
+        reshapedSize[0] = a->unitNum/dimSize[order - 1];
+        reshapedSize[1] = dimSize[order - 1];
+
+        /* we reshape dE/dc to a matrix whose column number is equal to the 
+           size of b. Then we can reduce the matrix into a row vector. */
+        node->grad->Reshape(2, reshapedSize);
+
+        if(b->outgo.tailNum > 1){
+            XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
+            _ReduceSum(node->grad, bGradTMP, 0);
+            if(beta != 1.0F)
+                _ScaleAndShiftMe(bGradTMP, beta);
+            _Sum(bGradTMP, b->grad, b->grad);
+            DelTensorBuf(bGradTMP);
+        }
+        else{
+            _ReduceSum(node->grad, b->grad, 0);
+            if(beta != 1.0F)
+                _ScaleAndShiftMe(b->grad, beta);
+        }
+
+        node->grad->Reshape(order, dimSize);
+    }
+    else{
+        int reshapedSize[MAX_TENSOR_DIM_NUM];
+        reshapedSize[0] = 1;
+        reshapedSize[1] = dimSize[n];
+        reshapedSize[2] = 1;
+
+        for(int i = 0; i < order; i++){
+            if(i < n)
+                reshapedSize[0] *= dimSize[i];
+        }
+
+        reshapedSize[2] = a->unitNum / (reshapedSize[0] * reshapedSize[1]);
+
+        /* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|. 
+           Then reduce along with z and x to obtain dE/db. */
+        node->grad->Reshape(3, reshapedSize);
+
+        XTensor * interGrad = NewTensorBuf(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
+
+        _ReduceSum(node->grad, interGrad, 2);
+
+        if(b->outgo.tailNum > 1){
+            XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
+            _ReduceSum(interGrad, bGradTMP, 0);
+            if(beta != 1.0F)
+                _ScaleAndShiftMe(bGradTMP, beta);
+            _Sum(bGradTMP, b->grad, b->grad);
+            DelTensorBuf(bGradTMP);
+        }
+        else{
+            _ReduceSum(interGrad, b->grad, 0);
+            if(beta != 1.0F)
+                _ScaleAndShiftMe(b->grad, beta);
+        }
+
+        node->grad->Reshape(order, dimSize);
+
+        DelTensorBuf(interGrad);
+
+    }
+    
+    node->visitMark = NODE_FINISHED;
 }

 /* 
@@ -99,6 +238,8 @@ void XMathGrad::GradMultiply(XTensor * node)
    CheckNTErrors(XTensor::IsSameShaped(a, b), "Wrong sized input tensors!");
    _Multiply(node->grad, b, a->grad, 1.0F);
    _Multiply(node->grad, a, b->grad, 1.0F);
+
+    node->visitMark = NODE_FINISHED;
 }

 /* 
@@ -124,16 +265,59 @@ void XMathGrad::GradMatrixMul(XTensor * node)
    XNoder::MakeGrad(a);
    XNoder::MakeGrad(b);

+    XTensor * c = node;
    XTensor * dedc = node->grad;
    XTensor * deda = a->grad;
    XTensor * dedb = b->grad;
    
+    if(deda->order == 2 && dedb->order == 2)
+        GradMatrixMul(a, deda, transA, b, dedb, transB, dedc, alpha);
+    else if(transA == X_NOTRANS && deda->order > 2 && dedb->order == 2){
+        int orderBackupA = a->order;
+        int orderBackupC = c->order;
+        int dimsBackupA[MAX_TENSOR_DIM_NUM];
+        int dimsBackupC[MAX_TENSOR_DIM_NUM];
+        memcpy(dimsBackupA, a->dimSize, sizeof(int) * a->order);
+        memcpy(dimsBackupC, c->dimSize, sizeof(int) * c->order);
+
+        a->Reshape(a->unitNum/a->GetDim(-1), a->GetDim(-1));
+        c->Reshape(c->unitNum/c->GetDim(-1), c->GetDim(-1));
+        deda->Reshape(a->unitNum/a->GetDim(-1), a->GetDim(-1));
+        dedc->Reshape(c->unitNum/c->GetDim(-1), c->GetDim(-1));
+
+        GradMatrixMul(a, deda, transA, b, dedb, transB, dedc, alpha);
+
+        a->Reshape(orderBackupA, dimsBackupA);
+        c->Reshape(orderBackupC, dimsBackupC);
+        deda->Reshape(orderBackupA, dimsBackupA);
+        dedc->Reshape(orderBackupC, dimsBackupC);
+    }
+    else{
+        ShowNTErrors("TODO!");
+    }
+
+    node->visitMark = NODE_FINISHED;
+}
+    
+/*
+gradient for matrix multiply: c = matmul(a, b) * \alpha
+>> a - as it is
+>> deda - dE/da
+>> b - as it is
+>> dedb - dE/db
+>> dedc - dE/dc
+>> alpha - the scalar
+*/
+void XMathGrad::GradMatrixMul(XTensor * a, XTensor * deda, MATRIX_TRANS_TYPE transA,
+                              XTensor * b, XTensor * dedb, MATRIX_TRANS_TYPE transB,
+                              XTensor * dedc, DTYPE alpha)
+{
    /* c = a * b * \alpha */
    if(transA == X_NOTRANS && transB == X_NOTRANS){
-
+        
        /* dE/da = dE/dc * b^T * \alpha */
        _MatrixMul(dedc, X_NOTRANS, b, X_TRANS, deda, alpha, 1.0F);
-
+        
        /* dE/db = a^T * dE/dc * \alpha */
        _MatrixMul(a, X_TRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
    }
@@ -141,8 +325,9 @@ void XMathGrad::GradMatrixMul(XTensor * node)
    /* c = a^T * b * \alpha */
    else if(transA == X_TRANS && transB == X_NOTRANS){
        
-        /* dE/da = dE/dc * b^T * \alpha */
-        _MatrixMul(dedc, X_NOTRANS, b, X_TRANS, deda, alpha, 1.0F);
+        /* dE/da = (dE/dc * b^T)^T * \alpha 
+                 = b * dE/dc^T * \alpha */
+        _MatrixMul(b, X_NOTRANS, dedc, X_TRANS, deda, alpha, 1.0F);
        
        /* dE/db = a * dE/dc * \alpha */
        _MatrixMul(a, X_NOTRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
@@ -154,19 +339,689 @@ void XMathGrad::GradMatrixMul(XTensor * node)
        /* dE/da = dE/dc * b * \alpha */
        _MatrixMul(dedc, X_NOTRANS, b, X_NOTRANS, deda, alpha, 1.0F);
        
-        /* dE/db = a^T * dE/dc * \alpha */
-        _MatrixMul(a, X_TRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
+        /* dE/db = (a^T * dE/dc)^T * \alpha 
+                 = dE/dc^T * a * \alpha */
+        _MatrixMul(dedc, X_TRANS, a, X_NOTRANS, dedb, alpha, 1.0F);
    }
    
    /* c = a^T * b^T * \alpha */
    else if(transA == X_TRANS && transB == X_TRANS){
        
-        /* dE/da = dE/dc * b * \alpha */
-        _MatrixMul(dedc, X_NOTRANS, b, X_NOTRANS, deda, alpha, 1.0F);
+        /* dE/da = (dE/dc * b)^T * \alpha 
+                 = b^T * dE/dc^T * \alpha */
+        _MatrixMul(b, X_TRANS, dedc, X_TRANS, deda, alpha, 1.0F);
+        
+        /* dE/db = (a * dE/dc)^T * \alpha 
+                 = dE/dc^T * a^T * \alpha */
+        _MatrixMul(dedc, X_TRANS, a, X_TRANS, dedb, alpha, 1.0F);
+    }
+}
+
+/* 
+gradient for matrix multiply in batch mode.
+for each batch: c_i = matmul(a_i, b_i) * \alpha
+for c_i = matmul(a_i, b_i) * \alpha
+we have 
+dE/da_i = dE/dc_i * b_i^T * \alpha
+dE/db_i = a_i^T * dE/dc_i * \alpha
+>> node - the node (c) for backward computation
+*/
+void XMathGrad::GradMatrixMulBatched(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for MULTIPLY!");
+    CheckNTErrors(income.paramNum == 3, "Wrong parameter number for MULTIPLY!");
+
+    XTensor * a = income.tails[0]; 
+    XTensor * b = income.tails[1];
+    MATRIX_TRANS_TYPE transA = income.GetParamTrans(0);
+    MATRIX_TRANS_TYPE transB = income.GetParamTrans(1);
+    DTYPE alpha = income.GetParam(2);
+
+    XNoder::MakeGrad(a);
+    XNoder::MakeGrad(b);
+
+    XTensor * dedc = node->grad;
+    XTensor * deda = a->grad;
+    XTensor * dedb = b->grad;
+
+    /* c = a * b * \alpha */
+    if(transA == X_NOTRANS && transB == X_NOTRANS){
+        
+        /* dE/da = dE/dc * b^T * \alpha */
+        _MatrixMulBatched(dedc, X_NOTRANS, b, X_TRANS, deda, alpha, 1.0F);
+        
+        /* dE/db = a^T * dE/dc * \alpha */
+        _MatrixMulBatched(a, X_TRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
+    }
+    
+    /* c = a^T * b * \alpha */
+    else if(transA == X_TRANS && transB == X_NOTRANS){
+        
+        /* dE/da = (dE/dc * b^T)^T * \alpha 
+                 = b * dE/dc^T * \alpha */
+        _MatrixMulBatched(b, X_NOTRANS, dedc, X_TRANS, deda, alpha, 1.0F);
        
        /* dE/db = a * dE/dc * \alpha */
-        _MatrixMul(a, X_NOTRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
+        _MatrixMulBatched(a, X_NOTRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
+    }
+    
+    /* c = a * b^T * \alpha */
+    else if(transA == X_NOTRANS && transB == X_TRANS){
+        
+        /* dE/da = dE/dc * b * \alpha */
+        _MatrixMulBatched(dedc, X_NOTRANS, b, X_NOTRANS, deda, alpha, 1.0F);
+        
+        /* dE/db = (a^T * dE/dc)^T * \alpha 
+                 = dE/dc^T * a * \alpha */
+        _MatrixMulBatched(dedc, X_TRANS, a, X_NOTRANS, dedb, alpha, 1.0F);
+    }
+    
+    /* c = a^T * b^T * \alpha */
+    else if(transA == X_TRANS && transB == X_TRANS){
+        
+        /* dE/da = (dE/dc * b)^T * \alpha 
+                 = b^T * dE/dc^T * \alpha */
+        _MatrixMulBatched(b, X_TRANS, dedc, X_TRANS, deda, alpha, 1.0F);
+        
+        /* dE/db = (a * dE/dc)^T * \alpha 
+                 = dE/dc^T * a^T * \alpha */
+        _MatrixMulBatched(dedc, X_TRANS, a, X_TRANS, dedb, alpha, 1.0F);
    }
+
+    node->visitMark = NODE_FINISHED;
+}
+
+/*
+gradient for log
+for
+c = log(a)
+we have
+dE/da = dE/dc * 1/a
+>> node - the node (c) for backward computation
+*/
+void XMathGrad::GradLog(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for LOG!");
+
+    XTensor * a = income.tails[0];
+
+    XNoder::MakeGrad(a);
+
+    _Div(node->grad, a, a->grad, 1.0F);
+
+    node->visitMark = NODE_FINISHED;
+}
+
+/*
+gradient for power
+for
+c = pow(a,p)
+we have
+dE/da = (dE/dc) * p * a^(p-1)
+>> node - the node (c) for backward computation
+*/
+void XMathGrad::GradPower(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for POWER!");
+
+    XTensor * a = income.tails[0];
+    XTensor * b = NewTensorBuf(a, a->devID, a->mem);
+
+    DTYPE p = income.GetParam(0);
+
+    XNoder::MakeGrad(a);
+
+    _Power(a, b, p - 1.0F);
+    _ScaleAndShiftMe(b, p);
+    _Multiply(node->grad, b, a->grad, 1.0F);
+
+    DelTensor(b);
+
+    node->visitMark = NODE_FINISHED;
+}
+
+/*
+gradient for negate
+for
+c = -a
+we have
+dE/da = dE/dc * (-1)
+>> node - the node (c) for backward computation
+*/
+void XMathGrad::GradNegate(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for NEGATE!");
+
+    XTensor * a = income.tails[0];
+    XTensor * b = NewTensorBuf(a, a->devID, a->mem);
+
+    XNoder::MakeGrad(a);
+
+    _ScaleAndShift(node->grad, b, -1.0F);
+    _Sum(a->grad, b, a->grad);
+
+    DelTensorBuf(b);
+
+    node->visitMark = NODE_FINISHED;
+}
+
+/*
+gradient for ScaleAndShift
+for
+c = a * scale + shift
+we have
+dE/da = dE/dc * scale
+>> node - the node (c) for backward computation
+*/
+void XMathGrad::GradScaleAndShift(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SCALEANDSHIFT!");
+
+    XTensor * a = income.tails[0];
+
+    DTYPE scale = income.GetParam(0);
+
+    XNoder::MakeGrad(a);
+
+    _Sum(a->grad, node->grad, a->grad, scale);
+
+    node->visitMark = NODE_FINISHED;
+}
+
+/*
+gradient for minus
+for
+c =  a - b * \beta
+we have
+dE/da = dE/dc
+dE/db = -dE/dc * \beta
+>> node - the node (c) for backward computation
+*/
+void XMathGrad::GradSub(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for SUBSTRACT!");
+
+    XTensor * a = income.tails[0];
+    XTensor * b = income.tails[1];
+    DTYPE beta = income.GetParam(0);
+
+    XNoder::MakeGrad(a);
+    XNoder::MakeGrad(b);
+
+    _Sum(a->grad, node->grad, a->grad);
+    _Sum(b->grad, node->grad, b->grad, -beta);
+
+    node->visitMark = NODE_FINISHED;
+}
+
+/*
+gradient for divide
+for
+c =  a / b
+we have
+dE/da = dE/dc / b
+dE/db = dE/dc * a / -b^2
+>> node - the node (c) for backward computation
+*/
+void XMathGrad::GradDiv(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for DIVIDE!");
+
+    XTensor * a = income.tails[0];
+    XTensor * b = income.tails[1];
+    XTensor * ab2 = NewTensorBuf(a, a->devID, a->mem);
+
+    XNoder::MakeGrad(a);
+    XNoder::MakeGrad(b);
+
+    CheckNTErrors(XTensor::IsSameShaped(a, b), "Wrong sized input tensors!");
+
+    _Div(node->grad, b, a->grad, 1.0F);
+
+    _Power(b, ab2, -2.0F);
+    _Multiply(a, ab2, ab2);
+    _ScaleAndShiftMe(ab2, -1.0F);
+    _Multiply(node->grad, ab2, b->grad, 1.0F);
+
+    DelTensorBuf(ab2);
+
+    node->visitMark = NODE_FINISHED;
+}
+
+/*
+gradient for exp
+for
+c = exp(a)
+we have
+dE/da = dE/dc * exp(a)
+>> node - the node (c) for backward computation
+*/
+void XMathGrad::GradExp(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for EXP!");
+
+    XTensor * a = income.tails[0];
+    XTensor * b = NewTensorBuf(a, a->devID, a->mem);
+
+    XNoder::MakeGrad(a);
+
+    _Exp(a, b);
+    _Multiply(node->grad, b, a->grad, 1.0F);
+
+    DelTensorBuf(b);
+
+    node->visitMark = NODE_FINISHED;
+}
+
+/*
+gradient for sin
+for
+c = sin(a)
+we have
+dE/da = dE/dc * cos(a)
+>> node - the node (c) for backward computation
+*/
+void XMathGrad::GradSin(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SIN!");
+
+    XTensor * a = income.tails[0];
+    XTensor * b = NewTensorBuf(a, a->devID, a->mem);
+
+    XNoder::MakeGrad(a);
+
+    _Cos(a, b);
+    _Multiply(node->grad, b, a->grad, 1.0F);
+
+    DelTensorBuf(b);
+
+    node->visitMark = NODE_FINISHED;
+}
+
+/*
+gradient for cos
+for
+c = cos(a)
+we have
+dE/da = dE/dc * -sin(a)
+>> node - the node (c) for backward computation
+*/
+void XMathGrad::GradCos(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for COS!");
+
+    XTensor * a = income.tails[0];
+    XTensor * b = NewTensorBuf(a, a->devID, a->mem);
+
+    XNoder::MakeGrad(a);
+
+    _Sin(a, b);
+    _ScaleAndShiftMe(b, -1.0F);
+    _Multiply(node->grad, b, a->grad, 1.0F);
+
+    DelTensorBuf(b);
+
+    node->visitMark = NODE_FINISHED;
+}
+
+/*
+gradient for tan
+for
+c = tan(a)
+we have
+dE/da = dE/dc * 1/(cos(a))^2
+>> node - the node (c) for backward computation
+*/
+void XMathGrad::GradTan(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for TAN!");
+
+    XTensor * a = income.tails[0];
+    XTensor * b = NewTensorBuf(a, a->devID, a->mem);
+
+    XNoder::MakeGrad(a);
+
+    _Cos(a, b);
+    _PowerMe(b, -2.0F);
+    _Multiply(node->grad, b, a->grad, 1.0F);
+
+    DelTensorBuf(b);
+
+    node->visitMark = NODE_FINISHED;
+}
+
+/*
+gradient for normalize
+>> node - the node (c) for backward computation
+*/
+void XMathGrad::GradNormalize(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 5, "Wrong input tensor number for NORMALIZE!");
+
+    XTensor * input = income.tails[0];
+    XTensor * mean = income.tails[1];
+    XTensor * var = income.tails[2];
+    XTensor * a = income.tails[3];
+    XTensor * b = income.tails[4];
+    XTensor * c = NewTensor(var);
+    XTensor * d = NewTensor(a);
+    XTensor * e = NewTensor(a);
+    XTensor * f = NewTensor(a);
+    XTensor * g = NewTensor(a);
+    XTensor * h = NewTensor(a);
+    XTensor * i = NewTensor(a);
+    XTensor * j = NewTensor(a);
+    XTensor * k = NewTensor(var);
+    XTensor * p = NewTensor(var);
+    XTensor * q = NewTensor(var);
+    XTensor * r = NewTensor(a);
+    XTensor * x = NewTensor(mean);
+    XTensor * y = NewTensor(mean);
+    XTensor * z = NewTensor(mean);
+    DTYPE epsilon = income.GetParam(1);
+
+    int dim = income.GetParamInt(0);
+    int n = a->GetDim(dim);
+    XNoder::MakeGrad(input);
+    XNoder::MakeGrad(mean);
+    XNoder::MakeGrad(var);
+    XNoder::MakeGrad(a);
+    XNoder::MakeGrad(b);
+
+    /* dEdinput */
+    _ScaleAndShift(var, c, 1.0F, epsilon);
+    _Unsqueeze(c, d, dim, n);
+    _Power(d, e, -0.5F);
+    _Multiply(a, e, f);
+    _Multiply(node->grad, f, input->grad, 1.0F);
+
+    /* dEdmean */
+    _ScaleAndShift(f, g, -1.0F);
+    _ReduceSum(g, x, dim);
+    _ReduceSum(node->grad, y, dim);
+    _Multiply(y, x, mean->grad, 1.0F);
+
+    /* dEdvar */
+    _Unsqueeze(mean, h, dim, n);
+    _Sub(input, h, i);
+    _Multiply(a, i, j);
+    _Power(var, k, -1.5F);
+    _ScaleAndShift(k, p, -0.5F);
+    _ReduceSum(j, z, dim);
+    _Multiply(z, p, q);
+    _Multiply(y, q, var->grad, 1.0F);
+
+    /* dEda */
+    _Multiply(i, e, r);
+    _Multiply(node->grad, r, a->grad, 1.0F);
+
+    /* dEdb */
+    _Sum(b->grad, node->grad, b->grad);
+
+    node->visitMark = NODE_FINISHED;
+
+    delete c;
+    delete d;
+    delete e;
+    delete f;
+    delete g;
+    delete h;
+    delete i;
+    delete j;
+    delete k;
+    delete p;
+    delete q;
+    delete r;
+    delete x;
+    delete y;
+    delete z;
+}
+
+/*
+gradient for absolute
+for
+c = |a|
+we have
+dE/da = dE/dc   a >= 0
+        -dE/dc  a < 0
+>> node - the node (c) for backward computation
+*/
+void XMathGrad::GradAbsolute(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for ABSOLUTE!");
+
+    XTensor * a = income.tails[0];
+    XTensor * b = NewTensorBuf(a, a->devID, a->mem);
+
+    XNoder::MakeGrad(a);
+
+    _Sign(a, b);
+    _Multiply(node->grad, b, a->grad, 1.0F);
+
+    DelTensorBuf(b);
+
+    node->visitMark = NODE_FINISHED;
+}
+
+/*
+gradient for sign
+for
+c = sign(a)
+we have
+dE/da = 0
+>> node - the node (c) for backward computation
+*/
+void XMathGrad::GradSign(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SIGN!");
+
+    // we do nothing here
+    // TODO: set grad = 0 if the node is the only child
+
+    node->visitMark = NODE_FINISHED;
+}
+
+/*
+gradient for round
+for
+c = round(a)
+we have
+dE/da = 0
+>> node - the node (c) for backward computation
+*/
+void XMathGrad::GradRound(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for ROUND!");
+
+    // we do nothing here
+    // TODO: set grad = 0 if the node is the only child
+
+    node->visitMark = NODE_FINISHED;
+}
+
+/*
+gradient for clip
+we have
+dE/da = 1  lower < a < upper
+dE/da = 0  otherwise 
+>> node - the node (c) for backward computation
+*/
+void XMathGrad::GradClip(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for CLIP!");
+
+    XTensor * a = income.tails[0];
+    XTensor * b = NewTensorBuf(a, a->devID, a->mem);
+
+    DTYPE lower = income.GetParam(0);
+    DTYPE upper = income.GetParam(1);
+
+    XNoder::MakeGrad(a);
+
+    _ClipBackward(node, a, node->grad, a->grad, lower, upper);
+    _Sum(a->grad, b, a->grad);
+
+    DelTensorBuf(b);
+
+    node->visitMark = NODE_FINISHED;
+}
+
+/*
+gradient for reduceMean
+for
+c = reduceMean(a, dim)
+we have
+dE/da = Unsqueeze(dE/dc) * 1/dimSizeA[dim]
+>> node - the node (c) for backward computation
+*/
+void XMathGrad::GradReduceMean(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for Reduce!");
+
+    XTensor * a = income.tails[0];
+    XTensor * b = NewTensorBuf(a, a->devID, a->mem);
+
+    int dim = income.GetParamInt(0);
+    int n = a->GetDim(dim);
+
+    XNoder::MakeGrad(a);
+
+    _Unsqueeze(node->grad, b, dim, n);
+    _ScaleAndShiftMe(b, 1.0F/n);
+    _Sum(a->grad, b, a->grad);
+
+    DelTensorBuf(b);
+
+    node->visitMark = NODE_FINISHED;
+}
+
+/*
+gradient for reduceSum
+for
+c = reduceSum(a, dim)
+we have
+dE/da = Unsqueeze(dE/dc) * 1
+>> node - the node (c) for backward computation
+*/
+void XMathGrad::GradReduceSum(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for Reduce!");
+
+    XTensor * a = income.tails[0];
+    XTensor * b = NewTensorBuf(a, a->devID, a->mem);
+
+    int dim = income.GetParamInt(0);
+    int n = a->GetDim(dim);
+
+    XNoder::MakeGrad(a);
+
+    _Unsqueeze(node->grad, b, dim, n);
+    _Sum(a->grad, b, a->grad);
+
+    DelTensor(b);
+
+    node->visitMark = NODE_FINISHED;
+}
+
+/*
+gradient for reduceSumSquared
+for
+c = \sum_i (a_i - b)^2 
+we have
+dE/da = Unsqueeze(dE/dc) * 2a
+dE/db = dE/dc * -2 * n * b
+>> node - the node (c) for backward computation
+*/
+void XMathGrad::GradReduceSumSquared(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for Reduce!");
+
+    XTensor * a = income.tails[0];
+    XTensor * b = income.tails[1];
+    XTensor * c = NewTensorBuf(a, a->devID, a->mem);
+    XTensor * d = NewTensorBuf(b, b->devID, b->mem);
+    XTensor * e = NewTensorBuf(a, a->devID, a->mem);
+
+    int dim = income.GetParamInt(0);
+    int n = a->GetDim(dim);
+    XNoder::MakeGrad(a);
+    XNoder::MakeGrad(b);
+
+    /* dE/da = Unsqueeze(dE/dc) * 2a */
+    _ScaleAndShift(a, c, 2.0F);
+    _Unsqueeze(node->grad, e, dim, n);
+    _Multiply(e, c, a->grad, 1.0F);
+
+    /* dE/db = dE/dc * -2 * n * b */
+    _ScaleAndShift(b, d, -2.0F * n);
+    _Multiply(node->grad, d, b->grad, 1.0F);
+
+    DelTensorBuf(c);
+    DelTensorBuf(d);
+    DelTensorBuf(e);
+
+    node->visitMark = NODE_FINISHED;
+}
+
+/*
+gradient for reduceVariance
+for
+c = (sum_i (a_i - b)^2) * 1/n
+where b is the mean, and n is the size of a
+we have
+dE/da = Unsqueeze(dE/dc) * 2a/n
+dE/db = dE/dc * -2 * b
+>> node - the node (c) for backward computation
+*/
+void XMathGrad::GradReduceVariance(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for Reduce!");
+
+    XTensor * a = income.tails[0];
+    XTensor * b = income.tails[1];
+    XTensor * c = NewTensorBuf(a, a->devID, a->mem);
+    XTensor * d = NewTensorBuf(b, b->devID, b->mem);
+    XTensor * e = NewTensorBuf(a, a->devID, a->mem);
+
+    int dim = income.GetParamInt(0);
+    int n = a->GetDim(dim);
+    XNoder::MakeGrad(a);
+    XNoder::MakeGrad(b);
+
+    /* dE/da = Unsqueeze(dE/dc) * 2a/n */
+    _ScaleAndShift(a, c, 2.0F / n);
+    _Unsqueeze(node->grad, e, dim, n);
+    _Multiply(e, c, a->grad, 1.0F);
+
+    /* dE/db = dE/dc * -2 * b */
+    _ScaleAndShift(b, d, -2.0F);
+    _Multiply(node->grad, d, b->grad, 1.0F);
+
+    DelTensorBuf(c);
+    DelTensorBuf(d);
+    DelTensorBuf(e);
+
+    node->visitMark = NODE_FINISHED;
 }

 }
--- a/source/network/XBackwardMath.h
+++ b/source/network/XBackwardMath.h
@@ -44,13 +44,105 @@ private:
    static
    void GradSum(XTensor * node);

-    /* gradient for multiply (dot production): c =  a * b */
+    /* gradient for sum with one dimension: c = a + b * \beta
+       where the size of b is equal to that of one dimension of a */
+    static
+    void GradSumDim(XTensor * node);
+
+    /* gradient for multiply (dot production): c =  a * b * \alpha */
    static
    void GradMultiply(XTensor * node);

-    /* gradient for matrix multiply: c = matmul(a, b) */
+    /* gradient for matrix multiply: c = matmul(a, b) * \alpha */
    static
    void GradMatrixMul(XTensor * node);
+    
+    /* gradient for matrix multiply: c = matmul(a, b) * \alpha */
+    static
+    void GradMatrixMul(XTensor * a, XTensor * deda, MATRIX_TRANS_TYPE transA,
+                       XTensor * b, XTensor * dedb, MATRIX_TRANS_TYPE transB,
+                       XTensor * dedc, DTYPE alpha);
+
+    /* gradient for matrix multiply in batch mode.
+       for each batch: c_i = matmul(a_i, b_i) * \alpha */
+    static
+    void GradMatrixMulBatched(XTensor * node);
+
+    /* gradient for log: c =  log(a) */
+    static
+    void GradLog(XTensor * node);
+
+    /* gradient for power */
+    static
+    void GradPower(XTensor * node);
+
+    /* gradient for negate */
+    static
+    void GradNegate(XTensor * node);
+
+    /* gradient for ScaleAndShift */
+    static
+    void GradScaleAndShift(XTensor * node);
+
+    /* gradient for Minus */
+    static
+    void GradSub(XTensor * node);
+
+    /* gradient for Divide */
+    static
+    void GradDiv(XTensor * node);
+
+    /* gradient for reduceMean */
+    static
+    void GradReduceMean(XTensor * node);
+
+    /* gradient for reduceSum */
+    static
+    void GradReduceSum(XTensor * node);
+
+    /* gradient for reduceSumSquared */
+    static
+    void GradReduceSumSquared(XTensor * node);
+
+    /* gradient for reduceVariance */
+    static
+    void GradReduceVariance(XTensor * node);
+
+    /* gradient for sin */
+    static
+    void GradSin(XTensor * node);
+
+    /* gradient for cos */
+    static
+    void GradCos(XTensor * node);
+
+    /* gradient for tan */
+    static
+    void GradTan(XTensor * node);
+
+    /* gradient for exp */
+    static
+    void GradExp(XTensor * node);
+
+    /* gradient for normalize */
+    static
+    void GradNormalize(XTensor * node);
+
+    /* gradient for absolute */
+    static
+    void GradAbsolute(XTensor * node);
+
+    /* gradient for sign */
+    static
+    void GradSign(XTensor * node);
+
+    /* gradient for clip */
+    static
+    void GradClip(XTensor * node);
+
+    /* gradient for round */
+    static
+    void GradRound(XTensor * node);
 };

 }

--- a/source/network/XBackwardShape.cpp
+++ b/source/network/XBackwardShape.cpp
@@ -43,6 +43,12 @@ void XShapeGrad::MakeGrad(XTensor * node)
        GradMergeList(node);
    else if(operID == SHAPE_UNSQUEEZE)
        GradUnsqueeze(node);
+    else if(operID == SHAPE_SPLIT)
+        GradSplit(node);
+    else if(operID == SHAPE_SPLIT_LIST)
+        GradSplitList(node);
+    else if (operID == SHAPE_TRANSPOSE)
+        GradTranspose(node);
    else{
        ShowNTErrors("TODO!");
    }
@@ -55,6 +61,13 @@ bool XShapeGrad::IsShapeOP(XTensor * node)
    return (income.typeID & DATA_BASE) != 0;
 }

+/* post processing of a node */
+void XShapeGrad::PostProcessing(XTensor * node, int typeID)
+{
+    if(typeID == SHAPE_SPLIT_LIST)
+        GradSplitListPost(node);
+}
+
 /* 
 gradient for merge
 for 
@@ -134,6 +147,8 @@ void XShapeGrad::GradMerge(XTensor * node)
    gradInputSmall.data = NULL;

    delete[] dims;
+
+    node->visitMark = NODE_FINISHED;
 }

 /* 
@@ -213,6 +228,120 @@ void XShapeGrad::GradMergeList(XTensor * node)
        gradSmall.data = NULL;
        delete[] dims;
    }
+
+    node->visitMark = NODE_FINISHED;
+}
+
+/* 
+gradient computation for split: 
+for
+c = split(a)
+we have
+dE/da = merge(dE/dc)
+>> node - the node (c) for backward computation
+*/
+void XShapeGrad::GradSplit(XTensor * node)
+{
+    XLink &income = node->income;
+    XTensor * input = income.tails[0];
+
+    int whereToSplit = income.GetParamInt(0);
+    int splitNum = income.GetParamInt(1);
+
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SPLIT!");
+    CheckNTErrors(node->order == input->order + 1, "Wrong tensor orders!");
+    CheckNTErrors(splitNum == node->dimSize[0], "Wrong split number!");
+
+    XNoder::MakeGrad(input);
+
+    /* we can simply merge the gradient tensor 
+       if the input is used in spliting only */
+    if(input->outgo.tailNum == 1)
+        _Merge(node->grad, input->grad, whereToSplit + 1, 0);
+
+    /* if the tensor is used somewhere else, we need another SUM
+       for gradient accumulation */
+    else{
+        XTensor inputGradTMP(input);
+
+        _Merge(node->grad, &inputGradTMP, whereToSplit + 1, 0);
+        _Sum(input->grad, &inputGradTMP, input->grad);
+    }
+
+    node->visitMark = NODE_FINISHED;
+}
+
+/* 
+gradient computation for spliting 
+where we return the list of the splits
+for
+list(c_1, ...) = split(a) 
+we have
+dE/da = merge(dE/c_1, ...)
+>> node - the node (c) for backward computation
+*/
+void XShapeGrad::GradSplitList(XTensor * node)
+{
+    XLink &income = node->income;
+    XTensor * input = income.tails[0];
+
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SPLIT!");
+    CheckNTErrors(node->order == input->order + 1, "Wrong tensor orders!");
+
+    node->visitMark = NODE_DOING;
+}
+
+/*
+gradient computation for spliting. We return 
+the list of the splits : list(c_1, ...) = split(a).
+this method is called only when all nodes of spliting 
+have been processed. We do this in a post-processing
+manner because we can fuze multiple memory copy jobs 
+one time. This is good for system speed up. 
+>> node - the node (c) for backward computation
+*/
+void XShapeGrad::GradSplitListPost(XTensor * node)
+{
+    /* we compute the gradient for current node, rather than for
+       child node, i.e., we use the outgoing edge here */
+    XLink &outgo = node->outgo;
+    XList splits(outgo.tailNum);
+    int whereToSplit = -1;
+    int splitNum = 0;
+
+    for(int i = 0; i < outgo.tailNum; i++){
+        XTensor * parent = (XTensor*)outgo.tails[i];
+        XLink &income = parent->income;
+        if(income.typeID == SHAPE_SPLIT_LIST){
+            int w = income.GetParamInt(0);
+            int splitID = income.GetParamInt(1);
+            if(whereToSplit < 0)
+                whereToSplit = w;
+            splitNum++;
+
+            CheckNTErrors(whereToSplit == w, "Wrong dimension for spliting");
+            CheckNTErrors(income.tailNum == 1, "Something wrong with outgoing edge!");
+            CheckNTErrors(splitNum - 1 == splitID, "Wrong split id!");
+
+            splits.Add(parent);
+        }
+    }
+
+    /* we can simply merge the gradient tensor 
+       if the node is used in spliting only */
+    if(outgo.tailNum == splitNum){
+        _Merge(&splits, node->grad, whereToSplit + 1);
+    }
+
+    /* if the tensor is used as input to other nodes
+       somewhere else, we need another SUM for gradient 
+       accumulation */
+    else{
+        XTensor nodeGradTMP(node);
+
+        _Merge(&splits, &nodeGradTMP, whereToSplit + 1);
+        _Sum(node->grad, &nodeGradTMP, node->grad);
+    }
 }

 /* 
@@ -239,6 +368,40 @@ void XShapeGrad::GradUnsqueeze(XTensor * node)
    CheckNTErrors(output->unitNum = input->unitNum * dSize, "Wrong tensor size!");

    _ReduceSum(output->grad, input->grad, dim);
+
+    node->visitMark = NODE_FINISHED;
+}
+
+/*
+gradient for transposing a tensor
+for
+c = Transpose(a)
+we have
+dE/da = Transpose(dE/dc)
+>> node - the node (c) for backward computation
+*/
+void XShapeGrad::GradTranspose(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for TRANSPOSE!");
+
+    XTensor * output = node;
+    XTensor * input = income.tails[0];
+    XTensor * b = NewTensor(input);
+    XNoder::MakeGrad(input);
+
+    int i = income.GetParamInt(0);
+    int j = income.GetParamInt(1);
+
+    CheckNTErrors(input->order > i && i >= 0, "index of dimension is out of scope!");
+    CheckNTErrors(input->order > j && j >= 0, "index of dimension is out of scope!");
+
+    _Transpose(output->grad, b, i, j);
+    _Sum(input->grad, b, input->grad);
+
+    node->visitMark = NODE_FINISHED;
+
+    delete b;
 }

 }
\ No newline at end of file
--- a/source/network/XBackwardShape.h
+++ b/source/network/XBackwardShape.h
@@ -40,18 +40,41 @@ public:
    static
    bool IsShapeOP(XTensor * node);

+    /* post processing of a node */
+    static
+    void PostProcessing(XTensor * node, int typeId);
+
 private:
-    /* gradient for merge: c = merge(a, b, ...) */
+    /* gradient computation for merge: c = merge(a, b, ...) */
    static
    void GradMerge(XTensor * node);

-    /* gradient for merging a list of tensors : c = merge(list(a, b, ...)) */
+    /* gradient computation for merging a list of tensors : c = merge(list(a, b, ...)) */
    static
    void GradMergeList(XTensor * node);

-    /* gradient for unsqueezing a tensor : c = unsqueeze(a) */
+    /* gradient computation for split: c = split(a) */
+    static
+    void GradSplit(XTensor * node);
+
+    /* gradient computation for spliting. we return the list of the splits : list(c_1, ...) = split(a) */
+    static
+    void GradSplitList(XTensor * node);
+
+    /* gradient computation for spliting. we return the list of the splits : list(c_1, ...) = split(a).
+       this method is called only when all nodes of spliting have been processed. We do this in a post-processing
+       manner because we can fuze multiple memory copy jobs one time. This is good for system speed up. */
+    static
+    void GradSplitListPost(XTensor * node);
+
+    /* gradient computation for unsqueezing a tensor : c = unsqueeze(a) */
    static
    void GradUnsqueeze(XTensor * node);
+
+    /* gradient computation for unsqueezing a tensor : c = unsqueeze(a) */
+    static
+    void GradTranspose(XTensor * node);
+    
 };

 }

--- a/source/network/XNet.cpp
+++ b/source/network/XNet.cpp
@@ -46,6 +46,11 @@ unsigned int MakeNetID()
    return id;
 }

+void XNetClearAll()
+{
+    MUTEX_DELE(netMutex);
+}
+
 /* constructor */
 XNet::XNet()
 {
@@ -143,7 +148,7 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)

    /* back-propagation from output to input */
    for(int i = nodes.count - 1; i >= 0; i--){
-        XTensor * node = (XTensor*)nodes.Get(i);
+        XTensor * node = (XTensor*)nodes.Get(i);;
        if(node->visitMark == NODE_FINISHED)
            continue;

@@ -176,6 +181,10 @@ void XNet::BackwardNode(XTensor * node)
        return;

    if(!XNoder::IsLeaf(node)){
+        /* post processing for parent nodes */
+        BackwardNodePost(node);
+
+        /* process the current node */
        if(XMathGrad::IsMathOP(node))
            XMathGrad::MakeGrad(node);
        else if(XFuncGrad::IsFunc(node))
@@ -186,8 +195,24 @@ void XNet::BackwardNode(XTensor * node)
            ShowNTErrors("Wrong node type!");
        }
    }
+}
+
+/* 
+backward computation (in post processing) for a given node 
+>> node - the node whose parent nodes are not processed yet. So
+          we do the job at the child node.
+*/
+void XNet::BackwardNodePost(XTensor * node)
+{
+    bool isSplitList = false;
+    XLink &outgo = node->outgo;
+    for(int i = 0; i < outgo.tailNum; i++){
+        if(outgo.tails[i]->income.typeID == SHAPE_SPLIT_LIST)
+            isSplitList = true;
+    }

-    node->visitMark = NODE_FINISHED;
+    if(isSplitList)
+        XShapeGrad::PostProcessing(node, SHAPE_SPLIT_LIST);
 }

 /* 
@@ -238,10 +263,11 @@ void XNet::TarjanVisit(XTensor * node, XList &orders, const unsigned int code)
    if(node == NULL)
        return;

+    //fprintf(stderr, "%d\n", node->id);
    if(node->visitMark == code + 1){
        ShowNTErrors("There is a circle in the network\n");
    }
-    else if(node->visitMark <= code || node->visitMark >= code + 2){
+    else if(node->visitMark <= code){
        node->visitMark = code + 1;
        XLink &income = node->income;
        for(int i = 0; i < income.tailNum; i++){

--- a/source/network/XNet.h
+++ b/source/network/XNet.h
@@ -73,6 +73,9 @@ struct XNet
    /* backward computation for a given node */
    void BackwardNode(XTensor * node);

+    /* backward computation (in post processing) for a given node */
+    void BackwardNodePost(XTensor * node);
+
    /* traverse the net and find the topological order by 
       depth-first search (Tarjan's algorithm) */
    void Traverse(XTensor &root);
@@ -92,6 +95,7 @@ struct XNet
 extern unsigned int netIDGlobal;
 extern MUTEX_HANDLE netMutex;
 extern unsigned int MakeNetID();
+extern void XNetClearAll();

 }


--- a/source/sample/fnnlm/FNNLM.cpp
+++ b/source/sample/fnnlm/FNNLM.cpp
@@ -33,7 +33,7 @@
 #include "../../tensor/function/FHeader.h"
 #include "../../network/XNet.h"

-namespace samplefnnlm
+namespace fnnlm
 {

 #define MAX_NAME_LENGTH 1024
@@ -57,7 +57,7 @@ void LoadArgs(int argc, const char ** argv, FNNModel &model);
 void Init(FNNModel &model);
 void Check(FNNModel &model);
 void Copy(FNNModel &tgt, FNNModel &src);
-void Clear(FNNModel &model);
+void Clear(FNNModel &model, bool isNodeGrad);
 void InitModelTensor1D(XTensor &tensor, int num, FNNModel &model);
 void InitModelTensor2D(XTensor &tensor, int rowNum, int colNum, FNNModel &model);
 void Train(const char * train, bool isShuffled, FNNModel &model);
@@ -153,43 +153,80 @@ load arguments
 */
 void LoadArgs(int argc, const char ** argv, FNNModel &model)
 {
+    fprintf(stderr, "args:\n");
    for(int i = 0; i < argc; i++){
-        if(!strcmp(argv[i], "-train") && i + 1 < argc)
+        if(!strcmp(argv[i], "-train") && i + 1 < argc){
            strcpy(trainFN, argv[i + 1]);
-        if(!strcmp(argv[i], "-model") && i + 1 < argc)
+            fprintf(stderr, " -train=%s\n", argv[i + 1]);
+        }
+        if(!strcmp(argv[i], "-model") && i + 1 < argc){
            strcpy(modelFN, argv[i + 1]);
-        if(!strcmp(argv[i], "-test") && i + 1 < argc)
+            fprintf(stderr, " -model=%s\n", argv[i + 1]);
+        }
+        if(!strcmp(argv[i], "-test") && i + 1 < argc){
            strcpy(testFN, argv[i + 1]);
-        if(!strcmp(argv[i], "-output") && i + 1 < argc)
+            fprintf(stderr, " -test=%s\n", argv[i + 1]);
+        }
+        if(!strcmp(argv[i], "-output") && i + 1 < argc){
            strcpy(outputFN, argv[i + 1]);
-        if(!strcmp(argv[i], "-n") && i + 1 < argc)
+            fprintf(stderr, " -output=%s\n", argv[i + 1]);
+        }
+        if(!strcmp(argv[i], "-n") && i + 1 < argc){
            model.n = atoi(argv[i + 1]);
-        if(!strcmp(argv[i], "-esize") && i + 1 < argc)
+            fprintf(stderr, " -n=%d\n", model.n);
+        }
+        if(!strcmp(argv[i], "-esize") && i + 1 < argc){
            model.eSize = atoi(argv[i + 1]);
-        if(!strcmp(argv[i], "-vsize") && i + 1 < argc)
+            fprintf(stderr, " -esize=%d\n", model.eSize);
+        }
+        if(!strcmp(argv[i], "-vsize") && i + 1 < argc){
            model.vSize = atoi(argv[i + 1]);
-        if(!strcmp(argv[i], "-hdepth") && i + 1 < argc)
+            fprintf(stderr, " -vsize=%d\n", model.vSize);
+        }
+        if(!strcmp(argv[i], "-hdepth") && i + 1 < argc){
            model.hDepth = atoi(argv[i + 1]);
-        if(!strcmp(argv[i], "-hsize") && i + 1 < argc)
+            fprintf(stderr, " -hdepth=%d\n", model.hDepth);
+        }
+        if(!strcmp(argv[i], "-hsize") && i + 1 < argc){
            model.hSize = atoi(argv[i + 1]);
-        if(!strcmp(argv[i], "-lrate") && i + 1 < argc)
+            fprintf(stderr, " -hsize=%d\n", model.hSize);
+        }
+        if(!strcmp(argv[i], "-lrate") && i + 1 < argc){
            learningRate = (float)atof(argv[i + 1]);
-        if(!strcmp(argv[i], "-nstep") && i + 1 < argc)
+            fprintf(stderr, " -lrate=%f\n", learningRate);
+        }
+        if(!strcmp(argv[i], "-nstep") && i + 1 < argc){
            nStep = atoi(argv[i + 1]);
-        if(!strcmp(argv[i], "-nepoch") && i + 1 < argc)
+            fprintf(stderr, " -nstep=%d\n", nStep);
+        }
+        if(!strcmp(argv[i], "-nepoch") && i + 1 < argc){
            nEpoch = atoi(argv[i + 1]);
-        if(!strcmp(argv[i], "-minmax") && i + 1 < argc)
+            fprintf(stderr, " -nepoch=%d\n", nEpoch);
+        }
+        if(!strcmp(argv[i], "-minmax") && i + 1 < argc){
            minmax = (float)fabs(atof(argv[i + 1]));
-        if(!strcmp(argv[i], "-batch") && i + 1 < argc)
+            fprintf(stderr, " -minmax=%f\n", minmax);
+        }
+        if(!strcmp(argv[i], "-batch") && i + 1 < argc){
            sentBatch = atoi(argv[i + 1]);
-        if(!strcmp(argv[i], "-wbatch") && i + 1 < argc)
+            fprintf(stderr, " -batch=%d\n", sentBatch);
+        }
+        if(!strcmp(argv[i], "-wbatch") && i + 1 < argc){
            wordBatch = atoi(argv[i + 1]);
-        if(!strcmp(argv[i], "-shuffle"))
+            fprintf(stderr, " -wbatch=%d\n", wordBatch);
+        }
+        if(!strcmp(argv[i], "-shuffle")){
            shuffled = true;
-        if(!strcmp(argv[i], "-autodiff"))
+            fprintf(stderr, " -shuffle=true\n");
+        }
+        if(!strcmp(argv[i], "-autodiff")){
            autoDiff = true;
-        if(!strcmp(argv[i], "-dev") && i + 1 < argc)
+            fprintf(stderr, " -autodiff=true\n");
+        }
+        if(!strcmp(argv[i], "-dev") && i + 1 < argc){
            model.devID = atoi(argv[i + 1]);
+            fprintf(stderr, " -dev=%d\n", model.devID);
+        }
    }

    for(int i = 0; i < argc; i++){
@@ -203,6 +240,7 @@ void Check(FNNModel &model)
 {
    CheckErrors(model.n > 0 && model.n <= MAX_N_GRAM, "The LM order is out of range (use -n)!");
    CheckErrors(model.vSize > 0, "no vocabulary size found (use -vsize)!");
+    CheckErrors(model.eSize > 0, "no embedding size found (use -esize)!");
 }

 /* make a hard copy of the fnn model */
@@ -230,16 +268,37 @@ void Copy(FNNModel &tgt, FNNModel &src)
    }
 }

-/* reset model parameters */
-void Clear(FNNModel &model)
+/* 
+reset model parameters 
+>> model - the model whose parameter (gradient) is set to 0
+>> isNodeGrad - indicates whether the tensor node keeps the 
+                gradient information
+*/
+void Clear(FNNModel &model, bool isNodeGrad)
 {
-    model.embeddingW.SetZeroAll();
-    for(int i = 0; i < MAX_HIDDEN_NUM; i++){
-        model.hiddenW[i].SetZeroAll();
-        model.hiddenB[i].SetZeroAll();
+    if (isNodeGrad) {
+        if(model.embeddingW.grad != NULL)
+            model.embeddingW.grad->SetZeroAll();
+        for (int i = 0; i < MAX_HIDDEN_NUM; i++) {
+            if(model.hiddenW[i].grad != NULL)
+                model.hiddenW[i].grad->SetZeroAll();
+            if(model.hiddenB[i].grad != NULL)
+                model.hiddenB[i].grad->SetZeroAll();
+        }
+        if(model.outputW.grad != NULL)
+            model.outputW.grad->SetZeroAll();
+        if(model.outputB.grad != NULL)
+            model.outputB.grad->SetZeroAll();
+    }
+    else {
+        model.embeddingW.SetZeroAll();
+        for (int i = 0; i < MAX_HIDDEN_NUM; i++) {
+            model.hiddenW[i].SetZeroAll();
+            model.hiddenB[i].SetZeroAll();
+        }
+        model.outputW.SetZeroAll();
+        model.outputB.SetZeroAll();
    }
-    model.outputW.SetZeroAll();
-    model.outputB.SetZeroAll();
 }

 /* 
@@ -401,7 +460,7 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
                FNNNet net;

                /* gradident = 0 */
-                Clear(grad);
+                Clear(grad, false);

                /* forward computation */
                Forward(inputs, output, model, net);
@@ -413,6 +472,9 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
                Update(model, grad, learningRate, false);
            }
            else{
+                /* gradient = 0 */
+                Clear(model, true);
+
                /* forward + backward process */
                ForwardAutoDiff(inputs, output, model);

@@ -492,21 +554,24 @@ void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad)
        gradList.Add(&grad.embeddingW);
    }
    else{
-        paraList.Add(model.outputW.grad);
-        paraList.Add(&model.outputB.grad);
+        gradList.Add(model.outputW.grad);
+        gradList.Add(model.outputB.grad);

        for (int i = 0; i < model.hDepth; i++) {
-            paraList.Add(&model.hiddenW[i].grad);
-            paraList.Add(&model.hiddenB[i].grad);
+            gradList.Add(model.hiddenW[i].grad);
+            gradList.Add(model.hiddenB[i].grad);
        }

-        paraList.Add(&model.embeddingW.grad);
+        gradList.Add(model.embeddingW.grad);
    }

    for (int i = 0; i < paraList.count; i++) {
        XTensor * para = (XTensor*)paraList.GetItem(i);
        XTensor * paraGrad = (XTensor*)gradList.GetItem(i);

+        //fprintf(stderr, "%d\n", i);
+        //paraGrad->Dump(stderr, "grad:", 10);
+
        /* the delta rule */
        _Sum(para, paraGrad, para, -epsilon);
    }
@@ -516,7 +581,7 @@ void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad)
 get prediction probabilites of the gold words
 >> output - output probabilities
 >> gold - gold standard
->> 
+>> wordPobs - probability of each word
 << return - probability of the batch
 */
 float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs)
@@ -568,8 +633,10 @@ int LoadNGrams(FILE * file, int n, NGram * ngrams, int sentNum, int wordNum)
        if(pin <= 0){
            int len = (int)strlen(lineBuf);

-            if(lineBuf[len - 1] == '\r')
+            while(lineBuf[len - 1] == '\r' || lineBuf[len - 1] == '\n'){
                lineBuf[len - 1] = 0;
+                len--;
+            }

            len = (int)strlen(lineBuf);
            if(len == 0)
@@ -580,10 +647,11 @@ int LoadNGrams(FILE * file, int n, NGram * ngrams, int sentNum, int wordNum)
        
            /* how many words are in the sentence */
            int wNum = 0;
+            int i = 0;

-            for(int i = pin; i < len; i++){
+            for(i = pin; i < len; i++){
                /* load word (id) seperated by space or tab */
-                if((lineBuf[i] == ' ' || lineBuf[i] == '\t' || i == len - 1) && wSize > 0){
+                if((lineBuf[i] == ' ' || lineBuf[i] == '\t') && wSize > 0){
                    lineBuf[i] = 0;
                    wordBuf[wNum++] = atoi(lineBuf + i - wSize);
                    wSize = 0;
@@ -592,6 +660,9 @@ int LoadNGrams(FILE * file, int n, NGram * ngrams, int sentNum, int wordNum)
                    wSize++;
            }

+            if(wSize > 0)
+                wordBuf[wNum++] = atoi(lineBuf + i - wSize);
+
            wordBufCount = wNum;
            lineNum++;
        }
@@ -911,7 +982,6 @@ forward process (with tensor connections)
 */
 void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model)
 {
-    int batchSize = inputs[0].GetDim(0);
    int n = model.n;
    int depth = model.hDepth;

@@ -935,15 +1005,13 @@ void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model)
    hidden = Merge(hidden, 2, 0);

    /* hidden layers */
-    for(int i = 0; i < depth; i++){
-        b = Unsqueeze(model.hiddenB[i], 1, batchSize);
-        hidden = MMul(hidden, model.hiddenW) + b;
-    }
-
-    b = Unsqueeze(model.outputB, 1, batchSize);
+    for(int i = 0; i < depth; i++)
+        hidden = MMul(hidden, model.hiddenW[i]) + model.hiddenB[i];

    /* output layer */
-    output = LogSoftmax(MMul(hidden, model.outputW) + b, 1);
+    output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1);
+
+    //XLink::ShowNetwork(stderr, &output);
 }

 /* 
@@ -1039,10 +1107,7 @@ void Test(const char * test, const char * result, FNNModel &model)

        /* the gold standard */
        XTensor gold;
-
-        /* prepare an empty network for building the fnn */
-        FNNNet net;
-
+        
        /* make the input tensor for position i */
        for (int i = 0; i < model.n - 1; i++)
            MakeWordBatch(inputs[i], ngrams, ngramNum, i, model.vSize, model.devID, model.mem);
@@ -1050,8 +1115,16 @@ void Test(const char * test, const char * result, FNNModel &model)
        /* make the gold tensor */
        MakeWordBatch(gold, ngrams, ngramNum, model.n - 1, model.vSize, model.devID, model.mem);

-        /* forward computation */
-        Forward(inputs, output, model, net);
+        if (!autoDiff) {
+            /* prepare an empty network for building the fnn */
+            FNNNet net;
+
+            /* forward computation */
+            Forward(inputs, output, model, net);
+        }
+        else {
+            ForwardAutoDiff(inputs, output, model);
+        }

        /* prediction probabilities */
        XTensor probs;

--- a/source/sample/fnnlm/FNNLM.h
+++ b/source/sample/fnnlm/FNNLM.h
@@ -36,7 +36,7 @@

 using namespace nts;

-namespace samplefnnlm
+namespace fnnlm
 {

 #define _EXIT_(x)// exit(x)
@@ -126,7 +126,7 @@ struct FNNNet
    XTensor output;
 };

-/* entry of the program */
+/* entrance of the program */
 int FNNLMMain(int argc, const char ** argv);

 };

--- a/source/sample/transformer/T2TAttention.cpp
+++ b/source/sample/transformer/T2TAttention.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ */
+
+#include <math.h>
+#include "T2TAttention.h"
+#include "T2TUtility.h"
+#include "T2TEmbedding.h"
+#include "../../tensor/core/CHeader.h"
+
+namespace transformer
+{
+
+/* constructor */
+T2TAttention::T2TAttention()
+{
+    nhead = -1;
+    dk = -1;
+    dv = -1;
+    d  = -1;
+}
+
+/* deconstructor */
+T2TAttention::~T2TAttention()
+{
+}
+
+/* 
+initialize the model 
+>> argc - number of arguments
+>> argv - list of pointers to the arguments
+>> myDevID - device id
+>> myMem - the memory pool
+*/
+void T2TAttention::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
+{
+    devID = myDevID;
+    mem = myMem;
+    
+    float minmax = 0;
+
+    LoadParamInt(argc, argv, "nhead", &nhead, 8);
+    LoadParamInt(argc, argv, "d", &dk, DEFAULT_EMBEDDING_SIZE);
+    LoadParamInt(argc, argv, "d", &dv, DEFAULT_EMBEDDING_SIZE);
+    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
+    LoadParamFloat(argc, argv, "attminmax", &minmax, 0.1F);
+
+    InitTensor2D(&wk, d, dk, X_FLOAT, devID, mem);
+    InitTensor2D(&wq, d, dk, X_FLOAT, devID, mem);
+    InitTensor2D(&wv, d, dv, X_FLOAT, devID, mem);
+    
+    float scale = 1.0F;
+    float finfoutk = (float)sqrt(6.0F * scale/(d + dk));
+    float finfoutv = (float)sqrt(6.0F * scale/(d + dv));
+
+    wk.SetDataRand(-finfoutk, finfoutk);
+    wq.SetDataRand(-finfoutk, finfoutk);
+    wv.SetDataRand(-finfoutv, finfoutv);
+}
+
+/* 
+make the network 
+>> k - keys. It might be of size B * L * H
+       where B = batch size, L = sequence length, 
+       and H = vector size of each position
+>> q - queries
+>> v - values
+<< return - multi-attention result
+*/
+XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v)
+{
+    XTensor k2;
+    XTensor q2;
+    XTensor v2;
+
+    /* linear transofmration before self-attention */
+    k2 = MMul(k, wk);
+    q2 = MMul(q, wq);
+    v2 = MMul(v, wv);
+
+    XTensor kheads;
+    XTensor qheads;
+    XTensor vheads;
+
+    /* multi head */
+    kheads = Split(k2, k2.order - 1, nhead);
+    qheads = Split(q2, q2.order - 1, nhead);
+    vheads = Split(v2, v2.order - 1, nhead);
+
+    XTensor att;
+    XTensor scalar;
+
+    /* scalar = softmax(Q * K^T / sqrt(dk)) * V */
+    scalar = Softmax(Linear(BMMul(qheads, X_NOTRANS, kheads, X_TRANS), 1/(float)sqrt((float)dk)), -1);
+    att = BMMul(scalar, vheads);
+
+    /* concatenate the heads */
+    return Merge(att, att.order - 1);
+}
+
+}
--- a/source/sample/transformer/T2TAttention.h
+++ b/source/sample/transformer/T2TAttention.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ */
+
+#ifndef __T2TATTENTION_H__
+#define __T2TATTENTION_H__
+
+#include "../../network/XNet.h"
+
+using namespace nts;
+
+namespace transformer
+{
+
+/* 
+multi-head attention 
+y(Q, K, V) = cat(head_1, head_2, ..., head_n)
+where head_i = Attention(Q * w_i^Q, K * w_i^K, V * w_i^V)
+      attention(Q, K, V) = softmax(Q * K^T/d_k^0.5) V
+      d_k = dimension size of K
+*/
+class T2TAttention
+{
+public:
+    /* device id */
+    int devID;
+    
+    /* memory pool */
+    XMem * mem;
+    
+    /* head number */
+    int nhead;
+
+    /* transformation matrix for K */
+    XTensor wk;
+
+    /* transformation matrix for Q */
+    XTensor wq;
+
+    /* transformation matrix for V */
+    XTensor wv;
+
+    /* size of transformed Q and K */
+    int dk;
+
+    /* size of transformed V */
+    int dv;
+
+    /* size of input Q, K and V */
+    int d;
+
+public:
+    /* constructor */
+    T2TAttention();
+
+    /* de-constructor */
+    ~T2TAttention();
+
+    /* initialize the model */
+    void InitModel(int argc, const char ** argv, int myDevID = -1, XMem * myMem = NULL);
+
+    /* make the network */
+    XTensor Make(XTensor &k, XTensor &q, XTensor &v);
+};
+
+}
+
+#endif
--- a/source/sample/transformer/T2TDecoder.cpp
+++ b/source/sample/transformer/T2TDecoder.cpp
--- a/source/sample/transformer/T2TDecoder.h
+++ b/source/sample/transformer/T2TDecoder.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ */
+
+#ifndef __T2TDECODER_H__
+#define __T2TDECODER_H__
+
+namespace transformer
+{
+
+class T2TDecoder
+{
+
+};
+
+class AttDecoder : T2TDecoder
+{
+public:
+    /* initialize the model */
+    void InitModel(int argc, const char ** argv);
+};
+
+}
+
+#endif
\ No newline at end of file
--- a/source/sample/transformer/T2TEmbedding.cpp
+++ b/source/sample/transformer/T2TEmbedding.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-01
+ */
+
+#include <math.h>
+#include "T2TEmbedding.h"
+#include "T2TUtility.h"
+#include "../../tensor/core/CHeader.h"
+
+namespace transformer
+{
+
+/* constructor */
+T2TEmbedder::T2TEmbedder()
+{
+    devID = -1;
+    mem = NULL;
+    vSize = -1;
+    maxLength = -1;
+}
+
+/* deconstructor */
+T2TEmbedder::~T2TEmbedder()
+{
+}
+
+/* 
+initialize the model 
+>> argc - number of arguments
+>> argv - list of pointers to the arguments
+>> myDevID - device id
+>> myMem - the memory pool
+*/
+void T2TEmbedder::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
+{
+    devID = myDevID;
+    mem = myMem;
+    
+    LoadParamInt(argc, argv, "vsize", &vSize, -1);
+    LoadParamInt(argc, argv, "maxlen", &maxLength, 512);
+    LoadParamInt(argc, argv, "d", &eSize, DEFAULT_EMBEDDING_SIZE);
+    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
+
+    InitTensor2D(&w, vSize, eSize, X_FLOAT, devID, mem);
+
+    w.SetDataRandn(0, 1.0F/(float)sqrt((float)eSize));
+
+    /* create the positional embedding matrix */
+    MakePosEmbedding(eSize, d, maxLength);
+}
+
+/* 
+make positional embeddings (of size eSize * length
+eSize - embedding size
+length - length of the sequenc
+*/
+void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length)
+{
+    InitTensor2D(&posEmbeddingBase, length, eSize, X_FLOAT, devID, mem);
+
+    float * data = new float[posEmbeddingBase.unitNum];
+
+    for(int pos = 0; pos < length; pos++){
+        float * dp = data + pos * eSize;
+        for(int k = 0; k < eSize; k++){
+            if(k % 2 == 0){
+                int i = k/2;
+                dp[k] = (float)sin(pos/pow(10000.0F, 2.0F*i/d));
+            }
+            else{
+                int i = (k - 1)/2;
+                dp[k] = (float)cos(pos/pow(10000.0F, 2.0F*i/d));
+            }
+        }
+    }
+
+    posEmbeddingBase.SetData(data, posEmbeddingBase.unitNum);
+
+    delete[] data;
+}
+
+/* 
+make the network 
+*/
+XTensor T2TEmbedder::Make(XTensor &input)
+{
+    CheckNTErrors(input.GetDim(-1) == vSize, "Wrong vocabulary size!");
+    CheckNTErrors(input.order > 1, "Wrong input tensor size!");
+    CheckNTErrors(input.dimSize[input.order - 2] < maxLength, "The sequence is too long!");
+    CheckNTErrors(vSize > 0, "set vocabulary size by \"-vsize\"");
+    CheckNTErrors(eSize > 0, "set embedding size by \"-esize\"");
+
+    int dims[MAX_TENSOR_DIM_NUM];
+    memcpy(dims, input.dimSize, input.order * sizeof(int));
+    dims[input.order - 1] = eSize;
+
+    bool match = (posEmbedding.order == input.order);
+    if(match){
+        for(int i = 0; i < input.order; i++){
+            if(dims[i] != posEmbedding.GetDim(i))
+                match = false;
+        }
+    }
+
+    /* we make positional embeddings first */
+    if(!match){
+        InitTensor(&posEmbedding, input.order, dims, X_FLOAT, 1.0F, devID, mem);
+        XTensor * posTMP = NewTensorBuf(2, dims + 1, X_FLOAT, 1.0F, devID, mem);
+
+        _CopyValues(&posEmbeddingBase, 0, posTMP->unitNum, posTMP, 0);
+        _Unsqueeze(posTMP, &posEmbedding, 0, dims[0]);
+
+        DelTensorBuf(posTMP);
+    }
+
+    XTensor wordEmbedding;
+
+    /* then we make word embeddings */
+    wordEmbedding = Linear(MMul(input, w), (float)sqrt((float)d));
+
+    /* we sum over the two embeddings */
+    return wordEmbedding +posEmbedding;
+}
+
+}
--- a/source/sample/transformer/T2TEmbedding.h
+++ b/source/sample/transformer/T2TEmbedding.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-01
+ */
+
+#ifndef __T2TEMBEDDING_H__
+#define __T2TEMBEDDING_H__
+
+#include "../../network/XNet.h"
+
+using namespace nts;
+
+namespace transformer
+{
+
+#define DEFAULT_EMBEDDING_SIZE 512
+
+/* 
+embedding (of word at position i):
+word embedding + positional embedding
+*/
+class T2TEmbedder
+{
+public:
+    /* device id */
+    int devID;
+    
+    /* memory pool */
+    XMem * mem;
+    
+    /* vocabulary size */
+    int vSize;
+
+    /* embedding size */
+    int eSize;
+
+    /* maximum length of the sequence */
+    int maxLength;
+
+    /* dimension size of the hidden layers in the t2t model */
+    int d;
+
+    /* word embedding matrix */
+    XTensor w;
+
+    /* predefined positional embeddings. It can speeds up 
+       the embedding processing by re-loading. */
+    XTensor posEmbeddingBase;
+
+    /* positional embeddings */
+    XTensor posEmbedding;
+
+public:
+    /* constructor */
+    T2TEmbedder();
+
+    /* de-constructor */
+    ~T2TEmbedder();
+
+    /* initialize the model */
+    void InitModel(int argc, const char ** argv, int myDevID = -1, XMem * myMem = NULL);
+
+    /* make positional embeddings */
+    void MakePosEmbedding(int eSize, int d, int length);
+
+    /* make the network */
+    XTensor Make(XTensor &input);
+};
+
+}
+
+#endif
--- a/source/sample/transformer/T2TEncoder.cpp
+++ b/source/sample/transformer/T2TEncoder.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ */
+
+#include <math.h>
+#include "T2TEncoder.h"
+#include "T2TLayerNormal.h"
+#include "T2TUtility.h"
+#include "../../tensor/core/CHeader.h"
+
+namespace transformer
+{
+
+/* constructor */
+AttEncoder::AttEncoder()
+{
+}
+
+/* de-constructor */
+AttEncoder::~AttEncoder()
+{
+    delete[] attentions;
+    delete[] fnns;
+    delete[] attLayerNorms;
+    delete[] fnnLayerNorms;
+}
+
+/* 
+initialize the model 
+>> argc - number of arguments
+>> argv - list of pointers to the arguments
+>> myDevID - device id
+>> myMem - the memory pool
+*/
+void AttEncoder::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
+{
+    devID = myDevID;
+    mem = myMem;
+    
+    LoadParamInt(argc, argv, "nlayer", &nlayer, 6);
+    LoadParamInt(argc, argv, "hsize", &hSize, DEFAULT_EMBEDDING_SIZE);
+    LoadParamInt(argc, argv, "esize", &eSize, DEFAULT_EMBEDDING_SIZE);
+    LoadParamInt(argc, argv, "vsize", &vSize, -1);
+
+    CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
+    CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsize\"");
+
+    /* embedding model */
+    embedder.InitModel(argc, argv, devID, mem);
+
+    attentions = new T2TAttention[nlayer];
+    fnns = new T2TFNN[nlayer];
+    attLayerNorms = new T2TLN[nlayer];
+    fnnLayerNorms = new T2TLN[nlayer];
+
+    /* initialize the stacked layers */
+    for(int i = 0; i < nlayer; i++){
+        attentions[i].InitModel(argc, argv, myDevID, myMem);
+        fnns[i].InitModel(argc, argv, myDevID, myMem);
+        attLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
+        fnnLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
+    }
+}
+
+/* 
+make the encoding network
+>> input - the input tensor of the encoder
+<< return - the output tensor of the encoder
+*/
+XTensor AttEncoder::Make(XTensor &input)
+{
+    XTensor x;
+
+    x = embedder.Make(input);
+
+    for(int i = 0; i < nlayer; i++){
+        XTensor att;
+        XTensor ln;
+        XTensor fnn;
+        XTensor res;
+
+        /* self attention */
+        att = attentions[i].Make(x, x, x);
+
+        /* residual connection */
+        res = Sum(att, x);
+
+        /* TODO: dropout */
+
+        /* layer normalization */
+        x = attLayerNorms[i].Make(res);
+
+        /* fnn */
+        fnn = fnns[i].Make(x);
+
+        /* residual connection */
+        res = Sum(fnn, x);
+
+        /* TODO: dropout */
+
+        /* layer normalization */
+        x = fnnLayerNorms[i].Make(res);
+    }
+
+    return x;
+}
+
+}
--- a/source/sample/transformer/T2TEncoder.h
+++ b/source/sample/transformer/T2TEncoder.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ */
+
+#ifndef __T2TENCODER_H__
+#define __T2TENCODER_H__
+
+#include "T2TFNN.h"
+#include "T2TAttention.h"
+#include "T2TEmbedding.h"
+#include "T2TLayerNormal.h"
+#include "../../network/XNet.h"
+
+using namespace nts;
+
+namespace transformer
+{
+
+/* 
+base class of the encoder 
+*/
+class T2TEncoder
+{
+public:
+    virtual
+    XTensor Make(XTensor &input) = 0;
+};
+
+/* 
+the encoder based on RNN 
+*/
+class RNNEncoder : T2TEncoder
+{
+public:
+    XTensor Make(XTensor &input);
+};
+
+
+/* 
+the encoder based on self-attention 
+*/
+class AttEncoder : T2TEncoder
+{
+public:
+    /* device id */
+    int devID;
+
+    /* memory pool */
+    XMem * mem;
+
+    /* layer number */
+    int nlayer;
+
+    /* hidden layer size of the FNN layer */
+    int hSize;
+
+    /* embedding size */
+    int eSize;
+
+    /* vocabulary size */
+    int vSize;
+
+    /* embedding of word at each position */
+    T2TEmbedder embedder;
+
+    /* FNN model of each layer */
+    T2TFNN * fnns;
+
+    /* attention model of each layer */
+    T2TAttention * attentions;
+
+    /* layer normalization for fnn */
+    T2TLN * fnnLayerNorms;
+
+    /* layer normalization for attention */
+    T2TLN * attLayerNorms;
+
+    /* input tensor of the encoder */
+    XTensor * input;
+
+    /* output tensor of the encoder */
+    XTensor * output;
+    
+public:
+    /* constructor */
+    AttEncoder();
+
+    /* de-constructor */
+    ~AttEncoder();
+
+    /* initialize the model */
+    void InitModel(int argc, const char ** argv, int myDevID = -1, XMem * myMem = NULL);
+
+    /* make the encoding network */
+    XTensor Make(XTensor &input);
+};
+
+
+}
+
+#endif
--- a/source/sample/transformer/T2TFNN.cpp
+++ b/source/sample/transformer/T2TFNN.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ */
+
+#include <math.h>
+#include "T2TFNN.h"
+#include "T2TUtility.h"
+#include "T2TEmbedding.h"
+#include "../../tensor/core/CHeader.h"
+#include "../../tensor/function/FHeader.h"
+
+namespace transformer
+{
+
+/* constructor */
+T2TFNN::T2TFNN()
+{
+    inSize  = -1;
+    outSize = -1;
+    hSize   = -1;
+}
+
+/* deconstructor */
+T2TFNN::~T2TFNN()
+{
+}
+
+/* 
+initialize the model 
+>> argc - number of arguments
+>> argv - list of pointers to the arguments
+>> myDevID - device id
+>> myMem - the memory pool
+*/
+void T2TFNN::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
+{
+    devID = myDevID;
+    mem = myMem;
+    
+    float minmax = 0;
+
+    LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE);
+    LoadParamInt(argc, argv, "d", &outSize, DEFAULT_EMBEDDING_SIZE);
+    LoadParamInt(argc, argv, "fnnh", &hSize, DEFAULT_EMBEDDING_SIZE);
+    LoadParamFloat(argc, argv, "fnnminmax", &minmax, 0.1F);
+
+    InitTensor2D(&w1, inSize, hSize, X_FLOAT, devID, mem);
+    InitTensor1D(&b1, hSize, X_FLOAT, devID, mem);
+
+    InitTensor2D(&w2, hSize, outSize, X_FLOAT, devID, mem);
+    InitTensor1D(&b2, outSize, X_FLOAT, devID, mem);
+
+    float scale = 1.0F;
+    float finfout1 = (float)sqrt(6.0F * scale/(inSize + hSize));
+    float finfout2 = (float)sqrt(6.0F * scale/(hSize + outSize));
+    
+    w1.SetDataRand(-finfout1, finfout1);
+    b1.SetZeroAll();
+    w2.SetDataRand(-finfout2, finfout2);
+    b2.SetZeroAll();
+}
+
+/* 
+make the network 
+y = max(0, x * w1 + b1) * w2 + b2
+>> input - the input tensor
+>> return - the output tensor 
+*/
+XTensor T2TFNN::Make(XTensor &input)
+{
+    XTensor t1;
+
+    /* t1 = max(0, x * w1 + b1) */
+    t1 = Rectify(MMul(input, w1) + b1);
+
+    /* result = t1 * w2 + b2 */
+    return MMul(t1, w2) + b2;
+}
+
+
+}
--- a/source/sample/transformer/T2TFNN.h
+++ b/source/sample/transformer/T2TFNN.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ */
+
+#ifndef __T2TFNN_H__
+#define __T2TFNN_H__
+
+#include "../../tensor/XTensor.h"
+
+using namespace nts;
+
+namespace transformer
+{
+
+/* a fnn: y = max(0, x * w1 + b1) * w2 + b2 */
+class T2TFNN
+{
+public:
+    /* device id */
+    int devID;
+
+    /* memory pool */
+    XMem * mem;
+
+    /* size of input vector */
+    int inSize;
+
+    /* size of output vector */
+    int outSize;
+
+    /* size of hidden layers */
+    int hSize;
+
+    /* matrix of transformation 1 */
+    XTensor w1;
+
+    /* bias of transformation 1 */
+    XTensor b1;
+
+    /* matrix of transformation 2 */
+    XTensor w2;
+
+    /* bias of transformation 2 */
+    XTensor b2;
+
+public:
+
+    /* constructor */
+    T2TFNN();
+
+    /* deconstructor */
+    ~T2TFNN();
+
+    /* initialize the model */
+    void InitModel(int argc, const char ** argv, int myDevID = -1, XMem * myMem = NULL);
+
+    /* make the network */
+    XTensor Make(XTensor &input);
+
+};
+
+}
+
+#endif
--- a/source/sample/transformer/T2TLayerNormal.cpp
+++ b/source/sample/transformer/T2TLayerNormal.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ */
+
+#include "T2TLayerNormal.h"
+#include "T2TUtility.h"
+#include "T2TEmbedding.h"
+#include "../../tensor/core/CHeader.h"
+
+namespace transformer
+{
+    
+/* constructor */
+T2TLN::T2TLN()
+{
+    devID = -1;
+    mem   = NULL;
+}
+
+/* de-constructor */
+T2TLN::~T2TLN()
+{
+}
+
+/*
+initialize the model
+>> argc - number of arguments
+>> argv - list of pointers to the arguments
+>> myDevID - device id
+>> myMem - the memory pool
+*/
+void T2TLN::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
+{
+    devID = myDevID;
+    mem = myMem;
+
+    int d = 0;
+    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
+
+    InitTensor2D(&w, d, d, X_FLOAT, devID, mem);
+    InitTensor1D(&b, d, X_FLOAT, devID, mem);
+
+    float scale = 1.0F;
+    float finfout = (float)sqrt(6.0F * scale / (d + d));
+
+    w.SetDataRand(-finfout, finfout);
+    b.SetZeroAll();
+}
+    
+/*
+make the network 
+for each layer representation x, we have
+y = 
+>> input - the input tensor
+>> return - layer normalization output
+*/
+XTensor T2TLN::Make(XTensor &input)
+{
+    XTensor &x = input;
+    XTensor xn;
+    XTensor mean;
+    XTensor variance;
+    XTensor standard;
+    XTensor meanFilled;
+    XTensor standardFilled;
+
+    /* \mu = (sum_i x_i)/m */
+    mean = ReduceMean(x, x.order - 1);
+
+    /* \sigma = (sum_i (x_i - \mu)^2)/m */
+    variance = ReduceVariance(x, x.order - 1, mean);
+
+    /* standard = sqrt(variance) */
+    standard = Power(variance, 0.5F);
+
+    /* unsqueeze mean and standard deviation to fit them into 
+       the same shape of x */
+    meanFilled = Unsqueeze(mean, x.order - 1, x.GetDim(-1));
+    standardFilled = Unsqueeze(standard, x.order - 1, x.GetDim(-1));
+
+    /* x' = (x - \mu)/standard */
+    xn = (x - meanFilled)/standardFilled ;
+
+    /* result = x' * w + b   */
+    return MMul(xn, w) + b;
+}
+
+}
--- a/source/sample/transformer/T2TLayerNormal.h
+++ b/source/sample/transformer/T2TLayerNormal.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ */
+
+#ifndef __T2TLAYERNORMAL_H__
+#define __T2TLAYERNORMAL_H__
+
+#include "../../network/XNet.h"
+
+using namespace nts;
+
+namespace transformer
+{
+
+/* layer normalization: y = norm(x) * w + b 
+   where norm(x) = (x - mean)/standardDeviation */
+class T2TLN
+{
+public:
+    /* device id */
+    int devID;
+    
+    /* memory pool */
+    XMem * mem;
+
+    /* the transformation matrix w */
+    XTensor w;
+
+    /* the bias term b */
+    XTensor b;
+    
+public:
+    /* constructor */
+    T2TLN();
+    
+    /* de-constructor */
+    ~T2TLN();
+    
+    /* initialize the model */
+    void InitModel(int argc, const char ** argv, int myDevID = -1, XMem * myMem = NULL);
+    
+    /* make the network */
+    XTensor Make(XTensor &input);
+};
+
+}
+
+#endif
--- a/source/sample/transformer/T2TModel.cpp
+++ b/source/sample/transformer/T2TModel.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ */
+
+
+#include "T2TModel.h"
+#include "T2TUtility.h"
+#include "../../tensor/core/CHeader.h"
+
+namespace transformer
+{
+
+/* constructor */
+T2TModel::T2TModel()
+{
+    devID = -1;
+    mem = NULL;
+    isLM = false;
+    isMT = false;
+}
+
+/* de-constructor */
+T2TModel::~T2TModel()
+{
+    delete mem;
+}
+
+/* 
+initialize the model 
+>> argc - number of arguments
+>> argv - list of pointers to the arguments
+*/
+void T2TModel::InitModel(int argc, const char ** argv)
+{
+    bool useMem = false;
+
+    LoadParamInt(argc, argv, "dev", &devID, -1);
+    LoadParamBool(argc, argv, "mem", &useMem, useMem);
+    LoadParamBool(argc, argv, "lm", &isLM, true);
+    LoadParamBool(argc, argv, "mt", &isMT, false);
+
+    if(useMem){
+        delete mem;
+        mem = new XMem(devID);
+    }
+
+    encoder.InitModel(argc, argv, devID, mem);
+    outputLayer.InitModel(argc, argv, devID, mem);
+}
+
+/* 
+make the encoding network
+>> input - input tensor
+<< return - encoding result
+*/
+XTensor T2TModel::MakeEncoding(XTensor &input)
+{
+    return encoder.Make(input);
+}
+
+/* 
+make the entire network (with the output softmax layer) 
+>> input - input tensor
+>> output - output tensor (distribution)
+*/
+void T2TModel::Make(XTensor &input, XTensor &output)
+{
+    XTensor encoding;
+    
+    if(isLM){
+        encoding = MakeEncoding(input);
+        outputLayer.Make(encoding, output);
+    }
+    else{
+        ShowNTErrors("TODO!");
+    }
+}
+
+}
--- a/source/sample/transformer/T2TModel.h
+++ b/source/sample/transformer/T2TModel.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ */
+
+#ifndef __T2TMODEL_H__
+#define __T2TMODEL_H__
+
+#include "T2TFNN.h"
+#include "T2TAttention.h"
+#include "T2TEncoder.h"
+#include "T2TDecoder.h"
+#include "T2TOutput.h"
+
+namespace transformer
+{
+
+class T2TModel
+{
+public:
+    /* device id */
+    int devID;
+
+    /* memory pool */
+    XMem * mem;
+
+    /* the encoder */
+    AttEncoder encoder;
+
+    /* the decoder */
+    AttDecoder decoder;
+
+    /* output layer */
+    T2TOutput outputLayer;
+
+    /* indicates whether the model is running for language modeling */
+    bool isLM;
+
+    /* indicates whether the model is running for machine translation */
+    bool isMT;
+
+public:
+    /* constructor */
+    T2TModel();
+
+    /* de-constructor */
+    ~T2TModel();
+
+    /* initialize the model */
+    void InitModel(int argc, const char ** argv);
+
+    /* make the encoding network */
+    XTensor MakeEncoding(XTensor &input);
+
+    /* make the entire network (with the output softmax layer) */
+    void Make(XTensor &input, XTensor &output);
+};
+
+}
+
+#endif
\ No newline at end of file
--- a/source/sample/transformer/T2TOutput.cpp
+++ b/source/sample/transformer/T2TOutput.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ */
+
+#include <math.h>
+#include "T2TOutput.h"
+#include "T2TUtility.h"
+#include "T2TEmbedding.h"
+#include "../../tensor/core/CHeader.h"
+
+namespace transformer
+{
+/* constructor */
+T2TOutput::T2TOutput()
+{
+    devID = -1;
+    mem = NULL;
+    vSize = -1;
+    inSize = -1;
+    hSize = -1;
+}
+
+/* de-constructor */
+T2TOutput::~T2TOutput()
+{
+}
+
+/*
+initialize the model 
+>> argc - number of arguments
+>> argv - list of pointers to the arguments
+>> myDevID - device id
+>> myMem - the memory pool
+*/
+void T2TOutput::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
+{
+    devID = myDevID;
+    mem = myMem;
+
+    float minmax = 0;
+
+    LoadParamInt(argc, argv, "vsize", &vSize, -1);
+    LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE);
+    LoadParamInt(argc, argv, "d", &hSize, DEFAULT_EMBEDDING_SIZE);
+    LoadParamFloat(argc, argv, "outputminmax", &minmax, 0.08F);
+
+    InitTensor2D(&w, hSize, vSize, X_FLOAT, devID, mem);
+    
+    float scale = 1.0F;
+    float finfout = (float)sqrt(6.0F * scale/(hSize + vSize));
+    w.SetDataRand(-finfout, finfout);
+}
+
+/* 
+make the network 
+y = softmax(x * w)
+>> input - input tensor
+<< return - output tensor 
+*/
+XTensor T2TOutput::Make(XTensor &input)
+{
+    XTensor &x = input;
+
+    return LogSoftmax(MMul(x, w), -1);
+}
+
+/* 
+make the network (redefined output tensor) 
+>> input - input tensor
+>> output - output tensor 
+*/
+void T2TOutput::Make(XTensor &input, XTensor &output)
+{
+    XTensor &x = input;
+
+    output = LogSoftmax(MMul(x, w), -1);
+}
+
+}
--- a/source/sample/transformer/T2TOutput.h
+++ b/source/sample/transformer/T2TOutput.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ */
+
+#ifndef __T2TOUTPUT_H__
+#define __T2TOUTPUT_H__
+
+#include "../../tensor/function/FHeader.h"
+
+using namespace nts;
+
+namespace transformer
+{
+
+/* output layer */
+class T2TOutput
+{
+public:
+    /* device id */
+    int devID;
+
+    /* memory pool */
+    XMem * mem;
+
+    /* vocabulary size */
+    int vSize;
+
+    /* input vector size */
+    int inSize;
+
+    /* vector size of the linear transformation */
+    int hSize;
+
+    /* transformation matrix */
+    XTensor w;
+
+public:
+    /* constructor */
+    T2TOutput();
+
+    /* de-constructor */
+    ~T2TOutput();
+
+    /* initialize the model */
+    void InitModel(int argc, const char ** argv, int myDevID = -1, XMem * myMem = NULL);
+
+    /* make the network */
+    XTensor Make(XTensor &input);
+
+    /* make the network (redefined output tensor) */
+    void Make(XTensor &input, XTensor &output);
+};
+
+
+}
+
+#endif
\ No newline at end of file
--- a/source/sample/transformer/T2TTrainer.cpp
+++ b/source/sample/transformer/T2TTrainer.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-02
+ */
+
+#include <math.h>
+#include "T2TTrainer.h"
+#include "T2TUtility.h"
+#include "../../tensor/XUtility.h"
+#include "../../tensor/core/CHeader.h"
+
+namespace transformer
+{
+
+/* constructor */
+T2TTrainer::T2TTrainer()
+{
+    devID = -1;
+    mem = NULL;
+    seqLen = NULL;
+    nseqBuf = 0;
+    nextSeq = -1;
+}
+
+/* de-constructor */
+T2TTrainer::~T2TTrainer()
+{
+    delete[] buf;
+    delete[] seqLen;
+    delete[] seqOffset;
+}
+
+/* 
+initialization 
+>> argc - number of arguments
+>> argv - list of pointers to the arguments
+*/
+void T2TTrainer::Init(int argc, const char ** argv)
+{
+    LoadParamInt(argc, argv, "dev", &devID, -1);
+    LoadParamFloat(argc, argv, "lrate", &lrate, 0.001F);
+    LoadParamInt(argc, argv, "sbatch", &sBatchSize, 1);
+    LoadParamInt(argc, argv, "wbatch", &wBatchSize, 1);
+    LoadParamInt(argc, argv, "nepoch", &nepoch, 1);
+    LoadParamInt(argc, argv, "nstep", &nstep, 1);
+    LoadParamInt(argc, argv, "d", &d, 512);
+    LoadParamInt(argc, argv, "nwarmup", &nwarmup, 4000);
+    LoadParamInt(argc, argv, "vsize", &vSize, 1);
+    LoadParamBool(argc, argv, "sorted", &isLenSorted, false);
+    LoadParamInt(argc, argv, "bufsize", &bufSize, 50000);
+
+    buf = new int[bufSize];
+    seqLen = new int[bufSize];
+    seqOffset = new int[bufSize];
+}
+
+/* 
+train the model
+>> fn - training data file
+>> model - model to train
+*/
+void T2TTrainer::Train(const char * fn, T2TModel * model)
+{
+    int epoch = 0;
+    int step = 0;
+    int wc = 0;
+    int wordCount = 0;
+    int wordCountTotal = 0;
+    bool isEnd = false;
+    float loss = 0;
+    float lr = 0;
+
+    XNet net;
+    
+    double startT = GetClockSec();
+    
+    for(epoch = 0; epoch < nepoch; epoch++){
+        
+        FILE * file = fopen(fn, "rb");
+        CheckNTErrors(file, "cannot open training file!");
+        
+        wordCount = 0;
+        
+        /* batch of input sequences */
+        XTensor batch;
+        
+        while(LoadBatch(file, &batch, 1, vSize, sBatchSize, wBatchSize, isLenSorted, wc)){
+            
+            /* output probabilities */
+            XTensor output;
+            
+            /* make the network */
+            model->Make(batch, output);
+
+            /* back-propagation for obtaining gradients */
+            net.Backward(output, batch, CROSSENTROPY);
+            
+            /* learning rate */
+            lr = (1 / (float)sqrt((float)d)) * (float)MIN(pow(step + 1, -0.5), (step + 1) * pow(nwarmup, -1.5));
+            //lr = 0.00005F;
+            
+            /* update the parameters */
+            Update(model, lr);
+            
+            /* get probabilities */
+            float prob = GetProb(&output, &batch, NULL);
+            
+            loss += -prob;
+            wordCount += wc;
+            wordCountTotal += wc;
+            
+            if(++step >= nstep){
+                isEnd = true;
+                break;
+            }
+            
+            if (step % 1 == 0) {
+                double elapsed = GetClockSec() - startT;
+                XPRINT6(0, stderr, "[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, ppl=%.3f\n",
+                        lr, elapsed, step, epoch + 1, wordCountTotal, exp(loss / wordCount));
+            }
+        }
+        
+        fclose(file);
+
+        if (isEnd)
+            break;
+    }
+    
+    double elapsed = GetClockSec() - startT;
+    
+    XPRINT6(0, stderr, "[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, ppl=%.3f\n",
+            lr, elapsed, step, epoch, wordCountTotal, exp(loss / wordCount));
+    XPRINT3(0, stderr, "[INFO] training finished (took %.1fs, step=%d and epoch=%d)\n",
+            elapsed, step, epoch);
+}
+
+char line[MAX_SEQUENCE_LENGTH];
+
+/* 
+load data to buffer 
+>> file - where to load data
+*/
+int T2TTrainer::LoadBuf(FILE * file)
+{
+    int lineCount = 0;
+    int seqCount = 0;
+    int wordCount = 0;
+    while(fgets(line, MAX_SEQUENCE_LENGTH - 1, file)){
+        int len = (int)strlen(line);
+
+        while(line[len - 1] == '\r' || line[len - 1] == '\n'){
+            line[len - 1] = 0;
+            len--;
+        }
+
+        len = (int)strlen(line);
+        if(len == 0)
+            continue;
+        
+        /* how many characters are in a word */
+        int wSize = 0;
+        
+        /* how many words are in the sentence */
+        int wNum = 0;
+        int wNumLocal = 0;
+        int i = 0;
+
+        for(i = 0; i < len; i++){
+            /* load word (id) seperated by space or tab */
+            if((line[i] == ' ' || line[i] == '\t') && wSize > 0){
+                line[i] = 0;
+
+                if(wSize == 3 && line[i - 1] == '|' && line[i - 2] == '|' && line[i - 3] == '|'){
+                    seqLen[seqCount] = wNumLocal;
+                    seqOffset[seqCount] = wordCount + wNum - wNumLocal;
+                    seqCount++;
+                    wNumLocal = 0;
+                }
+                else{
+                    buf[wordCount + wNum++] = atoi(line + i - wSize);
+                    wNumLocal++;
+                }
+
+                wSize = 0;
+            }
+            else
+                wSize++;
+        }
+
+        if(wSize > 0){
+            buf[wordCount + wNum++] = atoi(line + i - wSize);
+            wNumLocal++;
+        }
+
+        seqLen[seqCount] = wNumLocal;
+        seqOffset[seqCount] = wordCount + wNum - wNumLocal;
+        seqCount++;
+
+        wordCount += wNum;
+        lineCount++;
+
+        if(wordCount >= bufSize - MAX_SEQUENCE_LENGTH)
+            break;
+    }
+
+    nseqBuf = seqCount;
+    nextSeq = 0;
+
+    return lineCount;
+}
+
+/* 
+load a batch of sequences 
+>> file - the handle to the data file
+>> batch - the batch
+>> step - the step we go over when move to the next sequence
+>> vs - vocabulary size
+>> sBatch - batch size of sequences
+>> wBatch - batch size of words
+>> isSorted - indicates whether the sequences are sorted by length
+>> wCount - word count
+*/
+int T2TTrainer::LoadBatch(FILE * file, XTensor * batch, int step, int vs, int sBatch, int wBatch, bool isSorted, int &wCount)
+{
+    if(nextSeq < 0 || nextSeq >= nseqBuf)
+        LoadBuf(file);
+
+    int seq = MAX(nextSeq, 0);
+    int wc = 0;
+    int wn = 0;
+    int sc = 0;
+    int max = 0;
+    while(seq + sc < nseqBuf){
+        wn = seqLen[seq + sc];
+        wc += wn;
+        sc += 1;
+
+        if(max < wn)
+            max = wn;
+
+        if(sc >= sBatch && wc >= wBatch)
+            break;
+    }
+
+    wCount = 0;
+    nextSeq = seq + sc;
+
+    if(sc > 0){
+        int dims[MAX_TENSOR_DIM_NUM];
+        dims[0] = sc;
+        dims[1] = max;
+        dims[2] = vs;
+
+        if(batch->order != 3 || batch->GetDim(0) != dims[0] || 
+           batch->GetDim(1) != dims[1] || batch->GetDim(2) != dims[2]){
+               InitTensor(batch, 3, dims, X_FLOAT, 1.0F, devID, mem);
+        }
+
+        batch->SetZeroAll();
+
+        /* this might be slow on GPUs :( */
+        for(int s = seq; s < seq + sc; s++){
+            for(int w = 0; w < seqLen[s]; w++){
+                batch->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w]);
+                wCount++;
+            }
+        }
+    }
+
+    return sc;
+}
+    
+/*
+get word probabilities for a batch of sequences
+>> output - word distribution for each position
+>> gold - gold standard
+>> wordProbs - word probability for gold prediction
+*/
+float T2TTrainer::GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs)
+{
+    XTensor probs;
+    InitTensor(&probs, output);
+    
+    /* probs[i,j] = output[i,j] * gold[i,j] */
+    _Multiply(output, gold, &probs);
+    
+    /* probability of each word */
+    XTensor wprobs;
+    InitTensor1D(&wprobs, output->unitNum/output->GetDim(-1), X_FLOAT, output->devID, output->mem);
+    
+    int dims[2] = {output->unitNum/output->GetDim(-1), output->GetDim(-1)};
+    probs.Reshape(2, dims);
+    _ReduceSum(&probs, &wprobs, 1);
+    
+    if(wordProbs != NULL)
+        _CopyValues(&wprobs, wordProbs);
+    
+    /* reshape the tensor to fit it into the reduce procedure
+     TODO: XTensor supports scalars */
+    dims[0] = 1;
+    dims[1] = probs.unitNum;
+    probs.Reshape(2, dims);
+    
+    /* probability for the batch */
+    XTensor result;
+    InitTensor1D(&result, 1, X_FLOAT, output->devID, output->mem);
+    _ReduceSum(&probs, &result, 1);
+    
+    return result.Get1D(0);
+}
+
+/* 
+update the model by delta rule
+\theta_new = \theta - \lrate * grad
+where
+\lrate = d^-0.5 * min(stepNum^-0.5, stepNum * warmupStepNum^-1.5)
+>> model - the t2t model
+>> lr - learning rate
+*/
+void T2TTrainer::Update(T2TModel * model, const float lr)
+{
+    XList ws(100);
+
+    ws.Add(&model->outputLayer.w);
+    
+    for(int i = 0; i < model->encoder.nlayer; i++){
+        ws.Add(&model->encoder.fnns[i].w1);
+        ws.Add(&model->encoder.fnns[i].b1);
+        ws.Add(&model->encoder.fnns[i].w2);
+        ws.Add(&model->encoder.fnns[i].b2);
+        ws.Add(&model->encoder.attentions[i].wk);
+        ws.Add(&model->encoder.attentions[i].wq);
+        ws.Add(&model->encoder.attentions[i].wv);
+        ws.Add(&model->encoder.fnnLayerNorms[i].w);
+        ws.Add(&model->encoder.fnnLayerNorms[i].b);
+        ws.Add(&model->encoder.attLayerNorms[i].w);
+        ws.Add(&model->encoder.attLayerNorms[i].b);
+    }
+
+    ws.Add(&model->encoder.embedder.w);
+
+    for(int i = 0; i < ws.count; i++){
+        XTensor * para = (XTensor*)ws.Get(i);
+        XTensor * paraGrad = para->grad;
+
+        if (para == NULL || paraGrad == NULL)
+            continue;
+
+        CheckNTErrors(para != NULL, "NULL parameter tensor!");
+        CheckNTErrors(paraGrad != NULL, "NULL gradient tensor!");
+
+        /*
+        DTYPE * d = new DTYPE[para->unitNum * para->unitSize];
+        DTYPE * g = new DTYPE[para->unitNum * para->unitSize];
+
+        XMemCopy(d, -1, para->data, para->devID, para->unitNum * para->unitSize);
+        XMemCopy(g, -1, paraGrad->data, paraGrad->devID, para->unitNum * para->unitSize);
+
+        for (int i = 0; i < para->unitNum; i++) {
+            if (IsNAN(d[i]) || IsINF(d[i])) {
+                int nnn = 0;
+            }
+            if (IsNAN(g[i]) || IsINF(g[i])) {
+                int nnn = 0;
+            }
+        }
+
+        delete[] d;
+        delete[] g;
+        */
+
+        /* the delta rule */
+        _Sum(para, paraGrad, para, -lr);
+
+        /* clear gradient */
+        paraGrad->SetZeroAll();
+    }
+}
+
+}
--- a/source/sample/transformer/T2TTrainer.h
+++ b/source/sample/transformer/T2TTrainer.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-02
+ */
+
+#ifndef __T2TTRAINER_H__
+#define __T2TTRAINER_H__
+
+#include "T2TModel.h"
+
+#include "../../tensor/function/FHeader.h"
+
+#define MAX_SEQUENCE_LENGTH 1024 * 4
+
+using namespace nts;
+
+namespace transformer
+{
+
+/* trainer of the T2T model */
+class T2TTrainer
+{
+public:
+    /* device id */
+    int devID;
+
+    /* memory pool */
+    XMem * mem;
+
+    /* buffer for loading words */
+    int * buf;
+
+    /* buffer size */
+    int bufSize;
+
+    /* length of each sequence */
+    int * seqLen;
+
+    /* offset of the first word for each sequence */
+    int * seqOffset;
+
+    /* number of sequences in the buffer */
+    int nseqBuf;
+
+    /* offset for next sequence in the buffer */
+    int nextSeq;
+    
+    /* indicates whether the sequence is sorted by length */
+    bool isLenSorted;
+    
+    /* dimension size of each inner layer */
+    int d;
+    
+    /* step number of warm-up for training */
+    int nwarmup;
+
+    /* vocabulary size of the source side */
+    int vSize;
+
+    /* learning rate */
+    float lrate;
+
+    /* sentence batch size */
+    int sBatchSize;
+
+    /* word batch size */
+    int wBatchSize;
+
+    /* training epoch number */
+    int nepoch;
+
+    /* traing step number */
+    int nstep;
+
+public:
+    /* constructor */
+    T2TTrainer();
+
+    /* de-constructor */
+    ~T2TTrainer();
+
+    /* initialize the trainer */
+    void Init(int argc, const char ** argv);
+
+    /* train the model */
+    void Train(const char * fn, T2TModel * model);
+
+    /* load data to buffer */
+    int LoadBuf(FILE * file);
+
+    /* load a batch of sequences */
+    int LoadBatch(FILE * file, XTensor * batch, int step, int vs, int sBatch, int wBatch, bool isSorted, int &wCount);
+    
+    /* get word probabilities for a batch of sequences */
+    float GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs);
+
+    /* update the model by delta rule */
+    void Update(T2TModel * model, const float lr);
+};
+
+
+}
+
+#endif
--- a/source/sample/transformer/T2TUtility.cpp
+++ b/source/sample/transformer/T2TUtility.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+namespace transformer
+{
+
+FILE * tmpFILE;
+
+void LoadParamString(int argc, const char ** argv, const char * name, char * p, const char * defaultP)
+{
+    char vname[128];
+    vname[0] = '-';
+    strcpy(vname + 1, name);
+    bool hit = false;
+    for(int i = 0; i < argc; i++){
+        if(!strcmp(argv[i], vname) && i + 1 < argc){
+            strcpy(p, argv[i + 1]);
+            //fprintf(stderr, " %s=%s\n", name, argv[i + 1]);
+            hit = true;
+        }
+    }
+    if(!hit)
+        strcpy(p, defaultP);
+}
+
+void LoadParamInt(int argc, const char ** argv, const char * name, int * p, int defaultP)
+{
+    char vname[128];
+    vname[0] = '-';
+    strcpy(vname + 1, name);
+    bool hit = false;
+    for(int i = 0; i < argc; i++){
+        if(!strcmp(argv[i], vname) && i + 1 < argc){
+            *(int*)p = atoi(argv[i + 1]);
+            //fprintf(stderr, " %s=%s\n", name, argv[i + 1]);
+            hit = true;
+        }
+    }
+    if(!hit)
+        *p = defaultP;
+}
+
+void LoadParamBool(int argc, const char ** argv, const char * name, bool * p, bool defaultP)
+{
+    char vname[128];
+    vname[0] = '-';
+    strcpy(vname + 1, name);
+    bool hit = false;
+    for(int i = 0; i < argc; i++){
+        if(!strcmp(argv[i], vname)){
+            *(bool*)p = true;
+            //fprintf(stderr, " %s=%s\n", name, "true");
+            hit = true;
+        }
+    }
+    if(!hit)
+        *p = defaultP;
+}
+
+void LoadParamFloat(int argc, const char ** argv, const char * name, float * p, float defaultP)
+{
+    char vname[128];
+    vname[0] = '-';
+    strcpy(vname + 1, name);
+    bool hit = false;
+    for(int i = 0; i < argc; i++){
+        if(!strcmp(argv[i], vname) && i + 1 < argc){
+            *p = (float)atof(argv[i + 1]);
+            //fprintf(stderr, " %s=%s\n", name, argv[i + 1]);
+            hit = true;
+        }
+    }
+    if(!hit)
+        *p = defaultP;
+}
+
+void ShowParams(int argc, const char ** argv)
+{
+    fprintf(stderr, "args:\n");
+    for(int i = 0; i < argc; i++){
+        if(argv[i][0] == '-'){
+            if(i + 1 < argc && argv[i + 1][0] != '-')
+                fprintf(stderr, " %s=%s\n", argv[i], argv[i + 1]);
+            else
+                fprintf(stderr, " %s=yes\n", argv[i]);
+        }
+    }
+    fprintf(stderr, "\n");
+}
+
+}
--- a/source/sample/transformer/T2TUtility.h
+++ b/source/sample/transformer/T2TUtility.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ */
+
+#ifndef __T2TUTILITY_H__
+#define __T2TUTILITY_H__
+
+#include <stdio.h>
+
+namespace transformer
+{
+
+extern FILE * tmpFILE;
+
+/* load arguments */
+void LoadParamString(int argc, const char ** argv, const char * name, char * p, const char * defaultP);
+void LoadParamInt(int argc, const char ** argv, const char * name, int * p, int defaultP);
+void LoadParamBool(int argc, const char ** argv, const char * name, bool * p, bool defaultP);
+void LoadParamFloat(int argc, const char ** argv, const char * name, float * p, float defaultP);
+
+/* show arguments */
+void ShowParams(int argc, const char ** argv);
+
+}
+
+#endif
--- a/source/sample/transformer/Transformer.cpp
+++ b/source/sample/transformer/Transformer.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ */
+
+#include "Transformer.h"
+#include "T2TModel.h"
+#include "T2TUtility.h"
+#include "T2TTrainer.h"
+#include "../../tensor/XDevice.h"
+
+namespace transformer
+{
+
+int TransformerMain(int argc, const char ** argv)
+{
+    if(argc == 0)
+        return 1;
+
+    tmpFILE = fopen("tmp.txt", "wb");
+
+    ShowParams(argc, argv);
+
+    char * trainFN = new char[MAX_LINE_LENGTH];
+
+    LoadParamString(argc, argv, "train", trainFN, "");
+
+    T2TModel model;
+
+    model.InitModel(argc, argv);
+
+    if(strcmp(trainFN, "")){
+        T2TTrainer trainer;
+        trainer.Init(argc, argv);
+        trainer.Train(trainFN, &model);
+    }
+
+    delete[] trainFN;
+
+    fclose(tmpFILE);
+
+    return 0;
+}
+
+}
\ No newline at end of file
--- a/source/sample/transformer/Transformer.h
+++ b/source/sample/transformer/Transformer.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ *
+ * An impelementation of the transformer system. See more details 
+ * about FNNLM in 
+ * "Attention Is All You Need" by Vaswani et al.
+ * https://arxiv.org/pdf/1706.03762.pdf
+ *
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * I start writing the code related to NMT - a long time since my last coding 
+ * work on MT
+ */
+
+#ifndef __TRANSFORMER_H__
+#define __TRANSFORMER_H__
+
+#include "../../tensor/XGlobal.h"
+#include "../../tensor/XTensor.h"
+#include "../../tensor/core/CHeader.h"
+
+namespace transformer
+{
+
+/* entrance of the program */
+int TransformerMain(int argc, const char ** argv);
+
+}
+
+#endif
\ No newline at end of file
--- a/source/sample/transformer/transformer.lnk
+++ b/source/sample/transformer/transformer.lnk
--- a/source/tensor/Main.cpp
+++ b/source/tensor/Main.cpp
@@ -29,6 +29,7 @@
 #include "XTensor.h"
 #include "XDevice.h"
 #include "./test/Test.h"
+#include "./core/CHeader.h"

 //#define CRTDBG_MAP_ALLOC
 //#include <stdlib.h>  
@@ -37,6 +38,7 @@
 using namespace nts;

 void SmallTest();
+void TransposeTest();

 int main( int argc, const char ** argv )
 {
@@ -92,3 +94,35 @@ void SmallTest()
    c.Dump(stderr, "c:");
    d.Dump(stderr, "d:");
 }
+
+void TransposeTest()
+{
+    XTensor a;
+    XTensor b;
+
+    int I = 2;
+    int J = 3;
+
+    InitTensor4D(&a, 2, 3, 4, 5);
+
+    int * dims = new int[a.order];
+    memcpy(dims, a.dimSize, sizeof(int) * a.order);
+    dims[I] = a.dimSize[J];
+    dims[J] = a.dimSize[I];
+
+    InitTensor(&b, 4, dims);
+
+    a.SetZeroAll();
+    b.SetZeroAll();
+
+    float * data = new float[a.unitNum];
+    for(int i = 0; i < a.unitNum; i++)
+        data[i] = (float)i;
+
+    a.SetData(data, a.unitNum, 0);
+
+    _Transpose(&a, &b, I, J);
+    b.Dump(stderr, "b:");
+
+    delete[] data;
+}
--- a/source/tensor/XDevice.cpp
+++ b/source/tensor/XDevice.cpp
@@ -40,6 +40,7 @@ XDevManager GDevs;
 /* constructor */
 XDevice::XDevice()
 {
+    stream = NULL;
    Clear();

 #ifdef USE_CUDA
@@ -55,6 +56,8 @@ XDevice::~XDevice()
    MUTEX_DELE(cublasMutex);
    if(isHandleReady)
        cublasDestroy(cublasHandle);
+    if(stream != NULL)
+        delete stream;
 #endif
 }

@@ -118,6 +121,8 @@ void XDevice::Init(int myDevID)
        }
        else
            sprintf(name2, "GPU-%d %s", devID, name);
+
+        stream = new XStream(0, devID);
 #endif
    }

@@ -161,6 +166,14 @@ cublasHandle_t * XDevice::GetCublasHandle()
    return &cublasHandle;
 }

+/* get the stream of cuda */
+cudaStream_t * XDevice::GetCudaStream()
+{
+    CheckNTErrors(stream != NULL, "the stream is not initialized!");
+
+    return &stream->stream;
+}
+
 #endif // USE_CUDA

 /* switch to a device */
@@ -311,11 +324,19 @@ void XDevManager::Clear()
 /* get the handle of GPU */
 cublasHandle_t * XDevManager::GetCudaHandle(const int devID)
 {
-    CheckNTErrors((devID < nGPU), "index of GPU is out of range.");
+    CheckNTErrors(devID < nGPU, "index of GPU is out of range.");

    return GPUs[devID].GetCublasHandle();
 }

+/* get the stream of cuda */
+cudaStream_t * XDevManager::GetCudaStream(const int devID)
+{
+    CheckNTErrors(devID < nGPU, "index of GPU is out of range.");
+
+    return GPUs[devID].GetCudaStream();
+}
+
 #endif

 /* 
@@ -384,13 +405,10 @@ int XDevManager::GetCudaThread2D(const int devID, const int n, const int m, int 
    memset(gridSize, 0, sizeof(int) * 3);
    memset(blockSize, 0, sizeof(int) * 3);

-    if(n <= 0 || m <= 0 || devID >= nGPU)
+    if(n <= 0 || m <= 0)
        return 1;

-    if(devID < 0){
-        XPRINT(0, stderr, "WARNING! You are calling the grid and block size computation function on a CPU!");
-        return 0;
-    }
+    CheckNTErrors(devID >= 0 && devID < nGPU, "Invalid GPU device id!");

 #ifdef USE_CUDA


--- a/source/tensor/XDevice.h
+++ b/source/tensor/XDevice.h
@@ -25,6 +25,7 @@
 #define __XDEVICE_H__

 #include "XThread.h"
+#include "XStream.h"

 #ifdef USE_CUDA

@@ -92,6 +93,9 @@ public:

    /* specify whether Unified Virtual Address Space (UVA) is supported */
    bool isUVASupported;
+
+    /* default stream for the device */
+    XStream * stream;
    
 #ifdef USE_CUDA
    /* mutex for handle (GPU cublas) */
@@ -121,6 +125,9 @@ public:
 #ifdef USE_CUDA
    /* get cublas handle */
    cublasHandle_t * GetCublasHandle();
+
+    /* get the stream of cuda */
+    cudaStream_t * GetCudaStream();
 #endif

    /* switch to a device */
@@ -178,6 +185,9 @@ public:
 #ifdef USE_CUDA
    /* get the handle of GPU */
    cublasHandle_t * GetCudaHandle(const int devID);
+
+    /* get the stream of cuda */
+    cudaStream_t * GetCudaStream(const int devID);
 #endif

    /* get grid and block sizes that max potential */

--- a/source/tensor/XLink.cpp
+++ b/source/tensor/XLink.cpp
@@ -167,7 +167,9 @@ void XLink::SetType(int id)
    type[0] = 0;
    strcpy(type, GetOPName(id));
    typeID = id;
-    CheckNTErrors(strcmp(type, "NULL"), "illegal edge type name!");
+    if(id != 0){
+        CheckNTErrors(strcmp(type, "NULL"), "illegal edge type name!");
+    }
 }

 /* 
@@ -515,7 +517,7 @@ void XLink::CopyIncoming(const XTensor * reference, XTensor * target)
        tails.Add(tail);
    }

-    MakeLink(&tails, target, reference->id);
+    MakeLink(&tails, target, reference->income.typeID);

    int paraNum = reference->income.paramNum;
    target->income.paramNum = paraNum;

--- a/source/tensor/XList.cpp
+++ b/source/tensor/XList.cpp
@@ -208,22 +208,16 @@ void XList::Insert(int pos, void * item)
 /* get the item at position i */
 void * XList::GetItem(int i) const
 {
-    if( i >= 0 && i < count )
-        return items[i];
-    else
-        return NULL;
+    CheckNTErrors(i >= 0 && i < count, "Index of a list item is out of scope!");
+    return items[i];
 }

 /* get the integer-typed item at position i */
 int XList::GetItemInt(int i)
 {
    CheckNTErrors(isIntList, "An int list is required!");
-
-    if( i >= 0 && i < count ){
-        return *(int*)(items[i]);
-    }
-    else
-        return 0;
+    CheckNTErrors(i >= 0 && i < count, "Index of a list item is out of scope!");
+    return *(int*)(items[i]);
 }

 /* set the item at position i */

--- a/source/tensor/XMem.cpp
+++ b/source/tensor/XMem.cpp
@@ -181,7 +181,10 @@ void XMem::Free(int myDevID, void * mem)
    else{
 #ifdef USE_CUDA
        SetDevice(myDevID);
-        CheckNTErrors(cudaFree((char*)mem) == cudaSuccess, "Cannot free the memory.");
+        cudaError_t error = cudaFree((char*)mem);
+        if(error != cudaSuccess){
+            ShowNTErrors("Cannot free the memory.");
+        }
 #else
        ShowNTErrors("Please specify USE_CUDA for compiling this program.");
 #endif

--- a/source/tensor/XName.cpp
+++ b/source/tensor/XName.cpp
@@ -29,6 +29,22 @@ const char * GetOPName(int type)
    if ((type & MATH_BASE) != 0){
        if (type == MATH_ABSOLUTE)
            return "M_ABSOLUTE";
+        else if (type == MATH_EXP)
+            return "M_EXP";
+        else if (type == MATH_LOG)
+            return "M_LOG";
+        else if (type == MATH_SIN)
+            return "M_SIN";
+        else if (type == MATH_COS)
+            return "M_COS";
+        else if (type == MATH_TAN)
+            return "M_TAN";
+        else if (type == MATH_ROUND)
+            return "M_ROUND";
+        else if (type == MATH_CLIP)
+            return "M_CLIP";
+        else if (type == MATH_DIV)
+            return "M_DIV";
        else if (type == MATH_MATRIXMUL)
            return "M_MATRIXMUL";
        else if (type == MATH_MATRIXMULBATCHED)
@@ -37,18 +53,20 @@ const char * GetOPName(int type)
            return "M_MULTIPLY";
        else if (type == MATH_NEGATE)
            return "M_NEGATE";
-        else if (type == MATH_SIGN)
-            return "M_SIGN";
-        else if (type == MATH_SUM)
-            return "M_SUM";
-        else if (type == MATH_LOG)
-            return "M_LOG";
        else if (type == MATH_NORMALIZE)
            return "M_NORMALIZE";
        else if (type == MATH_POWER)
            return "M_POWER";
        else if (type == MATH_SCALEANDSHIFT)
            return "M_SCALEANDSHIFT";
+        else if (type == MATH_SIGN)
+            return "M_SIGN";
+        else if (type == MATH_SUM)
+            return "M_SUM";
+        else if (type == MATH_SUB)
+            return "M_SUB";
+        else if (type == MATH_SUMDIM)
+            return "M_SUMDIM";
        else if (type == REDUCE_REDUCEMAX)
            return "R_REDUCEMAX";
        else if (type == REDUCE_REDUCEMEAN)

--- a/source/tensor/XName.h
+++ b/source/tensor/XName.h
@@ -30,20 +30,30 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /* math operations */
 #define MATH_BASE               0x00001000
+
 #define MATH_ABSOLUTE           MATH_BASE + 1
-#define MATH_MATRIXMUL          MATH_ABSOLUTE + 1
+#define MATH_EXP                MATH_ABSOLUTE + 1
+#define MATH_LOG                MATH_EXP + 1
+#define MATH_SIN                MATH_LOG + 1
+#define MATH_COS                MATH_SIN + 1
+#define MATH_TAN                MATH_COS + 1
+#define MATH_ROUND              MATH_TAN + 1
+
+#define MATH_CLIP               MATH_ROUND + 1
+#define MATH_DIV                MATH_CLIP + 1
+#define MATH_MATRIXMUL          MATH_DIV + 1
 #define MATH_MATRIXMULBATCHED   MATH_MATRIXMUL + 1
 #define MATH_MULTIPLY           MATH_MATRIXMULBATCHED + 1
 #define MATH_NEGATE             MATH_MULTIPLY + 1
-#define MATH_SIGN               MATH_NEGATE + 1
-#define MATH_SUM                MATH_SIGN + 1
-
-#define MATH_LOG                MATH_SUM + 1
-#define MATH_NORMALIZE          MATH_LOG + 1
+#define MATH_NORMALIZE          MATH_NEGATE + 1
 #define MATH_POWER              MATH_NORMALIZE + 1
 #define MATH_SCALEANDSHIFT      MATH_POWER + 1
+#define MATH_SIGN               MATH_SCALEANDSHIFT + 1
+#define MATH_SUM                MATH_SIGN + 1
+#define MATH_SUB                MATH_SUM + 1
+#define MATH_SUMDIM             MATH_SUB + 1

-#define REDUCE                  MATH_SCALEANDSHIFT + 1
+#define REDUCE                  MATH_SUMDIM + 1
 #define REDUCE_REDUCEMAX        REDUCE + 1
 #define REDUCE_REDUCEMEAN       REDUCE_REDUCEMAX + 1
 #define REDUCE_REDUCESUM        REDUCE_REDUCEMEAN + 1

--- a/source/tensor/XStream.cpp
+++ b/source/tensor/XStream.cpp
@@ -84,7 +84,7 @@ void XStream::Create(int priority, int myDevID)
    XDevice::SetGPUDevice(myDevID);
    //cudaStreamCreateWithPriority(&stream, cudaStreamDefault, priority);
    CheckNTErrors((cudaStreamCreate(&stream) == cudaSuccess), 
-                        "cannot create the cuda stream!");
+                  "cannot create the cuda stream!");
    XDevice::SetGPUDevice(backupDevID);
 #endif
    devID = myDevID;

--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -42,6 +42,8 @@
 #include "core/movement/CopyValues.h"
 #include "core/arithmetic/Sum.h"
 #include "core/arithmetic/Multiply.h"
+#include "core/arithmetic/Sub.h"
+#include "core/arithmetic/Div.h"
 #include "core/math/ScaleAndShift.h"

 #ifdef USE_CUDA
@@ -354,6 +356,18 @@ XTensor XTensor::operator* (const XTensor& tensor)
    return Multiply(*this, tensor);
 }

+/* overloading of the minus-sign */
+XTensor XTensor::operator- (const XTensor& tensor)
+{
+    return Sub(*this, tensor);
+}
+
+/* overloading of the division-sign */
+XTensor XTensor::operator/ (const XTensor& tensor)
+{
+    return Div(*this, tensor);
+}
+
 /* 
 linear transformation b = a * \scale + \shift
 >> scale - the slope
@@ -426,8 +440,12 @@ get the size of a given dimension
 int XTensor::GetDim(const int dim)
 {
    CheckNTErrors(dim < order, "dimenision is out of range!");
+    
+    int d = dim;
+    if(dim < 0)
+        d = order - 1;

-    return dimSize[dim];
+    return dimSize[d];
 }

 /* 
@@ -454,6 +472,27 @@ void XTensor::Reshape(const int myOrder, const int * myDimSize)
    memcpy(dimSizeRDI, dimsRDI, sizeof(int) * order);
 }

+/* 
+reshape the tensor to a vector 
+>> num - number of elements
+*/
+void XTensor::Reshape(const int num)
+{
+    int dim = num;
+    Reshape(1, &dim);
+}
+
+/* 
+reshape the tensor to a matrix 
+>> rowNum - number of rows
+>> colNum - number of columns
+*/
+void XTensor::Reshape(const int rowNum, const int colNum)
+{
+    int dims[2] = {rowNum, colNum};
+    Reshape(2, dims);
+}
+
 /* get the number of items in the data array */
 int XTensor::GetSize() const
 {
@@ -560,25 +599,24 @@ set the tensor items by a uniform distribution in range [lower, upper]
 void XTensor::SetDataRand(DTYPE lower, DTYPE upper)
 {
    // TODO: cuda code!!!!!!!
-    // TODO: replace float with DTYPE

    if (data == NULL)
        return;

    // srand((unsigned)time(0));
-
+    DTYPE variance = upper - lower;
    void * d = NULL;
    if (dataType == X_FLOAT) {
        d = new float[unitNum];
        for (int i = 0; i < unitNum; i++) {
-            DTYPE value = lower + (upper - lower) * (float)rand() / RAND_MAX;
+            DTYPE value = lower + variance * (float)rand() / RAND_MAX;
            *((float*)d + i) = value;
        }
    }
    else if (dataType == X_DOUBLE) {
        d = new double[unitNum];
        for (int i = 0; i < unitNum; i++) {
-            *((double*)d + i) = lower + (upper - lower) * rand() / RAND_MAX;
+            *((double*)d + i) = lower + variance * rand() / RAND_MAX;
        }
    }
    else {
@@ -588,15 +626,15 @@ void XTensor::SetDataRand(DTYPE lower, DTYPE upper)
    SetData(d, unitNum);
    
    if (dataType == X_FLOAT) {
-        delete[](float*)d;
+        delete[] (float*)d;
    }
    else {
-        delete[](double*)d;
+        delete[] (double*)d;
    }
 }

-/* a gauss distribution */
-double GaussRand()
+/* a gauss distribution (Box-Muller method) */
+double GaussRand(DTYPE mean, DTYPE standardDeviation)
 {
    // TODO: cuda code!!!!!!!

@@ -606,8 +644,8 @@ double GaussRand()
    double pi = 3.141592654;

    if (phase == 0){
-        u = rand() / (RAND_MAX + 1.0);
-        v = rand() / (RAND_MAX + 1.0);
+        u = (rand() + 1.0) / (RAND_MAX + 1.0);
+        v = (rand() + 1.0) / (RAND_MAX + 1.0);
        z = sqrt(-2.0 * log(u))* sin(2.0 * pi * v);
    }
    else{
@@ -615,7 +653,7 @@ double GaussRand()
    }

    phase = 1 - phase;
-    return z;
+    return mean + (z * standardDeviation);
 }

 /* 
@@ -626,7 +664,6 @@ set the tensor items by a normal distribution
 void XTensor::SetDataRandn(DTYPE mean, DTYPE standardDeviation)
 {
    // TODO: cuda code!!!!!!!
-    // TODO: replace float with DTYPE

    if (data == NULL)
        return;
@@ -636,13 +673,13 @@ void XTensor::SetDataRandn(DTYPE mean, DTYPE standardDeviation)
    if (dataType == X_FLOAT) {
        d = new float[unitNum];
        for (int i = 0; i < unitNum; i++) {
-            *((float*)d + i) = (float)GaussRand();
+            *((float*)d + i) = (float)GaussRand(mean, standardDeviation);
        }
    }
    else if (dataType == X_DOUBLE) {
        d = new double[unitNum];
        for (int i = 0; i < unitNum; i++) {
-            *((double*)d + i) = GaussRand();
+            *((double*)d + i) = GaussRand(mean, standardDeviation);
        }
    }
    else {
@@ -652,10 +689,10 @@ void XTensor::SetDataRandn(DTYPE mean, DTYPE standardDeviation)
    SetData(d, unitNum);

    if (dataType == X_FLOAT) {
-        delete[](float*)d;
+        delete[] (float*)d;
    }
    else {
-        delete[](double*)d;
+        delete[] (double*)d;
    }
 }

@@ -1003,13 +1040,13 @@ set the value of a cell in a 3d tensor in default type
 */
 bool XTensor::Set3D(DTYPE value, int d0, int d1, int d2)
 {
-    CheckNTErrors((order == 3), "Cannot get a 2d cell for a tensor whose order is not 2!");
-    CheckNTErrors((d0 >= 0 && d1 < dimSize[0]), "dimension 0 is out of range!");
-    CheckNTErrors((d2 >= 0 && d2 < dimSize[1]), "dimension 1 is out of range!");
-    CheckNTErrors((d2 >= 0 && d2 < dimSize[2]), "dimension 1 is out of range!");
-    CheckNTErrors((dataType == DEFAULT_DTYPE), "The tensor is not in default type.");
+    CheckNTErrors(order == 3, "Cannot get a 2d cell for a tensor whose order is not 2!");
+    CheckNTErrors(d0 >= 0 && d0 < dimSize[0], "dimension 0 is out of range!");
+    CheckNTErrors(d1 >= 0 && d1 < dimSize[1], "dimension 1 is out of range!");
+    CheckNTErrors(d2 >= 0 && d2 < dimSize[2], "dimension 1 is out of range!");
+    CheckNTErrors(dataType == DEFAULT_DTYPE, "The tensor is not in default type.");

-    int dims[3] = {d0, d1, d1};
+    int dims[3] = {d0, d1, d2};

    return SetToDevice(devID, GetCell(dims, 3), value);
 }
@@ -1439,6 +1476,21 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int verbo
 }

 /* 
+dump data to a file
+>> tensor - tensor whose data is dumped
+>> file - where to domp the data
+>> label - label of the tensor
+>> n - number of items to dump
+>> verbose - verbose level
+*/
+void XTensor::Dump(const XTensor * tensor, FILE * file, const char * label, const int n, const int verbose)
+{
+    XTensor a(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, tensor->devID, tensor->mem);
+    _CopyValues(tensor, &a);
+    a.Dump(file, label, n, verbose);
+}
+
+/* 
 read data from a file
 >> file - where to load the data
 >> label - label of the tensor
@@ -1687,13 +1739,13 @@ void InitTensor(XTensor * tensor,

        dims[0] = -abs(dims[0]);

-        tensor->Resize(myOrder, dims, myDataType, myDenseRatio);
-
-        if(myDevID == CURRENT_GPU)
+        if (myDevID == CURRENT_GPU)
            tensor->devID = XDevice::GetGPUDevice();
        else
            tensor->devID = myDevID;

+        tensor->Resize(myOrder, dims, myDataType, myDenseRatio);
+
        if(allocated)
            XTensor::AllocateData(tensor);
    }
@@ -1870,28 +1922,47 @@ generate a XTensor which allocates data on the buffer
 >> myDimSize - the size of each dimension
 >> myMem - memory pool used to allocating the data array.
           we actually allocate the data on the buffer associated with
-           the memory pool.
+           the memory pool
+>> devID - device id
 >> myDataType - unit size (e.g., int, float, and double)
 >> myDenseRatio - how often an element has non-zero value

 */
-XTensor * NewTensorBuf(const int myOrder, const int * myDimSize, XMem * myMem,
-                       const TENSOR_DATA_TYPE myDataType, const float myDenseRatio)
+XTensor * NewTensorBuf(const int myOrder, const int * myDimSize,
+                       const TENSOR_DATA_TYPE myDataType, const float myDenseRatio,
+                       const int devID, XMem * myMem)
 {
-    CheckNTErrors(myMem != NULL, "No memory pool specified!");
-
    int dims[MAX_TENSOR_DIM_NUM];
    memcpy(dims, myDimSize, sizeof(int) * myOrder);

    dims[0] = -abs(dims[0]);

-    XTensor * tensor = NewTensor(myOrder, dims, myDataType, myDenseRatio, -1, myMem);
-    tensor->data = myMem->AllocBuf(myMem->devID, tensor->unitNum * tensor->unitSize);
+    XTensor * tensor = NewTensor(myOrder, dims, myDataType, myDenseRatio, devID, myMem);
+
+    if(myMem != NULL)
+        tensor->data = myMem->AllocBuf(myMem->devID, tensor->unitNum * tensor->unitSize);
+    else
+        tensor->data = XMemAlloc(devID, tensor->unitNum * tensor->unitSize);

    return tensor;
 }

 /* 
+generate a XTensor which allocates data on the buffer 
+>> reference - reference tensor
+>> devID - device id
+>> myMem - memory pool used to allocating the data array.
+           we actually allocate the data on the buffer associated with
+           the memory pool
+*/
+XTensor * NewTensorBuf(const XTensor * reference, int devID, XMem * myMem)
+{
+    return NewTensorBuf(reference->order, reference->dimSize, 
+                        reference->dataType, reference->denseRatio,
+                        devID, myMem);
+}
+
+/* 
 generate a dense vector 
 >> num - number of entries
 >> myDataType - unit size (e.g., int, float, and double) 
@@ -2041,7 +2112,7 @@ XTensor * NewTensor(XTensor * a, bool isFilledData)
 free the data space of a given tensor 
 >> tensor - pointer to the tensor
 */
-void DelTensor(const XTensor * tensor)
+void DelTensor(XTensor * tensor)
 {
    delete tensor;
 }
@@ -2050,10 +2121,13 @@ void DelTensor(const XTensor * tensor)
 free the data space of a given tensor (on the buffer)
 >> tensor - pointer to the tensor
 */
-void DelTensorBuf(const XTensor * tensor)
+void DelTensorBuf(XTensor * tensor)
 {
-    CheckNTErrors(tensor->mem != NULL, "No memory pool found!");
-    tensor->mem->ReleaseBuf(tensor->devID, tensor->unitNum * tensor->unitSize);
+    if(tensor->mem != NULL)
+        tensor->mem->ReleaseBuf(tensor->devID, tensor->unitNum * tensor->unitSize);
+    else
+        XMemFree(tensor->devID, tensor->data);
+    tensor->data = NULL;
    delete tensor;
 }


--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
@@ -45,12 +45,13 @@ namespace nts{
 struct XLink;

 /* define the maximum number of dimensions in a tensor */
-#define MAX_TENSOR_DIM_NUM 6
+#define MAX_TENSOR_DIM_NUM 8
 #define USE_BATCHED_STRIDED_MAT_MUL
-#define MIN_TENSOR_SPLIT_NUM 10
+#define MIN_TENSOR_SPLIT_NUM 0
 #define MIN_TENSOR_SPLIT_LIST_NUM 1024
 #define MIN_TENSOR_CAT_NUM 8

+
 /* computation flags */
 #define UNSAFE_BUT_FAST_MEM
 #define FAST_MATRIX
@@ -202,6 +203,12 @@ public:
    /* overloading of the multiply-sign */
    XTensor  operator* (const XTensor &tensor);

+    /* overloading of the minus-sign */
+    XTensor  operator- (const XTensor &tensor);
+
+    /* overloading of the division-sign */
+    XTensor  operator/ (const XTensor &tensor);
+
    /* linear transformation */
    XTensor Lin(DTYPE scale, DTYPE shift = 0);

@@ -222,6 +229,12 @@ public:
    /* reshape the tensor */
    void Reshape(const int order, const int * myDimSize);

+    /* reshape the tensor to a vector */
+    void Reshape(const int num);
+
+    /* reshape the tensor to a matrix */
+    void Reshape(const int rowNum, const int colNum);
+
    /* get the number of items in the data array */
    int GetSize() const;

@@ -328,6 +341,10 @@ public:
    /* dump data to a file */
    void Dump(FILE * file, const char * label = NULL, const int n = -1, const int verbose = 0);

+    /* dump data to a file */
+    static
+    void Dump(const XTensor * tensor, FILE * file, const char * label = NULL, const int n = -1, const int verbose = 0);
+
    /* read data from a file */
    void Read(FILE * file, const char * label = NULL);

@@ -386,8 +403,12 @@ XTensor * NewTensor(const int myOrder, const int * myDimSize, const TENSOR_DATA_
                    const float myDenseRatio = 1.0F, const int myDevID = -1, XMem * myMem = NULL);

 /* generate a XTensor which allocates data on the buffer */
-XTensor * NewTensorBuf(const int myOrder, const int * myDimSize, XMem * myMem,
-                       const TENSOR_DATA_TYPE myDataType = X_FLOAT, const float myDenseRatio = 1.0F);
+XTensor * NewTensorBuf(const int myOrder, const int * myDimSize,
+                       const TENSOR_DATA_TYPE myDataType = X_FLOAT, const float myDenseRatio = 1.0F,
+                       const int myDevID = -1, XMem * myMem = NULL);
+
+/* generate a XTensor which allocates data on the buffer */
+XTensor * NewTensorBuf(const XTensor * reference, int devID, XMem * myMem);

 /* generate a dense vector */
 XTensor * NewTensor1D(const int num, const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, 
@@ -417,10 +438,10 @@ XTensor * NewTensor5D(const int d0, const int d1, const int d2, const int d3, co
 XTensor * NewTensor(XTensor * a, bool isFilledData = true);

 /* free the data space of a given tensor */
-void DelTensor(const XTensor * tensor);
+void DelTensor(XTensor * tensor);

 /* free the data space of a given tensor (on the buffer) */
-void DelTensorBuf(const XTensor * tensor);
+void DelTensorBuf(XTensor * tensor);

 } /* end of the nts (NiuTrans.Tensor) namespace */


--- a/source/tensor/XUtility.cpp
+++ b/source/tensor/XUtility.cpp
@@ -175,29 +175,38 @@ void XMemCopy(void * t, int devIDT, const void * s, int devIDS, size_t size)
        return;
    }
 #ifdef USE_CUDA
-    else if(devIDT >= 0 && devIDS < 0){
-        cudaError_t error = cudaMemcpy(t, s, size, cudaMemcpyHostToDevice);
-        if(error != cudaSuccess){
-            ShowNTErrors("cudaMemcpy error (cudaMemcpyHostToDevice)");
-        }
-    }
-    else if(devIDT < 0 && devIDS >= 0){
-        cudaError_t error = cudaMemcpy(t, s, size, cudaMemcpyDeviceToHost);
-        if(error != cudaSuccess){
-            ShowNTErrors("cudaMemcpy error (cudaMemcpyDeviceToHost)");
-        }
-    }
    else{
-        //if(devIDT == devIDS){
-            cudaError_t error = cudaMemcpy(t, s, size, cudaMemcpyDeviceToDevice);
+        int devID = devIDT < 0 ? devIDS : devIDT;
+        int devIDBackup = 0;
+        cudaGetDevice(&devIDBackup);
+        cudaSetDevice(devID);
+
+        if(devIDT >= 0 && devIDS < 0){
+            cudaError_t error = cudaMemcpy(t, s, size, cudaMemcpyHostToDevice);
            if(error != cudaSuccess){
-                ShowNTErrors("cudaMemcpy error (cudaMemcpyDeviceToDevice)");
+                ShowNTErrors("cudaMemcpy error (cudaMemcpyHostToDevice)");
            }
-        /*}
+        }
+        else if(devIDT < 0 && devIDS >= 0){
+            cudaError_t error = cudaMemcpy(t, s, size, cudaMemcpyDeviceToHost);
+            if(error != cudaSuccess){
+                ShowNTErrors("cudaMemcpy error (cudaMemcpyDeviceToHost)");
+            }
+        }
        else{
-            CheckNTErrors((cudaMemcpyPeer(t, devIDT, s, devIDS, size) == cudaSuccess),
-                                "cudaMemcpy error (cudaMemcpyDeviceToDevice)");
-        }*/
+            //if(devIDT == devIDS){
+                cudaError_t error = cudaMemcpy(t, s, size, cudaMemcpyDeviceToDevice);
+                if(error != cudaSuccess){
+                    ShowNTErrors("cudaMemcpy error (cudaMemcpyDeviceToDevice)");
+                }
+            /*}
+            else{
+                CheckNTErrors((cudaMemcpyPeer(t, devIDT, s, devIDS, size) == cudaSuccess),
+                                    "cudaMemcpy error (cudaMemcpyDeviceToDevice)");
+            }*/
+        }
+
+        cudaSetDevice(devIDBackup);
    }
 #else
    ShowNTErrors("Please specify USE_CUDA and recompile the code!");
@@ -208,6 +217,9 @@ void XMemCopy(void * t, int devIDT, const void * s, int devIDS, size_t size)
 #ifdef USE_CUDA
 void XMemCopyAsync(void * t, int devIDT, const void * s, int devIDS, size_t size, cudaStream_t stream, int streamDevID)
 {
+    if(t == s)
+        return;
+
    int devIDBackup = -1;
    if(streamDevID >= 0 && (devIDT >= 0 || devIDS >= 0)){
        CheckNTErrors((cudaGetDevice(&devIDBackup) == cudaSuccess), "Cannot get GPU device id!");
@@ -220,17 +232,23 @@ void XMemCopyAsync(void * t, int devIDT, const void * s, int devIDS, size_t size
        return;
    }
    else if(devIDT >= 0 && devIDS < 0){
-        CheckNTErrors((cudaMemcpyAsync(t, s, size, cudaMemcpyHostToDevice, stream) == cudaSuccess),
-                            "cudaMemcpyAsync error (cudaMemcpyHostToDevice)");
+        cudaError_t error = cudaMemcpyAsync(t, s, size, cudaMemcpyHostToDevice, stream);
+        if(error != cudaSuccess){
+            ShowNTErrors("cudaMemcpyAsync error (cudaMemcpyHostToDevice)");
+        }
    }
    else if(devIDT < 0 && devIDS >= 0){
-        CheckNTErrors((cudaMemcpyAsync(t, s, size, cudaMemcpyDeviceToHost, stream) == cudaSuccess),
-                            "cudaMemcpyAsync error (cudaMemcpyDeviceToHost)");
+        cudaError_t error = cudaMemcpyAsync(t, s, size, cudaMemcpyDeviceToHost, stream);
+        if(error != cudaSuccess){
+            ShowNTErrors("cudaMemcpyAsync error (cudaMemcpyDeviceToHost)");
+        }
    }
    else{
        //if(devIDT == devIDS){
-            CheckNTErrors((cudaMemcpyAsync(t, s, size, cudaMemcpyDeviceToDevice, stream) == cudaSuccess),
-                                "cudaMemcpyAsync error (cudaMemcpyDeviceToDevice)");
+            cudaError_t error = cudaMemcpyAsync(t, s, size, cudaMemcpyDeviceToDevice, stream);
+            if(error != cudaSuccess){
+                ShowNTErrors("cudaMemcpyAsync error (cudaMemcpyDeviceToDevice)");
+            }
        //}
        /*else{
            CheckNTErrors((cudaMemcpyPeerAsync(t, devIDT, s, devIDS, size, stream) == cudaSuccess),
@@ -261,18 +279,69 @@ void XMemCopy2D(void * t, size_t tPitch, int devIDT, const void * s, size_t sPit
        return;
    }
 #ifdef USE_CUDA
-    else if (devIDT >= 0 && devIDS < 0) {
-        CheckNTErrors((cudaMemcpy2D(t, tPitch, s, sPitch, mSize, n, cudaMemcpyHostToDevice) == cudaSuccess),
-                            "cudaMemcpy2D error (cudaMemcpyHostToDevice)");
+    else{
+        int devID = devIDT < 0 ? devIDS : devIDT;
+        int devIDBackup = 0;
+        cudaGetDevice(&devIDBackup);
+        cudaSetDevice(devID);
+
+        if (devIDT >= 0 && devIDS < 0) {
+            cudaError_t error = cudaMemcpy2D(t, tPitch, s, sPitch, mSize, n, cudaMemcpyHostToDevice);
+            if(error != cudaSuccess){
+                ShowNTErrors("cudaMemcpy2D error (cudaMemcpyHostToDevice)");
+            }
+        }
+        else if (devIDT < 0 && devIDS >= 0) {
+            cudaError_t error = cudaMemcpy2D(t, tPitch, s, sPitch, mSize, n, cudaMemcpyDeviceToHost);
+            if(error != cudaSuccess){
+                ShowNTErrors("cudaMemcpy error (cudaMemcpyDeviceToHost)");
+            }
+        }
+        else {
+            cudaError_t error = cudaMemcpy2D(t, tPitch, s, sPitch, mSize, n, cudaMemcpyDeviceToDevice);
+            if (error != cudaSuccess) {
+                ShowNTErrors("cudaMemcpy error (cudaMemcpyDeviceToDevice)");
+            }
+        }
+
+        cudaSetDevice(devIDBackup);
    }
-    else if (devIDT < 0 && devIDS >= 0) {
-        CheckNTErrors((cudaMemcpy2D(t, tPitch, s, sPitch, mSize, n, cudaMemcpyDeviceToHost) == cudaSuccess),
-            "cudaMemcpy error (cudaMemcpyDeviceToHost)");
+#else
+    ShowNTErrors("Please specify USE_CUDA and recompile the code!");
+#endif
+}
+
+void XMemCopy2DAsync(void * t, size_t tPitch, int devIDT, const void * s, size_t sPitch, int devIDS, size_t mSize, int n, XStream * stream)
+{
+    if (t == s)
+        return;
+
+    if (devIDT < 0 && devIDS < 0) {
+        for(int i = 0; i < n; i++)
+            memcpy((char*)t + tPitch * i, (char*)s + sPitch * i, mSize);
+        return;
    }
-    else {
-        cudaError_t error = cudaMemcpy2D(t, tPitch, s, sPitch, mSize, n, cudaMemcpyDeviceToDevice);
-        if (error != cudaSuccess) {
-            ShowNTErrors("cudaMemcpy error (cudaMemcpyDeviceToDevice)");
+#ifdef USE_CUDA
+    else{
+        CheckNTErrors(stream != NULL, "No stream found!");
+        cudaStream_t &cstream = stream->stream;
+        if (devIDT >= 0 && devIDS < 0) {
+            cudaError_t error = cudaMemcpy2DAsync(t, tPitch, s, sPitch, mSize, n, cudaMemcpyHostToDevice, cstream);
+            if(error != cudaSuccess){
+                ShowNTErrors("cudaMemcpy2D error (cudaMemcpyHostToDevice)");
+            }
+        }
+        else if (devIDT < 0 && devIDS >= 0) {
+            cudaError_t error = cudaMemcpy2DAsync(t, tPitch, s, sPitch, mSize, n, cudaMemcpyDeviceToHost, cstream);
+            if(error != cudaSuccess){
+                ShowNTErrors("cudaMemcpy error (cudaMemcpyDeviceToHost)");
+            }
+        }
+        else {
+            cudaError_t error = cudaMemcpy2DAsync(t, tPitch, s, sPitch, mSize, n, cudaMemcpyDeviceToDevice, cstream);
+            if (error != cudaSuccess) {
+                ShowNTErrors("cudaMemcpy error (cudaMemcpyDeviceToDevice)");
+            }
        }
    }
 #else

--- a/source/tensor/XUtility.h
+++ b/source/tensor/XUtility.h
@@ -23,6 +23,7 @@

 #include <stdio.h>
 #include "XGlobal.h"
+#include "XDevice.h"

 #ifndef __XUTILITY_H__
 #define __XUTILITY_H__
@@ -41,6 +42,7 @@ extern void XMemSet(void * p, int value, size_t size);
 extern void XMemSet(int devID, void * p, int value, size_t size);
 extern void XMemCopy(void * t, int devIDT, const void * s, int devIDS, size_t size);
 extern void XMemCopy2D(void * t, size_t tPitch, int devIDT, const void * s, size_t sPitch, int devIDS, size_t mSize, int n);
+extern void XMemCopy2DAsync(void * t, size_t tPitch, int devIDT, const void * s, size_t sPitch, int devIDS, size_t mSize, int n, XStream * stream);
 extern void * XMemAlloc(int devID, size_t size);
 extern void * XMemAllocOnDev(int devID, size_t size);
 extern void XMemFree(int devID, void * p);

--- a/source/tensor/core/CHeader.h
+++ b/source/tensor/core/CHeader.h
@@ -26,49 +26,63 @@

 #include "../XTensor.h"

-#include "shape/Concatenate.h"
-#include "shape/ConcatenateSolely.h"
-#include "movement/CopyBlocks.h"
-#include "movement/CopyBlocksInGrid.h"
-#include "movement/CopyBlocksOnSite.h"
-#include "movement/CopyData2D.h"
-#include "movement/CopyIndexed.h"
-#include "movement/CopyInGrid.h"
-#include "movement/CopyValues.h"
-#include "utilities/FlushToMem.h"
-#include "shape/MakeMergeBlockIndex.h"
-#include "shape/MakeSplitBlockIndex.h"
+#include "arithmetic/Div.h"
 #include "arithmetic/MatrixMul.h"
 #include "arithmetic/MatrixMul2D.h"
 #include "arithmetic/MatrixMul2DMultiTheading.h"
 #include "arithmetic/MatrixMul2DParallel.h"
 #include "arithmetic/MatrixMulBatched.h"
-#include "arithmetic/MatrixMULBatchedCPU.h"
-#include "shape/Merge.h"
-#include "shape/MergeBlockLists.h"
 #include "arithmetic/Multiply.h"
 #include "arithmetic/Negate.h"
+#include "arithmetic/Sign.h"
+#include "arithmetic/Sub.h"
+#include "arithmetic/Sum.h"
+#include "arithmetic/SumByColumnTV.h"
+#include "arithmetic/SumByColumnVT.h"
+#include "arithmetic/SumDim.h"
+#include "arithmetic/XTensorBLAS.h"
+
+#include "getandset/ConvertDataType.h"
+#include "getandset/Select.h"
+#include "getandset/SetData.h"
+
+#include "math/Clip.h"
 #include "math/Normalize.h"
-#include "shape/Permute.h"
 #include "math/Power.h"
+#include "math/ScaleAndShift.h"
+#include "math/Unary.h"
+
+
+#include "movement/CopyBlocks.h"
+#include "movement/CopyBlocksInGrid.h"
+#include "movement/CopyBlocksOnSite.h"
+#include "movement/CopyData2D.h"
+#include "movement/CopyIndexed.h"
+#include "movement/CopyInGrid.h"
+#include "movement/CopyValues.h"
+
 #include "reduce/ReduceMax.h"
 #include "reduce/ReduceMean.h"
 #include "reduce/ReduceStandardVariance.h"
 #include "reduce/ReduceSum.h"
 #include "reduce/ReduceSumSquared.h"
 #include "reduce/ReduceVariance.h"
-#include "math/ScaleAndShift.h"
-#include "getandset/Select.h"
-#include "getandset/SetData.h"
-#include "sort/Sort.h"
+
+#include "shape/Concatenate.h"
+#include "shape/ConcatenateSolely.h"
+#include "shape/MakeMergeBlockIndex.h"
+#include "shape/MakeSplitBlockIndex.h"
+#include "shape/Merge.h"
+#include "shape/MergeBlockLists.h"
+#include "shape/Permute.h"
 #include "shape/Split.h"
-#include "arithmetic/Sum.h"
-#include "arithmetic/SumByColumnTV.h"
-#include "arithmetic/SumByColumnVT.h"
-#include "sort/TopK.h"
 #include "shape/Transpose.h"
 #include "shape/Unsqueeze.h"
+
+#include "sort/Sort.h"
+#include "sort/TopK.h"
+
 #include "utilities/XMatrixSegment.h"
-#include "arithmetic/XTensorBLAS.h"
+#include "utilities/FlushToMem.h"

 #endif // __CHEADER_H__
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Div.cpp
+++ b/source/tensor/core/arithmetic/Div.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-01
+ */
+
+#include "../../XTensor.h"
+#include "../../XName.h"
+#include "Div.h"
+#include "Div.cuh"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/*
+element-wise division of two tensors
+
+c(i) = a(i)/b(i) + \alpha * c(i)
+where i is the index of the item
+
+>> a - tensor a
+>> b - tensor b
+>> c - result tensor
+>> alpha - the coefficient
+>> leadingDim - the dimension along which we perform broadcasting
+*/
+void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
+{
+	int leadingDimRDI = a->order - leadingDim - 1;
+    CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
+                  "Unmatched tensors in multiplication!");
+    CheckNTErrors((a->order == b->order && a->order == c->order), 
+                  "Unmatched tensors!");
+
+#ifdef USE_CUDA
+    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
+        _CudaDiv(a, b, c, alpha, leadingDim);
+        return;
+    }
+#endif
+
+    int stride = 1;
+    int blockSizeA = 1;
+    int blockSizeB = 1;
+    int blockSizeC = 1;
+    int blockNum = 1;
+    int dimensionSizeA = a->dimSizeRDI[leadingDimRDI];
+    int dimensionSizeB = b->dimSizeRDI[leadingDimRDI];
+    int dimensionSizeC = c->dimSizeRDI[leadingDimRDI];
+
+    for (int i = 0; i < a->order; i++) {
+        if (i != leadingDimRDI) {
+            CheckNTErrors((a->dimSizeRDI[i] == b->dimSizeRDI[i] && a->dimSizeRDI[i] == c->dimSizeRDI[i]),
+                          "Unmatched tensors!");
+        }
+        if (i < leadingDimRDI)
+            stride *= a->dimSizeRDI[i];
+    }
+
+    blockSizeA = stride * dimensionSizeA;
+    blockSizeB = stride * dimensionSizeB;
+    blockSizeC = stride * dimensionSizeC;
+    blockNum = a->unitNum / blockSizeA;
+
+    if (!a->isSparse && !b->isSparse) {
+        if (a->dataType == DEFAULT_DTYPE && b->dataType == DEFAULT_DTYPE) {
+            if (a->unitNum == c->unitNum && b->unitNum == c->unitNum) {
+                int size = a->unitNum;
+                DTYPE * ap = (DTYPE*)a->data;
+                DTYPE * bp = (DTYPE*)b->data;
+                DTYPE * cp = (DTYPE*)c->data;
+                if (alpha == 0) {
+                    for (int i = 0; i < size; i++)
+                        cp[i] = ap[i] / bp[i];
+                }
+                else {
+                    for (int i = 0; i < size; i++)
+                        cp[i] = ap[i] / bp[i] + alpha * cp[i];
+                }
+            }
+            else {
+                for (int k = 0; k < blockNum; k++) {
+
+                    for (int ci = 0, ai = 0, bi = 0; ci < dimensionSizeC; ci++, ai++, bi++) {
+                        if (ai >= dimensionSizeA)
+                            ai = 0;
+                        if (bi >= dimensionSizeB)
+                            bi = 0;
+                        DTYPE * ap = (DTYPE*)a->data + k * blockSizeA + ai * stride;
+                        DTYPE * bp = (DTYPE*)b->data + k * blockSizeB + bi * stride;
+                        DTYPE * cp = (DTYPE*)c->data + k * blockSizeC + ci * stride;
+                        for (int j = 0; j < stride; j++)
+                            cp[j] = ap[j] / bp[j] + cp[j] * alpha;
+                    }
+                }
+            }
+        }
+        else {
+            // TODO!!
+            ShowNTErrors("TODO!");
+        }
+    }
+    else {
+        // TODO!!
+        ShowNTErrors("TODO!");
+    }
+}
+
+/*
+element-wise division of two tensors (do it on site)
+keep the result in the input tensor a and return nothing
+
+a(i) = a(i)*b(i) + \alpha * a(i)
+where i is the index of the item
+
+>> a - tensor a (where keep the result)
+>> b - tensor b
+>> alpha - the coefficient
+>> leadingDim - the dimension along which we perform broadcasting
+*/
+void _DivMe(XTensor * a, const XTensor * b, DTYPE alpha, int leadingDim)
+{
+    _Div(a, b, a, alpha, leadingDim);
+}
+
+/*
+element-wise division of two tensors (return a XTensor structure)
+make a new tensor c to keep the result and return it
+
+c(i) = a(i)*b(i)
+where i is the index of the item
+
+>> a - tensor a
+>> b - tensor b
+>> leadingDim - the dimension along which we perform broadcasting
+<< return - the product of the tensors
+*/
+XTensor Div(const XTensor &a, const XTensor &b, int leadingDim)
+{
+    CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
+
+    XTensor c(&a);
+    c.SetTMP();
+    
+    /* call _Multiply function */
+    _Div(&a, &b, &c, 0, leadingDim);
+    
+    /* tensor connections */
+    XLink::MakeLink(&a, &b, &c, MATH_DIV);
+    XLink::AddParamToHeadInt(&c, leadingDim);
+    
+    return c;
+}
+
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/Div.cu
+++ b/source/tensor/core/arithmetic/Div.cu
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
+*/
+
+#include "../../XDevice.h"
+#include "../../XTensor.h"
+#include "Div.h"
+#include "Div.cuh"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+#ifdef USE_CUDA
+/*
+division of data arrays in a element-wise manner c(i) = a(i)/b(i)
+>> a - data array a
+>> b - data array b
+>> c - result data array
+>> size - size of c
+*/
+__global__
+void KernelDivElementWise(DTYPE * a, DTYPE * b, DTYPE * c, int size)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < size)
+        c[i] = a[i] / b[i];
+}
+
+/*
+division of data arrays in a element-wise manner c(i) = a(i)/b(i) + \alpha*c(i)
+>> a - data array a
+>> b - data array b
+>> c - result data array
+>> size - size of c
+>> alpha - the coefficient
+*/
+__global__
+void KernelDivElementWiseV2(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE alpha)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < size)
+        c[i] = a[i] / b[i] + alpha * c[i];
+}
+
+/*
+division of two tensors in a element-wise manner c(i) = a(i)/b(i).
+Note that a and b can be of different sizes here, i.e.,
+|a_lead| <= |c_lead| and |b_lead| <= |c_lead|
+where |a_lead| means the size of the leading dimension of a
+>> a - tensor a
+>> b - tensor b
+>> c - result tensor
+>> alpha - the coefficient
+>> stride - the number of items we go over when move next along the leading dimension in a block
+>> ldSizeA - size of the leading dimension of a
+>> ldSizeB - size of the leading dimension of b
+>> ldSizeC - size of the leading dimension of c
+>> blockNum - number of blocks
+*/
+template<int nonZeroAlpha> __global__
+void KernelDivElementWiseTensorDynamic(DTYPE * a, DTYPE * b, DTYPE * c, DTYPE alpha,
+    int stride, int ldSizeA, int ldSizeB, int ldSizeC, int blockNum)
+{
+    __shared__ DTYPE* ap[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+    __shared__ DTYPE* bp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+    __shared__ DTYPE* cp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+
+    if (i >= blockNum * stride || j >= ldSizeC)
+        return;
+
+    if (threadIdx.y == 0) {
+        int block = i / stride;
+        int size = block * stride;
+        ap[threadIdx.x] = a + size * ldSizeA;
+        bp[threadIdx.x] = b + size * ldSizeB;
+        cp[threadIdx.x] = c + size * ldSizeC;
+    }
+
+    __syncthreads();
+
+    int aj = j >= ldSizeA ? j % ldSizeA : j;
+    int bj = j >= ldSizeB ? j % ldSizeB : j;
+    int offseti = i % stride;
+
+    if (nonZeroAlpha == 0)
+        cp[threadIdx.x][j * ldSizeC + offseti] = ap[threadIdx.x][aj * ldSizeA + offseti] / bp[threadIdx.x][bj * ldSizeB + offseti];
+    else
+        cp[threadIdx.x][j * ldSizeC + offseti] = ap[threadIdx.x][aj * ldSizeA + offseti] / bp[threadIdx.x][bj * ldSizeB + offseti]
+                                                 + alpha * cp[threadIdx.x][j * ldSizeC + offseti];
+}
+
+/*
+element-wise division of two tensors
+c(i) = a(i)*b(i) + \alpha * c(i)
+where i is the item index
+>> a - tensor a
+>> b - tensor b
+>> c - result tensor
+>> alpha - the coefficient
+>> leadingDim - dimension along which we perform broadcasting
+*/
+void _CudaDiv(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
+{
+	int leadingDimRDI = a->order - leadingDim - 1;
+    CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
+                  "Unmatched tensors in multiplication!");
+    CheckNTErrors((a->order == b->order && a->order == c->order), "Unmatched tensors!");
+
+    int stride = 1;
+    int blockSizeA = 1;
+    int blockNum = 1;
+    int dimensionSizeA = a->dimSizeRDI[leadingDimRDI];
+    int dimensionSizeB = b->dimSizeRDI[leadingDimRDI];
+    int dimensionSizeC = c->dimSizeRDI[leadingDimRDI];
+
+    for (int i = 0; i < a->order; i++) {
+        if (i != leadingDimRDI) {
+            CheckNTErrors((a->dimSizeRDI[i] == b->dimSizeRDI[i] &&
+                           a->dimSizeRDI[i] == c->dimSizeRDI[i]),
+                          "Unmatched tensors!");
+        }
+        if (i < leadingDimRDI)
+            stride *= a->dimSizeRDI[i];
+    }
+
+    blockSizeA = stride * dimensionSizeA;
+    blockNum = a->unitNum / blockSizeA;
+
+    int devIDBackup;
+    ProtectCudaDev(a->devID, devIDBackup);
+
+    if (!a->isSparse && !b->isSparse) {
+        if (a->dataType == DEFAULT_DTYPE && b->dataType == DEFAULT_DTYPE) {
+            int cudaGridSize[3];
+            int cudaBlockSize[3];
+
+            if (a->unitNum == c->unitNum && b->unitNum == c->unitNum) {
+                GDevs.GetCudaThread(a->devID, c->unitNum, cudaGridSize, cudaBlockSize);
+                dim3 blocks(cudaGridSize[0]), threads(cudaBlockSize[0]);
+
+                if (alpha == 0)
+                    KernelDivElementWise << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, c->unitNum);
+                else
+                    KernelDivElementWiseV2 << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, c->unitNum, alpha);
+            }
+            else {
+                GDevs.GetCudaThread2D(c->devID, stride * blockNum, dimensionSizeC, MAX_INT, cudaGridSize, cudaBlockSize);
+                dim3 blocks(cudaGridSize[0], cudaGridSize[1]), threads(cudaBlockSize[0], cudaBlockSize[1]);
+
+                if (alpha == 0) {
+                    KernelDivElementWiseTensorDynamic<0> << <blocks, threads >> >
+                        ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, 0,
+                        stride, dimensionSizeA, dimensionSizeB, dimensionSizeC, blockNum);
+                }
+                else {
+                    KernelDivElementWiseTensorDynamic<1> << <blocks, threads >> >
+                        ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, alpha,
+                        stride, dimensionSizeA, dimensionSizeB, dimensionSizeC, blockNum);
+                }
+            }
+        }
+        else {
+            // TODO!!
+            ShowNTErrors("TODO!");
+        }
+    }
+    else {
+        // TODO!!
+        ShowNTErrors("TODO!");
+    }
+
+    BacktoCudaDev(a->devID, devIDBackup);
+}
+
+#endif // USE_CUDA
+
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Div.cuh
+++ b/source/tensor/core/arithmetic/Div.cuh
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-01
+ */
+
+#ifndef __DIV_CUH__
+#define __DIV_CUH__
+
+#include "Div.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+#ifdef USE_CUDA
+
+/* division of two tensors in a element-wise manner c(i) = a(i)/b(i) */
+__global__
+void KernelDivElementWise(DTYPE * a, DTYPE * b, DTYPE * c, int size);
+
+/* division of two tensors in a element-wise manner c(i) = a(i)/b(i) + \alpha*c(i) */
+__global__
+void KernelDivElementWiseV2(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE alpha);
+
+/* division of two tensors in a element-wise manner c(i) = a(i)/b(i)+ \alpha*c(i)  */
+template<int nonZeroAlpha>__global__
+void KernelDivElementWiseTensorDynamic(DTYPE * a, DTYPE * b, DTYPE * c, DTYPE alpha, int stride, int ldSizeA, int ldSizeB, int ldSizeC, int blockNum);
+
+/* element-wise division of two tensors */
+void _CudaDiv(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha = 0, int leadingDim = 0);
+
+#endif // USE_CUDA
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __DIV_CUH__
+
--- a/source/tensor/core/math/Log.h
+++ b/source/tensor/core/math/Log.h
@@ -16,31 +16,39 @@
 */

 /*
-* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
-*/
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-01
+ */

-#ifndef __LOG_H__
-#define __LOG_H__
+#ifndef __DIV_H__
+#define __DIV_H__

 #include "../../XTensor.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

-/* set every entry to its log value */
-void _Log(const XTensor * a, XTensor * b);
+/* 
+element-wise division of two tensors:
+c(i) = a(i)/b(i) + \alpha * c(i) 
+where i is the index of the element
+*/
+void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha = 0, int leadingDim = 0);

 /* 
-set every entry to its log value (do it on site)
+element-wise division of two tensors (do it on site)
 keep the result in the input tensor a and return nothing
+a(i) = a(i)/b(i) + \alpha * a(i) 
+where i is the index of the element 
 */
-void _LogMe(XTensor * a);
+void _DivMe(XTensor * a, const XTensor * b, DTYPE alpha = 0, int leadingDim = 0);

 /* 
-set every entry to its log value (return a XTensor structure)
+element-wise division of two tensors (return a XTensor structure)
 make a new tensor to keep the result and return it
+c(i) = a(i)/b(i)
+where i is the index of the element 
 */
-XTensor Log(const XTensor & a);
+XTensor Div(const XTensor &a, const XTensor &b, int leadingDim = 0);

 } // namespace nts(NiuTrans.Tensor)

-#endif // __LOG_H__
+#endif // __DIV_H__
\ No newline at end of file
--- a/source/tensor/core/arithmetic/MatrixMULBatchedCPU.cpp
+++ b/source/tensor/core/arithmetic/MatrixMULBatchedCPU.cpp
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
-
-#include "../../XTensor.h"
-#include "MatrixMULBatchedCPU.h"
-#include "MatrixMul2D.h"
-#include "XTensorBLAS.h"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-/*
-matrix multiplication in batch mode (BLAS)
-c_i = trans(a_i) * trans(b_i) * \alpha + c_i * \beta for each i in [0,count-1]
->> a - list of input matrices (2d tensors)
->> transposedA - indicate whether the matrix a is transposed
->> b - another list of input matrices (2d tensors)
->> transposedB - indicate whether the matrix b is transposed
->> c - output matrix (2d tensor)
->> alpha - scalar
->> beta - scalar
-*/
-void _MatrixMULBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA,
-                          const XList * b, MATRIX_TRANS_TYPE transposedB,
-                          XList * c, DTYPE alpha, DTYPE beta)
-{
-    CheckNTErrors(a && b && c, "Empty input lists!");
-    CheckNTErrors(a->count == b->count && a->count == c->count, "Input lists must be of the same size!");
-
-    if (a->count == 0)
-        return;
-
-    bool isUniform = true;
-    for (int i = 1; i < a->count; i++) {
-        XTensor * aim = (XTensor*)a->GetItem(i - 1);
-        XTensor * bim = (XTensor*)b->GetItem(i - 1);
-        XTensor * cim = (XTensor*)c->GetItem(i - 1);
-        XTensor * ai = (XTensor*)a->GetItem(i);
-        XTensor * bi = (XTensor*)b->GetItem(i);
-        XTensor * ci = (XTensor*)c->GetItem(i);
-        if (!XTensor::IsSameShaped(aim, ai) ||
-            !XTensor::IsSameShaped(bim, bi) ||
-            !XTensor::IsSameShaped(cim, ci))
-        {
-            isUniform = false;
-            break;
-        }
-    }
-
-    for (int i = 0; i < a->count; i++) {
-        XTensor * ai = (XTensor*)a->GetItem(i);
-        XTensor * bi = (XTensor*)b->GetItem(i);
-        XTensor * ci = (XTensor*)c->GetItem(i);
-        CheckNTErrors((ai->order == 2), "2d tensor (i.e., matrix) is required!");
-        CheckNTErrors((bi->order == 2), "2d tensor (i.e., matrix) is required!");
-        CheckNTErrors((ci->order == 2), "2d tensor (i.e., matrix) is required!");
-#ifdef USE_BLAS
-        if (useBLAS)
-            _MatrixMULCPU(ai, transposedA, bi, transposedB, ci, alpha, beta);
-        else
-            _MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
-#else
-        _MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
-#endif
-    }
-    //}
-}
-} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/MatrixMul.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul.cpp
@@ -24,8 +24,8 @@
 #include "../../XName.h"
 #include "MatrixMul.h"
 #include "MatrixMul2D.h"
-#include "MatrixMULBatchedCPU.h"
 #include "XTensorBLAS.h"
+#include "MatrixMulBatched.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

@@ -53,11 +53,29 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
                const XTensor * b, MATRIX_TRANS_TYPE transposedB,
                XTensor * c, DTYPE alpha, DTYPE beta, XPRunner * parallelRunner)
 {
-    CheckNTErrors((a && b && c), "Empty input tensors!");
-    CheckNTErrors((a->dataType == b->dataType && a->dataType == c->dataType),
+    CheckNTErrors(a && b && c, "Empty input tensors!");
+    CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
                  "Input tensors should have the same data type!");
-    CheckNTErrors((a->order >= 2 && b->order >= 2 && c->order >= 2),
+    CheckNTErrors(a->order >= 2 && b->order >= 2 && c->order >= 2,
                  "Input tensors must have a order >= 2!");
+    CheckNTErrors(c->order == a->order + b->order - 2, "wrong tensor order")
+    
+    /* we transform a higher order tensor to a matrix to kill the number
+       of calls of matrix multiplication */
+    if(transposedA == X_NOTRANS && a->order > 2 && b->order == 2){
+        int ncolA = a->dimSize[a->order - 1];
+        int ncolC = c->dimSize[c->order - 1];
+        XTensor * a2 = NewTensor2D(a->unitNum/ncolA, -ncolA, a->dataType, a->devID, a->mem);
+        XTensor * c2 = NewTensor2D(c->unitNum/ncolC, -ncolC, c->dataType, c->devID, c->mem);
+        a2->data = a->data;
+        c2->data = c->data;
+        _MatrixMul2D(a2, transposedA, b, transposedB, c2, alpha, beta, parallelRunner);
+        a2->data = NULL;
+        c2->data = NULL;
+        delete a2;
+        delete c2;
+        return;
+    }

    int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
    int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
@@ -144,10 +162,10 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,

        cublasHandle_t * handle = a->mem != NULL ? a->mem->GetCublasHandle() : GDevs.GetCudaHandle(a->devID);
        _CudaBLASMatrixMULList(handle,
-                              aList, transposedA,
-                              bList, transposedB,
-                              cList, aList->count,
-                              alpha, beta);
+                               aList, transposedA,
+                               bList, transposedB,
+                               cList, aList->count,
+                               alpha, beta);

        BacktoCudaDev(a->devID, devIDBackup);
 #else
@@ -156,9 +174,9 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
    }
    else {
        CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
-        _MatrixMULBatchedCPU(aList, transposedA,
-            bList, transposedB,
-            cList, alpha, beta);
+        _MatrixMulBatchedCPU(aList, transposedA,
+                             bList, transposedB,
+                             cList, alpha, beta);
    }

    for (int i = 0; i < aList->count; i++) {
@@ -251,9 +269,7 @@ XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
 /* 
 matrix multiplication with no transposition c = a * b * alpha
 >> a - tensor a
->> transposedA - indicates whether the matrices in a are transposed
 >> b - tensor b
->> transposedB - indicates whether teh matrices in b are transposed
 >> alpha - a coefficient
 >> parallelRunner - parallel processing module
 << return - the result of matrix multiplication

--- a/source/tensor/core/arithmetic/MatrixMulBatched.cpp
+++ b/source/tensor/core/arithmetic/MatrixMulBatched.cpp
@@ -23,8 +23,8 @@
 #include "../../XDevice.h"
 #include "../../XName.h"
 #include "MatrixMulBatched.h"
-#include "MatrixMULBatchedCPU.h"
 #include "XTensorBLAS.h"
+#include "MatrixMul2D.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

@@ -57,6 +57,43 @@ void _MatrixMulBatched(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
    CheckNTErrors((a->order == b->order && a->order == c->order), 
                  "Input tensor and output tensor must have same order!");

+    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0)
+        _MatrixMulBatchedGPU(a, transposedA, b, transposedB, c, alpha, beta);
+    else
+        _MatrixMulBatchedCPU(a, transposedA, b, transposedB, c, alpha, beta);
+}
+
+/*
+matrix multiplication of the two tensors
+optimized for GPU
+
+for each 2-dimensional data array in a (denoted as ai) and
+each 2-dimensional data array in b (denoted as bi), we have
+ci = trans(ai) * trans(bi) * alpha + cm * beta
+where trans() returns the transposed matrix if the flag is fired
+
+>> a - tensor a
+>> transposedA - indicates whether the matrices in a are transposed
+>> b - tensor b
+>> transposedB - indicates whether teh matrices in b are transposed
+>> c - where we keep a*b
+>> alpha - a coefficient
+>> beta - another coefficient
+*/
+void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
+                          const XTensor * b, MATRIX_TRANS_TYPE transposedB,
+                          XTensor * c, DTYPE alpha, DTYPE beta)
+{
+#ifdef USE_CUDA
+    CheckNTErrors((a && b && c), "Empty input tensors!");
+    CheckNTErrors((a->dataType == b->dataType && a->dataType == c->dataType),
+                  "Input tensors should have the same data type!");
+    CheckNTErrors((a->order >= 2 && b->order >= 2 && c->order >= 2),
+                  "Input tensors must have a order >= 2!");
+    CheckNTErrors((a->order == b->order && a->order == c->order), 
+                  "Input tensor and output tensor must have same order!");
+    CheckNTErrors(a->devID >= 0 && b->devID >= 0 && c->devID >= 0, "The tensors must be on GPUs");
+
    int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
    int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
    int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1];
@@ -64,8 +101,7 @@ void _MatrixMulBatched(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
    int cn = c->dimSizeRDI[1];
    int cm = c->dimSizeRDI[0];

-    CheckNTErrors((am == bn && an == cn && bm == cm),
-        "Unmatched tensors in multiplication!");
+    CheckNTErrors((am == bn && an == cn && bm == cm), "Unmatched tensors in multiplication!");

    int aBlockSize = a->dimSizeRDI[0] * a->dimSizeRDI[1];
    int bBlockSize = b->dimSizeRDI[0] * b->dimSizeRDI[1];
@@ -81,76 +117,159 @@ void _MatrixMulBatched(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
        blockNum *= a->dimSizeRDI[i];
    }

-    XList * aList = new XList(10);
-    XList * bList = new XList(10);
-    XList * cList = new XList(10);
-    int aDimSize[2] = { -a->dimSizeRDI[1], a->dimSizeRDI[0] };
-    int bDimSize[2] = { -b->dimSizeRDI[1], b->dimSizeRDI[0] };
-    int cDimSize[2] = { -c->dimSizeRDI[1], c->dimSizeRDI[0] };
-
-    for (int p = 0; p < blockNum; p++) {
-        void * ap = (char*)a->data + aRealBlockSize * p;
-        void * bp = (char*)b->data + bRealBlockSize * p;
-        void * cp = (char*)c->data + cRealBlockSize * p;
-        XTensor * ai = NewTensor(2, aDimSize, a->dataType, a->denseRatio, a->devID, a->mem);
-        XTensor * bi = NewTensor(2, bDimSize, b->dataType, b->denseRatio, b->devID, b->mem);
-        XTensor * ci = NewTensor(2, cDimSize, c->dataType, c->denseRatio, c->devID, c->mem);
-        ai->data = ap;
-        bi->data = bp;
-        ci->data = cp;
-        aList->Add(ai);
-        bList->Add(bi);
-        cList->Add(ci);
+    int devIDBackup = 0;
+    ProtectCudaDev(a->devID, devIDBackup);
+
+    cublasHandle_t * handle = a->mem != NULL ? a->mem->GetCublasHandle() : GDevs.GetCudaHandle(a->devID);
+    _CudaBLASMatrixMULBatchedStrided(handle,
+                                     a->data, transposedA, a->dataType, aBlockSize,
+                                     b->data, transposedB, b->dataType, bBlockSize,
+                                     c->data, c->dataType, cBlockSize, blockNum,
+                                     a->dimSizeRDI[1], a->dimSizeRDI[0],
+                                     b->dimSizeRDI[1], b->dimSizeRDI[0],
+                                     c->dimSizeRDI[1], c->dimSizeRDI[0], alpha, beta);
+
+    BacktoCudaDev(a->devID, devIDBackup);
+#endif
+}
+
+/*
+matrix multiplication of the two tensors
+optimized for CPU
+
+for each 2-dimensional data array in a (denoted as ai) and
+each 2-dimensional data array in b (denoted as bi), we have
+ci = trans(ai) * trans(bi) * alpha + cm * beta
+where trans() returns the transposed matrix if the flag is fired
+
+>> a - tensor a
+>> transposedA - indicates whether the matrices in a are transposed
+>> b - tensor b
+>> transposedB - indicates whether teh matrices in b are transposed
+>> c - where we keep a*b
+>> alpha - a coefficient
+>> beta - another coefficient
+*/
+void _MatrixMulBatchedCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
+                          const XTensor * b, MATRIX_TRANS_TYPE transposedB,
+                          XTensor * c, DTYPE alpha, DTYPE beta)
+{
+CheckNTErrors((a && b && c), "Empty input tensors!");
+    CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
+                 "Input tensors should have the same data type!");
+    CheckNTErrors(a->order >= 2 && b->order >= 2 && c->order >= 2,
+                 "Input tensors must have a order >= 2!");
+    CheckNTErrors(a->order == b->order && a->order == c->order, 
+                 "Input tensor and output tensor must have same order!");
+
+
+    int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
+    int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
+    int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1];
+    int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0];
+    int cn = c->dimSizeRDI[1];
+    int cm = c->dimSizeRDI[0];
+
+    CheckNTErrors(am == bn && an == cn && bm == cm, "Unmatched tensors in multiplication!");
+
+    int aBlockSize = a->dimSizeRDI[0] * a->dimSizeRDI[1];
+    int bBlockSize = b->dimSizeRDI[0] * b->dimSizeRDI[1];
+    int cBlockSize = c->dimSizeRDI[0] * c->dimSizeRDI[1];
+    int aRealBlockSize = aBlockSize * a->unitSize;
+    int bRealBlockSize = bBlockSize * b->unitSize;
+    int cRealBlockSize = cBlockSize * c->unitSize;
+    int blockNum = 1;
+
+    for (int i = 2; i < a->order; i++) {
+        CheckNTErrors((a->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
+        CheckNTErrors((b->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
+        blockNum *= a->dimSizeRDI[i];
    }

-    if (a->devID >= 0 && b->devID >= 0 && c->devID >= 0) {
-#ifdef USE_CUDA
-        CheckNTErrors((a->devID == b->devID && a->devID == c->devID),
-                      "The code must be run on the same GPU!");
-        
-        int devIDBackup;
-        ProtectCudaDev(a->devID, devIDBackup);
-
-        cublasHandle_t * handle = a->mem != NULL ? a->mem->GetCublasHandle() : GDevs.GetCudaHandle(a->devID);
-        _CudaBLASMatrixMULList(handle,
-							   aList, transposedA,
-                               bList, transposedB,
-                               cList, aList->count,
-                               alpha, beta);
-
-        BacktoCudaDev(a->devID, devIDBackup);
+    int aDimSize[2] = {-a->dimSizeRDI[1], a->dimSizeRDI[0]};
+    int bDimSize[2] = {-b->dimSizeRDI[1], b->dimSizeRDI[0]};
+    int cDimSize[2] = {-c->dimSizeRDI[1], c->dimSizeRDI[0]};
+
+    XTensor * ai = NewTensor2D(aDimSize[0], aDimSize[1], a->dataType, a->devID, a->mem);
+    XTensor * bi = NewTensor2D(bDimSize[0], bDimSize[1], b->dataType, b->devID, b->mem);
+    XTensor * ci = NewTensor2D(cDimSize[0], cDimSize[1], c->dataType, c->devID, c->mem);
+
+    for (int i = 0; i < blockNum; i++) {
+        ai->data = (char*)a->data + i * aRealBlockSize;
+        bi->data = (char*)b->data + i * bRealBlockSize;
+        ci->data = (char*)c->data + i * cRealBlockSize;
+#ifdef USE_BLAS
+        if (useBLAS)
+            _MatrixMULCPU(ai, transposedA, bi, transposedB, ci, alpha, beta);
+        else
+            _MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
 #else
-        ShowNTErrors("Please specify USE_CUDA and recompile the code!");
+        _MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
 #endif
    }
-    else {
-        CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
-        _MatrixMULBatchedCPU(aList, transposedA,
-            bList, transposedB,
-            cList, alpha, beta);
-    }

-    for (int i = 0; i < aList->count; i++) {
-        XTensor * ai = (XTensor*)aList->GetItem(i);
-        ai->data = NULL;
-        delete ai;
-    }
+    ai->data = NULL;
+    bi->data = NULL;
+    ci->data = NULL;
+    delete ai;
+    delete bi;
+    delete ci;
+}

-    for (int i = 0; i < bList->count; i++) {
-        XTensor * bi = (XTensor*)bList->GetItem(i);
-        bi->data = NULL;
-        delete bi;
-    }
+/*
+matrix multiplication in batch mode for list inputs (BLAS)
+c_i = trans(a_i) * trans(b_i) * \alpha + c_i * \beta for each i in [0,count-1]
+>> a - list of input matrices (2d tensors)
+>> transposedA - indicate whether the matrix a is transposed
+>> b - another list of input matrices (2d tensors)
+>> transposedB - indicate whether the matrix b is transposed
+>> c - output matrix (2d tensor)
+>> alpha - scalar
+>> beta - scalar
+*/
+void _MatrixMulBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA,
+                          const XList * b, MATRIX_TRANS_TYPE transposedB,
+                          XList * c, DTYPE alpha, DTYPE beta)
+{
+    CheckNTErrors(a && b && c, "Empty input lists!");
+    CheckNTErrors(a->count == b->count && a->count == c->count, "Input lists must be of the same size!");
+
+    if (a->count == 0)
+        return;

-    for (int i = 0; i < cList->count; i++) {
-        XTensor * ci = (XTensor*)cList->GetItem(i);
-        ci->data = NULL;
-        delete ci;
+    bool isUniform = true;
+    for (int i = 1; i < a->count; i++) {
+        XTensor * aim = (XTensor*)a->GetItem(i - 1);
+        XTensor * bim = (XTensor*)b->GetItem(i - 1);
+        XTensor * cim = (XTensor*)c->GetItem(i - 1);
+        XTensor * ai = (XTensor*)a->GetItem(i);
+        XTensor * bi = (XTensor*)b->GetItem(i);
+        XTensor * ci = (XTensor*)c->GetItem(i);
+        if (!XTensor::IsSameShaped(aim, ai) ||
+            !XTensor::IsSameShaped(bim, bi) ||
+            !XTensor::IsSameShaped(cim, ci))
+        {
+            isUniform = false;
+            break;
+        }
    }

-    delete aList;
-    delete bList;
-    delete cList;
+    for (int i = 0; i < a->count; i++) {
+        XTensor * ai = (XTensor*)a->GetItem(i);
+        XTensor * bi = (XTensor*)b->GetItem(i);
+        XTensor * ci = (XTensor*)c->GetItem(i);
+        CheckNTErrors((ai->order == 2), "2d tensor (i.e., matrix) is required!");
+        CheckNTErrors((bi->order == 2), "2d tensor (i.e., matrix) is required!");
+        CheckNTErrors((ci->order == 2), "2d tensor (i.e., matrix) is required!");
+#ifdef USE_BLAS
+        if (useBLAS)
+            _MatrixMULCPU(ai, transposedA, bi, transposedB, ci, alpha, beta);
+        else
+            _MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
+#else
+        _MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
+#endif
+    }
 }

 /*
@@ -212,4 +331,60 @@ XTensor MatrixMulBatched(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const 
    return c;
 }

+/*
+matrix multiplication of the two tensors (do it on site)
+c = a * b * alpha
+make a new tensor to keep the result and return it
+
+for each 2-dimensional data array in a (denoted as ai) and
+each 2-dimensional data array in b (denoted as bi), we have
+ci = ai * bi * alpha + cm * beta
+
+>> a - tensor a
+>> b - tensor b
+>> alpha - a coefficient
+>> parallelRunner - parallel processing module
+<< return - the result of matrix multiplication of the two tensors
+*/
+XTensor MatrixMulBatched(const XTensor &a, const XTensor &b,
+                         DTYPE alpha, XPRunner * parallelRunner)
+{
+    CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
+    CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
+    CheckNTErrors(a.order == b.order, "Input tensor and output tensor must have same order!");
+
+    int an = a.dimSizeRDI[1];
+    int am = a.dimSizeRDI[0];
+    int bn = b.dimSizeRDI[1];
+    int bm = b.dimSizeRDI[0];
+
+    CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
+
+    int order = a.order;
+    int sub = 0;
+    int * dimSize = new int[order];
+    for (int i = 0; i < a.order - 2; i++)
+        dimSize[sub++] = a.dimSize[i];
+    dimSize[sub++] = an;
+    dimSize[sub++] = bm;
+
+    float dr = (!a.isSparse || !b.isSparse) ? 1.0F : MAX(a.denseRatio, b.denseRatio);
+    XTensor c(order, dimSize, a.dataType, dr, a.devID, a.mem);
+    c.SetTMP();
+
+    /*call _MatrixMulBatched function */
+    _MatrixMulBatched(&a, X_NOTRANS, &b, X_NOTRANS, &c, alpha, 0, parallelRunner);
+
+    /* tensor connections */
+    XLink::MakeLink(&a, &b, &c, MATH_MATRIXMULBATCHED);
+    XLink::AddParamToHeadTrans(&c, X_NOTRANS);
+    XLink::AddParamToHeadTrans(&c, X_NOTRANS);
+    XLink::AddParamToHead(&c, alpha);
+
+    /* destroy variables */
+    delete[] dimSize;
+
+    return c;
+}
+
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/MatrixMulBatched.h
+++ b/source/tensor/core/arithmetic/MatrixMulBatched.h
@@ -26,6 +26,8 @@

 namespace nts { // namespace nts(NiuTrans.Tensor)

+#define BMMul MatrixMulBatched
+
 /*
 matrix multiplication of the two tensors c = trans(a) * trans(b) * alpha + c * beta

@@ -37,6 +39,28 @@ where trans() returns the transposed matrix if the flag is fired
 void _MatrixMulBatched(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB,
                       XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);

+
+/*
+matrix multiplication of the two tensors c = trans(a) * trans(b) * alpha + c * beta
+optimized for GPU
+*/
+void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB,
+                          XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
+
+/*
+matrix multiplication of the two tensors c = trans(a) * trans(b) * alpha + c * beta
+optimized for GPU
+*/
+void _MatrixMulBatchedCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, 
+                          XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
+
+/*
+matrix multiplication of the two tensors c = trans(a) * trans(b) * alpha + c * beta (for list inputs)
+optimized for GPU
+*/
+void _MatrixMulBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA, const XList * b, MATRIX_TRANS_TYPE transposedB, 
+                          XList * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
+
 /*
 matrix multiplication of the two tensors (return a XTensor structure) c = trans(a) * trans(b) * alpha
 make a new tensor to keep the result and return it
@@ -49,6 +73,17 @@ where trans() returns the transposed matrix if the flag is fired
 XTensor MatrixMulBatched(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
                         DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);

+/*
+matrix multiplication of the two tensors (return a XTensor structure) c = a * b * alpha
+make a new tensor to keep the result and return it
+
+for each 2-dimensional data array in a (denoted as ai) and
+each 2-dimensional data array in b (denoted as bi), we have
+ci = ai * bi * alpha + cm * beta
+*/
+XTensor MatrixMulBatched(const XTensor &a, const XTensor &b, 
+                         DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
+
 } // namespace nts(NiuTrans.Tensor)

 #endif // __MATRIXMULBATCHED_H__
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Multiply.cpp
+++ b/source/tensor/core/arithmetic/Multiply.cpp
@@ -32,9 +32,9 @@ element-wise product of two tensors
 c(i) = a(i)*b(i) + \alpha * c(i)
 where i is the index of the item

->> a - matrix a
->> b - matrix b
->> c - result matrix
+>> a - tensor a
+>> b - tensor b
+>> c - result tensor
 >> alpha - the coefficient
 >> leadingDim - the dimension along which we perform broadcasting
 */

--- a/source/tensor/core/arithmetic/Multiply.cu
+++ b/source/tensor/core/arithmetic/Multiply.cu
@@ -104,9 +104,9 @@ void KernelMulElementWiseTensorDynamic(DTYPE * a, DTYPE * b, DTYPE * c, DTYPE al
    int offseti = i % stride;

    if (nonZeroAlpha == 0)
-        cp[threadIdx.x][j * ldSizeC + offseti] = ap[threadIdx.x][aj* ldSizeA + offseti] * bp[threadIdx.x][bj* ldSizeB + offseti];
+        cp[threadIdx.x][j * ldSizeC + offseti] = ap[threadIdx.x][aj * ldSizeA + offseti] * bp[threadIdx.x][bj * ldSizeB + offseti];
    else
-        cp[threadIdx.x][j * ldSizeC + offseti] = ap[threadIdx.x][aj* ldSizeA + offseti] * bp[threadIdx.x][bj* ldSizeB + offseti] +
+        cp[threadIdx.x][j * ldSizeC + offseti] = ap[threadIdx.x][aj * ldSizeA + offseti] * bp[threadIdx.x][bj * ldSizeB + offseti] +
        alpha * cp[threadIdx.x][j * ldSizeC + offseti];
 }


--- a/source/tensor/core/arithmetic/Sign.cpp
+++ b/source/tensor/core/arithmetic/Sign.cpp
@@ -76,7 +76,7 @@ XTensor Sign(const XTensor & a)
    XTensor b(&a);
    b.SetTMP();

-    /* call _ScaleAndShift function */
+    /* call _Sign function */
    _Sign(&a, &b);

    /* tensor connections */

--- a/source/tensor/core/arithmetic/Sub.cpp
+++ b/source/tensor/core/arithmetic/Sub.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-01
+ */
+
+#include "../../XTensor.h"
+#include "../../XName.h"
+#include "../../XUtility.h"
+#include "Sub.h"
+#include "Sub.cuh"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/*
+tensor subtraction c = a - b * \beta
+
+>> a - a tensor
+>> b - another tensor
+>> c - where we put a-b*\beta. we save it in a if c is NULL
+>> beta - the scaling factor
+*/
+void _Sub(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
+{
+    CheckNTErrors(a && b && c, "Empty tensor input!");
+    CheckNTErrors(a->unitNum == b->unitNum && a->unitNum == c->unitNum,
+                  "Unmatched tensors in addition!");
+    CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
+                  "Unmatched tensors in addition!");
+
+    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
+
+#ifdef USE_CUDA
+        if (a == c) {
+            int P2PAccesible = 0;
+#ifdef CUDA_UVA
+            cudaDeviceCanAccessPeer(&P2PAccesible, a->devID, b->devID);
+#endif
+            if ((a->devID < 0 && b->devID >= 0) ||
+                (a->devID >= 0 && b->devID < 0) ||
+                (a->devID >= 0 && b->devID >= 0 && a->devID != b->devID && !P2PAccesible))
+            {
+                ShowNTErrors("Cannot run this method on multiple devices simultaneously!");
+            }
+            else
+                _CudaSub(a, b, c, beta);
+        }
+        else
+            _CudaSub(a, b, c, beta);
+
+#endif
+    }
+    else {
+        if (!a->isSparse && !b->isSparse) {
+            CheckNTErrors(!c->isSparse, "Illegal use of sparse tensor in addition!");
+    
+            if (a->dataType == DEFAULT_DTYPE &&
+                b->dataType == DEFAULT_DTYPE &&
+                c->dataType == DEFAULT_DTYPE)
+            {
+                DTYPE * ap = (DTYPE*)a->data;
+                DTYPE * bp = (DTYPE*)b->data;
+                DTYPE * cp = (DTYPE*)c->data;
+    
+                /* unrolling */
+                int num = a->unitNum;
+                if (num % 4 == 0) {
+                    for (int i = 0; i < num; i += 4) {
+                        cp[i] = ap[i] - bp[i] * beta;
+                        cp[i + 1] = ap[i + 1] - bp[i + 1] * beta;
+                        cp[i + 2] = ap[i + 2] - bp[i + 2] * beta;
+                        cp[i + 3] = ap[i + 3] - bp[i + 3] * beta;
+                    }
+                }
+                else if (num % 2 == 0) {
+                    for (int i = 0; i < num; i += 2) {
+                        cp[i] = ap[i] - bp[i] * beta;
+                        cp[i + 1] = ap[i + 1] - bp[i + 1] * beta;
+                    }
+                }
+                else {
+                    for (int i = 0; i < num; i++) {
+                        cp[i] = ap[i] - bp[i] * beta;
+                    }
+                }
+            }
+            else {
+                // TODO!!
+                ShowNTErrors("TODO!");
+            }
+        }
+        else {
+            // TODO!!
+            ShowNTErrors("TODO!");
+        }
+    }
+}
+    
+/*
+tensor subtraction a = a - b * \beta (do it on site)
+keep the result in the tensor a and return nothing
+
+>> a - a tensor
+>> b - another tensor
+>> beta - the scaling factor
+*/
+void _SubMe(XTensor * a, const XTensor * b, DTYPE beta)
+{
+    _Sub(a, b, a, beta);
+}
+    
+/*
+tensor subtraction c = a - b * \beta (return a XTensor structure)
+make a new tensor c to keep the result and return it
+
+>> a - a tensor
+>> b - another tensor
+>> beta - the scaling factor
+<< return - the result of tensor subtraction
+*/
+XTensor Sub(const XTensor &a, const XTensor &b, DTYPE beta)
+{
+    XTensor c(&a);
+    c.SetTMP();
+
+    /* call _Sub function */
+    _Sub(&a, &b, &c, beta);
+    
+    /* tensor connections */
+    XLink::MakeLink(&a, &b, &c, MATH_SUB);
+    XLink::AddParamToHead(&c, beta);
+    
+    return c;
+}
+
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/Sub.cu
+++ b/source/tensor/core/arithmetic/Sub.cu
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-01
+ */
+
+#include "../../XDevice.h"
+#include "../../XUtility.h"
+#include "Sub.cuh"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+#ifdef USE_CUDA
+
+/*
+subtraction of data arrays (CUDA Kernel)
+c = a - b * \beta
+>> a - A matrix
+>> b - another matrix
+>> c - where we put a-b
+>> size - the size of a/b/c
+>> beta - the coefficient
+*/
+__global__
+void KernelSUB(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < size)
+        c[i] = a[i] - b[i] * beta;
+}
+
+/*
+tensor subtraction c = a - b * \beta (cuda version)
+>> a - a tensor
+>> b - another tensor
+>> c - where we put a-b*\beta.
+>> beta - the scaling factor
+*/
+void _CudaSub(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
+{
+    CheckNTErrors(a && b && c, "Empty tensor input!");
+    CheckNTErrors((a->unitNum == b->unitNum && a->unitNum == c->unitNum),
+                  "Unmatched tensors in addition!");
+    CheckNTErrors((a->dataType == b->dataType && a->dataType == c->dataType),
+                  "Unmatched tensors in addition!");
+    CheckNTErrors((a->devID == b->devID && a->devID == c->devID),
+                  "The tensors must be on the same!");
+
+    int devIDBackup = XDevice::GetGPUDevice();
+    XDevice::SetGPUDevice(a->devID);
+
+    if (!a->isSparse && !b->isSparse) {
+        CheckNTErrors(!c->isSparse, "Illegal use of sparse matrix in addition!");
+
+        if (a->dataType == DEFAULT_DTYPE &&
+            b->dataType == DEFAULT_DTYPE &&
+            c->dataType == DEFAULT_DTYPE)
+        {
+            int gridSize[3], blockSize[3];
+
+            GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
+            dim3 blocks(gridSize[0]);
+            dim3 threads(blockSize[0]);
+            KernelSUB << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, a->unitNum, beta);
+        }
+        else {
+            // TODO!!
+            ShowNTErrors("TODO!");
+        }
+    }
+    else {
+        // TODO!!
+        ShowNTErrors("TODO!");
+    }
+
+    XDevice::SetGPUDevice(devIDBackup);
+}
+
+/* subtraction over arrays
+tensor subtraction c = a - b * \beta (cuda version) with an input handle
+>> devID - device ID (MUST >= 0)
+>> handle - cuda handle
+>> a - an array
+>> b - another array
+>> c - where we put a-b
+>> size - size of the array
+>> beta - the coefficient
+*/
+void _CudaSubWithHandle(int devID, cublasHandle_t * handle, DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta)
+{
+    if (size == 0)
+        return;
+
+    if (c == NULL)
+        c = a;
+
+    CheckNTErrors((a && b && c), "Empty arrays in addition!");
+
+    int devIDBackup;
+    ProtectCudaDev(devID, devIDBackup);
+
+    if (c == a) {
+#ifdef DOUBELPRICSION
+        cublasDaxpy(*handle, size, &beta, b, 1, a, 1);
+#else
+        cublasSaxpy(*handle, size, &beta, b, 1, a, 1);
+#endif
+    }
+    else {
+        int gridSize[3], blockSize[3];
+
+        GDevs.GetCudaThread(devID, size, gridSize, blockSize);
+
+        dim3 blocks(gridSize[0]);
+        dim3 threads(blockSize[0]);
+
+        KernelSUB<<<blocks, threads>>>((DTYPE*)a, (DTYPE*)b, (DTYPE*)c, size, beta);
+    }
+
+    BacktoCudaDev(devID, devIDBackup);
+}
+
+#endif // USE_CUDA
+
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/Sub.cuh
+++ b/source/tensor/core/arithmetic/Sub.cuh
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-01
+ */
+
+#ifndef __SUB_CUH__
+#define __SUB_CUH__
+
+#include "Sub.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+#ifdef USE_CUDA
+
+/* subtraction of data arrays (CUDA Kernel) */
+__global__
+void KernelSUB(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta = (DTYPE)1.0);
+
+/* tensor subtraction c = a - b * \beta (cuda version) */
+void _CudaSub(const XTensor * a, const XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
+
+/*  tensor subtraction c = a - b * \beta (cuda version) with an input handle */
+void _CudaSubWithHandle(int devID, cublasHandle_t * handle, DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta = (DTYPE)1.0);
+
+#endif // USE_CUDA
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __SUB_CUH__
--- a/source/tensor/core/arithmetic/Absolute.h
+++ b/source/tensor/core/arithmetic/Absolute.h
 /* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */

 /*
-* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
-*/
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-01
+ * Today is the first day of August. It's still very hot.
+ */

-#ifndef __ABSOLUTE_H__
-#define __ABSOLUTE_H__
+#ifndef __SUB_H__
+#define __SUB_H__

 #include "../../XTensor.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

-/* set every entry to its absolute value */
-void _Absolute(const XTensor * a, XTensor * b);
+/* tensor subtraction c = a - b * \beta */
+void _Sub(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);

-/*
-set every entry to its absolute value (do it on site)
+/* 
+tensor subtraction a = a - b * \beta
 keep the result in the input tensor a and return nothing
 */
-void _AbsoluteMe(XTensor * a);
-
-/* 
-set every entry to its absolute value (return a XTensor structure)
-make a new tensor to keep the result and return it
+void _SubMe(XTensor * a, const XTensor * b, DTYPE beta = (DTYPE)1.0);
+    
+/*
+tensor subtraction c = a - b * \beta
+make a new tensor c to keep the result and return it
 */
-XTensor Absolute(const XTensor & a);
+XTensor Sub(const XTensor &a, const XTensor &b, DTYPE beta = (DTYPE)1.0);

 } // namespace nts(NiuTrans.Tensor)

-#endif // __ABSOLUTE_H__
+#endif // __SUB_H__
--- a/source/tensor/core/arithmetic/Sum.cpp
+++ b/source/tensor/core/arithmetic/Sum.cpp
@@ -22,8 +22,10 @@
 #include "../../XTensor.h"
 #include "../../XName.h"
 #include "../../XUtility.h"
+#include "../movement/CopyValues.h"
 #include "Sum.h"
 #include "Sum.cuh"
+#include "SumDim.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

@@ -43,8 +45,12 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
    CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
                  "Unmatched tensors in addition!");

-    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
+    if(beta == 0){
+        _CopyValues(a, c);
+        return;
+    }

+    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
 #ifdef USE_CUDA
        if (a == c) {
            int P2PAccesible = 0;
@@ -67,7 +73,7 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
    }
    else {
        if (!a->isSparse && !b->isSparse) {
-            CheckNTErrors(!c->isSparse, "Illegal use of sparse matrix in addition!");
+            CheckNTErrors(!c->isSparse, "Illegal use of sparse tensor in addition!");
    
            if (a->dataType == DEFAULT_DTYPE &&
                b->dataType == DEFAULT_DTYPE &&
@@ -123,6 +129,33 @@ void _SumMe(XTensor * a, const XTensor * b, DTYPE beta)
 {
    _Sum(a, b, a, beta);
 }
+
+/* 
+return a dimension if the sum is performed as SumDim (in more details in SumDim.h 
+>> a - a tensor
+>> b - another tensor for sum
+*/
+int GetSumDimIndex(const XTensor &a, const XTensor &b)
+{
+    if(a.order < b.order)
+        return -1;
+
+    int hitCount = 0;
+    int hitDim = -1;
+    for(int i = 0; i < b.order; i++){
+        if(b.dimSize[b.order - 1 - i] == 1)
+            continue;
+        else if(b.dimSize[b.order - 1 - i] == a.dimSize[a.order - 1 - i]){
+            hitCount++;
+            hitDim = a.order - b.order + i;
+        }
+    }
+
+    if(hitCount == 1)
+        return hitDim;
+    else
+        return -1;
+}
    
 /*
 tensor summation c = a + b * \beta (return a XTensor structure)
@@ -137,13 +170,29 @@ XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta)
 {
    XTensor c(&a);
    c.SetTMP();
+
+    int n = GetSumDimIndex(a, b);
+
+    if(n == -1){
+        /* call _Sum function */
+        _Sum(&a, &b, &c, beta);
    
-    /* call _Sum function */
-    _Sum(&a, &b, &c, beta);
+        /* tensor connections */
+        XLink::MakeLink(&a, &b, &c, MATH_SUM);
+        XLink::AddParamToHead(&c, beta);
+    }
+    else if(n >= 0 && n < a.order){
+        /* call _Sum function */
+        _SumDim(&a, &b, &c, n, beta);
    
-    /* tensor connections */
-    XLink::MakeLink(&a, &b, &c, MATH_SUM);
-    XLink::AddParamToHead(&c, beta);
+        /* tensor connections */
+        XLink::MakeLink(&a, &b, &c, MATH_SUMDIM);
+        XLink::AddParamToHeadInt(&c, n);
+        XLink::AddParamToHead(&c, beta);
+    }
+    else{
+        ShowNTErrors("Something is wrong!");
+    }
    
    return c;
 }

--- a/source/tensor/core/arithmetic/Sum.cu
+++ b/source/tensor/core/arithmetic/Sum.cu
@@ -20,6 +20,7 @@
 */

 #include "../../XDevice.h"
+#include "../../XUtility.h"
 #include "Sum.cuh"

 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/SumDim.cpp
+++ b/source/tensor/core/arithmetic/SumDim.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-29
+ */
+
+#include "Sum.h"
+#include "SumDim.h"
+#include "SumDim.cuh"
+#include "../../XName.h"
+#include "../movement/CopyValues.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/*
+tensor summation 
+
+c = a + b * \beta 
+where the size of b is equal to the n-th dimension of a, 
+i.e., a is summed with b by broadcasting
+
+>> a - a tensor
+>> b - another tensor whose size is equal to that of dimension n of a
+>> c - where we put a+b*\beta. we save it in a if c is NULL
+>> n - the dimension index
+>> beta - the scaling factor
+*/
+void _SumDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta)
+{
+    CheckNTErrors(a && b && c, "Empty tensor input!");
+    CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in addition!");
+    CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
+                  "Unmatched data types in addition!");
+    CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in addition!");
+    CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
+    CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
+
+    if(beta == 0){
+        _CopyValues(a, c);
+        return;
+    }
+
+    if(XTensor::IsSameShaped(a, b)){
+        _Sum(a, b, c, beta);
+        return;
+    }
+
+    if(a->devID >= 0 || b->devID >= 0 || c->devID >= 0){
+#ifdef USE_CUDA
+        _CudaSumDim(a, b, c, n, beta);
+#else
+        ShowNTErrors("Please specify USE_CUDA and recompile the code!");
+#endif
+    }
+    else{
+        int stride = 1;
+        int blockSize = a->dimSize[n];
+        int blockNum = 1;
+
+        for(int i = a->order - 1; i >= 0; i--){
+            if(i > n)
+                stride *= a->dimSize[i];
+            else if(i < n)
+                blockNum *= a->dimSize[i];
+        }
+    
+        if (a->dataType == DEFAULT_DTYPE){
+            int num = a->unitNum;
+            if(stride > 1){
+                for(int i = 0, j = 0; i < num; i += stride, j++){
+                    DTYPE * ap =   (DTYPE*)a->data + i;
+                    DTYPE   bv = *((DTYPE*)b->data + j % blockSize) * beta;
+                    DTYPE * cp =   (DTYPE*)c->data + i;
+                    for(int k = 0; k < stride; k++)
+                        cp[k] = ap[k] + bv;
+                }
+            }
+            else if(stride == 1){
+                DTYPE * bp = (DTYPE*)b->data;
+                for(int i = 0; i < num; i += blockSize){
+                    DTYPE * ap = (DTYPE*)a->data + i;
+                    DTYPE * cp = (DTYPE*)c->data + i;
+                    if(beta == 1.0F){
+                        for(int j = 0; j < blockSize; j++)
+                            cp[j] = ap[j] + bp[j];
+                    }
+                    else{
+                        for(int j = 0; j < blockSize; j++)
+                            cp[j] = ap[j] + bp[j] * beta;
+                    }
+                }
+            }
+            else{
+                ShowNTErrors("Something is wrong!");
+            }
+        }
+        else {
+            ShowNTErrors("TODO!");
+        }
+    }
+}
+    
+/*
+tensor summation (do it on site)
+keep the result in the input tensor and return nothing
+
+a = a + b * \beta
+where the size of b is equal to the n-th dimension of a,
+i.e., a is summed with b by broadcasting
+
+>> a - a tensor
+>> b - another tensor whose size is equal to that of dimension n of a
+>> n - the dimension index
+>> beta - the scaling factor
+*/
+void _SumDim(XTensor * a, const XTensor * b, int n, DTYPE beta)
+{
+    _SumDim(a, b, a, n, beta);
+}
+    
+/*
+tensor summation (return a XTensor structure and make tensor connections)
+make a new tensor to keep the result and return it
+
+c = a + b * \beta
+where the size of b is equal to the n-th dimension of a,
+i.e., a is summed with b by broadcasting
+
+>> a - a tensor
+>> b - another tensor whose size is equal to that of dimension n of a
+>> n - the dimension index
+>> beta - the scaling factor
+<< return - the result tensor by tensor summation
+*/
+XTensor SumDim(const XTensor &a, const XTensor &b, int n, DTYPE beta)
+{
+    XTensor c(&a);
+    c.SetTMP();
+    
+    /* call _Sum function */
+    _SumDim(&a, &b, &c, n, beta);
+    
+    /* tensor connections */
+    XLink::MakeLink(&a, &b, &c, MATH_SUMDIM);
+    XLink::AddParamToHeadInt(&c, n);
+    XLink::AddParamToHead(&c, beta);
+    
+    return c;
+}
+    
+}
--- a/source/tensor/core/arithmetic/SumDim.cu
+++ b/source/tensor/core/arithmetic/SumDim.cu
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-29
+*/
+
+#include "SumDim.cuh"
+#include "../../XDevice.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+#ifdef USE_CUDA
+
+/* 
+tensor summation of a tensor and a row vector
+c = a + b * \beta 
+where a is a tensor and b is a row vector
+>> a - pointer to the data array of a
+>> b - pointer to the data array of b
+>> c - pointer to the data array of c
+>> rowNum - number of rows of a and c
+>> colNum - number of columns of a and c (i.e., the size of b)
+>> beta - the scaling factor
+*/
+template <class T, bool betaFired>
+__global__
+void KernelAddWithRow(T * a, T * b, T * c, int rowNum, int colNum, T beta)
+{
+    __shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+    int col = blockDim.x * blockIdx.x + threadIdx.x;
+    int row = blockDim.y * blockIdx.y + threadIdx.y;
+
+    if(col >= colNum || row >= rowNum)
+        return;
+
+    if(threadIdx.y == 0)
+        bv[threadIdx.x] = b[col];
+
+    __syncthreads();
+
+    int offset = colNum * row + col;
+    if(betaFired)
+        c[offset] = a[offset] + bv[threadIdx.x] * beta;
+    else
+        c[offset] = a[offset] + bv[threadIdx.x];
+}
+
+/* 
+tensor summation of a tensor and a colum vector
+c = a + b * \beta 
+where a is a tensor and b is a colum vector
+>> a - pointer to the data array of a
+>> b - pointer to the data array of b
+>> c - pointer to the data array of c
+>> rowNum - number of rows of a and c (i.e., the size of b)
+>> colNum - number of columns of a and c 
+>> blockNum - size of a block (matrix), i.e., rowNum * colNum
+>> blockNum - number of matrics 
+>> beta - the scaling factor
+*/
+template <class T, bool betaFired>
+__global__
+void KernelAddWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, T beta)
+{
+    __shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+
+    int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
+    int row = blockDim.y * blockIdx.y + threadIdx.y;
+
+    int col = colIndex % colNum;
+    int block = colIndex / colNum;
+
+    if(row >= rowNum || block >= blockNum)
+        return;
+
+    if(threadIdx.x == 0)
+        bv[threadIdx.y] = b[row];
+
+    __syncthreads();
+
+    int offset = block * blockSize + row * colNum + col;
+    
+    if(betaFired)
+        c[offset] = a[offset] + bv[threadIdx.y] * beta;
+    else
+        c[offset] = a[offset] + bv[threadIdx.y];
+}
+
+/*
+tensor summation (cuda version)
+
+c = a + b * \beta 
+where the size of b is equal to the n-th dimension of a, 
+i.e., a is summed with b by broadcasting
+
+>> a - a tensor
+>> b - another tensor whose size is equal to that of dimension n of a
+>> c - where we put a+b*\beta. we save it in a if c is NULL
+>> n - the dimension index
+>> beta - the scaling factor
+*/
+void _CudaSumDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta)
+{
+    CheckNTErrors(a && b && c, "Empty tensor input!");
+    CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in addition!");
+    CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
+                  "Unmatched data types in addition!");
+    CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in addition!");
+    CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
+    CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
+
+    int stride = 1;
+    int blockSize = a->dimSize[n];
+    int blockNum = 1;
+
+    for(int i = a->order - 1; i >= 0; i--){
+        if(i > n)
+            stride *= a->dimSize[i];
+        else if(i < n)
+            blockNum *= a->dimSize[i];
+    }
+
+    int cudaGrids[3];
+    int cudaBlocks[3];
+
+    int devIDBackup = 0;
+    ProtectCudaDev(a->devID, devIDBackup);
+
+    if (a->dataType == DEFAULT_DTYPE){
+        if(stride > 1){
+            GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
+            if(beta == (DTYPE)1.0F)
+                KernelAddWithCol<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
+                                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, 
+                                                  blockSize, stride, blockSize * stride, blockNum, beta);
+            else
+                KernelAddWithCol<DTYPE, true>  <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
+                                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, 
+                                                  blockSize, stride, blockSize * stride, blockNum, beta);
+        }
+        else if(stride == 1){
+            GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
+            if(beta == (DTYPE)1.0F)
+                KernelAddWithRow<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
+                                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, 
+                                                  blockNum, blockSize, beta);
+            else
+                KernelAddWithRow<DTYPE, true>  <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
+                                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, 
+                                                  blockNum, blockSize, beta);
+        }
+        else{
+            ShowNTErrors("Something is wrong!");
+        }
+    }
+    else {
+        ShowNTErrors("TODO!");
+    }
+
+    BacktoCudaDev(a->devID, devIDBackup);
+}
+
+#endif
+
+} // namespace nts(NiuTrans.Tensor)
+
--- a/source/tensor/core/arithmetic/SumDim.cuh
+++ b/source/tensor/core/arithmetic/SumDim.cuh
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-29
+*/
+
+#ifndef __SUMDIM_CUH__
+#define __SUMDIM_CUH__
+
+#include "../../XTensor.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+#ifdef USE_CUDA
+
+/* tensor summation c = a + b * \beta where the size of b is equal to the n-th dimension of a, 
+   i.e., a is summed with b by broadcasting (cuda version) */
+void _CudaSumDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta = (DTYPE)1.0);
+
+#endif
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __SUMDIM_CUH__
--- a/source/tensor/core/arithmetic/SumDim.h
+++ b/source/tensor/core/arithmetic/SumDim.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-29
+ * It reached to 39 centigrade around 3:00 pm in Shenyang
+ */
+
+#ifndef __SUMDIM_H__
+#define __SUMDIM_H__
+
+#include "../../XTensor.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+    
+/* tensor summation c = a + b * \beta where the size of b is equal to the n-th dimension of a, 
+   i.e., a is summed with b by broadcasting */
+void _SumDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta = (DTYPE)1.0);
+    
+/* tensor summation c = a + b * \beta where the size of b is equal to the n-th dimension of a, 
+   i.e., a is summed with b by broadcasting. we keep the result in the input tensor a and return nothing */
+void _SumDim(XTensor * a, const XTensor * b, int n, DTYPE beta = (DTYPE)1.0);
+    
+/* tensor summation c = a + b * \beta where the size of b is equal to the n-th dimension of a, 
+   i.e., a is summed with b by broadcasting. We make a new tensor c to keep the result and return it */
+XTensor SumDim(const XTensor &a, const XTensor &b, int n, DTYPE beta = (DTYPE)1.0);
+    
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __SUMDIM_H__
--- a/source/tensor/core/getandset/SetData.cpp
+++ b/source/tensor/core/getandset/SetData.cpp
@@ -20,6 +20,7 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-05-08
 */

+#include <math.h>
 #include "SetData.h"
 #include "SetData.cuh"
 #include "../../XUtility.h"
@@ -37,6 +38,43 @@

 namespace nts{ // namespace nts(NiuTrans.Tensor)

+/*
+Fills the input Tensor or Variable with values according to the method described in 
+"Understanding the difficulty of training deep feedforward neural networks" - Glorot, X. & Bengio, Y. (2010), 
+using a uniform distribution. The resulting tensor will have values sampled from :math:`U(-a, a)` 
+where :math:`a = gain \times \sqrt{2 / (fan\_in + fan\_out)} \times \sqrt{3}`. Also known as Glorot initialisation.
+
+>> tensor - the tensor whose data array would be initialized
+>> gain - an optional scaling factor
+*/
+void _SetDataFanInOut(XTensor * tensor, DTYPE gain)
+{
+    CheckNTErrors(tensor->dataType == X_FLOAT, "the tensor must be in X_FLOAT!");
+    CheckNTErrors(tensor->order >= 2, "the tensor dimension must be no less than 2!");
+
+    int fanIn = 1;
+    int fanOut = 1;
+
+    int order = tensor->order;
+    if (order == 2) {
+        fanIn = tensor->dimSize[1];
+        fanOut = tensor->dimSize[0];
+    }
+    else {
+        int numInputFmaps = tensor->dimSize[1];
+        int numOutputFmaps = tensor->dimSize[0];
+        int receptiveFieldSize = 0;
+        for (int i = 2; i < order; i++)
+            receptiveFieldSize += tensor->dimSize[i];
+        fanIn = numInputFmaps * receptiveFieldSize;
+        fanOut = numOutputFmaps * receptiveFieldSize;
+    }
+
+    DTYPE std = gain * (float)sqrt(2.0/(fanIn + fanOut));
+    DTYPE a = (DTYPE)sqrt(3.0) * std;
+    _SetDataRand(tensor, -a, a);
+}
+
 /* 
 generate data items with a fixed value p 
 >> tensor - the tensor whose data array would be initialized
@@ -65,7 +103,7 @@ void _SetDataFixed(XTensor * tensor, void * valuePointer)
        }
        else{
 #ifdef USE_CUDA
-            CudaSetDataFixedInt(tensor, p);
+            _CudaSetDataFixedInt(tensor, p);
 #endif
        }
    }
@@ -88,7 +126,7 @@ void _SetDataFixed(XTensor * tensor, void * valuePointer)
        }
        else{
 #ifdef USE_CUDA
-            CudaSetDataFixedFloat(tensor, p);
+            _CudaSetDataFixedFloat(tensor, p);
 #endif
        }
    }
@@ -111,7 +149,7 @@ void _SetDataFixed(XTensor * tensor, void * valuePointer)
        }
        else{
 #ifdef USE_CUDA
-            CudaSetDataFixedDouble(tensor, p);
+            _CudaSetDataFixedDouble(tensor, p);
 #endif
        }
    }
@@ -137,7 +175,7 @@ generate data items with a fixed value p (in integer)
 */
 void _SetDataFixedInt(XTensor * tensor, int p)
 {
-    CheckNTErrors(tensor->dataType == X_INT, "the tensor must be in X_INT");
+    CheckNTErrors(tensor->dataType == X_INT, "the tensor must be in X_INT!");

    if(p == 0)
        tensor->SetZeroAll();
@@ -152,7 +190,7 @@ generate data items with a fixed value p (in float)
 */
 void _SetDataFixedFloat(XTensor * tensor, float p)
 {
-    CheckNTErrors(tensor->dataType == X_FLOAT, "the tensor must be in X_INT");
+    CheckNTErrors(tensor->dataType == X_FLOAT, "the tensor must be in X_FLOAT!");

    if(p == 0)
        tensor->SetZeroAll();
@@ -167,7 +205,7 @@ generate data items with a fixed value p (in double)
 */
 void _SetDataFixedDouble(XTensor * tensor, double p)
 {
-    CheckNTErrors(tensor->dataType == X_DOUBLE, "the tensor must be in X_INT");
+    CheckNTErrors(tensor->dataType == X_DOUBLE, "the tensor must be in X_DOUBLE!");

    if(p == 0)
        tensor->SetZeroAll();
@@ -176,32 +214,32 @@ void _SetDataFixedDouble(XTensor * tensor, double p)
 }

 /*
-generate data items with a uniform distribution in [low,high]
+generate data items with a uniform distribution in [lower, upper]
 >> tensor - the tensor whose data array would be initialized
->> low - lower value of the range
->> high - higher value of the range
+>> lower - lower value of the range
+>> upper - upper value of the range
 */
-void _SetDataRand(XTensor * tensor, DTYPE low, DTYPE high)
+void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
 {
+    CheckNTErrors(upper > lower, "the high value must be greater than low value!");
+
    if(tensor == NULL)
        return;
    
    /* GPU code */
    if(tensor->devID < 0){
-        DTYPE variance = high - low;
-        
-        srand((unsigned)time(NULL));
+        DTYPE variance = upper - lower;
        
        if(tensor->dataType == X_FLOAT){
            float * d = (float*)tensor->data;
            for(int i = 0; i < tensor->unitNum; i++){
-                d[i] = variance * ((float)rand()/RAND_MAX) + low;
+                d[i] = variance * ((float)rand()/RAND_MAX) + lower;
            }
        }
        else if(tensor->dataType == X_DOUBLE){
            double * d = (double*)tensor->data;
            for(int i = 0; i < tensor->unitNum; i++){
-                d[i] = variance * ((double)rand()/RAND_MAX) + low;
+                d[i] = variance * ((double)rand()/RAND_MAX) + lower;
            }
        }
        else{
@@ -215,12 +253,27 @@ void _SetDataRand(XTensor * tensor, DTYPE low, DTYPE high)
    TODO: generate data points on GPUs straightforwardly.
    */
    else{
-        XTensor * t2 = NewTensor(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, -1);
-        _SetDataRand(t2, low, high);
-        _CopyValues(t2, tensor);
-        delete t2;
+#ifdef USE_CUDA
+        _CudaSetDataRand(tensor, lower, upper);
+#endif
+        //XTensor * t2 = NewTensor(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, -1);
+        //_SetDataRand(t2, low, high);
+        //_CopyValues(t2, tensor);
+        //delete t2;
    }
 }
    
+
+/*
+generate data items with a normal distribution with specified mean and standard deviation 
+>> mean - mean or expectation of the distribution
+>> standardDeviation - standard deviation of the distribution
+*/
+void _SetDataRandN(XTensor * tensor, DTYPE mean, DTYPE standardDeviation)
+{
+    // TODO: rewrite it and add cuda code!!!!!!!
+    tensor->SetDataRandn(mean, standardDeviation);
+}
+
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/getandset/SetData.cu
+++ b/source/tensor/core/getandset/SetData.cu
@@ -21,7 +21,10 @@
 * I'm surprised that I did not write this file till today.
 */

+#include <curand.h>
+#include <time.h>
 #include "SetData.cuh"
+#include <curand_kernel.h>
 #include "../../XDevice.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -46,7 +49,7 @@ generate data items with a fixed value p (in int)
 >> tensor - the tensor for initialization
 >> p - the initial value
 */
-void CudaSetDataFixedInt(XTensor * tensor, int p)
+void _CudaSetDataFixedInt(XTensor * tensor, int p)
 {
    CheckNTErrors(tensor->dataType == X_INT, "the tensor must be in X_INT!");

@@ -86,7 +89,7 @@ generate data items with a fixed value p (in float)
 >> tensor - the tensor for initialization
 >> p - the initial value
 */
-void CudaSetDataFixedFloat(XTensor * tensor, float p)
+void _CudaSetDataFixedFloat(XTensor * tensor, float p)
 {
    CheckNTErrors(tensor->dataType == X_FLOAT, "the tensor must be in X_FLOAT!");

@@ -126,7 +129,7 @@ generate data items with a fixed value p (in double)
 >> tensor - the tensor for initialization
 >> p - the initial value
 */
-void CudaSetDataFixedDouble(XTensor * tensor, double p)
+void _CudaSetDataFixedDouble(XTensor * tensor, double p)
 {
    CheckNTErrors(tensor->dataType == X_DOUBLE, "the tensor must be in X_DOUBLE!");

@@ -146,4 +149,156 @@ void CudaSetDataFixedDouble(XTensor * tensor, double p)
    BacktoCudaDev(tensor->devID, devIDBackup);
 }

+/* 
+call curand_init function on each kernel with the same random seed
+and init the rng states
+*/
+__global__ 
+void KernelInitializeCurand(curandState * state, unsigned long seed)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    curand_init(seed, i, 0, &state[i]);
+}
+
+/* */
+__device__ 
+float GenerateFloat(curandState* globalState, int i)
+{
+    //copy state to local mem
+    curandState localState = globalState[0] ;
+    //apply uniform distribution with calculated random
+    float randNum = curand_uniform(&localState);
+    //update state
+    globalState[0] = localState;
+
+    //return value
+    return randNum;
+}
+
+/**/
+__device__ 
+double GenerateDouble(curandState* globalState, int i)
+{
+    //copy state to local mem
+    curandState localState = globalState[i];
+    //apply uniform distribution with calculated random
+    double randNum = curand_uniform_double(&localState);
+    //update state
+    globalState[i] = localState;
+    //return value
+    return randNum;
+}
+
+/* 
+set data array with a uniform distribution in [low, high] 
+>> deviceStates - the state of curand
+>> d - float datatype pointer to the data array 
+>> size - size of the array
+>> lower - low value of the range
+>> variance - the variance of the range
+*/
+__global__
+void KernelSetDataRandFloat(unsigned long seed, float * d, int size, DTYPE low, DTYPE variance)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < size) {
+        curandState deviceStates;
+        curand_init(seed, 0, 0, &deviceStates);
+        //curand_init((unsigned long long)clock() + i, 0, 0, &deviceStates);
+        float randNum = GenerateFloat(&deviceStates, i);
+        //float randNum = curand_uniform(&deviceStates);
+        d[i] = randNum * variance + low;
+    }
+}
+
+__global__
+void KernelSetDataRandFloat(float * d, int size, DTYPE low, DTYPE variance)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < size) {
+        d[i] = d[i] * variance + low;
+    }
+}
+
+/* 
+set data array with a uniform distribution in [low, high] 
+>> deviceStates - the state of curand
+>> d - double datatype pointer to the data array
+>> size - size of the array
+>> lower - low value of the range
+>> variance - the variance of the range
+*/
+__global__
+void KernelSetDataRandDouble(curandState* deviceStates, double * d, int size, DTYPE low, DTYPE variance)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    
+    if (i < size){
+        double randNum = GenerateDouble(deviceStates, i);
+        d[i] = randNum * variance + low;
+    }
+}
+
+
+/*
+generate data items with a uniform distribution in [lower, upper]
+>> tensor - the tensor whose data array would be initialized
+>> lower - lower value of the range
+>> upper - upper value of the range
+*/
+void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
+{
+    CheckNTErrors(upper > lower, "the high value must be greater than low value!");
+
+    int gridSize[3];
+    int blockSize[3];
+
+    GDevs.GetCudaThread(tensor->devID, tensor->unitNum, gridSize, blockSize);
+
+    dim3 blocks(gridSize[0]);
+    dim3 threads(blockSize[0]);
+
+    int devIDBackup;
+    ProtectCudaDev(tensor->devID, devIDBackup);
+    
+    //curandState *deviceStates;
+    //cudaMalloc(&deviceStates, sizeof(curandState) );
+    DTYPE variance = upper - lower;
+
+    //KernelInitializeCurand<<<blocks, threads >>>(deviceStates, unsigned(time(NULL)));
+    if (tensor->dataType == X_FLOAT)
+        KernelSetDataRandFloat <<<blocks, threads >>>(10, (float*)tensor->data, tensor->unitNum, lower, variance);
+    /*else if (tensor->dataType == X_DOUBLE)
+        KernelSetDataRandDouble <<<blocks, threads >>>(deviceStates, (double*)tensor->data, tensor->unitNum, low, variance);*/
+
+    //cudaFree(deviceStates);
+    BacktoCudaDev(tensor->devID, devIDBackup);
+
+
+    //int num = tensor->unitNum;
+    //curandGenerator_t gen;
+    //curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_MRG32K3A);
+    //curandSetPseudoRandomGeneratorSeed(gen, time(NULL));
+    //curandGenerateUniform(gen, (float *)tensor->data, num);
+    //int gridSize[3];
+    //int blockSize[3];
+
+    //GDevs.GetCudaThread(tensor->devID, tensor->unitNum, gridSize, blockSize);
+
+    //dim3 blocks(gridSize[0]);
+    //dim3 threads(blockSize[0]);
+
+    //int devIDBackup;
+    //ProtectCudaDev(tensor->devID, devIDBackup);
+   
+    //DTYPE variance = high - low;
+    //if (tensor->dataType == X_FLOAT)
+    //    KernelSetDataRandFloat <<<blocks, threads >>>((float*)tensor->data, tensor->unitNum, low, variance);
+    ////else if (tensor->dataType == X_DOUBLE)
+    // //   KernelSetDataRandDouble <<<blocks, threads >>>((double*)tensor->data, tensor->unitNum, low, variance);
+    //curandDestroyGenerator(gen);
+    //BacktoCudaDev(tensor->devID, devIDBackup);
+}
+
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/getandset/SetData.cuh
+++ b/source/tensor/core/getandset/SetData.cuh
@@ -29,13 +29,16 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)

 /* generate data items with a fixed value p (in int) */
-void CudaSetDataFixedInt(XTensor * tensor, int p);
+void _CudaSetDataFixedInt(XTensor * tensor, int p);

 /* generate data items with a fixed value p (in float) */
-void CudaSetDataFixedFloat(XTensor * tensor, float p);
+void _CudaSetDataFixedFloat(XTensor * tensor, float p);

 /* generate data items with a fixed value p (in double) */
-void CudaSetDataFixedDouble(XTensor * tensor, double p);
+void _CudaSetDataFixedDouble(XTensor * tensor, double p);
+
+/* generate data items with a uniform distribution in [lower, upper] */
+void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/getandset/SetData.h
+++ b/source/tensor/core/getandset/SetData.h
@@ -27,6 +27,9 @@

 namespace nts { // namespace nts(NiuTrans.Tensor)

+/* generate data items with a xavier initialization */
+void _SetDataFanInOut(XTensor * tensor, DTYPE gain = 1.0F);
+
 /* generate data items with a fixed value p */
 void _SetDataFixed(XTensor * tensor, void * valuePointer);

@@ -42,8 +45,8 @@ void _SetDataFixedFloat(XTensor * tensor, float p);
 /* generate data items with a fixed value p (in double) */
 void _SetDataFixedDouble(XTensor * tensor, double p);

-/* generate data items with a uniform distribution in [low,high] */
-void _SetDataRand(XTensor * tensor, DTYPE low, DTYPE high);
+/* generate data items with a uniform distribution in [lower, upper] */
+void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper);

 /* generate data items with a normal distribution with specified mean and standard deviation */
 void _SetDataRandN(XTensor * tensor, DTYPE mean, DTYPE standardDeviation);

--- a/source/tensor/core/arithmetic/Absolute.cpp
+++ b/source/tensor/core/arithmetic/Absolute.cpp
@@ -16,67 +16,130 @@
 */

 /*
-* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
+* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-03
 */

-#include <math.h>
 #include "../../XTensor.h"
 #include "../../XName.h"
-#include "Absolute.h"
-#include "Absolute.cuh"
+#include "Clip.h"
+#include "Clip.cuh"

 namespace nts { // namespace nts(NiuTrans.Tensor)

 /*
-set every entry to its absolute value
+set every entry to its clip value
 >> a - input tensor we are processing
 >> b - output tensor we are processing
+>> lower - the lower border
+>> upper - the upper border
 */
-void _Absolute(const XTensor * a, XTensor * b)
+void _Clip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper)
 {
 #ifdef USE_CUDA
-    /* run it on GPUs */
-    if (a->devID >= 0) {
-        _CudaAbsolute(a, b);
-    return;
-}
+	/* run it on GPUs */
+	if (a->devID >= 0) {
+		_CudaClip(a, b, lower, upper);
+		return;
+	}
 #endif

-    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
-    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
-    DTYPE * d = (DTYPE*)a->data;
-    DTYPE * db = (DTYPE*)b->data;
-    for (int i = 0; i < a->unitNum; i++)
-        db[i] = (DTYPE)fabs(d[i]);
+	CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
+	CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
+
+	DTYPE * d = (DTYPE*)a->data;
+	DTYPE * db = (DTYPE*)b->data;
+	for (int i = 0; i < a->unitNum; i++) {
+		if (d[i] > upper)
+			db[i] = upper;
+		else if (d[i] < lower)
+			db[i] = lower;
+		else
+			db[i] = d[i];
+	}
 }

 /*
-set every entry to its absolute value (do it on site)
+set every entry to its clip value (do it on site)
 keep the result in the input tensor a and return nothing
 >> a - the tensor we are processing
+>> lower - the lower border
+>> upper - the upper border
 */
-void _AbsoluteMe(XTensor * a)
+void _ClipMe(XTensor * a, DTYPE lower, DTYPE upper)
 {
-    _Absolute(a, a);
+	_Clip(a, a, lower, upper);
 }

 /*
-set every entry to its absolute value (return a XTensor structure)
+set every entry to its clip value (return a XTensor structure)
 make a new tensor to keep the result and return it
 >> a - input tensor we are processing
-<< return - the absolute value of input tensor
+>> lower - the lower border
+>> upper - the upper border
+<< return - the clip value of the input tensor
 */
-XTensor Absolute(const XTensor & a)
+XTensor Clip(const XTensor & a, DTYPE lower, DTYPE upper)
+{
+	XTensor b(&a);
+	b.SetTMP();
+
+	/* call _Clip function */
+	_Clip(&a, &b, lower, upper);
+
+	/* tensor connections */
+	XLink::MakeLink(&a, NULL, &b, MATH_CLIP);
+	XLink::AddParamToHead(&b, lower);
+	XLink::AddParamToHead(&b, upper);
+
+	return b;
+}
+
+/*
+backward computation
+
+dE/dx = dE/dy * dy/dx
+
+hard tanh: y =  upper    if x > upper
+x    if lower <= x <= upper
+lower    if x< lower
+
+and dy/dx =  1    if lower <= x <= upper
+0    otherwise
+
+>> gold - gold standard to measure error (or loss)
+>> y - output of the function
+>> x - input of the function
+>> dedy - dE/dy
+>> dedx - dE/dx
+>> lossName - type of loss function, e.g., cross entropy
+*/
+void _ClipBackward(XTensor * y, XTensor * x, XTensor * dedy, XTensor * dedx, DTYPE lower, DTYPE upper) 
 {
-    XTensor b(&a);
-    b.SetTMP();
-    
-    /* call _Absolute function */
-    _Absolute(&a, &b);
-    
-    /* tensor connections */
-    XLink::MakeLink(&a, NULL, &b, MATH_ABSOLUTE);
    
-    return b;
+#ifdef USE_CUDA
+    if (x->devID >= 0) {
+        _CudaClipBackward(y, x, dedy, dedx, lower, upper);
+        return;
 }
+#endif
+
+    if (x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE) {
+        DTYPE * dedyp = (DTYPE*)dedy->data;
+        DTYPE * dedxp = (DTYPE*)dedx->data;
+        DTYPE * ip = (DTYPE*)x->data;
+        int size = y->unitNum;
+
+        /* dE/dx = dE/dy * dy/dx */
+        for (int i = 0; i < size; i++) {
+            DTYPE s = ip[i];
+            if (s > upper || s < lower)
+                dedxp[i] = 0;
+            else
+                dedxp[i] = dedyp[i];
+        }
+    }
+    else
+        ShowNTErrors("TODO!");
+}
+
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Absolute.cu
+++ b/source/tensor/core/arithmetic/Absolute.cu
@@ -16,78 +16,162 @@
 */

 /*
-* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
+* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-03
 */

 #include "../../XDevice.h"
 #include "../../XTensor.h"
-#include "Absolute.h"
-#include "Absolute.cuh"
+#include "Clip.h"
+#include "Clip.cuh"

 namespace nts { // namespace nts(NiuTrans.Tensor)

 #ifdef USE_CUDA
 /*
-set each entry to its absolute value (CUDA Kernel)
+set each entry to its clip value (CUDA Kernel)
 >> a - pointer to input data array
 >> b - pointer to output data array
+>> lower - the lower border
+>> upper - the upper border
 >> size - size of the data array
 */
 __global__
-void KernelAbsolute(DTYPE * a, DTYPE * b, int size)
+	void KernelClip(DTYPE * a, DTYPE * b, DTYPE lower, DTYPE upper, int size)
 {
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
+	int i = blockDim.x * blockIdx.x + threadIdx.x;

-    if (i < size)
-        b[i] = fabs(a[i]);
+	if (i < size) {
+		if (a[i] > upper)
+			b[i] = upper;
+		else if (a[i] < lower)
+			b[i] = lower;
+		else
+			b[i] = a[i];
+	}
 }

 /*
-set each entry to its absolute value (CUDA Kernel)
+set each entry to its clip value with float16 data type value (CUDA Kernel)
 This is for float16 computation
 >> a - pointer to input data array
 >> b - pointer to output data array
+>> lower - the lower border
+>> upper - the upper border
 >> size - size of the data array
 */
 __global__
-void KernelAbsolute(__half * a, __half * b, int size)
+void KernelClip(__half * a, __half * b, DTYPE lower, DTYPE upper, int size)
 {
-    return;
+	return;
 }

 /*
-set each entry to its absolute value
->> a - input tensor
->> b - output tensor
+set each entry to its clip value
+>> a - input tensor we are processing
+>> b - output tensor we are processing
+>> lower - the lower border
+>> upper - the upper border
 */
-void _CudaAbsolute(const XTensor * a, XTensor * b)
+void _CudaClip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper)
 {
-    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
-    CheckNTErrors((a->isSparse == false), "TODO!");
+	CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
+	CheckNTErrors((a->isSparse == false), "TODO!");
+
+	int gridSize[3];
+	int blockSize[3];
+
+	GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
+
+	dim3 blocks(gridSize[0]);
+	dim3 threads(blockSize[0]);

-    int gridSize[3];
-    int blockSize[3];
+	int devIDBackup;
+	ProtectCudaDev(a->devID, devIDBackup);

-    GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
+	if (a->dataType == DEFAULT_DTYPE) {
+		KernelClip << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, lower, upper, a->unitNum);
+	}
+	else if (a->dataType == X_FLOAT16) {
+		KernelClip << <blocks, threads >> >((__half*)a->data, (__half*)b->data, lower, upper, a->unitNum);
+	}
+	else {
+		ShowNTErrors("TODO!");
+	}

-    dim3 blocks(gridSize[0]);
-    dim3 threads(blockSize[0]);
+	BacktoCudaDev(a->devID, devIDBackup);
+}
+
+/*
+clip backward computation of dE/dx (Cuda kernel)

-    int devIDBackup;
-    ProtectCudaDev(a->devID, devIDBackup);
+dy/dx = 1     if lower <= x <= upper
+0     otherwise

-    if (a->dataType == DEFAULT_DTYPE) {
-        KernelAbsolute << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);
+>> dedy - dE/dy
+>> dedx - dE/dx
+>> y - y of the function
+>> x - x of the function
+>> lower 
+>> upper 
+*/
+__global__
+void KernelClipBackward(DTYPE * dedy, DTYPE * dedx, DTYPE * y, DTYPE * x, DTYPE lower, DTYPE upper, int size)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < size) {
+        DTYPE s = x[i];
+        if (s > upper || s < lower)
+            dedx[i] = 0;
+        else
+            dedx[i] = dedy[i];
    }
-    else if (a->dataType == X_FLOAT16) {
-        KernelAbsolute << <blocks, threads >> >((__half*)a->data, (__half*)b->data, a->unitNum);
+}
+
+/*
+backward computation (Cuda version)
+
+dE/dx = dE/dy * dy/dx
+
+hard tanh: y =  upper    if x > upper
+x    if lower <= x <= upper
+lower    if x< lower
+
+and dy/dx =  1    if lower <= x <= upper
+0    otherwise
+
+>> gold - gold standard to measure error (or loss)
+>> y - output of the function
+>> x - input of the function
+>> dedy - dE/dy
+>> dedx - dE/dx
+>> lossName - type of loss function, e.g., cross entropy
+*/
+void _CudaClipBackward(XTensor * y, XTensor * x, XTensor * dedy, XTensor * dedx, DTYPE lower, DTYPE upper)
+{
+    if (x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE) {
+
+        int gridSize[3], blockSize[3];
+
+        GDevs.GetCudaThread(x->devID, x->unitNum, gridSize, blockSize);
+
+        int devIDBackup;
+        ProtectCudaDev(x->devID, devIDBackup);
+
+        /* dE/dx = dE/dy * dy/dx */
+        KernelClipBackward <<<dim3(gridSize[0]), dim3(blockSize[0])>>>
+                             ((DTYPE*)dedy->data,
+                              (DTYPE*)dedx->data,
+                              (DTYPE*)y->data, (DTYPE*)x->data,
+                              lower, upper,
+                              x->unitNum);
+
+        BacktoCudaDev(x->devID, devIDBackup);
    }
-    else {
+    else
        ShowNTErrors("TODO!");
-    }
-
-    BacktoCudaDev(a->devID, devIDBackup);
 }

+
 #endif // USE_CUDA
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/math/Clip.cuh
+++ b/source/tensor/core/math/Clip.cuh
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-03
+*/
+
+#ifndef __CLIP_CUH__
+#define __CLIP_CUH__
+
+#include "Clip.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+#ifdef USE_CUDA
+
+/* set each entry to its clip value (CUDA Kernel) */
+__global__
+void KernelClip(DTYPE * a, DTYPE * b, DTYPE lower, DTYPE upper, int size);
+
+/* set each entry to its clip value (CUDA Kernel) with float16 data type*/
+__global__
+void KernelClip(__half * a, __half * b, DTYPE lower, DTYPE upper, int size);
+
+/* set each entry to its clip value */
+void _CudaClip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper);
+
+/* backward of Clip function (CUDA Kernel) */
+__global__
+void KernelClipBackward(DTYPE * dedy, DTYPE * dedx, DTYPE * y, DTYPE * x, DTYPE lower, DTYPE upper, int size);
+
+/* backward of Clip function */
+void _CudaClipBackward(XTensor * y, XTensor * x, XTensor * dedy, XTensor * dedx, DTYPE lower, DTYPE upper);
+
+#endif // USE_CUDA
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __CLIP_H__
\ No newline at end of file
--- a/source/tensor/core/math/Log.cpp
+++ b/source/tensor/core/math/Log.cpp
@@ -16,67 +16,36 @@
 */

 /*
-* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
+* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-03
 */

+#ifndef __CLIP_H__
+#define __CLIP_H__
+
 #include "../../XTensor.h"
-#include "../../XName.h"
-#include "Log.h"
-#include "Log.cuh"
-#include <math.h>

 namespace nts { // namespace nts(NiuTrans.Tensor)

-/*
-set every entry to its log value (do it on site)
->> a - input tensor we are processing
->> b - output tensor we are processing
-*/
-void _Log(const XTensor * a, XTensor * b)
-{
-#ifdef USE_CUDA
-    /* run it on GPUs */
-    if (a->devID >= 0) {
-        _CudaLog(a, b);
-    return;
-    }
-#endif
-
-    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
-    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
-    DTYPE * d = (DTYPE*)a->data;
-    DTYPE * db = (DTYPE*)b->data;
-    for (int i = 0; i < a->unitNum; i++)
-        db[i] = (DTYPE)log(d[i]);
-}
+/* set every entry to its clip value */
+void _Clip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper);

 /*
-set every entry to its log value
+set every entry to its clip value (do it on site)
 keep the result in the input tensor a and return nothing
->> a - the tensor we are processing
 */
-void _LogMe(XTensor * a)
-{
-    _Log(a, a);
-}
+void _ClipMe(XTensor * a, DTYPE lower, DTYPE upper);

 /*
-set every entry to its log value (return a XTensor structure)
+set every entry to its clip value  (return a XTensor structure)
 make a new tensor to keep the result and return it
->> a - input tensor we are processing
-<< return - the log value of the input tensor
 */
-XTensor Log(const XTensor & a)
-{
-    XTensor b(&a);
-    b.SetTMP();
-    
-    /* call _Log function */
-    _Log(&a, &b);
-    
-    /* tensor connections */
-    XLink::MakeLink(&a, NULL, &b, MATH_LOG);
-    
-    return b;
-}
-} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
+XTensor Clip(const XTensor & a, DTYPE lower, DTYPE upper);
+
+/*
+backward of Clip function
+*/
+void _ClipBackward(XTensor * y, XTensor * x, XTensor * dedy, XTensor * dedx, DTYPE lower, DTYPE upper);
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __CLIP_H__
--- a/source/tensor/core/math/Log.cu
+++ b/source/tensor/core/math/Log.cu
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
-*/
-
-#include "../../XDevice.h"
-#include "../../XTensor.h"
-#include "Log.h"
-#include "Log.cuh"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-#ifdef USE_CUDA
-/*
-set each entry to its log value (CUDA Kernel)
->> a - pointer to input data array
->> b - pointer to output data array
->> size - size of the data array
-*/
-__global__
-void KernelLog(DTYPE * a, DTYPE * b, int size)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-    if (i < size)
-        b[i] = log(a[i]);
-}
-
-/*
-set each entry to its log value (CUDA Kernel)
-This is for float16 computation
->> a - pointer to input data array
->> b - pointer to output data array
->> size - size of the data array
-*/
-__global__
-void KernelLog(__half * a, __half * b, int size)
-{
-    return;
-}
-
-/*
-set each entry to its log value
->> a - input tensor
->> b - output tensor
-*/
-void _CudaLog(const XTensor * a, XTensor * b)
-{
-    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
-    CheckNTErrors((a->isSparse == false), "TODO!");
-
-    int gridSize[3];
-    int blockSize[3];
-
-    GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
-
-    dim3 blocks(gridSize[0]);
-    dim3 threads(blockSize[0]);
-
-    int devIDBackup;
-    ProtectCudaDev(a->devID, devIDBackup);
-
-    if (a->dataType == DEFAULT_DTYPE) {
-        KernelLog << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);
-    }
-    else if (a->dataType == X_FLOAT16) {
-        KernelLog << <blocks, threads >> >((__half*)a->data, (__half*)b->data, a->unitNum);
-    }
-    else {
-        ShowNTErrors("TODO!");
-    }
-
-    BacktoCudaDev(a->devID, devIDBackup);
-}
-
-#endif // USE_CUDA
-} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/math/Normalize.cu
+++ b/source/tensor/core/math/Normalize.cu
@@ -110,7 +110,7 @@ void _CudaNormalize(const XTensor * input, XTensor * output, int dim,
    int cudaBlockSize[3];

    GDevs.GetCudaThread2D(input->devID, strideNum, stride * blockNum,
-        MAX_INT, cudaGridSize, cudaBlockSize);
+                          MAX_INT, cudaGridSize, cudaBlockSize);

    dim3 blocks(cudaGridSize[1], cudaGridSize[0]);
    dim3 threads(cudaBlockSize[1], cudaBlockSize[0]);
@@ -119,9 +119,9 @@ void _CudaNormalize(const XTensor * input, XTensor * output, int dim,
    ProtectCudaDev(a->devID, devIDBackup);

    KernelNormalize << <blocks, threads >> >((DTYPE*)input->data, (DTYPE*)output->data,
-        (DTYPE*)mean->data, (DTYPE*)var->data,
-        (DTYPE*)a->data, (DTYPE*)b->data, epsilon,
-        stride, strideNum, blockNum);
+                                             (DTYPE*)mean->data, (DTYPE*)var->data,
+                                             (DTYPE*)a->data, (DTYPE*)b->data, epsilon,
+                                              stride, strideNum, blockNum);

    BacktoCudaDev(a->devID, devIDBackup);
 }

--- a/source/tensor/core/math/Power.cpp
+++ b/source/tensor/core/math/Power.cpp
@@ -60,8 +60,12 @@ void _Power(const XTensor * a, XTensor * b, DTYPE p)
            bData[i] = aData[i] * aData[i];
    }
    else {
-        for (int i = 0; i < a->unitNum; i++)
-            bData[i] = (DTYPE)pow(aData[i], p);
+        for (int i = 0; i < a->unitNum; i++) {
+            if (p < 0 && aData[i] == 0)
+                bData[i] = 1e20F;
+            else
+                bData[i] = (DTYPE)pow(aData[i], p);
+        }
    }
 }


--- a/source/tensor/core/math/Power.cu
+++ b/source/tensor/core/math/Power.cu
@@ -77,8 +77,13 @@ void KernelPower(DTYPE * a, DTYPE * b, DTYPE p, int size)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;

-    if (i < size)
-        b[i] = pow(a[i], p);
+    if (i < size) {
+        DTYPE v = a[i];
+        if (p < 0 && v == 0)
+            b[i] = 1e20;
+        else
+            b[i] = pow(a[i], p);
+    }
 }

 /*
@@ -94,8 +99,13 @@ void KernelPower(__half * a, __half * b, __half p, int size)
 #if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
 #else
    int i = blockDim.x * blockIdx.x + threadIdx.x;
-    if (i < size)
-        b[i] = __float2half(pow(__half2float(a[i]), __half2float(p)));
+    if (i < size) {
+        float v = __half2float(a[i]);
+        if (__half2float(p) < 0 && v == 0)
+            b[i] = __float2half(1e20);
+        else
+            b[i] = __float2half(pow(__half2float(a[i]), __half2float(p)));
+    }
 #endif
 }


--- a/source/tensor/core/math/Unary.cpp
+++ b/source/tensor/core/math/Unary.cpp
+#include <math.h>
+#include "../../XName.h"
+#include "Unary.h"
+#include "Unary.cuh"
+
+namespace nts{
+    
+
+#ifdef USE_CUDA
+/* define three marco separately, specify the respective function names */
+#define _SIMPLE_UNARY_FUNCTION(_funcName, _cudaFuncName, origFunc)          \
+void _funcName(const XTensor * a, XTensor * b)                              \
+{                                                                           \
+    /* run it on GPUs */                                                    \
+    if (a->devID >= 0) {                                                    \
+        _cudaFuncName(a, b);                                                \
+    return;                                                                 \
+    }                                                                       \
+    CheckNTErrors((XTensor::IsSameShaped(a, b)),                            \
+                  "Input tensors should have the same type!");              \
+    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");                 \
+    DTYPE * d = (DTYPE*)a->data;                                            \
+    DTYPE * db = (DTYPE*)b->data;                                           \
+    for (int i = 0; i < a->unitNum; i++)                                    \
+        db[i] = (DTYPE)origFunc(d[i]);                                      \
+}
+
+#define _SIMPLE_UNARY_FUNCTION_ME(_funcNameMe, _funcName)                   \
+void _funcNameMe(XTensor * a)                                               \
+{                                                                           \
+    _funcName(a, a);                                                        \
+}        
+
+#define SIMPLE_UNARY_FUNCTION(funcName, _funcName, operationId)             \
+XTensor funcName(const XTensor &a)                                          \
+{                                                                           \
+    XTensor b(&a);                                                          \
+    b.SetTMP();                                                             \
+    _funcName(&a, &b);                                                      \
+    XLink::MakeLink(&a, NULL, &b, operationId);                             \
+    return b;                                                               \
+}
+
+_SIMPLE_UNARY_FUNCTION(_Absolute, _CudaAbsolute, fabs)
+_SIMPLE_UNARY_FUNCTION_ME(_AbsoluteMe, _Absolute)
+SIMPLE_UNARY_FUNCTION(Absolute, _Absolute, MATH_ABSOLUTE)
+
+_SIMPLE_UNARY_FUNCTION(_Exp, _CudaExp, exp)
+_SIMPLE_UNARY_FUNCTION_ME(_ExpMe, _Exp)
+SIMPLE_UNARY_FUNCTION(Exp, _Exp, MATH_EXP)
+
+_SIMPLE_UNARY_FUNCTION(_Log, _CudaLog, log)
+_SIMPLE_UNARY_FUNCTION_ME(_LogMe, _Log)
+SIMPLE_UNARY_FUNCTION(Log, _Log, MATH_LOG)
+
+_SIMPLE_UNARY_FUNCTION(_Sin, _CudaSin, sin)
+_SIMPLE_UNARY_FUNCTION_ME(_SinMe, _Sin)
+SIMPLE_UNARY_FUNCTION(Sin, _Sin, MATH_SIN)
+
+_SIMPLE_UNARY_FUNCTION(_Cos, _CudaCos, cos)
+_SIMPLE_UNARY_FUNCTION_ME(_CosMe, _Cos)
+SIMPLE_UNARY_FUNCTION(Cos, _Cos, MATH_COS)
+
+_SIMPLE_UNARY_FUNCTION(_Tan, _CudaTan, tan)
+_SIMPLE_UNARY_FUNCTION_ME(_TanMe, _Tan)
+SIMPLE_UNARY_FUNCTION(Tan, _Tan, MATH_TAN)
+
+/*_SIMPLE_UNARY_FUNCTION(_Round, _CudaRound, round)
+_SIMPLE_UNARY_FUNCTION_ME(_RoundMe, _Round)
+SIMPLE_UNARY_FUNCTION(Round, _Round, MATH_ROUND)*/
+#else
+/* define three marco separately, specify the respective function names */
+#define _SIMPLE_UNARY_FUNCTION(_funcName, origFunc)          \
+void _funcName(const XTensor * a, XTensor * b)                              \
+{                                                                           \
+    CheckNTErrors((XTensor::IsSameShaped(a, b)),                            \
+                  "Input tensors should have the same type!");              \
+    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");                 \
+    DTYPE * d = (DTYPE*)a->data;                                            \
+    DTYPE * db = (DTYPE*)b->data;                                           \
+    for (int i = 0; i < a->unitNum; i++)                                    \
+        db[i] = (DTYPE)origFunc(d[i]);                                      \
+}
+
+#define _SIMPLE_UNARY_FUNCTION_ME(_funcNameMe, _funcName)                   \
+void _funcNameMe(XTensor * a)                                               \
+{                                                                           \
+    _funcName(a, a);                                                        \
+}        
+
+#define SIMPLE_UNARY_FUNCTION(funcName, _funcName, operationId)             \
+XTensor funcName(const XTensor &a)                                          \
+{                                                                           \
+    XTensor b(&a);                                                          \
+    b.SetTMP();                                                             \
+    _funcName(&a, &b);                                                      \
+    XLink::MakeLink(&a, NULL, &b, operationId);                             \
+    return b;                                                               \
+}
+
+_SIMPLE_UNARY_FUNCTION(_Absolute, fabs)
+_SIMPLE_UNARY_FUNCTION_ME(_AbsoluteMe, _Absolute)
+SIMPLE_UNARY_FUNCTION(Absolute, _Absolute, MATH_ABSOLUTE)
+
+_SIMPLE_UNARY_FUNCTION(_Exp, exp)
+_SIMPLE_UNARY_FUNCTION_ME(_ExpMe, _Exp)
+SIMPLE_UNARY_FUNCTION(Exp, _Exp, MATH_EXP)
+
+_SIMPLE_UNARY_FUNCTION(_Log, log)
+_SIMPLE_UNARY_FUNCTION_ME(_LogMe, _Log)
+SIMPLE_UNARY_FUNCTION(Log, _Log, MATH_LOG)
+
+_SIMPLE_UNARY_FUNCTION(_Sin, sin)
+_SIMPLE_UNARY_FUNCTION_ME(_SinMe, _Sin)
+SIMPLE_UNARY_FUNCTION(Sin, _Sin, MATH_SIN)
+
+_SIMPLE_UNARY_FUNCTION(_Cos, cos)
+_SIMPLE_UNARY_FUNCTION_ME(_CosMe, _Cos)
+SIMPLE_UNARY_FUNCTION(Cos, _Cos, MATH_COS)
+
+_SIMPLE_UNARY_FUNCTION(_Tan, tan)
+_SIMPLE_UNARY_FUNCTION_ME(_TanMe, _Tan)
+SIMPLE_UNARY_FUNCTION(Tan, _Tan, MATH_TAN)
+
+/*_SIMPLE_UNARY_FUNCTION(_Round, round)
+_SIMPLE_UNARY_FUNCTION_ME(_RoundMe, _Round)
+SIMPLE_UNARY_FUNCTION(Round, _Round, MATH_ROUND)*/
+#endif
+
+}
\ No newline at end of file
--- a/source/tensor/core/math/Unary.cu
+++ b/source/tensor/core/math/Unary.cu
+#include <math.h>
+#include "../../XDevice.h"
+#include "../../XName.h"
+#include "Unary.cuh"
+
+namespace nts {
+
+#define SIMPLE_UNARY_FUNCTION_GPU(funcName, origFunc)                       \
+__global__                                                                  \
+void Kernel##funcName(DTYPE * a, DTYPE * b, int size)                       \
+{                                                                           \
+    int i = blockDim.x * blockIdx.x + threadIdx.x;                          \
+                                                                            \
+    if (i < size)                                                           \
+        b[i] = (DTYPE)origFunc(a[i]);                                       \
+}                                                                           \
+__global__                                                                  \
+    void Kernel##funcName(__half * a, __half * b, int size)                 \
+{                                                                           \
+    return;                                                                 \
+}                                                                           \
+void _Cuda##funcName(const XTensor * a, XTensor * b)                        \
+{                                                                           \
+    CheckNTErrors((XTensor::IsSameShaped(a, b)),                            \
+                  "Input tensors should have the same type!");              \
+    CheckNTErrors((a->isSparse == false), "TODO!");                         \
+                                                                            \
+    int gridSize[3];                                                        \
+    int blockSize[3];                                                       \
+                                                                            \
+    GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);         \
+                                                                            \
+    dim3 blocks(gridSize[0]);                                               \
+    dim3 threads(blockSize[0]);                                             \
+                                                                            \
+    int devIDBackup;                                                        \
+    ProtectCudaDev(a->devID, devIDBackup);                                  \
+                                                                            \
+    if (a->dataType == DEFAULT_DTYPE) {                                     \
+        Kernel##funcName << <blocks, threads >> >                           \
+                     ((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);        \
+    }                                                                       \
+    else if (a->dataType == X_FLOAT16) {                                    \
+        Kernel##funcName << <blocks, threads >> >                           \
+                     ((__half*)a->data, (__half*)b->data, a->unitNum);      \
+    }                                                                       \
+    else {                                                                  \
+        ShowNTErrors("TODO!");                                              \
+    }                                                                       \
+                                                                            \
+    BacktoCudaDev(a->devID, devIDBackup);                                   \
+}                                                                           \
+
+SIMPLE_UNARY_FUNCTION_GPU(Absolute, fabs)
+SIMPLE_UNARY_FUNCTION_GPU(Exp, exp)
+SIMPLE_UNARY_FUNCTION_GPU(Log, log)
+SIMPLE_UNARY_FUNCTION_GPU(Sin, sin)
+SIMPLE_UNARY_FUNCTION_GPU(Cos, cos)
+SIMPLE_UNARY_FUNCTION_GPU(Tan, tan)
+//SIMPLE_UNARY_FUNCTION_GPU(Round, round)
+
+}
\ No newline at end of file
--- a/source/tensor/core/math/Unary.cuh
+++ b/source/tensor/core/math/Unary.cuh
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
+*/
+
+#ifndef __UNARY_CUH__
+#define __UNARY_CUH__
+
+#include "../../XTensor.h"
+#include "Unary.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+#ifdef USE_CUDA
+
+/* set each entry to its absolute value (CUDA Kernel) */
+__global__
+void KernelAbsolute(DTYPE * a, DTYPE * b, int size);
+/* set each entry to its absolute value (CUDA Kernel) with float16 data type*/
+__global__
+void KernelAbsolute(__half * a, __half * b, int size);
+/* set each entry to its absolute value */
+void _CudaAbsolute(const XTensor * a, XTensor * b);
+
+/* set each entry to its exponent value (CUDA Kernel) */
+__global__
+void KernelExp(DTYPE * a, DTYPE * b, int size);
+/* set each entry to its exponent value (CUDA Kernel) with float16 data type*/
+__global__
+void KernelExp(__half * a, __half * b, int size);
+/* set each entry to its exponent value */
+void _CudaExp(const XTensor * a, XTensor * b);
+
+/* set each entry to its logarithm value (CUDA Kernel) */
+__global__
+void KernelLog(DTYPE * a, DTYPE * b, int size);
+/* set each entry to its logarithm value (CUDA Kernel) with float16 data type*/
+__global__
+void KernelLog(__half * a, __half * b, int size);
+/* set each entry to its logarithm value */
+void _CudaLog(const XTensor * a, XTensor * b);
+
+/* set each entry to its sine value (CUDA Kernel) */
+__global__
+void KernelSin(DTYPE * a, DTYPE * b, int size);
+/* set each entry to its sine value (CUDA Kernel) with float16 data type*/
+__global__
+void KernelSin(__half * a, __half * b, int size);
+/* set each entry to its sine value */
+void _CudaSin(const XTensor * a, XTensor * b);
+
+/* set each entry to its cosine value (CUDA Kernel) */
+__global__
+void KernelCos(DTYPE * a, DTYPE * b, int size);
+/* set each entry to its cosine value (CUDA Kernel) with float16 data type*/
+__global__
+void KernelCos(__half * a, __half * b, int size);
+/* set each entry to its cosine value */
+void _CudaCos(const XTensor * a, XTensor * b);
+
+/* set each entry to its tangent value (CUDA Kernel) */
+__global__
+void KernelTan(DTYPE * a, DTYPE * b, int size);
+/* set each entry to its tangent value (CUDA Kernel) with float16 data type*/
+__global__
+void KernelTan(__half * a, __half * b, int size);
+/* set each entry to its tangent value */
+void _CudaTan(const XTensor * a, XTensor * b);
+
+/* set each entry to its round value (CUDA Kernel) */
+//__global__
+//void KernelRound(DTYPE * a, DTYPE * b, int size);
+/* set each entry to its round value (CUDA Kernel) with float16 data type*/
+//__global__
+//void KernelRound(__half * a, __half * b, int size);
+/* set each entry to its round value */
+//void _CudaRound(const XTensor * a, XTensor * b);
+
+#endif // USE_CUDA
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __UNARY_CUH__
\ No newline at end of file
--- a/source/tensor/core/math/Unary.h
+++ b/source/tensor/core/math/Unary.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
+*/
+
+#ifndef __UNARY_H__
+#define __UNARY_H__
+
+#include "../../XTensor.h"
+
+namespace nts{
+
+/* set every entry to its absolute value */
+void _Absolute(const XTensor * a, XTensor * b);
+/* 
+set every entry to its absolute value (do it on site)
+keep the result in the input tensor a and return nothing
+*/
+void _AbsoluteMe(XTensor * a);
+/* 
+set every entry to its absolute value (return a XTensor structure)
+make a new tensor to keep the result and return it
+*/
+XTensor Absolute(const XTensor & a);
+
+/* set every entry to its exponent value */
+void _Exp(const XTensor * a, XTensor * b);
+/* 
+set every entry to its exponent value (do it on site)
+keep the result in the input tensor a and return nothing
+*/
+void _ExpMe(XTensor * a);
+/* 
+set every entry to its exponent value (return a XTensor structure)
+make a new tensor to keep the result and return it
+*/
+XTensor Exp(const XTensor & a);
+
+/* set every entry to its logarithm value */
+void _Log(const XTensor * a, XTensor * b);
+/* 
+set every entry to its logarithm value (do it on site)
+keep the result in the input tensor a and return nothing
+*/
+void _LogMe(XTensor * a);
+/* 
+set every entry to its logarithm value (return a XTensor structure)
+make a new tensor to keep the result and return it
+*/
+XTensor Log(const XTensor & a);
+
+/* set every entry to its sine value */
+void _Sin(const XTensor * a, XTensor * b);
+/* 
+set every entry to its sine value (do it on site)
+keep the result in the input tensor a and return nothing
+*/
+void _SinMe(XTensor * a);
+/* 
+set every entry to its sine value (return a XTensor structure)
+make a new tensor to keep the result and return it
+*/
+XTensor Sin(const XTensor & a);
+
+/* set every entry to its cosine value */
+void _Cos(const XTensor * a, XTensor * b);
+/* 
+set every entry to its cosine value (do it on site)
+keep the result in the input tensor a and return nothing
+*/
+void _CosMe(XTensor * a);
+/* 
+set every entry to its cosine value (return a XTensor structure)
+make a new tensor to keep the result and return it
+*/
+XTensor Cos(const XTensor & a);
+
+/* set every entry to its tangent value */
+void _Tan(const XTensor * a, XTensor * b);
+/* 
+set every entry to its tangent value (do it on site)
+keep the result in the input tensor a and return nothing
+*/
+void _TanMe(XTensor * a);
+/* 
+set every entry to its tangent value (return a XTensor structure)
+make a new tensor to keep the result and return it
+*/
+XTensor Tan(const XTensor & a);
+
+
+/* set every entry to its round value */
+//void _Round(const XTensor * a, XTensor * b);
+/* 
+set every entry to its round value (do it on site)
+keep the result in the input tensor a and return nothing
+*/
+//void _RoundMe(XTensor * a);
+/* 
+set every entry to its round value (return a XTensor structure)
+make a new tensor to keep the result and return it
+*/
+//XTensor Round(const XTensor & a);
+
+}
+#endif //end __UNARY_H__
\ No newline at end of file
--- a/source/tensor/core/movement/CopyBlocks.cpp
+++ b/source/tensor/core/movement/CopyBlocks.cpp
@@ -35,24 +35,33 @@ copy a number of blocks to target positions
 >> target - target data array
 >> targetBlocks - target positions of the copy
 >> myMem - the memory pool
+>> devID - device id
 */
-void _CopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem)
+void _CopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID)
 {
-    if (myMem != NULL && myMem->devID >= 0) {
+    if (myMem != NULL)
+        devID = myMem->devID;
+    
+    if (devID >= 0) {
 #ifdef USE_CUDA
        /* copy the index from host to device */
-        int * targetBlocksTMP = (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int));
+        int * targetBlocksTMP = myMem != NULL ?
+                               (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)):
+                               (int*)XMemAlloc(devID, blockNum * sizeof(int));
        XMemCopy(targetBlocksTMP, myMem->devID, targetBlocks, -1, blockNum * sizeof(int));

-        _CopyBlocksOnSite(source, blockSize, blockNum, target, targetBlocksTMP, myMem);
+        _CopyBlocksOnSite(source, blockSize, blockNum, target, targetBlocksTMP, devID);

-        myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
+        if(myMem != NULL)
+            myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
+        else
+            XMemFree(devID, targetBlocksTMP);
 #else
        ShowNTErrors("Plesae specify USE_CUDA and recompile the code!");
 #endif
    }
    else {
-        _CopyBlocksOnSite(source, blockSize, blockNum, target, targetBlocks, myMem);
+        _CopyBlocksOnSite(source, blockSize, blockNum, target, targetBlocks, devID);
    }
 }

@@ -65,11 +74,12 @@ copy a number of blocks source source positions to target positions
 >> target - target data array
 >> targetBlocks - target positions of the copy
 >> myMem - the memory pool
+>> devID - device id
 */
 void _CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID)
 {
    if (myMem != NULL)
-        CheckNTErrors((myMem->devID == devID), "DevIDs are different between memory pool and input devID!");
+        devID = myMem->devID;

    if (devID >= 0) {
 #ifdef USE_CUDA

--- a/source/tensor/core/movement/CopyBlocks.h
+++ b/source/tensor/core/movement/CopyBlocks.h
@@ -27,7 +27,7 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)

 /* copy a number of blocks to target positions */
-void _CopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem);
+void _CopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID);

 /* copy a number of blocks from source positions to target positions */
 void _CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID);

--- a/source/tensor/core/movement/CopyBlocksInGrid.cu
+++ b/source/tensor/core/movement/CopyBlocksInGrid.cu
@@ -223,8 +223,11 @@ void _CudaCopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridN

    int cudaGrids[3];
    int cudaBlocks[3];
-
    int threadNum = MIN(MAX(blockSize, blockNum), MAX_CUDA_THREAD_NUM_PER_BLOCK);
+
+    int devIDBackup;
+    ProtectCudaDev(myMem->devID, devIDBackup);
+
    GDevs.GetCudaThread2D(myMem->devID, threadNum, gridNum * blockNum, INT_MAX, cudaGrids, cudaBlocks);

    cudaBlocks[1] = 1;
@@ -237,39 +240,41 @@ void _CudaCopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridN
    if (blockNum == 4) {
        if ((SHARED_MEMORY_SIZE / itemSize - 2 * MAX_CUDA_THREAD_NUM_PER_BLOCK) >= 2 * cudaBlocks[0] * blockNum)
            KernelCopyBlocksInGridFast<int, 4, 2> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
-            ((int*)source, blockSize, blockNum, gridNum, (int*)target, index);
+                                                    ((int*)source, blockSize, blockNum, gridNum, (int*)target, index);
        else
            KernelCopyBlocksInGridFast<int, 4, 1> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
-            ((int*)source, blockSize, blockNum, gridNum, (int*)target, index);
+                                                    ((int*)source, blockSize, blockNum, gridNum, (int*)target, index);
    }
    else if (blockNum == 6) {
        if ((SHARED_MEMORY_SIZE / itemSize - 2 * MAX_CUDA_THREAD_NUM_PER_BLOCK) >= 2 * cudaBlocks[0] * blockNum)
            KernelCopyBlocksInGridFast<int, 6, 2> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
-            ((int*)source, blockSize, blockNum, gridNum, (int*)target, index);
+                                                    ((int*)source, blockSize, blockNum, gridNum, (int*)target, index);
        else
            KernelCopyBlocksInGridFast<int, 6, 1> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
-            ((int*)source, blockSize, blockNum, gridNum, (int*)target, index);
+                                                    ((int*)source, blockSize, blockNum, gridNum, (int*)target, index);
    }
    else if (blockNum == 8) {
        if ((SHARED_MEMORY_SIZE / itemSize - 2 * MAX_CUDA_THREAD_NUM_PER_BLOCK) >= 2 * cudaBlocks[0] * blockNum)
            KernelCopyBlocksInGridFast<int, 8, 2> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
-            ((int*)source, blockSize, blockNum, gridNum, (int*)target, index);
+                                                    ((int*)source, blockSize, blockNum, gridNum, (int*)target, index);
        else
            KernelCopyBlocksInGridFast<int, 8, 1> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
-            ((int*)source, blockSize, blockNum, gridNum, (int*)target, index);
+                                                    ((int*)source, blockSize, blockNum, gridNum, (int*)target, index);
    }
    else if (blockNum == 12) {
        if ((SHARED_MEMORY_SIZE / itemSize - 2 * MAX_CUDA_THREAD_NUM_PER_BLOCK) >= 2 * cudaBlocks[0] * blockNum)
            KernelCopyBlocksInGridFast<int, 12, 2> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
-            ((int*)source, blockSize, blockNum, gridNum, (int*)target, index);
+                                                     ((int*)source, blockSize, blockNum, gridNum, (int*)target, index);
        else
            KernelCopyBlocksInGridFast<int, 12, 1> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
-            ((int*)source, blockSize, blockNum, gridNum, (int*)target, index);
+                                                     ((int*)source, blockSize, blockNum, gridNum, (int*)target, index);
    }
    else {
        KernelCopyBlocksInGrid<int> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
-            ((int*)source, blockSize, blockNum, gridNum, (int*)target, index);
+                                      ((int*)source, blockSize, blockNum, gridNum, (int*)target, index);
    }
+
+    BacktoCudaDev(myMem->devID, devIDBackup);
 }
 #endif // USE_CUDA


--- a/source/tensor/core/movement/CopyBlocksOnSite.cpp
+++ b/source/tensor/core/movement/CopyBlocksOnSite.cpp
@@ -34,29 +34,35 @@ all the data has been on the device (CPU/GPU) already.
 >> blockNum - number of blocks
 >> target - target data array
 >> targetBlocks - target positions of the copy
->> myMem - the memory pool
+>> devID - device id
 */
-void _CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem)
+void _CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, int devID)
 {
-    if (myMem != NULL && myMem->devID >= 0) {
+    if (devID >= 0) {
 #ifdef USE_CUDA
-        _CudaCopyBlocks(source, blockSize, blockNum, target, targetBlocks, myMem);
+        _CudaCopyBlocks(source, blockSize, blockNum, target, targetBlocks, devID);
 #else
        ShowNTErrors("Plesae specify USE_CUDA and recompile the code!");
 #endif
    }
    else {
-        int devID = myMem != NULL ? myMem->devID : -1;
-
        /* 
        The following code should be fine with GPUs, but too many
        kernel calls would slow down the system. We prefer to use
        one kernel to do block copy in batch (kernel fusion). 
        */
-        for (int i = 0, b = 0; i < blockNum; i++, b += blockSize) {
-            XMemCopy((char*)target + targetBlocks[i] * blockSize, devID,
-                (char*)source + b, devID, blockSize);
+        if(blockSize == sizeof(int)){
+            for (int i = 0, b = 0; i < blockNum; i++, b += blockSize) {
+                *(int*)((char*)target + targetBlocks[i] * blockSize) = 
+                *(int*)((char*)source + b);
+            }
+        }
+        else{
+            for (int i = 0, b = 0; i < blockNum; i++, b += blockSize) {
+                XMemCopy((char*)target + targetBlocks[i] * blockSize, devID,
+                         (char*)source + b, devID, blockSize);
+            }
        }
    }
 }
-} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/movement/CopyBlocksOnSite.cu
+++ b/source/tensor/core/movement/CopyBlocksOnSite.cu
@@ -36,39 +36,48 @@ NOTE that this version makes more use of the 2d threads in cuda
 >> target - target data array
 >> targetBlocks - target positions of the copy
 */
-template<int miniBlockSize>
+template<class T>
 __global__
-void KernelCopyBlocks(DTYPE * source, int blockSize, int blockNum, DTYPE * target, int * targetBlocks)
+void KernelCopyBlocks(T * source, int blockSize, int blockNum, T * target, int * targetBlocks)
 {
    /* entry index in the block */
-    int i = (blockDim.x * blockIdx.x + threadIdx.x) * miniBlockSize;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;

    /* block index */
    int j = blockDim.y * blockIdx.y + threadIdx.y;

-    if (j >= blockNum)
+    if (i >= blockSize || j >= blockNum)
        return;

-    /* target position */
-    int k = targetBlocks[j];
-
-    DTYPE * s = source + blockSize * j;
-    DTYPE * t = target + blockSize * k;
-
-    if (i < blockSize) {
-        if (miniBlockSize == 4) {
-            t[i] = s[i];
-            t[i + 1] = s[i + 1];
-            t[i + 2] = s[i + 2];
-            t[i + 3] = s[i + 3];
-        }
-        else if (miniBlockSize <= 1) {
-            t[i] = s[i];
-        }
-        else {
-            printf("something wrong!");
-        }
-    }
+    T * s = source + blockSize * j;
+    T * t = target + blockSize * targetBlocks[j];
+
+    t[i] = s[i];
+}
+
+/*
+copy a number of blocks to target positions
+NOTE that this version makes more use of the 2d threads in cuda
+>> source - data array (head of the blocks) to copy from
+>> blockSize - size of block
+>> blockNum - number of blocks
+>> target - target data array
+>> targetBlocks - target positions of the copy
+*/
+template<class T>
+__global__
+void KernelCopyBlocksV2(T * source, int blockSize, int blockNum, int totalSize, T * target, int * targetBlocks)
+{
+    /* entry index in the block */
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i >= totalSize)
+        return;
+
+    int targetBlockID = targetBlocks[i / blockSize];
+    int targetOffset  = i % blockSize;
+
+    *(target + blockSize * targetBlockID + targetOffset) = source[i];
 }

 /*
@@ -78,29 +87,42 @@ copy a number of blocks to target positions (cuda version)
 >> blockNum - number of blocks
 >> target - target data array
 >> targetBlocks - target positions of the copy (on the device)
->> myMem - memory pool
+>> devID - device id
 */
-void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem)
+void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, int devID)
 {
-    CheckNTErrors((myMem != NULL), "No memory pool!");
-    CheckNTErrors((myMem->devID >= 0), "Wrong device to run!");
-    CheckNTErrors((blockSize % sizeof(DTYPE) == 0), "Unsupported block size!");
-
+    CheckNTErrors(devID >= 0, "Wrong device to run!");
    int cudaGrids[3];
    int cudaBlocks[3];
-    int bSize = blockSize / sizeof(DTYPE);

-    if (bSize % 4 == 0) {
-        GDevs.GetCudaThread2D(myMem->devID, bSize / 4, blockNum, MAX_INT, cudaGrids, cudaBlocks);
-        KernelCopyBlocks<4> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
-            ((DTYPE*)source, bSize, blockNum, (DTYPE*)target, targetBlocks);
+    int devIDBackup;
+    ProtectCudaDev(devID, devIDBackup);
+
+    if(blockSize % sizeof(double) == 0){
+        int bSize = blockSize / sizeof(double);
+        GDevs.GetCudaThread(devID, bSize * blockNum, cudaGrids, cudaBlocks);
+        KernelCopyBlocksV2<double> <<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
+                                    ((double*)source, bSize, blockNum, bSize * blockNum, (double*)target, targetBlocks);
+        //GDevs.GetCudaThread2D(devID, bSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
+        //KernelCopyBlocks<double> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>>
+        //                            ((double*)source, bSize, blockNum, (double*)target, targetBlocks);
+    }
+    else 
+    if(blockSize % sizeof(float) == 0){
+        int bSize = blockSize / sizeof(float);
+        GDevs.GetCudaThread(devID, bSize * blockNum, cudaGrids, cudaBlocks);
+        KernelCopyBlocksV2<float> <<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
+                                   ((float*)source, bSize, blockNum, bSize * blockNum, (float*)target, targetBlocks);
+        //GDevs.GetCudaThread2D(devID, bSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
+        //KernelCopyBlocks<float> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>>
+        //                         ((float*)source, bSize, blockNum, (float*)target, targetBlocks);
    }
-    else {
-        GDevs.GetCudaThread2D(myMem->devID, bSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
-        KernelCopyBlocks<1> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
-            ((DTYPE*)source, bSize, blockNum, (DTYPE*)target, targetBlocks);
+    else{
+        ShowNTErrors("Unsupported block size!");
    }
+
+    BacktoCudaDev(devID, devIDBackup);
 }
 #endif // USE_CUDA

-} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/movement/CopyBlocksOnSite.cuh
+++ b/source/tensor/core/movement/CopyBlocksOnSite.cuh
@@ -28,15 +28,11 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 #ifdef USE_CUDA

-/* copy a number of blocks to target positions */
-__global__
-void KernelCopyBlocks(DTYPE * source, int blockSize, int blockNum, DTYPE * target, int * targetBlocks);
-
 /* copy a number of blocks to target positions (cuda version) */
-void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem);
+void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, int devID);

 #endif // USE_CUDA

 } // namespace nts(NiuTrans.Tensor)

-#endif // __COPYBLOCKS_CUH__
\ No newline at end of file
+#endif // __COPYBLOCKS_CUH__
--- a/source/tensor/core/movement/CopyBlocksOnSite.h
+++ b/source/tensor/core/movement/CopyBlocksOnSite.h
@@ -27,7 +27,7 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)

 /* copy a number of blocks to target positions (on site) */
-void _CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem);
+void _CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, int devID);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/movement/CopyBlocksSelected.cu
+++ b/source/tensor/core/movement/CopyBlocksSelected.cu
@@ -75,6 +75,9 @@ void _CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, i
    CheckNTErrors(devID >= 0, "Wrong device to run!");
    CheckNTErrors((blockSize % sizeof(DTYPE) == 0), "Unsupported block size!");

+    int devIDBackup;
+    ProtectCudaDev(devID, devIDBackup);
+
    /* copy the index to the GPU memory */
    int * sourceBlocksTMP = myMem != NULL ? (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) : (int *)XMemAlloc(devID, blockNum * sizeof(int));
    int * targetBlocksTMP = myMem != NULL ? (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) : (int *)XMemAlloc(devID, blockNum * sizeof(int));
@@ -97,6 +100,8 @@ void _CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, i
        XMemFree(devID, sourceBlocksTMP);
        XMemFree(devID, targetBlocksTMP);
    }
+
+    BacktoCudaDev(devID, devIDBackup);
 }

 #endif // USE_CUDA

--- a/source/tensor/core/movement/CopyIndexed.cpp
+++ b/source/tensor/core/movement/CopyIndexed.cpp
@@ -37,8 +37,8 @@ copy indexed sub-tensors
 >> indexSize - length of srcIndex (and tgtIndex)
 >> tgtIndex - index of the target sub-tensors
 >> copyNum - number of the sub-tensors we copy for each source index, 
-   e.g., for srcIndex = [1,4] and copyNum = 2,
-   we actually copy the source sub-tensors 1, 2, 4, 5
+             e.g., for srcIndex = [1,4] and copyNum = 2,
+             we actually copy the source sub-tensors 1, 2, 4, 5
 */
 void _CopyIndexed(const XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum)
 {
@@ -73,17 +73,23 @@ void _CopyIndexed(const XTensor * s, XTensor * t, int dim, int * srcIndex, int i
    int * realSrcIndex = new int[realIndexSize];
    int * realTgtIndex = new int[realIndexSize];
    for (int i = 0; i < indexOffsetNum; i++) {
+        int base = i * indexSize * copyNum;
+        int baseSrc = i * leadDimSizeSrc;
+        int baseTgt = i * leadDimSizeTgt;
        for (int j = 0; j < indexSize; j++) {
+            int offset = base + j * copyNum;
+            int * rsi = realSrcIndex + offset;
+            int * rti = realTgtIndex + offset;
            for (int k = 0; k < copyNum; k++) {
-                realSrcIndex[i * indexSize * copyNum + j * copyNum + k] = i * leadDimSizeSrc + srcIndex[j] + k;
-                realTgtIndex[i * indexSize * copyNum + j * copyNum + k] = i * leadDimSizeTgt + tgtIndex[j] + k;
+                rsi[k] = baseSrc + srcIndex[j] + k;
+                rti[k] = baseTgt + tgtIndex[j] + k;
            }
        }
    }

    for (int i = 0; i < indexSize; i++) {
-        CheckNTErrors((srcIndex[i] < blockNumSrc), "Index is out of range!");
-        CheckNTErrors((tgtIndex[i] < blockNumTgt), "Index is out of range!");
+        CheckNTErrors((srcIndex[i] < blockNumSrc), "Index is out of scope!");
+        CheckNTErrors((tgtIndex[i] < blockNumTgt), "Index is out of scope!");
    }

    _CopyBlocks(s->data, blockSizeSrc * s->unitSize, realSrcIndex, realIndexSize, t->data, realTgtIndex, s->mem, s->devID);

--- a/source/tensor/core/movement/CopyValues.cpp
+++ b/source/tensor/core/movement/CopyValues.cpp
@@ -20,6 +20,7 @@
 */

 #include "../../XName.h"
+#include "../../XUtility.h"
 #include "CopyValues.h"
 #include "CopyValues.cuh"

@@ -35,14 +36,14 @@ copy s to t
 void _CopyValues(const XTensor * s, XTensor * t, XStream * stream)
 {
    CheckNTErrors((s != NULL && t != NULL), "The input tensor and output tensor must be nonempty!");
-    CheckNTErrors((s->data != NULL), "Cannot copy from an empty data array!");
+    CheckNTErrors((s->data != NULL), "Cannot copy an empty data array!");
    CheckNTErrors((t->data != NULL), "Cannot copy to an empty data array!");
    CheckNTErrors((s->unitNum == t->unitNum), "Unmatched data item number!");

    if ((s->dataType == X_FLOAT16 && t->dataType == X_FLOAT) ||
        (s->dataType == X_FLOAT && t->dataType == X_FLOAT16)) {
        CheckNTErrors(((s->devID < 0 && t->devID < 0) || s->devID == t->devID),
-            "The code must be run on the same device!");
+                       "The code must be run on the same device!");
        CheckNTErrors((s->isSparse || t->isSparse), "TODO!");
        ConvertDataType(s->devID, s->data, s->dataType, t->data, t->dataType, s->unitNum);
    }
@@ -69,6 +70,34 @@ void _CopyValues(const XTensor * s, XTensor * t, XStream * stream)
 }

 /*
+copy s to t
+
+>> s - source
+>> sBeg - begining of the segment 
+>> sLen - length of the segment
+>> t - target
+>> tBeg - beginning of the segment on the target side
+>> stream - the stream for creating the job pipeline
+*/
+void _CopyValues(const XTensor * s, const int sBeg, const int sLen, XTensor * t, const int tBeg, XStream * stream)
+{
+    CheckNTErrors(s != NULL && t != NULL, "The input tensor and output tensor must be nonempty!");
+    CheckNTErrors(s->data != NULL && t->data != NULL, "Cannot copy an empty data array!");
+    CheckNTErrors(s->unitSize == t->unitSize, "The input tensors must be of the same unit size!");
+    CheckNTErrors(s->order > sBeg && sBeg >= 0 && sLen <= s->unitNum, "Wrong segment on the source side");
+    CheckNTErrors(t->order > tBeg && tBeg >= 0, "Wrong segment on the target side");
+
+    if (!s->isSparse && !t->isSparse) {
+        XMemCopy((char*)t->data + tBeg * t->unitSize, t->devID,
+                 (char*)s->data + sBeg * s->unitSize, s->devID,
+                  s->unitSize * sLen);
+    }
+    else {
+        ShowNTErrors("TODO!");
+    }
+}
+
+/*
 copy s to t (return a XTensor structure)
 make a new tensor to keep the result and return it


--- a/source/tensor/core/movement/CopyValues.h
+++ b/source/tensor/core/movement/CopyValues.h
@@ -29,6 +29,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* copy s to t */
 void _CopyValues(const XTensor * s, XTensor * t, XStream * stream = NULL);

+/* copy a segment of s to t  */
+void _CopyValues(const XTensor * s, const int sBeg, const int sLen, XTensor * t, const int tBeg, XStream * stream = NULL);
+
 /* 
 copy s to t (return a XTensor structure)
 make a new tensor to keep the result and return it

--- a/source/tensor/core/reduce/ReduceSum.cu
+++ b/source/tensor/core/reduce/ReduceSum.cu
@@ -478,20 +478,39 @@ void KernelReduceSumFast(__half * input, __half * output,
 if data storage is discontinuius ,use this way to reduce 
 */
 __global__ 
-void KernelReduceSumDiscontinuousStorage(DTYPE * input, DTYPE * output, int stride, 
+void KernelReduceSumDiscontinuousStorage(DTYPE * input, DTYPE * output, int stride, int blockNum,
                                         int strideNum, DTYPE * shift, DTYPE power, bool isExp)
 {
-    //int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    //int endIndex = (idx+1) * strideNum;
+    __shared__ DTYPE bias[MAX_CUDA_THREAD_NUM_PER_BLOCK];
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    int blockIndex = idx / stride;
-    int offsetInBlock = idx% stride;
+    int offsetInBlock = idx % stride; 
+    if (idx >= stride * blockNum)
+        return;
+    bias[idx % blockDim.x] = shift != NULL ? shift[idx] : 0;
    DTYPE ans = 0;
+
 #pragma unroll
    for (int i = stride * strideNum * blockIndex + offsetInBlock;
        i < stride * strideNum * blockIndex + offsetInBlock + stride * strideNum;
        i += stride){
-        ans += input[i];
+        DTYPE value = input[i];
+        value = value - bias[idx % blockDim.x];
+        if (power != (DTYPE)1.0) {
+            if (power == (DTYPE)2.0) {
+                value = value * value;
+            }
+            else if (power == (DTYPE)0.5) {
+                value = sqrt(value);
+            }
+            else {
+                value = pow(value, power);
+            }
+        }
+        if (isExp) {
+            value = exp(value);
+        }
+        ans += value;
    }
    output[idx] = ans;
 }
@@ -722,7 +741,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
        //convert2uintV2 << <dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >> > ((float*)input->data, goutput, stride, strideNum, blockNum, strideNum*blockNum*stride);
        dim3 grid, block;
        discontinuousStorageNoShareMemThreadAllocation(grid, block, stride, blockNum);
-        KernelReduceSumDiscontinuousStorage <<<grid, block >>> ((DTYPE *)input->data, (DTYPE*)output->data, stride, strideNum, sp, power, isExp);
+        KernelReduceSumDiscontinuousStorage <<<grid, block >>> ((DTYPE *)input->data, (DTYPE*)output->data, stride, strideNum, blockNum,sp, power, isExp);
    }
    else {
        do {

--- a/source/tensor/core/shape/MakeMergeBlockIndex.cpp
+++ b/source/tensor/core/shape/MakeMergeBlockIndex.cpp
@@ -33,14 +33,14 @@ set target data block index for the data movement in merge
 >> splitSizeInGrid - size of each data array to merge
 >> gridSize - number of blocks in a grid (here grid is a higher level orgnization upon blocks)
 >> gridNum - number of grids
->> mem - the memory pool
+>> devID - device id
 */
 void _MakeMergeBlockIndex(int * blockIndex, int blockNum, int blockNumInMerge,
-                          int splitSizeInGrid, int gridSize, int gridNum, XMem * mem)
+                          int splitSizeInGrid, int gridSize, int gridNum, int devID)
 {
-    if (mem != NULL && mem->devID >= 0) {
+    if (devID >= 0) {
 #ifdef USE_CUDA
-        _CudaMakeMergeBlockIndex(mem->devID, blockIndex, blockNum, blockNumInMerge, splitSizeInGrid, gridSize, gridNum);
+        _CudaMakeMergeBlockIndex(devID, blockIndex, blockNum, blockNumInMerge, splitSizeInGrid, gridSize, gridNum);
 #else
        ShowNTErrors("Please specify USE_CUDA and recompile the code!");
 #endif

--- a/source/tensor/core/shape/MakeMergeBlockIndex.h
+++ b/source/tensor/core/shape/MakeMergeBlockIndex.h
@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /* set target data block index for the data movement in merge */
 void _MakeMergeBlockIndex(int * blockIndex, int blockNum, int blockNumInMerge,
-                          int splitSizeInGrid, int gridSize, int gridNum, XMem * mem);
+                          int splitSizeInGrid, int gridSize, int gridNum, int devID);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/shape/MakeSplitBlockIndex.cpp
+++ b/source/tensor/core/shape/MakeSplitBlockIndex.cpp
@@ -31,13 +31,13 @@ set target data block index for the data movement in split
 >> splitNum - number of splits
 >> blockSplitSize - size of the splitted block
 >> blockNum - number of data blocks
->> mem - the memory pool
+>> devID - device id
 */
-void _MakeSplitBlockIndex(int * blockIndex, int splitNum, int blockSplitSize, int blockNum, XMem * mem)
+void _MakeSplitBlockIndex(int * blockIndex, int splitNum, int blockSplitSize, int blockNum, int devID)
 {
-    if (mem != NULL && mem->devID >= 0) {
+    if (devID >= 0) {
 #ifdef USE_CUDA
-        _CudaMakeSplitBlockIndex(mem->devID, blockIndex, splitNum, blockSplitSize, blockNum);
+        _CudaMakeSplitBlockIndex(devID, blockIndex, splitNum, blockSplitSize, blockNum);
 #else
        ShowNTErrors("Please specify USE_CUDA and recompile the code!");
 #endif

--- a/source/tensor/core/shape/MakeSplitBlockIndex.h
+++ b/source/tensor/core/shape/MakeSplitBlockIndex.h
@@ -27,7 +27,7 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)

 /* set target data block index for the data movement in split */
-void _MakeSplitBlockIndex(int * blockIndex, int splitNum, int blockSplitSize, int blockNum, XMem * mem);
+void _MakeSplitBlockIndex(int * blockIndex, int splitNum, int blockSplitSize, int blockNum, int devID);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/shape/Merge.cpp
+++ b/source/tensor/core/shape/Merge.cpp
@@ -42,10 +42,13 @@ e.g., (N/3, M, 3) -> (N, M)
 */
 void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
 {
-	int whereToMergeRDI = s->order - whereToMerge - 1;
-	int leadingDimRDI = s->order - leadingDim - 1;
+    if(leadingDim < 0)
+        leadingDim = 0;
+
+    int whereToMergeRDI = s->order - whereToMerge - 1;
+    int leadingDimRDI = s->order - leadingDim - 1;
    if (leadingDimRDI < 0)
-		leadingDimRDI = s->order - 1;
+        leadingDimRDI = s->order - 1;

    CheckNTErrors((s != NULL && t != NULL), "Invalid tensors!");
    CheckNTErrors((s->devID == t->devID || (s->devID < 0 && t->devID < 0)),
@@ -60,8 +63,12 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
            CheckNTErrors((t->dimSizeRDI[i] == s->dimSizeRDI[i] * s->dimSizeRDI[leadingDimRDI]),
                          "Unmatched tensor sizes!");
        }
+        else if (i < leadingDimRDI){
+            CheckNTErrors((s->dimSizeRDI[i] == t->dimSizeRDI[i]),
+                          "Unmatched tensor sizes!");
+        }
        else if (i > leadingDimRDI) {
-            CheckNTErrors((s->dimSizeRDI[i - 1] == t->dimSizeRDI[i]),
+            CheckNTErrors((s->dimSizeRDI[i] == t->dimSizeRDI[i - 1]),
                          "Unmatched tensor sizes!");
        }
    }
@@ -119,28 +126,24 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
        int realBlockSize = blockSize * t->unitSize;

        int * blockIndex = (int*)(mem != NULL ?
-            mem->AllocBuf(mem->devID, blockNum * gridNum * sizeof(int)) :
-            XMemAlloc(mem->devID, blockNum * gridNum * sizeof(int)));
+                                  mem->AllocBuf(mem->devID, blockNum * gridNum * sizeof(int)) :
+                                  XMemAlloc(s->devID, blockNum * gridNum * sizeof(int)));

-        _MakeMergeBlockIndex(blockIndex, blockNum, blockNumInMerge, splitSizeInGrid, gridSize, gridNum, mem);
+        _MakeMergeBlockIndex(blockIndex, blockNum, blockNumInMerge, splitSizeInGrid, gridSize, gridNum, s->devID);

-        _CopyBlocksOnSite(s->data, realBlockSize, blockNum, dataTMP, blockIndex, mem);
+        _CopyBlocksOnSite(s->data, realBlockSize, blockNum * gridNum, dataTMP, blockIndex, s->devID);

        if (mem != NULL)
            mem->ReleaseBuf(mem->devID, blockNum * gridNum * sizeof(int));
        else
-            XMemFree(mem->devID, blockIndex);
-
-        /* copy from tmp to target */
-        XMemCopy(t->data, t->devID, dataTMP, s->devID, size);
+            XMemFree(s->devID, blockIndex);

        if (!isOnSameDevice) {
            XMemCopy(t->data, t->devID, dataTMP, s->devID, size);
-
            if (mem != NULL)
                mem->ReleaseBuf(mem->devID, size);
            else
-                XMemFree(mem->devID, dataTMP);
+                XMemFree(s->devID, dataTMP);
        }
    }
 }
@@ -163,7 +166,7 @@ XTensor Merge(const XTensor &s, int whereToMerge, int leadingDim)
    CheckNTErrors(leadingDim < whereToMerge, "Invalid leading dimension!");
    
    if (leadingDim < 0)
-		leadingDim = 0;
+        leadingDim = 0;
    int order = s.order - 1;
    int * dimSize = new int[order];

@@ -205,7 +208,7 @@ merge small tensors into a big tensor
 */
 void _Merge(const XList * smalls, XTensor * big, int whereToMerge)
 {
-	CheckNTErrors((smalls != NULL), "Invalid list!");
+    CheckNTErrors((smalls != NULL), "Invalid list!");
    CheckNTErrors((smalls->count > 0), "Empty list!");

    bool uniform = true;
@@ -233,7 +236,7 @@ void _Merge(const XList * smalls, XTensor * big, int whereToMerge)
    int mergedNum = smalls->count;

    XTensor * s0 = (XTensor*)smalls->GetItem(0);
-	int whereToMergeRDI = s0->order - whereToMerge - 1;
+    int whereToMergeRDI = s0->order - whereToMerge - 1;
    for (int i = 0; i < s0->order; i++) {
        if (i <= whereToMergeRDI)
            blockSize *= s0->dimSizeRDI[i];
@@ -268,10 +271,10 @@ void _Merge(const XList * smalls, XTensor * big, int whereToMerge)
    }
    /* merging with fewer kernel/api calls??? (i'm not sure about it!! may remove this later) */
    else {
-        int* dimSizeTMP = new int[MAX_TENSOR_DIM_NUM];
-        for (int i = 0; i < MAX_TENSOR_DIM_NUM; i++)
-            dimSizeTMP[i] = -smallsItem0->dimSizeRDI[i];
-        dimSizeTMP[smallsItem0->order] = -mergeNum;
+        int* dimSizeTMP = new int[smallsItem0->order + 1];
+        for (int i = 0; i < smallsItem0->order; i++)
+            dimSizeTMP[i + 1] = -smallsItem0->dimSize[i];
+        dimSizeTMP[0] = -mergeNum;

        XMem * mem = smallsItem0->mem;
        XTensor * tensorTMP = new XTensor(smallsItem0->order + 1, dimSizeTMP,
@@ -283,7 +286,7 @@ void _Merge(const XList * smalls, XTensor * big, int whereToMerge)
        if (uniform)
            dataTMP = smallsItem0->data;
        else
-            dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(mem->devID, size);
+            dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(big->devID, size);

        tensorTMP->data = dataTMP;

@@ -295,18 +298,17 @@ void _Merge(const XList * smalls, XTensor * big, int whereToMerge)
            }
        }

-        _Merge(tensorTMP, big, whereToMerge);
+        _Merge(tensorTMP, big, whereToMerge + 1);

        delete[] dimSizeTMP;
-        tensorTMP->data = NULL;
-        dataTMP = NULL;

+        tensorTMP->data = NULL;
        delete tensorTMP;

        if ((!uniform) && (mem != NULL))
            mem->ReleaseBuf(mem->devID, size);
        else
-            XMemFree(mem->devID, dataTMP);
+            XMemFree(big->devID, dataTMP);
    }
 }


--- a/source/tensor/core/shape/MergeBlockLists.cu
+++ b/source/tensor/core/shape/MergeBlockLists.cu
@@ -109,6 +109,9 @@ void _CudaMergeBlockLists(const XList * sourceList, int * blockSizes, int blockN
    CheckNTErrors((maxBlockSize % sizeof(DTYPE) == 0), "Unsupported block size!");
    realMaxBlockSize = maxBlockSize / sizeof(DTYPE);

+    int devIDBackup;
+    ProtectCudaDev(myMem->devID, devIDBackup);
+
    int cudaGridSizes[3];
    int cudaBlockSizes[3];

@@ -135,6 +138,8 @@ void _CudaMergeBlockLists(const XList * sourceList, int * blockSizes, int blockN
    delete[] targetArrays;
    delete[] sizes;
    delete[] offsets;
+
+    BacktoCudaDev(myMem->devID, devIDBackup);
 }
 #endif // USE_CUDA


--- a/source/tensor/core/shape/Split.cpp
+++ b/source/tensor/core/shape/Split.cpp
@@ -24,6 +24,7 @@
 #include "MakeSplitBlockIndex.h"
 #include "../../XName.h"
 #include "../../XTensor.h"
+#include "../../XDevice.h"
 #include "../../XUtility.h"
 #include "../movement/CopyBlocksOnSite.h"

@@ -88,10 +89,33 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
        int n = blockNum / splitNum;
        int sStep = blockSize * s->unitSize;
        int tStep = n * tPitch;
-        for (int k = 0; k < splitNum; k++) {
-            XMemCopy2D((char*)t->data + k * tStep, tPitch, t->devID,
-                (char*)s->data + k * sStep, sPitch, s->devID,
-                mSize, n);
+        if(t->devID < 0){
+            for (int k = 0; k < splitNum; k++) {
+                XMemCopy2D((char*)t->data + k * tStep, tPitch, t->devID,
+                           (char*)s->data + k * sStep, sPitch, s->devID,
+                            mSize, n);
+            }
+        }
+        else{
+#ifdef USE_CUDA
+#ifdef STREAMED_MEMCPOPY
+            XStream * stream = GDevs.GPUs[t->devID].stream;
+            for (int k = 0; k < splitNum; k++) {
+                XMemCopy2DAsync((char*)t->data + k * tStep, tPitch, t->devID,
+                                (char*)s->data + k * sStep, sPitch, s->devID,
+                                 mSize, n, stream);
+            }
+            stream->StreamSynchronize();
+#else
+            for (int k = 0; k < splitNum; k++) {
+                XMemCopy2D((char*)t->data + k * tStep, tPitch, t->devID,
+                           (char*)s->data + k * sStep, sPitch, s->devID,
+                            mSize, n);
+            }
+#endif
+#else
+            ShowNTErrors("Please specify USE_CUDA and recompile the code!");
+#endif
        }
    }
    else {
@@ -108,17 +132,17 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
        int blockSplitSize = blockNum / splitNum;

        int * blockIndex = (int*)(mem != NULL ?
-            mem->AllocBuf(mem->devID, blockNum * sizeof(int)) :
-            XMemAlloc(mem->devID, blockNum * sizeof(int)));
+                                  mem->AllocBuf(mem->devID, blockNum * sizeof(int)) :
+                                  XMemAlloc(s->devID, blockNum * sizeof(int)));

-        _MakeSplitBlockIndex(blockIndex, splitNum, blockSplitSize, blockNum, mem);
+        _MakeSplitBlockIndex(blockIndex, splitNum, blockSplitSize, blockNum, s->devID);

-        _CopyBlocksOnSite(s->data, realBlockSize, blockNum, dataTMP, blockIndex, mem);
+        _CopyBlocksOnSite(s->data, realBlockSize, blockNum, dataTMP, blockIndex, s->devID);

        if (mem != NULL)
            mem->ReleaseBuf(mem->devID, blockNum * sizeof(int));
        else
-            XMemFree(mem->devID, blockIndex);
+            XMemFree(s->devID, blockIndex);

        /* copy from tmp to target */
        if (!isOnSameDevice) {
@@ -127,7 +151,7 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
            if (mem != NULL)
                mem->ReleaseBuf(mem->devID, size);
            else
-                XMemFree(mem->devID, dataTMP);
+                XMemFree(s->devID, dataTMP);
        }
    }
 }
@@ -144,6 +168,8 @@ make a new tensor to keep the result and return it
 XTensor Split(const XTensor &s, int whereToSplit, int splitNum)
 {
    CheckNTErrors(&s, "Invalid tensors!");
+    CheckNTErrors(s.dimSize[whereToSplit] % splitNum == 0, 
+                  "The dimension cannot be splitted due to the inproper split number");

    int order = s.order + 1;
    int * dimSize = new int[order];
@@ -226,20 +252,46 @@ void _Split(const XTensor * big, XList * smalls, int whereToSplit, int splitNum)
        int n = blockNum / splitNum;
        int sStep = blockSize * big->unitSize;
        int tStep = 0;
-        for (int k = 0; k < splitNum; k++) {
-            XTensor * t = (XTensor*)smalls->GetItem(k);
-            XMemCopy2D((char*)t->data + k * tStep, tPitch, t->devID,
-                (char*)big->data + k * sStep, sPitch, big->devID,
-                mSize, n);
+
+        if(big->devID < 0){
+            for (int k = 0; k < splitNum; k++) {
+                XTensor * t = (XTensor*)smalls->GetItem(k);
+                XMemCopy2D((char*)t->data + k * tStep, tPitch, t->devID,
+                           (char*)big->data + k * sStep, sPitch, big->devID,
+                            mSize, n);
+            }
+        }
+        else{
+#ifdef USE_CUDA
+#ifdef STREAMED_MEMCPOPY
+            XStream * stream = GDevs.GPUs[big->devID].stream;
+            for (int k = 0; k < splitNum; k++) {
+                XTensor * t = (XTensor*)smalls->GetItem(k);
+                XMemCopy2DAsync((char*)t->data + k * tStep, tPitch, t->devID,
+                                (char*)big->data + k * sStep, sPitch, big->devID,
+                                 mSize, n, stream);
+            }
+            stream->StreamSynchronize();
+#else
+            for (int k = 0; k < splitNum; k++) {
+                XTensor * t = (XTensor*)smalls->GetItem(k);
+                XMemCopy2D((char*)t->data + k * tStep, tPitch, t->devID,
+                           (char*)big->data + k * sStep, sPitch, big->devID,
+                            mSize, n);
+            }
+#endif
+#else
+            ShowNTErrors("Please specify USE_CUDA and recompile the code!");
+#endif
        }
    }
    /* splitting with fewer kernel/api calls??? (i'm not sure about it!! may remove this later) */
    else {
-        int* dimSizeTMP = new int[MAX_TENSOR_DIM_NUM];
-        for (int i = 0; i < MAX_TENSOR_DIM_NUM; i++)
-            dimSizeTMP[i] = -big->dimSize[i];
-        dimSizeTMP[whereToSplit] /= splitNum;
-        dimSizeTMP[big->order] = -splitNum;
+        int* dimSizeTMP = new int[big->order + 1];
+        for (int i = 0; i < big->order; i++)
+            dimSizeTMP[i + 1] = -big->dimSize[i];
+        dimSizeTMP[whereToSplit + 1] /= splitNum;
+        dimSizeTMP[0] = -splitNum;

        XMem * mem = big->mem;
        XTensor* tensorTMP = new XTensor(big->order + 1, dimSizeTMP, big->dataType, big->denseRatio, big->devID, mem);
@@ -251,7 +303,7 @@ void _Split(const XTensor * big, XList * smalls, int whereToSplit, int splitNum)
            dataTMP = first->data;
        }
        else {
-            dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(mem->devID, size);
+            dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(big->devID, size);
        }

        tensorTMP->data = dataTMP;
@@ -270,13 +322,12 @@ void _Split(const XTensor * big, XList * smalls, int whereToSplit, int splitNum)
        delete[] dimSizeTMP;

        tensorTMP->data = NULL;
-        dataTMP = NULL;
        delete tensorTMP;

        if ((!uniform) && (mem != NULL))
            mem->ReleaseBuf(mem->devID, size);
        else
-            XMemFree(mem->devID, dataTMP);
+            XMemFree(big->devID, dataTMP);
    }
 }


--- a/source/tensor/core/shape/Split.h
+++ b/source/tensor/core/shape/Split.h
@@ -26,6 +26,8 @@

 namespace nts { // namespace nts(NiuTrans.Tensor)

+#define STREAMED_MEMCPOPY
+
 /* 
 transform a tensor by splitting it 
 e.g., (M, N) -> (M, N/3, 3) 

--- a/source/tensor/core/shape/Transpose.cpp
+++ b/source/tensor/core/shape/Transpose.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */

+/*
+ * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-28
+ * It is extreamly hot these days and i cannot sleep well. Fortunately we had 
+ * good lunch of Steamed Cold Noodles. This made me feel much better!
+ */
+
+#include "Transpose.h"
+#include "Merge.h"
+#include "../../XUtility.h"
+#include "../../XName.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/*
+tensor transposition of dimensions i and j
+b = transposed(a) 
+
+For a input tensor a, we tranpose the dimensions i and j of it.
+E.g., let a be a tensor of size x * y * z, i = 0, j = 2, 
+then the output will be a tensor of size z * y * x.
+
+>> a - the input tensor
+>> b - the output tensor by transpose tensor a with specified dimensions i and j
+>> i - the transposed dimension
+>> j - the transposed dimension
+*/
+void _Transpose(const XTensor * a, XTensor * b, const int i, const int j)
+{
+    CheckNTErrors(a && b, "Empty tensors");
+    CheckNTErrors(a->order == b->order, "Wrong tensor orders");
+    CheckNTErrors(a->unitNum == b->unitNum && a->unitSize == b->unitSize, "Wrong tensor sizes");
+    CheckNTErrors(a->order > i && i >= 0, "index of dimension is out of scope!");
+    CheckNTErrors(a->order > j && j >= 0, "index of dimension is out of scope!");
+
+    for(int k = 0; k < a->order; k++){
+        if(k == i){
+            CheckNTErrors(a->dimSize[k] == b->dimSize[j], "Wrong dimension size in transposition");
+        }
+        else if(k == j){
+            CheckNTErrors(a->dimSize[k] == b->dimSize[i], "Wrong dimension size in transposition");
+        }
+        else{
+            CheckNTErrors(a->dimSize[k] == b->dimSize[k], "Wrong dimension size in transposition");
+        }
+    }
+
+    if(i == j){
+        XMemCopy(b->data, b->devID, a->data, a->devID, b->unitNum * b->unitSize);
+    }
+    else{
+        int I = MIN(i, j);
+        int J = MAX(i, j);
+        int * dims = new int[a->order + 1];
+
+        for(int k = 0; k <= J; k++)
+            dims[k] = a->dimSize[k];
+        dims[J + 1] = -1;
+        for(int k = J + 1; k < a->order; k++)
+            dims[k + 1] = a->dimSize[k];
+
+        /* reshape tensor a form (..., n_I, ..., n_J, ...) => (..., n_I, ..., n_J, 1, ...)*/
+        XTensor * aTMP =  new XTensor(a->order + 1, dims, a->dataType, a->denseRatio, a->devID, a->mem);
+        aTMP->data = a->data;
+
+        for(int k = 0; k < I; k++)
+            dims[k] = a->dimSize[k];
+        for(int k = I + 1; k <= J; k++)
+            dims[k - 1] = a->dimSize[k];
+        dims[J] = a->dimSize[I];
+        for(int k = J + 1; k < a->order; k++)
+            dims[k] = a->dimSize[k];
+
+        /* reshape tensor b form (..., m_I, ..., m_J, ...) => (..., m_J, m_I, ...) */
+        b->Reshape(b->order, dims);
+
+        /* tensor (..., n_I, ..., n_J, 1, ...) => tensor (..., m_J, m_I, ...) */
+        _Merge(aTMP, b, J + 1, I);
+
+        memcpy(dims, a->dimSize, sizeof(int) * a->order);
+        dims[I] = a->dimSize[J];
+        dims[J] = a->dimSize[I];
+
+        /* reshape tensor b form (..., m_J, m_I, ...) => (..., m_J, ..., m_I, ...) =>  */
+        b->Reshape(b->order, dims);
+
+        aTMP->data = NULL;
+        delete[] dims;
+        delete aTMP;
+    }
+}
+
+/*
+tensor transposition of dimensions i and j (return a XTensor structure).
+make a new tensor to keep the result and return it.
+b = transposed(a)
+
+For a input tensor a, we tranpose the dimensions i and j of it.
+E.g., let a be a tensor of size x * y * z, i = 0, j = 2, 
+then the output will be a tensor of size z * y * x.
+
+>> a - the input tensor
+>> i - the transposed dimension
+>> j - the transposed dimension
+<< return - the output tensor by transpose tensor a with specified dimensions i and j
+*/
+XTensor Transpose(const XTensor &a, const int i, const int j)
+{
+    CheckNTErrors(a.order > i && i >= 0, "index of dimension is out of scope!");
+    CheckNTErrors(a.order > j && j >= 0, "index of dimension is out of scope!");
+
+    int order = a.order;
+    int * dimSize = new int[order];
+    for(int k = 0; k < order; k++){
+        if(k == i)
+            dimSize[k] = a.dimSize[j];
+        else if(k == j)
+            dimSize[k] = a.dimSize[i];
+        else
+            dimSize[k] = a.dimSize[k];
+    }
+
+    float dr = (!a.isSparse) ? 1.0F : a.denseRatio;
+    XTensor b(order, dimSize, a.dataType, dr, a.devID, a.mem);
+    b.SetTMP();
+
+    /* call _Transpose function */
+    _Transpose(&a, &b, i, j);
+    
+    /* tensor connection */
+    XLink::MakeLink(&a, NULL, &b, SHAPE_TRANSPOSE);
+    XLink::AddParamToHeadInt(&b, i);
+    XLink::AddParamToHeadInt(&b, j);
+
+    /* destroy variables */
+    delete[] dimSize;
+
+    return b;
+}
+
+}
--- a/source/tensor/core/shape/Transpose.h
+++ b/source/tensor/core/shape/Transpose.h
@@ -27,27 +27,18 @@

 namespace nts { // namespace nts(NiuTrans.Tensor)

-#define transpose _Transpose_
-
 /*
-generate a transposed 1D/2D tensor
+tensor transposition of dimensions i and j
 b = transposed(a) 
 */
-void _Transpose(XTensor * a, XTensor * b);
-
-/* 
-transpose a 1D/2D tensor (do it on site).
-keep the result in the input tensor and return nothing.
-a = transposed(a) 
-*/
-void _TransposeMe(XTensor * a);
+void _Transpose(const XTensor * a, XTensor * b, const int i, const int j);

 /* 
-make a transposed 1D/2D tensor (return a XTensor structure).
+tensor transposition of dimensions i and j (return a XTensor structure).
 make a new tensor to keep the result and return it.
 b = transposed(a)
 */
-XTensor Transpose(XTensor &a);
+XTensor Transpose(const XTensor &a, const int i, const int j);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/shape/Unsqueeze.cu
+++ b/source/tensor/core/shape/Unsqueeze.cu
@@ -32,12 +32,108 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 insert a dimension by copying the blocks for n times (where n is the size of the inerted dimension)
 >> s - pointer to the source data array
 >> blockSize - size of a block
+
+>> totalSize - total size of the blocks (i.e., blockSIze * n)
+>> t - pointer to the target data array
+>> n - number of blocks to copy data
+*/
+template<class T>
+__global__
+void KernelUnsqueezeFlat(void * s, int blockSize, int totalSize, void * t, int n)
+{
+    /* index of data items */
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i >= blockSize)
+        return;
+
+    T value = ((T*)s)[i];
+    T * tData = (T*)t;
+
+    __syncthreads();
+
+    for (int k = i; k < totalSize; k += blockSize)
+        tData[k] = value;
+}
+
+/*
+insert a dimension by copying the blocks for n times (where n is the size of the inerted dimension)
+>> s - pointer to the source data array
+>> blockSize - size of a block
+
+>> totalSize - total size of the blocks (i.e., blockSIze * n)
+>> t - pointer to the target data array
+>> n - number of blocks to copy data
+*/
+template<class T>
+__global__
+void KernelUnsqueezeFlatBigram(void * s, int blockSize, int totalSize, void * t, int n)
+{
+    /* index of data items */
+    int i = (blockDim.x * blockIdx.x + threadIdx.x) * 2;
+
+    if (i >= blockSize)
+        return;
+
+    T value = ((T*)s)[i];
+    T value2 = ((T*)s)[i + 1];
+    T * tData = (T*)t;
+
+    __syncthreads();
+
+    for (int k = i; k < totalSize; k += blockSize){
+        tData[k] = value;
+        tData[k + 1] = value2;
+    }
+}
+
+/*
+insert a dimension by copying the blocks for n times (where n is the size of the inerted dimension)
+>> s - pointer to the source data array
+>> blockSize - size of a block
+
+>> totalSize - total size of the blocks (i.e., blockSIze * n)
+>> t - pointer to the target data array
+>> n - number of blocks to copy data
+*/
+template<class T>
+__global__
+void KernelUnsqueezeFlat2D(void * s, int blockSize, int totalSize, void * t, int n)
+{
+    __shared__ T data[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+    __shared__ int offsets[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+
+    /* index of data items */
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    /* index of data items */
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+
+    if (i >= blockSize || j >= n)
+        return;
+
+    if(threadIdx.y == 0)
+        data[threadIdx.x] = ((T*)s)[i];
+    if(threadIdx.x == 0)
+        offsets[threadIdx.y] = blockSize * j;
+
+    __syncthreads();
+
+    ((T*)t)[offsets[threadIdx.y] + i] = data[threadIdx.x];
+}
+
+/*
+insert a dimension by copying the blocks for n times (where n is the size of the inerted dimension)
+>> s - pointer to the source data array
+>> blockSize - size of a block
 >> blockNum - number of the blocks
+>> totalSize - total size of the blocks (i.e., blockSIze * n)
 >> t - pointer to the target data array
+>> n - number of blocks to copy data
 */
 template<class T>
 __global__
-void KernelUnsqueeze(void * s, int blockSize, int blockNum, void * t, int n)
+void KernelUnsqueeze(void * s, int blockSize, int blockNum, int totalSize, void * t, int n)
 {
    /* index of data items */
    int i = blockDim.x * blockIdx.x + threadIdx.x;
@@ -51,11 +147,10 @@ void KernelUnsqueeze(void * s, int blockSize, int blockNum, void * t, int n)
    MTYPE offset = blockSize * j;
    T value = ((T*)s)[offset + i];
    T * tData = (T*)t + offset * n;
-    int length = blockSize * n;

    __syncthreads();

-    for (int k = i; k < length; k += blockSize)
+    for (int k = i; k < totalSize; k += blockSize)
        tData[k] = value;
 }

@@ -83,21 +178,71 @@ void _CudaUnsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
    int cudaGrids[3];
    int cudaBlocks[3];

-    GDevs.GetCudaThread2D(a->devID, blockSize, blockNumA, MAX_INT, cudaGrids, cudaBlocks);
-
    int devIDBackup = 0;
    ProtectCudaDev(a->devID, devIDBackup);

-    if (a->dataType == X_FLOAT && a->dataType == X_FLOAT) {
-        KernelUnsqueeze<float> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
-            (a->data, blockSize, blockNumA, b->data, dSize);
+    if(blockNumA > 1){
+        GDevs.GetCudaThread2D(a->devID, blockSize, blockNumA, MAX_INT, cudaGrids, cudaBlocks);
+
+        if (a->dataType == X_FLOAT && a->dataType == X_FLOAT) {
+            KernelUnsqueeze<float> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
+                                      (a->data, blockSize, blockNumA, blockSize * dSize, b->data, dSize);
+        }
+        else if (a->dataType == X_INT && a->dataType == X_INT) {
+            KernelUnsqueeze<int> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
+                                    (a->data, blockSize, blockNumA, blockSize * dSize, b->data, dSize);
+        }
+        else {
+            ShowNTErrors("TODO!");
+        }
+    }
+    else if(blockNumA == 1 && blockSize < MAX_CUDA_THREAD_NUM_PER_BLOCK){
+        GDevs.GetCudaThread2D(a->devID, blockSize, dSize, MAX_CUDA_THREAD_NUM_PER_BLOCK/4, cudaGrids, cudaBlocks);
+
+        if (a->dataType == X_FLOAT && a->dataType == X_FLOAT) {
+            KernelUnsqueezeFlat2D<float> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
+                                          (a->data, blockSize, blockSize * dSize, b->data, dSize);
+        }
+        else if (a->dataType == X_INT && a->dataType == X_INT) {
+            KernelUnsqueezeFlat2D<int> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
+                                        (a->data, blockSize, blockSize * dSize, b->data, dSize);
+        }
+        else {
+            ShowNTErrors("TODO!");
+        }
+    }
+    else if(blockNumA == 1 && blockSize % 2 == 0){
+        GDevs.GetCudaThread(a->devID, blockSize/2, cudaGrids, cudaBlocks);
+
+        if (a->dataType == X_FLOAT && a->dataType == X_FLOAT) {
+            KernelUnsqueezeFlatBigram<float> << <dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >> >
+                                                (a->data, blockSize, blockSize * dSize, b->data, dSize);
+        }
+        else if (a->dataType == X_INT && a->dataType == X_INT) {
+            KernelUnsqueezeFlatBigram<int> << <dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >> >
+                                              (a->data, blockSize, blockSize * dSize, b->data, dSize);
+        }
+        else {
+            ShowNTErrors("TODO!");
+        }
    }
-    else if (a->dataType == X_INT && a->dataType == X_INT) {
-        KernelUnsqueeze<int> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
-            (a->data, blockSize, blockNumA, b->data, dSize);
+    else if(blockNumA == 1){
+        GDevs.GetCudaThread(a->devID, blockSize, cudaGrids, cudaBlocks);
+
+        if (a->dataType == X_FLOAT && a->dataType == X_FLOAT) {
+            KernelUnsqueezeFlat<float> << <dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >> >
+                                          (a->data, blockSize, blockSize * dSize, b->data, dSize);
+        }
+        else if (a->dataType == X_INT && a->dataType == X_INT) {
+            KernelUnsqueezeFlat<int> << <dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >> >
+                                        (a->data, blockSize, blockSize * dSize, b->data, dSize);
+        }
+        else {
+            ShowNTErrors("TODO!");
+        }
    }
-    else {
-        ShowNTErrors("TODO!");
+    else{
+        ShowNTErrors("Something is wrong!");
    }

    BacktoCudaDev(a->devID, devIDBackup);

--- a/source/tensor/core/sort/TopK.cu
+++ b/source/tensor/core/sort/TopK.cu
@@ -859,7 +859,7 @@ void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
        //XTensor * indexA = new XTensor(a->order, dimSize, X_INT, 1.0F, a->devID, a->mem);
        //indexA->data = a->mem != NULL ? a->mem->AllocBuf(a->devID, a->unitNum * sizeof(int)) : XMemAlloc(a->devID, a->unitNum * sizeof(int));

-        ///* make the index tensor */
+        /* make the index tensor */
        //indexA->SetAscendingOrder(dim);

        //_CudaSortBig(a, b, indexA, index, dim, k);

--- a/source/tensor/core/utilities/FlushToMem.cu
+++ b/source/tensor/core/utilities/FlushToMem.cu
@@ -117,7 +117,7 @@ void CudaGPUToCPUFlush(XTensor * tensor)
    else {
        tensor->dataHost = new char[tensor->unitNum * tensor->unitSize];
        if (tensor->data != NULL)
-            cudaMemcpy(tensor->dataHost, tensor->data, tensor->unitNum * tensor->unitSize, cudaMemcpyDeviceToHost);
+            XMemCopy(tensor->dataHost, -1, tensor->data, tensor->devID, tensor->unitNum * tensor->unitSize);
        else
            memset(tensor->dataHost, 0, tensor->unitNum * tensor->unitSize);
    }

--- a/source/tensor/function/HardTanH.cpp
+++ b/source/tensor/function/HardTanH.cpp
@@ -116,8 +116,7 @@ void _HardTanHBackward(XTensor * gold, XTensor * y, XTensor * x,
    }
 #endif

-    if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE)
-    {
+    if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
        /* calculate dE/dy */
        if(lossName != NOLOSS)
            _LossBackward(dedy, gold, y, lossName);

--- a/source/tensor/function/LogSoftmax.cpp
+++ b/source/tensor/function/LogSoftmax.cpp
@@ -38,6 +38,17 @@ log scale softmax y = log(e^x / \sum_{i} e^{x_i})
 */
 void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
 {
+    CheckNTErrors(!x->isSparse && !y->isSparse, "TODO!");
+    CheckNTErrors(x && y, "Empty input tensors!");
+
+    if(leadDim < 0)
+        leadDim = x->order - 1;
+
+    if(y->dimSize[leadDim] == 1){
+        y->SetZeroAll();
+        return;
+    }
+
    int leadDimRDI = x->order - leadDim - 1;
    if (!x->isSparse && !y->isSparse &&
        x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE)
@@ -68,25 +79,27 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
        blockSize = stride * dimensionSize;
        blockNum = y->unitNum / blockSize;

-        max = NewTensor(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
-        sum = NewTensor(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
-
-        max->data = mem != NULL ? (char*)mem->AllocBuf(mem->devID, max->unitNum * max->unitSize) : XMemAlloc(max->devID, max->unitNum * max->unitSize);
-        sum->data = mem != NULL ? (char*)mem->AllocBuf(mem->devID, sum->unitNum * sum->unitSize) : XMemAlloc(sum->devID, sum->unitNum * sum->unitSize);
+        max = NewTensorBuf(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
+        sum = NewTensorBuf(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);

        _ReduceMax(x, max, leadDim);
        _ReduceSum(x, sum, leadDim, max, 1.0F, true);

        if (x->devID >= 0) {
-            int dims[2];
-            dims[0] = -stride;
-            dims[1] = dimensionSize;
-            blockx = NewTensor(2, dims, x->dataType, x->denseRatio, x->devID, mem);
-            blocky = NewTensor(2, dims, x->dataType, x->denseRatio, x->devID, mem);
-            dims[0] = -stride;
-            dims[1] = 1;
-            blockMax = NewTensor(2, dims, x->dataType, x->denseRatio, x->devID, mem);
-            blockSum = NewTensor(2, dims, x->dataType, x->denseRatio, x->devID, mem);
+            if(leadDimRDI == 0){
+                blockSize = y->unitNum;
+                blockNum  = 1;
+                blockx = NewTensor2D(blockSize/dimensionSize, -dimensionSize, x->dataType, x->devID, mem);
+                blocky = NewTensor2D(blockSize/dimensionSize, -dimensionSize, x->dataType, x->devID, mem);
+                blockMax = NewTensor2D(blockSize/dimensionSize, -1, x->dataType, x->devID, mem);
+                blockSum = NewTensor2D(blockSize/dimensionSize, -1, x->dataType, x->devID, mem);
+            }
+            else{
+                blockx = NewTensor2D(-stride, dimensionSize, x->dataType, x->devID, mem);
+                blocky = NewTensor2D(-stride, dimensionSize, x->dataType, x->devID, mem);
+                blockMax = NewTensor2D(-stride, 1, x->dataType, x->devID, mem);
+                blockSum = NewTensor2D(-stride, 1, x->dataType, x->devID, mem);
+            }
        }

        for (int k = 0; k < blockNum; k++) {
@@ -123,7 +136,10 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
                blockMax->data = mp;
                blockSum->data = sp;
 #ifdef USE_CUDA
-                _CudaLogSoftmaxSumMax(blockx, blocky, leadDim, blockSum, blockMax);
+                if(leadDimRDI == 0)
+                    _CudaLogSoftmaxSumMax(blockx, blocky, 1, blockSum, blockMax);
+                else
+                    _CudaLogSoftmaxSumMax(blockx, blocky, leadDim, blockSum, blockMax);
 #else
                ShowNTErrors("Please specify USE_CUDA and recompile the code!");
 #endif
@@ -134,21 +150,10 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
            }
        }

-        if (x->devID < 0) {
-            if (mem != NULL) {
-                mem->ReleaseBuf(mem->devID, max->unitNum * max->unitSize);
-                mem->ReleaseBuf(mem->devID, sum->unitNum * sum->unitSize);
-            }
-            else {
-                XMemFree(max->devID, max->data);
-                XMemFree(sum->devID, sum->data);
-                max->data = NULL;
-                sum->data = NULL;
-            }
-            delete max;
-            delete sum;
-        }
-        else {
+        DelTensorBuf(max);
+        DelTensorBuf(sum);
+
+        if (x->devID >= 0) {
            delete blockx;
            delete blocky;
            delete blockMax;
@@ -171,19 +176,44 @@ make a new tensor to keep the result and return it
 */
 XTensor LogSoftmax(const XTensor &x, int leadDim)
 {
+    int ld = leadDim;
+    if (ld < 0)
+        ld = x.order - 1;
+
    XTensor y(&x);
    y.SetTMP();

    /* call _LogSoftmax function */
-    _LogSoftmax(&x, &y, leadDim);
+    _LogSoftmax(&x, &y, ld);

    /* tensor connection */
    XLink::MakeLink(&x, NULL, &y, FUNC_LOGSOFTMAX);
-    XLink::AddParamToHeadInt(&y, leadDim);
+    XLink::AddParamToHeadInt(&y, ld);

    return y;
 }

+/* 
+log scale softmax y = log(e^x / \sum_{i} e^{x_i})
+make a new tensor to keep the result and return it
+
+>> x - input vector
+>> y - output vector
+>> leadDim - leading dimension (along which we perform reduction)
+*/
+void LogSoftmax(const XTensor &x, XTensor &y, int leadDim)
+{
+    if(!XTensor::IsSameShaped(&x, &y))
+        InitTensor(&y, &x);
+
+    /* call _LogSoftmax function */
+    _LogSoftmax(&x, &y, leadDim);
+
+    /* tensor connection */
+    XLink::MakeLink(&x, NULL, &y, FUNC_LOGSOFTMAX);
+    XLink::AddParamToHeadInt(&y, leadDim);
+}
+
 /*
 backward computation for dense matrices with default data type

@@ -255,6 +285,9 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
    CheckNTErrors((!dedx->isSparse), "The gradient matrix must be dense!");
    CheckNTErrors((gold != NULL), "The gold standard cannot be empty!");

+    if(leadDim < 0)
+        leadDim = y->order - 1;
+
    int leadDimRDI = y->order - leadDim - 1;
 #ifdef USE_CUDA
    if (gold->devID >= 0) {

--- a/source/tensor/function/LogSoftmax.h
+++ b/source/tensor/function/LogSoftmax.h
@@ -33,6 +33,9 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim);
 /* log scale softmax y = log(e^x / \sum_{i} e^{x_i}) (return a XTensor structure) */
 XTensor LogSoftmax(const XTensor &x, int leadDim);

+/* log scale softmax y = log(e^x / \sum_{i} e^{x_i}) (with both argument of x and y) */
+void LogSoftmax(const XTensor &x, XTensor &y, int leadDim);
+
 /* de/dx */
 void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, 
                         XTensor * dedy, XTensor * dedx,

--- a/source/tensor/function/Loss.cu
+++ b/source/tensor/function/Loss.cu
@@ -24,7 +24,7 @@
 #include "../XDevice.h"
 #include "../core/math/Power.h"
 #include "../core/math/ScaleAndShift.h"
-#include "../core/math/Log.h"
+#include "../core/math/Unary.h"
 #include "../core/arithmetic/Negate.h"
 #include "../core/arithmetic/Sum.h"
 #include "../core/arithmetic/Multiply.h"

--- a/source/tensor/function/Softmax.cpp
+++ b/source/tensor/function/Softmax.cpp
@@ -37,6 +37,9 @@ softmax y = e^x / \sum_{i} e^{x_i}
 */
 void _Softmax(const XTensor * x, XTensor * y, int leadDim)
 {
+    if(leadDim < 0)
+        leadDim = x->order - 1;
+
    int leadDimRDI = x->order - leadDim - 1;
    if(!x->isSparse && !y->isSparse && x->dataType == y->dataType){
        int * dimSize = new int[x->order - 1];
@@ -100,10 +103,10 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
                    else{
                        for(int i = 0; i < n; i++){
                            DTYPE r = (DTYPE)exp(ip[i * m + j] - mp[j])/sp[j];
-                            if(IsNAN(r))
-                                r = DTYPE_MIN;
-                            if(IsINF(r))
-                                r = DTYPE_MIN;
+                            if (r > (DTYPE)1.0F)
+                                r = (DTYPE)1.0F;
+                            else if (r < 0)
+                                r = 0;
                            op[i * m + j] = r;
                        }
                    }
@@ -140,14 +143,19 @@ make a new tensor to keep the result and return it
 */
 XTensor Softmax(const XTensor &x, int leadDim)
 {
+    int ld = leadDim;
+    if (ld < 0)
+        ld = x.order - 1;
+
    XTensor y(&x);
    y.SetTMP();

    /* call _Softmax function */
-    _Softmax(&x, &y, leadDim);
+    _Softmax(&x, &y, ld);

    /* tensor connection */
    XLink::MakeLink(&x, NULL, &y, FUNC_SOFTMAX);
+    XLink::AddParamToHeadInt(&y, ld);

    return y;
 }
@@ -182,10 +190,14 @@ void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
                      int leadDim,
                      LOSS_FUNCTION_NAME lossName)
 {
-    CheckNTErrors((dedx->isSparse == false), "The gradient tensor must be dense!");
-    CheckNTErrors((gold != NULL), "Incorrect x gold standard tensor!");
+    CheckNTErrors(dedx->isSparse == false, "The gradient tensor must be dense!");
+    CheckNTErrors(gold != NULL || lossName == NOLOSS, "Gold standard is required for computing loss!");
+
+    if(leadDim < 0)
+        leadDim = y->order - 1;

    int leadDimRDI = y->order - leadDim - 1;
+
 #ifdef USE_CUDA
    if(y->devID >= 0){
        _CudaSoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName);

--- a/source/tensor/function/Softmax.cu
+++ b/source/tensor/function/Softmax.cu
@@ -85,7 +85,13 @@ void KernelSoftmaxComputeTensor(DTYPE * x, DTYPE * max, DTYPE * sum, DTYPE * y, 

    if(i < strideSizeTotal && j < strideNum){
        int offset = int(i / stride) * blockSize + j * stride + i2[threadIdx.x];
-        y[offset] = exp(x[offset] - xMax[threadIdx.x])/xSum[threadIdx.x];
+        DTYPE r = exp(x[offset] - xMax[threadIdx.x])/xSum[threadIdx.x];
+        if (r >(DTYPE)1.0F)
+            r = (DTYPE)1.0F;
+        else if (r < 0)
+            r = 0;
+        y[offset] = r;
+
    }
 }

@@ -194,7 +200,12 @@ void KernelSoftmaxComputeTensorUseBroadcast(DTYPE * input, DTYPE * max, DTYPE * 
        maxData = broadcast(maxData);
        if (i < strideNum){
            int offset = int(j / stride) * blockSize + i * stride + i2;
-            output[offset] = exp(input[offset] - maxData) / sumData;
+            DTYPE r = exp(input[offset] - maxData) / sumData;
+            if (r > (DTYPE)1.0F)
+                r = (DTYPE)1.0F;
+            else if (r < 0)
+                r = 0;
+            output[offset] = r;
        }
    }
 }
@@ -305,6 +316,9 @@ void _CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
    CheckNTErrors((x->devID == y->devID), "Matrices used in log softmax are not on the same GPU.");
    CheckNTErrors((y->order >= 1), "Empty tensor!");

+    int devIDBackup;
+    ProtectCudaDev(x->devID, devIDBackup);
+
    if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
        
        CheckNTErrors((lossName == CROSSENTROPY || 
@@ -350,8 +364,14 @@ void _CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
            /* make a matrix to keep \beta */
            XTensor * beta = new XTensor(y->order - 1, dimSize, y->dataType, y->denseRatio, y->devID, mem);

-            ytmp->data = mem->AllocBuf(mem->devID, y->unitNum * y->unitSize);
-            beta->data = mem->AllocBuf(mem->devID, beta->unitNum * beta->unitSize);
+            if(mem != NULL){
+                ytmp->data = mem->AllocBuf(mem->devID, y->unitNum * y->unitSize);
+                beta->data = mem->AllocBuf(mem->devID, beta->unitNum * beta->unitSize);
+            }
+            else{
+                ytmp->data = XMemAlloc(y->devID, y->unitNum * y->unitSize);
+                beta->data = XMemAlloc(y->devID, beta->unitNum * beta->unitSize);
+            }

            /* \beta = \sum_i (dE/dy_i * y_i) */
            _Multiply(dedy, y, ytmp, 0, 0);
@@ -364,8 +384,18 @@ void _CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
            /* dE/ds_j = y_j * ytmp = y_j * (dE/dy_j - \beta) */
            _Multiply(y, ytmp, dedx, 0, 0);

-            mem->ReleaseBuf(mem->devID, y->unitNum * y->unitSize);
-            mem->ReleaseBuf(mem->devID, beta->unitNum * beta->unitSize);
+
+            if(mem != NULL){
+                mem->ReleaseBuf(mem->devID, y->unitNum * y->unitSize);
+                mem->ReleaseBuf(mem->devID, beta->unitNum * beta->unitSize);
+            }
+            else{
+                XMemFree(y->devID, ytmp->data);
+                XMemFree(y->devID, beta->data);
+            }
+
+            ytmp->data = NULL;
+            beta->data = NULL;

            delete[] dimSize;
            delete ytmp;
@@ -377,6 +407,8 @@ void _CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
    }
    else
        ShowNTErrors("TODO!");
+
+    BacktoCudaDev(x->devID, devIDBackup);
 }

 #endif

--- a/source/tensor/test/TAbsolute.cpp
+++ b/source/tensor/test/TAbsolute.cpp
@@ -19,6 +19,7 @@
 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-12
 */

+#include "../core/math/Unary.h"
 #include "TAbsolute.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -30,14 +31,14 @@ Set every entry to its absolute value.
 bool TestAbsolute1()
 {
 	/* a tensor of size (3, 2) */
-	int aOrder = 2;
-	int * aDimSize = new int[aOrder];
-	aDimSize[0] = 3;
-	aDimSize[1] = 2;
+	int order = 2;
+	int * dimSize = new int[order];
+	dimSize[0] = 3;
+	dimSize[1] = 2;

-	int aUnitNum = 1;
-	for (int i = 0; i < aOrder; i++)
-		aUnitNum *= aDimSize[i];
+	int unitNum = 1;
+	for (int i = 0; i < order; i++)
+		unitNum *= dimSize[i];

 	DTYPE aData[3][2] = { {1.0F, -2.0F}, 
 	                      {0.5F, -4.0F},
@@ -50,14 +51,14 @@ bool TestAbsolute1()
 	bool cpuTest = true;

 	/* create tensors */
-	XTensor * a = NewTensor(aOrder, aDimSize);
-	XTensor * b = NewTensor(aOrder, aDimSize);
-	XTensor * aMe = NewTensor(aOrder, aDimSize);
+	XTensor * a = NewTensor(order, dimSize);
+	XTensor * b = NewTensor(order, dimSize);
+	XTensor * aMe = NewTensor(order, dimSize);
    XTensor bUser;

 	/* initialize variables */
-	a->SetData(aData, aUnitNum);
-    aMe->SetData(aData, aUnitNum);
+	a->SetData(aData, unitNum);
+    aMe->SetData(aData, unitNum);

 	/* call Absolute function */
    _Absolute(a, b);
@@ -65,21 +66,21 @@ bool TestAbsolute1()
    bUser = Absolute(*a);

 	/* check results */
-	cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && aMe->CheckData(answer, aUnitNum, 1e-4F) && bUser.CheckData(answer, aUnitNum, 1e-4F);
+	cpuTest = b->CheckData(answer, unitNum, 1e-4F) && aMe->CheckData(answer, unitNum, 1e-4F) && bUser.CheckData(answer, unitNum, 1e-4F);
    
 #ifdef USE_CUDA
 	/* GPU test */
 	bool gpuTest = true;

 	/* create tensor */
-	XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
-	XTensor * bGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
-	XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+	XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+	XTensor * aMeGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
    XTensor bUserGPU;

 	/* Initialize variables */
-	aGPU->SetData(aData, aUnitNum);
-    aMeGPU->SetData(aData, aUnitNum);
+	aGPU->SetData(aData, unitNum);
+    aMeGPU->SetData(aData, unitNum);

 	/* call Absolute function */
    _Absolute(aGPU, bGPU);
@@ -87,7 +88,7 @@ bool TestAbsolute1()
    bUserGPU = Absolute(*aGPU);

 	/* check results */
-	gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
+	gpuTest = bGPU->CheckData(answer, unitNum, 1e-4F) && aMeGPU->CheckData(answer, unitNum, 1e-4F) && bUserGPU.CheckData(answer, unitNum, 1e-4F);

 	/* destroy variables */
 	delete a;
@@ -96,7 +97,7 @@ bool TestAbsolute1()
    delete aGPU;
    delete bGPU;
    delete aMeGPU;
-	delete[] aDimSize;
+	delete[] dimSize;

 	return cpuTest && gpuTest;
 #else
@@ -104,7 +105,7 @@ bool TestAbsolute1()
 	delete a;
 	delete b;
 	delete aMe;
-	delete[] aDimSize;
+	delete[] dimSize;

 	return cpuTest;
 #endif // USE_CUDA

--- a/source/tensor/test/TAbsolute.h
+++ b/source/tensor/test/TAbsolute.h
@@ -22,7 +22,6 @@
 #ifndef __TEST_ABSOLUTE_H__
 #define __TEST_ABSOLUTE_H__

-#include "../core/arithmetic/Absolute.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/test/TClip.cpp
+++ b/source/tensor/test/TClip.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-03
+*/
+
+#include "../XTensor.h"
+#include "TClip.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/*
+case 1: test Clip function.
+Set every entry to its clip value.
+*/
+bool TestClip1()
+{
+	/* a tensor of size (3, 2) */
+	int aOrder = 2;
+	int * aDimSize = new int[aOrder];
+	aDimSize[0] = 3;
+	aDimSize[1] = 2;
+
+	int aUnitNum = 1;
+	for (int i = 0; i < aOrder; i++)
+		aUnitNum *= aDimSize[i];
+
+	DTYPE aData[3][2] = { {1.0F, -2.0F},
+						  {0.0F, 4.0F},
+						  {5.0F, -6.0F} };
+	DTYPE answer[3][2] = { {1.0F, -1.0F},
+						   {0.0F, 1.0F},
+					   	   {1.0F, -1.0F} };
+
+	/* CPU test */
+	bool cpuTest = true;
+
+	/* create tensors */
+	XTensor * a = NewTensor(aOrder, aDimSize);
+	XTensor * b = NewTensor(aOrder, aDimSize);
+	XTensor * aMe = NewTensor(aOrder, aDimSize);
+	XTensor bUser;
+
+	/* initialize variables */
+	a->SetData(aData, aUnitNum);
+	aMe->SetData(aData, aUnitNum);
+
+	/* call Clip function */
+	_Clip(a, b, -1.0, 1.0);
+	_ClipMe(aMe, -1.0, 1.0);
+	bUser = Clip(*a, -1.0, 1.0);
+
+	/* check results */
+	cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && 
+              aMe->CheckData(answer, aUnitNum, 1e-4F) && 
+              bUser.CheckData(answer, aUnitNum, 1e-4F);
+
+#ifdef USE_CUDA
+	/* GPU test */
+	bool gpuTest = true;
+
+	/* create tensor */
+	XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * bGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+	XTensor bUserGPU;
+
+	/* Initialize variables */
+	aGPU->SetData(aData, aUnitNum);
+	aMeGPU->SetData(aData, aUnitNum);
+
+	/* call Clip function */
+	_Clip(aGPU, bGPU, -1.0, 1.0);
+	_ClipMe(aMeGPU, -1.0, 1.0);
+	bUserGPU = Clip(*aGPU, -1.0, 1.0);
+
+	/* check results */
+	gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && 
+              aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && 
+              bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
+
+	/* destroy variables */
+	delete a;
+	delete b;
+	delete aMe;
+	delete aGPU;
+	delete bGPU;
+	delete aMeGPU;
+	delete[] aDimSize;
+
+	return cpuTest && gpuTest;
+#else
+	/* destroy variables */
+	delete a;
+	delete b;
+	delete aMe;
+	delete[] aDimSize;
+
+	return cpuTest;
+#endif // USE_CUDA
+}
+
+/* other cases */
+/*
+TODO!!
+*/
+
+/* test for Clip Function */
+bool TestClip()
+{
+	XPRINT(0, stdout, "[TEST Clip] set every entry to its clip value \n");
+	bool returnFlag = true, caseFlag = true;
+
+	/* case 1 test */
+	caseFlag = TestClip1();
+
+	if (!caseFlag) {
+		returnFlag = false;
+		XPRINT(0, stdout, ">> case 1 failed!\n");
+	}
+	else
+		XPRINT(0, stdout, ">> case 1 passed!\n");
+
+	/* other cases test */
+	/*
+	TODO!!
+	*/
+
+	if (returnFlag) {
+		XPRINT(0, stdout, ">> All Passed!\n");
+	}
+	else
+		XPRINT(0, stdout, ">> Failed!\n");
+
+	XPRINT(0, stdout, "\n");
+
+	return returnFlag;
+}
+
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/test/TMatrixMULBatchedCPU.h
+++ b/source/tensor/test/TMatrixMULBatchedCPU.h
@@ -16,19 +16,19 @@
 */

 /*
-* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-15
+* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-03
 */

-#ifndef __TEST_MATRIXMULBATCHEDCPU_H__
-#define __TEST_MATRIXMULBATCHEDCPU_H__
+#ifndef __TEST_CLIP_H__
+#define __TEST_CLIP_H__

-#include "../core/arithmetic/MatrixMULBatchedCPU.h"
+#include "../core/math/Clip.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

-/* test for MatrixMulBatchedCPU Function */
+/* test for Clip Function */
 extern "C"
-bool TestMatrixMulBatchedCPU();
+bool TestClip();

 } // namespace nts(NiuTrans.Tensor)
-#endif // __TEST_MATRIXMULBATCHEDCPU_H__
+#endif // __TEST_CLIP_H__
--- a/source/tensor/test/TCos.cpp
+++ b/source/tensor/test/TCos.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
+*/
+
+#include "../core/math/Unary.h"
+#include "TCos.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/*
+case 1: test Cos function.
+Set every entry to its cosine value.
+*/
+bool TestCos1()
+{
+	/* a tensor of size (3, 2) */
+	int order = 2;
+	int * dimSize = new int[order];
+	dimSize[0] = 3;
+	dimSize[1] = 2;
+
+	int unitNum = 1;
+	for (int i = 0; i < order; i++)
+		unitNum *= dimSize[i];
+
+	DTYPE aData[3][2] = { {1.0F, 2.0F}, 
+	                      {-1.0F, -2.0F},
+	                      {0.0F, 0.5F} };
+	DTYPE answer[3][2] = { {0.5403F, -0.4161F},
+	                       {0.5403F, -0.4161F},
+	                       {1.0F, 0.8776F} };
+
+	/* CPU test */
+	bool cpuTest = true;
+
+	/* create tensors */
+	XTensor * a = NewTensor(order, dimSize);
+    XTensor * b = NewTensor(order, dimSize);
+	XTensor * aMe = NewTensor(order, dimSize);
+    XTensor bUser;
+
+	/* initialize variables */
+	a->SetData(aData, unitNum);
+	aMe->SetData(aData, unitNum);
+
+	/* call Cos function */
+	_Cos(a, b);
+	_CosMe(aMe);
+    bUser = Cos(*a);
+
+	/* check results */
+	cpuTest = b->CheckData(answer, unitNum, 1e-4F) && aMe->CheckData(answer, unitNum, 1e-4F) && bUser.CheckData(answer, unitNum, 1e-4F);
+    
+#ifdef USE_CUDA
+	/* GPU test */
+	bool gpuTest = true;
+
+	/* create tensor */
+	XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+	XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+	XTensor * aMeGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor bUserGPU;
+
+	/* Initialize variables */
+	aGPU->SetData(aData, unitNum);
+	aMeGPU->SetData(aData, unitNum);
+
+	/* call Cos function */
+    _Cos(aGPU, bGPU);
+	_CosMe(aMeGPU);
+    bUserGPU = Cos(*aGPU);
+
+	/* check results */
+	gpuTest = bGPU->CheckData(answer, unitNum, 1e-4F) && aMeGPU->CheckData(answer, unitNum, 1e-4F) && bUserGPU.CheckData(answer, unitNum, 1e-4F);
+
+	/* destroy variables */
+	delete a;
+	delete b;
+	delete aMe;
+    delete aGPU;
+    delete bGPU;
+    delete aMeGPU;
+	delete[] dimSize;
+
+	return cpuTest && gpuTest;
+#else
+	/* destroy variables */
+	delete a;
+	delete b;
+	delete aMe;
+	delete[] dimSize;
+
+	return cpuTest;
+#endif // USE_CUDA
+}
+
+/* other cases */
+/*
+TODO!!
+*/
+
+/* test for Cos Function */
+bool TestCos()
+{
+	XPRINT(0, stdout, "[TEST Cos] set every entry to its cosine value \n");
+	bool returnFlag = true, caseFlag = true;
+
+	/* case 1 test */
+	caseFlag = TestCos1();
+
+	if (!caseFlag) {
+		returnFlag = false;
+		XPRINT(0, stdout, ">> case 1 failed!\n");
+	}
+	else
+		XPRINT(0, stdout, ">> case 1 passed!\n");
+
+	/* other cases test */
+	/*
+	TODO!!
+	*/
+
+	if (returnFlag) {
+		XPRINT(0, stdout, ">> All Passed!\n");
+	}
+	else
+		XPRINT(0, stdout, ">> Failed!\n");
+
+	XPRINT(0, stdout, "\n");
+
+	return returnFlag;
+}
+
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/test/TCos.h
+++ b/source/tensor/test/TCos.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
+*/
+
+#ifndef __TEST_SIN_H__
+#define __TEST_SIN_H__
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* test for Sin Function */
+bool TestSin();
+
+} // namespace nts(NiuTrans.Tensor)
+#endif // __TEST_SIN_H__
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
+*/
+
+#ifndef __TEST_COS_H__
+#define __TEST_COS_H__
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* test for Cos Function */
+bool TestCos();
+
+} // namespace nts(NiuTrans.Tensor)
+#endif // __TEST_COS_H__
--- a/source/tensor/test/TDiv.cpp
+++ b/source/tensor/test/TDiv.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-01
+ */
+
+#include "TDiv.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* 
+case 1: element-wise division of two tensors
+c(i) = a(i)/b(i) + \alpha * c(i)
+In this case, (2, 2)  (2, 2) -> (2, 2), leadingDim=0, alpha=0.
+*/
+bool TestDiv1()
+{
+	/* a source tensor of size (2, 2) */
+	int sOrder1 = 2;
+	int * sDimSize1 = new int[sOrder1];
+	sDimSize1[0] = 2;
+	sDimSize1[1] = 2;
+
+	int sUnitNum1 = 1;
+	for (int i = 0; i < sOrder1; i++)
+		sUnitNum1 *= sDimSize1[i];
+
+	/* a source tensor of size (2, 2) */
+	int sOrder2 = 2;
+	int * sDimSize2 = new int[sOrder2];
+	sDimSize2[0] = 2;
+	sDimSize2[1] = 2;
+
+	int sUnitNum2 = 1;
+	for (int i = 0; i < sOrder2; i++)
+		sUnitNum2 *= sDimSize2[i];
+
+	/* a target tensor of size (2, 2) */
+	int tOrder = 2;
+	int * tDimSize = new int[tOrder];
+	tDimSize[0] = 2;
+	tDimSize[1] = 2;
+
+	int tUnitNum = 1;
+	for (int i = 0; i < tOrder; i++)
+		tUnitNum *= tDimSize[i];
+
+	DTYPE sData1[2][2] = { {0.0F, 1.0F},
+	                       {2.0F, 3.0F} };
+	DTYPE sData2[2][2] = { {1.0F, 1.0F},
+	                       {4.0F, 9.0F} };
+	DTYPE answer[2][2] = { {0.0F, 1.0F},
+	                       {0.5F, 0.3333F} };
+
+	/* CPU test */
+	bool cpuTest = true;
+
+	/* create tensors */
+	XTensor * s1 = NewTensor(sOrder1, sDimSize1);
+	XTensor * s2 = NewTensor(sOrder2, sDimSize2);
+	XTensor * t = NewTensor(tOrder, tDimSize);
+    XTensor * tMe = NewTensor(tOrder, tDimSize);
+    XTensor tUser;
+
+	/* initialize variables */
+	s1->SetData(sData1, sUnitNum1);
+	tMe->SetData(sData1, sUnitNum1);
+	s2->SetData(sData2, sUnitNum2);
+	t->SetZeroAll();
+
+	/* call Div function */
+	_Div(s1, s2, t, 0, 0);
+	_DivMe(tMe, s2, 0, 0);
+    tUser = Div(*s1, *s2, 0);
+
+	/* check results */
+	cpuTest = t->CheckData(answer, tUnitNum, 1e-4F) && 
+              tMe->CheckData(answer, tUnitNum, 1e-4F) && 
+              tUser.CheckData(answer, tUnitNum, 1e-4F);
+
+#ifdef USE_CUDA
+	/* GPU test */
+	bool gpuTest = true;
+
+	/* create tensor */
+	XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
+	XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
+	XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tMeGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor tUserGPU;
+
+	/* Initialize variables */
+	sGPU1->SetData(sData1, sUnitNum1);
+	tMeGPU->SetData(sData1, sUnitNum1);
+	sGPU2->SetData(sData2, sUnitNum2);
+	tGPU->SetZeroAll();
+
+	/* call Div function */
+	_Div(sGPU1, sGPU2, tGPU, 0, 0);
+	_DivMe(tMeGPU, sGPU2, 0, 0);
+    tUserGPU = Div(*sGPU1, *sGPU2, 0);
+
+	/* check results */
+	gpuTest = tGPU->CheckData(answer, tUnitNum, 1e-4F) && 
+              tMeGPU->CheckData(answer, tUnitNum, 1e-4F) && 
+              tUserGPU.CheckData(answer, tUnitNum, 1e-4F);
+
+	/* destroy variables */
+    delete s1;
+    delete s2;
+    delete t;
+    delete tMe;
+    delete sGPU1;
+    delete sGPU2;
+    delete tGPU;
+    delete tMeGPU;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
+
+	return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s1;
+    delete s2;
+    delete t;
+    delete tMe;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
+
+	return cpuTest;
+#endif // USE_CUDA
+}
+
+/* other cases */
+/*
+TODO!!
+*/
+
+/* test for Div Function */
+bool TestDiv()
+{
+	XPRINT(0, stdout, "[TEST Div] element-wise division of two tensors \n");
+	bool returnFlag = true, caseFlag = true;
+
+	/* case 1 test */
+	caseFlag = TestDiv1();
+
+	if (!caseFlag) {
+		returnFlag = false;
+		XPRINT(0, stdout, ">> case 1 failed!\n");
+	}
+	else
+		XPRINT(0, stdout, ">> case 1 passed!\n");
+
+	/* other cases test */
+	/*
+	TODO!!
+	*/
+
+	if (returnFlag) {
+		XPRINT(0, stdout, ">> All Passed!\n");
+	}
+	else
+		XPRINT(0, stdout, ">> Failed!\n");
+
+	XPRINT(0, stdout, "\n");
+
+	return returnFlag;
+}
+
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/test/TDiv.h
+++ b/source/tensor/test/TDiv.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-01
+ */
+
+#ifndef __TEST_DIV_H__
+#define __TEST_DIV_H__
+
+#include "../core/arithmetic/Div.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* test for Div Function */
+extern "C"
+bool TestDiv();
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __TEST_DIV_H__
--- a/source/tensor/test/TExp.cpp
+++ b/source/tensor/test/TExp.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
+*/
+
+#include "../core/math/Unary.h"
+#include "TExp.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/*
+case 1: test Exp function.
+Set every entry to its exponent value.
+*/
+bool TestExp1()
+{
+	/* a tensor of size (3, 2) */
+	int order = 2;
+	int * dimSize = new int[order];
+	dimSize[0] = 3;
+	dimSize[1] = 2;
+
+	int unitNum = 1;
+	for (int i = 0; i < order; i++)
+		unitNum *= dimSize[i];
+
+	DTYPE aData[3][2] = { {1.0F, 2.0F}, 
+	                      {-1.0F, -2.0F},
+	                      {0.0F, 0.5F} };
+	DTYPE answer[3][2] = { {2.7183F, 7.3891F},
+	                       {0.3679F, 0.1353F},
+	                       {1.0F, 1.6487F} };
+
+	/* CPU test */
+	bool cpuTest = true;
+
+	/* create tensors */
+	XTensor * a = NewTensor(order, dimSize);
+    XTensor * b = NewTensor(order, dimSize);
+	XTensor * aMe = NewTensor(order, dimSize);
+    XTensor bUser;
+
+	/* initialize variables */
+	a->SetData(aData, unitNum);
+	aMe->SetData(aData, unitNum);
+
+	/* call Exp function */
+	_Exp(a, b);
+	_ExpMe(aMe);
+    bUser = Exp(*a);
+
+	/* check results */
+	cpuTest = b->CheckData(answer, unitNum, 1e-4F) && 
+              aMe->CheckData(answer, unitNum, 1e-4F) && 
+              bUser.CheckData(answer, unitNum, 1e-4F);
+    
+#ifdef USE_CUDA
+	/* GPU test */
+	bool gpuTest = true;
+
+	/* create tensor */
+	XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+	XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+	XTensor * aMeGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor bUserGPU;
+
+	/* Initialize variables */
+	aGPU->SetData(aData, unitNum);
+	aMeGPU->SetData(aData, unitNum);
+
+	/* call Exp function */
+    _Exp(aGPU, bGPU);
+	_ExpMe(aMeGPU);
+    bUserGPU = Exp(*aGPU);
+
+	/* check results */
+	gpuTest = bGPU->CheckData(answer, unitNum, 1e-4F) && 
+              aMeGPU->CheckData(answer, unitNum, 1e-4F) && \
+              bUserGPU.CheckData(answer, unitNum, 1e-4F);
+
+	/* destroy variables */
+	delete a;
+	delete b;
+	delete aMe;
+    delete aGPU;
+    delete bGPU;
+    delete aMeGPU;
+	delete[] dimSize;
+
+	return cpuTest && gpuTest;
+#else
+	/* destroy variables */
+	delete a;
+	delete b;
+	delete aMe;
+	delete[] dimSize;
+
+	return cpuTest;
+#endif // USE_CUDA
+}
+
+/* other cases */
+/*
+TODO!!
+*/
+
+/* test for Exp Function */
+bool TestExp()
+{
+	XPRINT(0, stdout, "[TEST Exp] set every entry to its exponent value \n");
+	bool returnFlag = true, caseFlag = true;
+
+	/* case 1 test */
+	caseFlag = TestExp1();
+
+	if (!caseFlag) {
+		returnFlag = false;
+		XPRINT(0, stdout, ">> case 1 failed!\n");
+	}
+	else
+		XPRINT(0, stdout, ">> case 1 passed!\n");
+
+	/* other cases test */
+	/*
+	TODO!!
+	*/
+
+	if (returnFlag) {
+		XPRINT(0, stdout, ">> All Passed!\n");
+	}
+	else
+		XPRINT(0, stdout, ">> Failed!\n");
+
+	XPRINT(0, stdout, "\n");
+
+	return returnFlag;
+}
+
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/MatrixMULBatchedCPU.h
+++ b/source/tensor/core/arithmetic/MatrixMULBatchedCPU.h
@@ -16,20 +16,16 @@
 */

 /*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
 */

-#ifndef __MATRIXMULBATCHEDCPU_H__
-#define __MATRIXMULBATCHEDCPU_H__
-
-#include "../../XTensor.h"
+#ifndef __TEST_EXP_H__
+#define __TEST_EXP_H__

 namespace nts { // namespace nts(NiuTrans.Tensor)

-/* matrix multiplication in batch mode (CPU code) */
-void _MatrixMULBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA, const XList * b, MATRIX_TRANS_TYPE transposedB, 
-                          XList * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
+/* test for Exp Function */
+bool TestExp();

 } // namespace nts(NiuTrans.Tensor)
-
-#endif // __MATRIXMULBATCHEDCPU_H__
\ No newline at end of file
+#endif // __TEST_EXP_H__
--- a/source/tensor/test/TLog.cpp
+++ b/source/tensor/test/TLog.cpp
@@ -19,6 +19,7 @@
 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-12
 */

+#include "../core/math/Unary.h"
 #include "TLog.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -30,14 +31,14 @@ Set every entry to its log value.
 bool TestLog1()
 {
 	/* a tensor of size (3, 2) */
-	int aOrder = 2;
-	int * aDimSize = new int[aOrder];
-	aDimSize[0] = 3;
-	aDimSize[1] = 2;
+	int order = 2;
+	int * dimSize = new int[order];
+	dimSize[0] = 3;
+	dimSize[1] = 2;

-	int aUnitNum = 1;
-	for (int i = 0; i < aOrder; i++)
-		aUnitNum *= aDimSize[i];
+	int unitNum = 1;
+	for (int i = 0; i < order; i++)
+		unitNum *= dimSize[i];

 	DTYPE aData[3][2] = { {1.0F, 2.0F}, 
 	                      {0.5F, 4.0F},
@@ -50,14 +51,14 @@ bool TestLog1()
 	bool cpuTest = true;

 	/* create tensors */
-	XTensor * a = NewTensor(aOrder, aDimSize);
-    XTensor * b = NewTensor(aOrder, aDimSize);
-	XTensor * aMe = NewTensor(aOrder, aDimSize);
+	XTensor * a = NewTensor(order, dimSize);
+    XTensor * b = NewTensor(order, dimSize);
+	XTensor * aMe = NewTensor(order, dimSize);
    XTensor bUser;

 	/* initialize variables */
-	a->SetData(aData, aUnitNum);
-	aMe->SetData(aData, aUnitNum);
+	a->SetData(aData, unitNum);
+	aMe->SetData(aData, unitNum);

 	/* call Log function */
 	_Log(a, b);
@@ -65,21 +66,21 @@ bool TestLog1()
    bUser = Log(*a);

 	/* check results */
-	cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && aMe->CheckData(answer, aUnitNum, 1e-4F) && bUser.CheckData(answer, aUnitNum, 1e-4F);
+	cpuTest = b->CheckData(answer, unitNum, 1e-4F) && aMe->CheckData(answer, unitNum, 1e-4F) && bUser.CheckData(answer, unitNum, 1e-4F);
    
 #ifdef USE_CUDA
 	/* GPU test */
 	bool gpuTest = true;

 	/* create tensor */
-	XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
-	XTensor * bGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
-	XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+	XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+	XTensor * aMeGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
    XTensor bUserGPU;

 	/* Initialize variables */
-	aGPU->SetData(aData, aUnitNum);
-	aMeGPU->SetData(aData, aUnitNum);
+	aGPU->SetData(aData, unitNum);
+	aMeGPU->SetData(aData, unitNum);

 	/* call Log function */
    _Log(aGPU, bGPU);
@@ -87,7 +88,7 @@ bool TestLog1()
    bUserGPU = Log(*aGPU);

 	/* check results */
-	gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
+	gpuTest = bGPU->CheckData(answer, unitNum, 1e-4F) && aMeGPU->CheckData(answer, unitNum, 1e-4F) && bUserGPU.CheckData(answer, unitNum, 1e-4F);

 	/* destroy variables */
 	delete a;
@@ -96,7 +97,7 @@ bool TestLog1()
    delete aGPU;
    delete bGPU;
    delete aMeGPU;
-	delete[] aDimSize;
+	delete[] dimSize;

 	return cpuTest && gpuTest;
 #else
@@ -104,7 +105,7 @@ bool TestLog1()
 	delete a;
 	delete b;
 	delete aMe;
-	delete[] aDimSize;
+	delete[] dimSize;

 	return cpuTest;
 #endif // USE_CUDA

--- a/source/tensor/test/TLog.h
+++ b/source/tensor/test/TLog.h
@@ -22,8 +22,6 @@
 #ifndef __TEST_LOG_H__
 #define __TEST_LOG_H__

-#include "../core/math/Log.h"
-
 namespace nts { // namespace nts(NiuTrans.Tensor)

 /* test for Log Function */

--- a/source/tensor/test/TLogSoftmax.h
+++ b/source/tensor/test/TLogSoftmax.h
@@ -16,8 +16,8 @@
 */

 /*
-* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-02
-*/
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-02
+ */

 #ifndef __TEST_LOGSOFTMAX_H__
 #define __TEST_LOGSOFTMAX_H__

--- a/source/tensor/test/TMatrixMULBatchedCPU.cpp
+++ b/source/tensor/test/TMatrixMULBatchedCPU.cpp
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-15
-*/
-
-#include "TMatrixMULBatchedCPU.h"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-/* 
-case 1: matrix multiplication in batch mode (CPU code). 
-In this case, aList=2*(2, 3), bList=2*(3, 2) -> c=2*(2, 2), transposedA=X_NOTRANS, transposedB=X_NOTRANS.
-*/
-bool TestMatrixMulBatchedCPU1()
-{
-    /* create list */
-    XList * aList = new XList();
-    XList * bList = new XList();
-    XList * cList = new XList();
-
-    /* a source tensor of size (2, 3) */
-    int aOrder = 2;
-    int * aDimSize = new int[aOrder];
-    aDimSize[0] = 2;
-    aDimSize[1] = 3;
-
-    int aUnitNum = 1;
-    for (int i = 0; i < aOrder; i++)
-        aUnitNum *= aDimSize[i];
-
-    /* a source tensor of size (3, 2) */
-    int bOrder = 2;
-    int * bDimSize = new int[bOrder];
-    bDimSize[0] = 3;
-    bDimSize[1] = 2;
-
-    int bUnitNum = 1;
-    for (int i = 0; i < bOrder; i++)
-        bUnitNum *= bDimSize[i];
-
-    /* a target tensor of size (2, 2) */
-    int cOrder = 2;
-    int * cDimSize = new int[cOrder];
-    cDimSize[0] = 2;
-    cDimSize[1] = 2;
-
-    int cUnitNum = 1;
-    for (int i = 0; i < cOrder; i++)
-        cUnitNum *= cDimSize[i];
-
-    DTYPE aData1[2][3] = { {1.0F, 2.0F, 3.0F},
-                           {-4.0F, 5.0F, 6.0F} };
-    DTYPE aData2[2][3] = { {1.0F, -2.0F, -3.0F},
-                           {-4.0F, 3.0F, 2.0F} };
-    DTYPE bData1[3][2] = { {0.0F, -1.0F},
-                           {1.0F, 2.0F}, 
-                           {2.0F, 1.0F} };
-    DTYPE bData2[3][2] = { {0.0F, 1.0F},
-                           {3.0F, 2.0F}, 
-                           {2.0F, 1.0F} };
-    DTYPE answer1[2][2] = { {8.0F, 6.0F}, 
-                            {17.0F, 20.0F} };
-    DTYPE answer2[2][2] = { {-12.0F, -6.0F}, 
-                            {13.0F, 4.0F} };
-
-    /* CPU test */
-    bool cpuTest = true;
-
-    /* create tensors */
-    XTensor * a1 = NewTensor(aOrder, aDimSize);
-    XTensor * a2 = NewTensor(aOrder, aDimSize);
-    XTensor * b1 = NewTensor(bOrder, bDimSize);
-    XTensor * b2 = NewTensor(bOrder, bDimSize);
-    XTensor * c1 = NewTensor(cOrder, cDimSize);
-    XTensor * c2 = NewTensor(cOrder, cDimSize);
-
-    /* initialize variables */
-    a1->SetData(aData1, aUnitNum);
-    a2->SetData(aData2, aUnitNum);
-    b1->SetData(bData1, aUnitNum);
-    b2->SetData(bData2, aUnitNum);
-    c1->SetZeroAll();
-    c2->SetZeroAll();
-
-    /* add tensors to list */
-    aList->Add(a1);
-    aList->Add(a2);
-    bList->Add(b1);
-    bList->Add(b2);
-    cList->Add(c1);
-    cList->Add(c2);
-
-    /* call MatrixMULBatchedCPU function */
-    _MatrixMULBatchedCPU(aList, X_NOTRANS, bList, X_NOTRANS, cList);
-
-    /* check results */
-    cpuTest = c1->CheckData(answer1, cUnitNum) && c2->CheckData(answer2, cUnitNum);
-    
-#ifdef USE_CUDA
-    /* GPU test */
-    bool gpuTest = true;
-
-    /* create tensors */
-    XTensor * aGPU1 = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * aGPU2 = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * bGPU1 = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * bGPU2 = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * cGPU1 = NewTensor(cOrder, cDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * cGPU2 = NewTensor(cOrder, cDimSize, X_FLOAT, 1.0F, 0);
-
-    /* initialize variables */
-    aGPU1->SetData(aData1, aUnitNum);
-    aGPU2->SetData(aData2, aUnitNum);
-    bGPU1->SetData(bData1, aUnitNum);
-    bGPU2->SetData(bData2, aUnitNum);
-    cGPU1->SetZeroAll();
-    cGPU2->SetZeroAll();
-    
-    /* clear list */
-    aList->Clear();
-    bList->Clear();
-    cList->Clear();
-
-    /* add tensors to list */
-    aList->Add(aGPU1);
-    aList->Add(aGPU2);
-    bList->Add(bGPU1);
-    bList->Add(bGPU2);
-    cList->Add(cGPU1);
-    cList->Add(cGPU2);
-
-    /* call MatrixMULBatchedCPU function */
-    _MatrixMULBatchedCPU(aList, X_NOTRANS, bList, X_NOTRANS, cList);
-
-    /* check results */
-    gpuTest = cGPU1->CheckData(answer1, cUnitNum) && gpuTest;
-    gpuTest = cGPU2->CheckData(answer2, cUnitNum) && gpuTest;
-
-    /* destroy variables */
-    delete a1;
-    delete a2;
-    delete b1;
-    delete b2;
-    delete c1;
-    delete c2;
-    delete aGPU1;
-    delete aGPU2;
-    delete bGPU1;
-    delete bGPU2;
-    delete cGPU1;
-    delete cGPU2;
-    delete[] aDimSize;
-    delete[] bDimSize;
-    delete[] cDimSize;
-    
-    return cpuTest && gpuTest;
-#else
-    /* destroy variables */
-    delete a1;
-    delete a2;
-    delete b1;
-    delete b2;
-    delete c1;
-    delete c2;
-    delete[] aDimSize;
-    delete[] bDimSize;
-    delete[] cDimSize;
-
-    return cpuTest;
-#endif // USE_CUDA
-}
-
-/* other cases */
-/*
-    TODO!!
-*/
-
-/* test for MatrixMulBatchedCPU Function */
-extern "C"
-bool TestMatrixMulBatchedCPU()
-{
-    XPRINT(0, stdout, "[TEST MATRIXMULBATCHEDCPU] matrix multiplication in batch mode (CPU code) \n");
-    bool returnFlag = true, caseFlag = true;
-
-    /* case 1 test */
-    caseFlag = TestMatrixMulBatchedCPU1();
-
-    if (!caseFlag) {
-        returnFlag = false;
-        XPRINT(0, stdout, ">> case 1 failed!\n");
-    }
-    else
-        XPRINT(0, stdout, ">> case 1 passed!\n");
-
-    /* other cases test */
-    /*
-    TODO!!
-    */
-
-    if (returnFlag) {
-        XPRINT(0, stdout, ">> All Passed!\n");
-    }
-    else
-        XPRINT(0, stdout, ">> Failed!\n");
-
-    XPRINT(0, stdout, "\n");
-
-    return returnFlag;
-}
-
-} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/test/TMultiply.cpp
+++ b/source/tensor/test/TMultiply.cpp
@@ -25,133 +25,10 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /* 
 case 1: element-wise product of two tensors
-c(i) = a(i)*b(i) + \alpha * c(i) 
-In this case, (2, 1)  (2, 1) -> (2, 1), leadingDim=0, alpha=0.
-*/
-bool TestMultiply1()
-{
-	/* a source tensor of size (2, 1) */
-	int sOrder1 = 2;
-	int * sDimSize1 = new int[sOrder1];
-	sDimSize1[0] = 2;
-	sDimSize1[1] = 1;
-
-	int sUnitNum1 = 1;
-	for (int i = 0; i < sOrder1; i++)
-		sUnitNum1 *= sDimSize1[i];
-
-	/* a source tensor of size (2, 1) */
-	int sOrder2 = 2;
-	int * sDimSize2 = new int[sOrder2];
-	sDimSize2[0] = 2;
-	sDimSize2[1] = 1;
-
-	int sUnitNum2 = 1;
-	for (int i = 0; i < sOrder2; i++)
-		sUnitNum2 *= sDimSize2[i];
-
-	/* a target tensor of size (2, 1) */
-	int tOrder = 2;
-	int * tDimSize = new int[tOrder];
-	tDimSize[0] = 2;
-	tDimSize[1] = 1;
-
-	int tUnitNum = 1;
-	for (int i = 0; i < tOrder; i++)
-		tUnitNum *= tDimSize[i];
-
-	DTYPE sData1[2][1] = { {0.0F}, 
-                           {1.0F} };
-	DTYPE sData2[2][1] = { {2.0F},
-                           {3.0F} };
-	DTYPE answer[2][1] = { {0.0F},
-                           {3.0F} };
-
-	/* CPU test */
-	bool cpuTest = true;
-
-	/* create tensors */
-	XTensor * s1 = NewTensor(sOrder1, sDimSize1);
-	XTensor * s2 = NewTensor(sOrder2, sDimSize2);
-	XTensor * t = NewTensor(tOrder, tDimSize);
-	XTensor * tMe = NewTensor(tOrder, tDimSize);
-    XTensor tUser;
-
-	/* initialize variables */
-	s1->SetData(sData1, sUnitNum1);
-	tMe->SetData(sData1, sUnitNum1);
-	s2->SetData(sData2, sUnitNum2);
-	t->SetZeroAll();
-
-	/* call Multiply function */
-	_Multiply(s1, s2, t, 0, 0);
-	_MultiplyMe(tMe, s2, 0, 0);
-    tUser = Multiply(*s1, *s2, 0);
-
-	/* check results */
-	cpuTest = t->CheckData(answer, tUnitNum) 
-        && tMe->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum);
-
-#ifdef USE_CUDA
-	/* GPU test */
-	bool gpuTest = true;
-
-	/* create tensor */
-	XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
-	XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
-	XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
-	XTensor * tMeGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
-    XTensor tUserGPU;
-
-	/* Initialize variables */
-	sGPU1->SetData(sData1, sUnitNum1);
-	tMeGPU->SetData(sData1, sUnitNum1);
-	sGPU2->SetData(sData2, sUnitNum2);
-	tGPU->SetZeroAll();
-
-	/* call Multiply function */
-	_Multiply(sGPU1, sGPU2, tGPU, 0, 0);
-	_MultiplyMe(tMeGPU, sGPU2, 0, 0);
-    tUserGPU = Multiply(*sGPU1, *sGPU2, 0);
-
-	/* check results */
-	gpuTest = tGPU->CheckData(answer, tUnitNum)
-        && tMeGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);
-
-	/* destroy variables */
-    delete s1;
-    delete s2;
-    delete t;
-    delete tMe;
-    delete sGPU1;
-    delete sGPU2;
-    delete tGPU;
-    delete tMeGPU;
-    delete[] sDimSize1;
-    delete[] sDimSize2;
-    delete[] tDimSize;
-
-	return cpuTest && gpuTest;
-#else
-    /* destroy variables */
-    delete s1;
-    delete s2;
-    delete t;
-    delete tMe;
-    delete[] sDimSize1;
-    delete[] sDimSize2;
-    delete[] tDimSize;
-
-	return cpuTest;
-#endif // USE_CUDA
-}
-
-/* 
-case 2: element-wise product of two tensors
 c(i) = a(i)*b(i) + \alpha * c(i)
 In this case, (2, 2)  (2, 2) -> (2, 2), leadingDim=0, alpha=0.
 */
-bool TestMultiply2()
+bool TestMultiply1()
 {
 	/* a source tensor of size (2, 2) */
 	int sOrder1 = 2;
@@ -212,8 +89,9 @@ bool TestMultiply2()
    tUser = Multiply(*s1, *s2, 0);

 	/* check results */
-	cpuTest = t->CheckData(answer, tUnitNum) 
-        && tMe->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum);
+	cpuTest = t->CheckData(answer, tUnitNum) && 
+              tMe->CheckData(answer, tUnitNum) && 
+              tUser.CheckData(answer, tUnitNum);

 #ifdef USE_CUDA
 	/* GPU test */
@@ -270,113 +148,6 @@ bool TestMultiply2()
 #endif // USE_CUDA
 }

-/* 
-case 3: element-wise product of two tensors, c(i) = a(i)*b(i) + \alpha * c(i)
-In this case, (2, 2)  (2, 2) -> (2, 2), leadingDim=1, alpha=0.
-*/
-bool TestMultiply3()
-{
-	/* a source tensor of size (2, 2) */
-	int sOrder1 = 2;
-	int * sDimSize1 = new int[sOrder1];
-	sDimSize1[0] = 2;
-	sDimSize1[1] = 2;
-
-	int sUnitNum1 = 1;
-	for (int i = 0; i < sOrder1; i++)
-		sUnitNum1 *= sDimSize1[i];
-
-	/* a source tensor of size (2, 2) */
-	int sOrder2 = 2;
-	int * sDimSize2 = new int[sOrder2];
-	sDimSize2[0] = 2;
-	sDimSize2[1] = 2;
-
-	int sUnitNum2 = 1;
-	for (int i = 0; i < sOrder2; i++)
-		sUnitNum2 *= sDimSize2[i];
-
-	/* a target tensor of size (2, 2) */
-	int tOrder = 2;
-	int * tDimSize = new int[tOrder];
-	tDimSize[0] = 2;
-	tDimSize[1] = 2;
-
-	int tUnitNum = 1;
-	for (int i = 0; i < tOrder; i++)
-		tUnitNum *= tDimSize[i];
-
-	DTYPE sData1[2][2] = { {0.0F, 1.0F},
-	                       {2.0F, 3.0F} };
-	DTYPE sData2[2][2] = { {0.0F, 1.0F},
-	                       {2.0F, 3.0F} };
-	DTYPE answer[2][2] = { {0.0F, 1.0F},
-	                       {4.0F, 9.0F} };
-
-	/* CPU test */
-	bool cpuTest = true;
-
-	/* create tensors */
-	XTensor * s1 = NewTensor(sOrder1, sDimSize1);
-	XTensor * s2 = NewTensor(sOrder2, sDimSize2);
-	XTensor * t = NewTensor(tOrder, tDimSize);
-
-	/* initialize variables */
-	s1->SetData(sData1, sUnitNum1);
-	s2->SetData(sData2, sUnitNum2);
-	t->SetZeroAll();
-
-	/* call MultiplyElementWise function */
-	_Multiply(s1, s2, t, 0, 1);
-
-	/* check results */
-	cpuTest = t->CheckData(answer, tUnitNum);
-
-#ifdef USE_CUDA
-	/* GPU test */
-	bool gpuTest = true;
-
-	/* create tensor */
-	XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
-	XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
-	XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
-
-	/* Initialize variables */
-	sGPU1->SetData(sData1, sUnitNum1);
-	sGPU2->SetData(sData2, sUnitNum2);
-	tGPU->SetZeroAll();
-
-	/* call MultiplyElementWise function */
-	_Multiply(sGPU1, sGPU2, tGPU, 0, 1);
-
-	/* check results */
-	gpuTest = tGPU->CheckData(answer, tUnitNum);
-
-	/* destroy variables */
-    delete s1;
-    delete s2;
-    delete t;
-    delete sGPU1;
-    delete sGPU2;
-    delete tGPU;
-    delete[] sDimSize1;
-    delete[] sDimSize2;
-    delete[] tDimSize;
-
-	return cpuTest && gpuTest;
-#else
-    /* destroy variables */
-    delete s1;
-    delete s2;
-    delete t;
-    delete[] sDimSize1;
-    delete[] sDimSize2;
-    delete[] tDimSize;
-
-	return cpuTest;
-#endif // USE_CUDA
-}
-
 /* other cases */
 /*
 TODO!!
@@ -398,26 +169,6 @@ bool TestMultiply()
 	else
 		XPRINT(0, stdout, ">> case 1 passed!\n");

-	/* case 2 test */
-	caseFlag = TestMultiply2();
-
-	if (!caseFlag) {
-		returnFlag = false;
-		XPRINT(0, stdout, ">> case 2 failed!\n");
-	}
-	else
-		XPRINT(0, stdout, ">> case 2 passed!\n");
-
-	/* case 3 test */
-	caseFlag = TestMultiply3();
-
-	if (!caseFlag) {
-		returnFlag = false;
-		XPRINT(0, stdout, ">> case 3 failed!\n");
-	}
-	else
-		XPRINT(0, stdout, ">> case 3 passed!\n");
-
 	/* other cases test */
 	/*
 	TODO!!

--- a/source/tensor/test/TMultiply.h
+++ b/source/tensor/test/TMultiply.h
@@ -19,16 +19,17 @@
 * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-06-15
 */

-#ifndef __TEST_MULTIPLYELEMENTWISE_H__
-#define __TEST_MULTIPLYELEMENTWISE_H__
+#ifndef __TEST_MULTIPLY_H__
+#define __TEST_MULTIPLY_H__

 #include "../core/arithmetic/Multiply.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

-/* test for MultiplyElementWise Function */
+/* test for Multiply Function */
 extern "C"
 bool TestMultiply();

 } // namespace nts(NiuTrans.Tensor)
-#endif // __TEST_MULTIPLYELEMENTWISE_H__
+
+#endif // __TEST_MULTIPLY_H__
--- a/source/tensor/test/TReduceSum.cpp
+++ b/source/tensor/test/TReduceSum.cpp
@@ -154,6 +154,84 @@ bool TestReduceSum1()
 #endif // USE_CUDA
 }

+void TestReduceSum2()
+{
+#ifdef USE_CUDA
+
+    int sOrder = 2;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 128;
+    sDimSize[1] = 16400;
+
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+
+    int tOrder1 = 1;
+    int * tDimSize1 = new int[tOrder1];
+    tDimSize1[0] = 16400;
+
+    int tUnitNum1 = 1;
+    for (int i = 0; i < tOrder1; i++)
+        tUnitNum1 *= tDimSize1[i];
+
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensors */
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU1 = NewTensor(tOrder1, tDimSize1, X_FLOAT, 1.0F, 0);
+
+    /* initialize variables */
+    /* initialize variables */
+    FILE *dataFile;
+    char dataString[32];
+    const int dataSize = 128 * 16400;
+    DTYPE sData[dataSize];
+    if ((dataFile = fopen("D:\\Work\\TensorFlowLearn\\testdata.in", "r")) == NULL)
+    {
+        printf("file open fail");
+        exit(1);
+    }
+    for (int i = 0; i < dataSize; ++i)
+    {
+        if (fscanf(dataFile, "%s", dataString) != EOF)
+        {
+            //printf("%s", dataString);
+            sData[i] = atof(dataString);
+            //srcTensorData[i] = i;
+        }
+        else
+        {
+            printf("read wrong");
+            break;
+        }
+    }
+    sGPU->SetData(sData, sUnitNum);
+    tGPU1->SetZeroAll();
+
+    /* call ReduceSum function */
+    //_ReduceSum(sGPU, tGPU1, 0);
+
+    /* check results */
+    DTYPE TensorData[dataSize];
+    cudaMemcpy(TensorData, tGPU1->data, sizeof(DTYPE)* tUnitNum1, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < 1000; ++i)
+    {
+        //check += TensorData[i * 2];
+        printf("%f ", TensorData[i]);
+        if ((i + 1) % 5 == 0) printf("\n");
+    }
+
+    /* destroy variables */
+    delete sGPU;
+    delete tGPU1;
+    delete[] sDimSize;
+    delete[] tDimSize1;
+
+#endif
+}
+
 /* other cases */
 /*
 TODO!!
@@ -167,6 +245,7 @@ bool TestReduceSum()

    /* case 1 test */
    caseFlag = TestReduceSum1();
+    //TestReduceSum2();
    if (!caseFlag) {
        returnFlag = false;
        XPRINT(0, stdout, ">> case 1 failed!\n");

--- a/source/tensor/test/TRound.cpp
+++ b/source/tensor/test/TRound.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
+*/
+
+#include "../core/math/Unary.h"
+#include "TRound.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/*
+case 1: test Round function.
+Set every entry to its round value.
+*/
+bool TestRound1()
+{
+    return true;
+
+	/* a tensor of size (3, 2) */
+	int order = 2;
+	int * dimSize = new int[order];
+	dimSize[0] = 3;
+	dimSize[1] = 2;
+
+	int unitNum = 1;
+	for (int i = 0; i < order; i++)
+		unitNum *= dimSize[i];
+
+	DTYPE aData[3][2] = { {1.3F, 2.7F}, 
+	                      {-1.3F, -2.7F},
+	                      {0.0F, 0.5F} };
+	DTYPE answer[3][2] = { {1.0F, 3.0F},
+	                       {-1.0F, -3.0F},
+	                       {0.0F, 1.0F} };
+
+	/* CPU test */
+	bool cpuTest = true;
+
+	/* create tensors */
+	XTensor * a = NewTensor(order, dimSize);
+    XTensor * b = NewTensor(order, dimSize);
+	XTensor * aMe = NewTensor(order, dimSize);
+    XTensor bUser;
+
+	/* initialize variables */
+	a->SetData(aData, unitNum);
+	aMe->SetData(aData, unitNum);
+
+	/* call Round function */
+	//_Round(a, b);
+	//_RoundMe(aMe);
+    //bUser = Round(*a);
+
+	/* check results */
+	cpuTest = b->CheckData(answer, unitNum, 1e-4F) && 
+              aMe->CheckData(answer, unitNum, 1e-4F) && 
+              bUser.CheckData(answer, unitNum, 1e-4F);
+    
+#ifdef USE_CUDA
+	/* GPU test */
+	bool gpuTest = true;
+
+	/* create tensor */
+	XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+	XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+	XTensor * aMeGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor bUserGPU;
+
+	/* Initialize variables */
+	aGPU->SetData(aData, unitNum);
+	aMeGPU->SetData(aData, unitNum);
+
+	/* call Round function */
+    //_Round(aGPU, bGPU);
+	//_RoundMe(aMeGPU);
+    //bUserGPU = Round(*aGPU);
+
+	/* check results */
+	gpuTest = bGPU->CheckData(answer, unitNum, 1e-4F) && 
+              aMeGPU->CheckData(answer, unitNum, 1e-4F) && 
+              bUserGPU.CheckData(answer, unitNum, 1e-4F);
+
+	/* destroy variables */
+	delete a;
+	delete b;
+	delete aMe;
+    delete aGPU;
+    delete bGPU;
+    delete aMeGPU;
+	delete[] dimSize;
+
+	return cpuTest && gpuTest;
+#else
+	/* destroy variables */
+	delete a;
+	delete b;
+	delete aMe;
+	delete[] dimSize;
+
+	return cpuTest;
+#endif // USE_CUDA
+}
+
+/* other cases */
+/*
+TODO!!
+*/
+
+/* test for Round Function */
+bool TestRound()
+{
+	XPRINT(0, stdout, "[TEST Round] set every entry to its round value \n");
+	bool returnFlag = true, caseFlag = true;
+
+	/* case 1 test */
+	caseFlag = TestRound1();
+
+	if (!caseFlag) {
+		returnFlag = false;
+		XPRINT(0, stdout, ">> case 1 failed!\n");
+	}
+	else
+		XPRINT(0, stdout, ">> case 1 passed!\n");
+
+	/* other cases test */
+	/*
+	TODO!!
+	*/
+
+	if (returnFlag) {
+		XPRINT(0, stdout, ">> All Passed!\n");
+	}
+	else
+		XPRINT(0, stdout, ">> Failed!\n");
+
+	XPRINT(0, stdout, "\n");
+
+	return returnFlag;
+}
+
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/Absolute.cuh
+++ b/source/tensor/core/arithmetic/Absolute.cuh
@@ -16,26 +16,16 @@
 */

 /*
-* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-03
 */

-#include "Absolute.h"
+#ifndef __TEST_ROUND_H__
+#define __TEST_ROUND_H__

 namespace nts { // namespace nts(NiuTrans.Tensor)

-#ifdef USE_CUDA
+/* test for Round Function */
+bool TestRound();

-/* set each entry to its absolute value (CUDA Kernel) */
-__global__
-void KernelAbsolute(DTYPE * a, DTYPE * b, int size);
-
-/* set each entry to its absolute value (CUDA Kernel) with float16 data type*/
-__global__
-void KernelAbsolute(__half * a, __half * b, int size);
-
-/* set each entry to its absolute value */
-void _CudaAbsolute(const XTensor * a, XTensor * b);
-
-#endif // USE_CUDA
-
-} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
+} // namespace nts(NiuTrans.Tensor)
+#endif // __TEST_ROUND_H__
--- a/source/tensor/test/TSin.cpp
+++ b/source/tensor/test/TSin.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
+*/
+
+#include "../core/math/Unary.h"
+#include "TSin.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/*
+case 1: test Sin function.
+Set every entry to its sine value.
+*/
+bool TestSin1()
+{
+	/* a tensor of size (3, 2) */
+	int order = 2;
+	int * dimSize = new int[order];
+	dimSize[0] = 3;
+	dimSize[1] = 2;
+
+	int unitNum = 1;
+	for (int i = 0; i < order; i++)
+		unitNum *= dimSize[i];
+
+	DTYPE aData[3][2] = { {1.0F, 2.0F}, 
+	                      {-1.0F, -2.0F},
+	                      {0.0F, 0.5F} };
+	DTYPE answer[3][2] = { {0.8415F, 0.9093F},
+	                       {-0.8415F, -0.9093F},
+	                       {0.0F, 0.4794F} };
+
+	/* CPU test */
+	bool cpuTest = true;
+
+	/* create tensors */
+	XTensor * a = NewTensor(order, dimSize);
+    XTensor * b = NewTensor(order, dimSize);
+	XTensor * aMe = NewTensor(order, dimSize);
+    XTensor bUser;
+
+	/* initialize variables */
+	a->SetData(aData, unitNum);
+	aMe->SetData(aData, unitNum);
+
+	/* call Sin function */
+	_Sin(a, b);
+	_SinMe(aMe);
+    bUser = Sin(*a);
+
+	/* check results */
+	cpuTest = b->CheckData(answer, unitNum, 1e-4F) && aMe->CheckData(answer, unitNum, 1e-4F) && bUser.CheckData(answer, unitNum, 1e-4F);
+    
+#ifdef USE_CUDA
+	/* GPU test */
+	bool gpuTest = true;
+
+	/* create tensor */
+	XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+	XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+	XTensor * aMeGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor bUserGPU;
+
+	/* Initialize variables */
+	aGPU->SetData(aData, unitNum);
+	aMeGPU->SetData(aData, unitNum);
+
+	/* call Sin function */
+    _Sin(aGPU, bGPU);
+	_SinMe(aMeGPU);
+    bUserGPU = Sin(*aGPU);
+
+	/* check results */
+	gpuTest = bGPU->CheckData(answer, unitNum, 1e-4F) && aMeGPU->CheckData(answer, unitNum, 1e-4F) && bUserGPU.CheckData(answer, unitNum, 1e-4F);
+
+	/* destroy variables */
+	delete a;
+	delete b;
+	delete aMe;
+    delete aGPU;
+    delete bGPU;
+    delete aMeGPU;
+	delete[] dimSize;
+
+	return cpuTest && gpuTest;
+#else
+	/* destroy variables */
+	delete a;
+	delete b;
+	delete aMe;
+	delete[] dimSize;
+
+	return cpuTest;
+#endif // USE_CUDA
+}
+
+/* other cases */
+/*
+TODO!!
+*/
+
+/* test for Sin Function */
+bool TestSin()
+{
+	XPRINT(0, stdout, "[TEST Sin] set every entry to its sine value \n");
+	bool returnFlag = true, caseFlag = true;
+
+	/* case 1 test */
+	caseFlag = TestSin1();
+
+	if (!caseFlag) {
+		returnFlag = false;
+		XPRINT(0, stdout, ">> case 1 failed!\n");
+	}
+	else
+		XPRINT(0, stdout, ">> case 1 passed!\n");
+
+	/* other cases test */
+	/*
+	TODO!!
+	*/
+
+	if (returnFlag) {
+		XPRINT(0, stdout, ">> All Passed!\n");
+	}
+	else
+		XPRINT(0, stdout, ">> Failed!\n");
+
+	XPRINT(0, stdout, "\n");
+
+	return returnFlag;
+}
+
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/math/Log.cuh
+++ b/source/tensor/core/math/Log.cuh
@@ -16,31 +16,16 @@
 */

 /*
-* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
 */

-#ifndef __LOG_CUH__
-#define __LOG_CUH__
-
-#include "Log.h"
+#ifndef __TEST_SIN_H__
+#define __TEST_SIN_H__

 namespace nts { // namespace nts(NiuTrans.Tensor)

-#ifdef USE_CUDA
-
-/* set each entry to its log value (CUDA Kernel) */
-__global__
-void KernelLog(DTYPE * a, DTYPE * b, int size);
-
-/* set each entry to its log value (CUDA Kernel) with float16 data type*/
-__global__
-void KernelLog(__half * a, __half * b, int size);
-
-/* set each entry to its log value */
-void _CudaLog(const XTensor * a, XTensor * b);
-
-#endif // USE_CUDA
+/* test for Sin Function */
+bool TestSin();

 } // namespace nts(NiuTrans.Tensor)
-
-#endif // __LOG_CUH__
\ No newline at end of file
+#endif // __TEST_SIN_H__
--- a/source/tensor/test/TSoftmax.cpp
+++ b/source/tensor/test/TSoftmax.cpp
@@ -214,8 +214,8 @@ bool TestSoftmax3Gpu()

    int order = 2;
    int * dimSize = new int[order];
-    dimSize[0] = 8;
-    dimSize[1] = 1000;
+    dimSize[0] = 10;
+    dimSize[1] = 10000;

    int unitNum = 1;
    for (int i = 0; i < order; i++)
@@ -228,7 +228,7 @@ bool TestSoftmax3Gpu()
    /* initialize variables */
    FILE *dataFile;
    char dataString[32];
-    const int dataSize = 8 * 1000;
+    const int dataSize = 10 * 10000;
    DTYPE xData[dataSize];
    if ((dataFile = fopen("D:\\Work\\TensorFlowLearn\\testdata.in", "r")) == NULL)
    {
@@ -261,9 +261,9 @@ bool TestSoftmax3Gpu()
    DTYPE check = 0;
    DTYPE TensorData[dataSize];
    cudaMemcpy(TensorData, yGPU->data, sizeof(DTYPE)* unitNum, cudaMemcpyDeviceToHost);
-    for (int i = 0; i < 1000; ++i)
+    for (int i = 0; i < 10000; ++i)
    {
-        check += TensorData[i];
+        check += TensorData[i*2];
        //printf("%f ", TensorData[i]);
    }
    printf("\n%f \n", check);

--- a/source/tensor/test/TSub.cpp
+++ b/source/tensor/test/TSub.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-01
+ */
+
+#include "TSub.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* case 1: tensor subtraction c = a - b * \beta */
+bool TestSub1()
+{
+    /* a tensor of size (2, 4) */
+    int order = 2;
+    int * dimSize = new int[order];
+    dimSize[0] = 2;
+    dimSize[1] = 4;
+
+    int unitNum = 1;
+    for (int i = 0; i < order; i++)
+        unitNum *= dimSize[i];
+
+    DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
+                          {4.0F, 5.0F, 6.0F, 7.0F} };
+    DTYPE bData[2][4] = { {1.0F, -1.0F, -3.0F, -5.0F}, 
+                          {-7.0F, -9.0F, -11.0F, -13.0F} };
+    DTYPE answer[2][4] = { {-1.0F, 2.0F, 5.0F, 8.0F},
+                           {11.0F, 14.0F, 17.0F, 20.0F} };
+
+    /* CPU test */
+    bool cpuTest = true;
+
+    /* create tensors */
+    XTensor * a = NewTensor(order, dimSize);
+    XTensor * b = NewTensor(order, dimSize);
+    XTensor * c = NewTensor(order, dimSize);
+    XTensor * cMe = NewTensor(order, dimSize);
+    XTensor cUser;
+
+    /* initialize variables */
+    a->SetData(aData, unitNum);
+    cMe->SetData(aData, unitNum);
+    b->SetData(bData, unitNum);
+    c->SetZeroAll();
+
+    /* call Sub function */
+    _Sub(a, b, c);
+    _SubMe(cMe, b);
+    cUser = Sub(*a, *b);
+
+    /* check results */
+    cpuTest = c->CheckData(answer, unitNum)
+              && cMe->CheckData(answer, unitNum) && cUser.CheckData(answer, unitNum);
+
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensor */
+    XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * cGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * cMeGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor cUserGPU;
+
+    /* Initialize variables */
+    aGPU->SetData(aData, unitNum);
+    cMeGPU->SetData(aData, unitNum);
+    bGPU->SetData(bData, unitNum);
+    cGPU->SetZeroAll();
+
+    /* call Sub function */
+    _Sub(aGPU, bGPU, cGPU);
+    _SubMe(cMeGPU, bGPU);
+    cUserGPU = Sub(*aGPU, *bGPU);
+
+    /* check results */
+    gpuTest = cGPU->CheckData(answer, unitNum, 1e-4F)
+              && cMeGPU->CheckData(answer, unitNum, 1e-4F) && cUserGPU.CheckData(answer, unitNum, 1e-4F);
+    
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete c;
+    delete cMe;
+    delete aGPU;
+    delete bGPU;
+    delete cGPU;
+    delete cMeGPU;
+    delete[] dimSize;
+
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete a;
+	delete b;
+	delete c;
+    delete cMe;
+    delete[] dimSize;
+
+    return cpuTest;
+#endif // USE_CUDA
+}
+
+/* case 2: tensor subtraction c = a - b * \beta */
+bool TestSub2()
+{
+    /* a tensor of size (2, 4) */
+    int order = 2;
+    int * dimSize = new int[order];
+    dimSize[0] = 2;
+    dimSize[1] = 4;
+
+    int unitNum = 1;
+    for (int i = 0; i < order; i++) {
+        unitNum *= dimSize[i];
+    }
+    DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
+                          {4.0F, 5.0F, 6.0F, 7.0F} };
+    DTYPE bData[2][4] = { {1.0F, -1.0F, -3.0F, -5.0F}, 
+                          {-7.0F, -9.0F, -11.0F, -13.0F} };
+    DTYPE answer[2][4] = { {-0.5F, 1.5F, 3.5F, 5.5F},
+                           {7.5F, 9.5F, 11.5F, 13.5F} };
+    float beta = 0.5F;
+
+    /* CPU test */
+    bool cpuTest = true;
+
+    /* create tensor */
+    XTensor * a = NewTensor(order, dimSize);
+    XTensor * b = NewTensor(order, dimSize);
+    XTensor * c = NewTensor(order, dimSize);
+    XTensor * cMe = NewTensor(order, dimSize);
+    XTensor cUser;
+
+    /* initialize variables */
+    a->SetData(aData, unitNum);
+    cMe->SetData(aData, unitNum);
+    b->SetData(bData, unitNum);
+    c->SetZeroAll();
+
+    /* call Sub function */
+    _Sub(a, b, c, beta);
+    _SubMe(cMe, b, beta);
+    cUser = Sub(*a, *b, beta);
+
+    /* check results */
+    cpuTest = c->CheckData(answer, unitNum)
+              && cMe->CheckData(answer, unitNum) && cUser.CheckData(answer, unitNum);
+
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensor */
+    XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * cGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * cMeGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor cUserGPU;
+
+    /* Initialize variables */
+    aGPU->SetData(aData, unitNum);
+    cMeGPU->SetData(aData, unitNum);
+    bGPU->SetData(bData, unitNum);
+    cGPU->SetZeroAll();
+
+    /* call Sub function */
+    _Sub(aGPU, bGPU, cGPU, beta);
+    _SubMe(cMeGPU, bGPU, beta);
+    cUserGPU = Sub(*aGPU, *bGPU, beta);
+
+    /* check results */
+    gpuTest = cGPU->CheckData(answer, unitNum, 1e-4F)
+              && cMeGPU->CheckData(answer, unitNum, 1e-4F) && cUserGPU.CheckData(answer, unitNum, 1e-4F);
+
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete c;
+    delete cMe;
+    delete aGPU;
+    delete bGPU;
+    delete cGPU;
+    delete cMeGPU;
+    delete[] dimSize;
+
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete c;
+    delete cMe;
+    delete[] dimSize;
+
+    return cpuTest;
+#endif // USE_CUDA
+}
+
+/* other cases */
+/*
+    TODO!!
+*/
+
+/* test for Sub Function */
+bool TestSub()
+{
+    XPRINT(0, stdout, "[TEST SUB] tensor subtraction c = a - b * beta\n");
+    bool returnFlag = true, caseFlag = true;
+
+    /* case 1 test */
+    caseFlag = TestSub1();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 1 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 1 passed!\n");
+
+    /* case 2 test */
+    caseFlag = TestSub2();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 2 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 2 passed!\n");
+
+    /* other cases test */
+    /*
+        TODO!!
+    */
+
+    if (returnFlag) {
+        XPRINT(0, stdout, ">> All Passed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> Failed!\n");
+
+    XPRINT(0, stdout, "\n");
+
+    return returnFlag;
+}
+
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/test/TSub.h
+++ b/source/tensor/test/TSub.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-01
+ */
+
+#ifndef __TEST_SUB_H__
+#define __TEST_SUB_H__
+
+#include "../core/arithmetic/Sub.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* test for Sub Function */
+bool TestSub();
+
+} // namespace nts(NiuTrans.Tensor)
+#endif // __TEST_SUB_H__
--- a/source/tensor/test/TSum.cpp
+++ b/source/tensor/test/TSum.cpp
@@ -16,8 +16,8 @@
 */

 /*
-* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-04-30
-*/
+ * $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-04-30
+ */

 #include "TSum.h"

@@ -59,14 +59,14 @@ bool TestSum1()
    b->SetData(bData, unitNum);
    c->SetZeroAll();

-    /* call sum function */
+    /* call Sum function */
    _Sum(a, b, c);
    _SumMe(cMe, b);
    cUser = Sum(*a, *b);

    /* check results */
    cpuTest = c->CheckData(answer, unitNum)
-        && cMe->CheckData(answer, unitNum) && cUser.CheckData(answer, unitNum);
+              && cMe->CheckData(answer, unitNum) && cUser.CheckData(answer, unitNum);

 #ifdef USE_CUDA
    /* GPU test */
@@ -85,14 +85,14 @@ bool TestSum1()
    bGPU->SetData(bData, unitNum);
    cGPU->SetZeroAll();

-    /* call sum function */
+    /* call Sum function */
    _Sum(aGPU, bGPU, cGPU);
    _SumMe(cMeGPU, bGPU);
    cUserGPU = Sum(*aGPU, *bGPU);

    /* check results */
    gpuTest = cGPU->CheckData(answer, unitNum)
-        && cMeGPU->CheckData(answer, unitNum) && cUserGPU.CheckData(answer, unitNum);
+              && cMeGPU->CheckData(answer, unitNum) && cUserGPU.CheckData(answer, unitNum);

    /* destroy variables */
    delete a;
@@ -155,14 +155,14 @@ bool TestSum2()
    b->SetData(bData, unitNum);
    c->SetZeroAll();

-    /* call sum function */
+    /* call Sum function */
    _Sum(a, b, c, beta);
    _SumMe(cMe, b, beta);
    cUser = Sum(*a, *b, beta);

    /* check results */
    cpuTest = c->CheckData(answer, unitNum)
-        && cMe->CheckData(answer, unitNum) && cUser.CheckData(answer, unitNum);
+              && cMe->CheckData(answer, unitNum) && cUser.CheckData(answer, unitNum);

 #ifdef USE_CUDA
    /* GPU test */
@@ -181,14 +181,14 @@ bool TestSum2()
    bGPU->SetData(bData, unitNum);
    cGPU->SetZeroAll();

-    /* call sum function */
+    /* call Sum function */
    _Sum(aGPU, bGPU, cGPU, beta);
    _SumMe(cMeGPU, bGPU, beta);
    cUserGPU = Sum(*aGPU, *bGPU, beta);

    /* check results */
    gpuTest = cGPU->CheckData(answer, unitNum)
-        && cMeGPU->CheckData(answer, unitNum) && cUserGPU.CheckData(answer, unitNum);
+              && cMeGPU->CheckData(answer, unitNum) && cUserGPU.CheckData(answer, unitNum);

    /* destroy variables */
    delete a;

--- a/source/tensor/test/TSum.h
+++ b/source/tensor/test/TSum.h
@@ -16,8 +16,8 @@
 */

 /*
-* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-04-30
-*/
+ * $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-04-30
+ */

 #ifndef __TEST_SUM_H__
 #define __TEST_SUM_H__

--- a/source/tensor/test/TSumDim.cpp
+++ b/source/tensor/test/TSumDim.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-30
+*/
+
+#include "TSumDim.h"
+#include "../core/arithmetic/SumDim.h"
+#include "../XTensor.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* 
+case 1: tensor summation c = a + b * \beta 
+where the size of b is equal to the n-th dimension of a, 
+i.e., a is summed with b by broadcasting 
+*/
+bool TestSumDim1()
+{
+    /* a tensor of size (2, 4) */
+    int aOrder = 2;
+    int * aDimSize = new int[aOrder];
+    aDimSize[0] = 2;
+    aDimSize[1] = 4;
+
+    int aUnitNum = 1;
+    for (int i = 0; i < aOrder; i++)
+        aUnitNum *= aDimSize[i];
+
+    /* a tensor of size (2) */
+    int bOrder = 1;
+    int * bDimSize = new int[bOrder];
+    bDimSize[0] = 2;
+
+    int bUnitNum = 1;
+    for (int i = 0; i < bOrder; i++)
+        bUnitNum *= bDimSize[i];
+
+    DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
+                          {4.0F, 5.0F, 6.0F, 7.0F} };
+    DTYPE bData[2] = {1.0F, -1.0F};
+    DTYPE answer[2][4] = { {1.0F, 2.0F, 3.0F, 4.0F},
+                           {3.0F, 4.0F, 5.0F, 6.0F} };
+
+    /* CPU test */
+    bool cpuTest = true;
+
+    /* create tensors */
+    XTensor * a = NewTensor(aOrder, aDimSize);
+    XTensor * b = NewTensor(bOrder, bDimSize);
+    XTensor * c = NewTensor(aOrder, aDimSize);
+    XTensor * cMe = NewTensor(aOrder, aDimSize);
+    XTensor cUser;
+
+    /* initialize variables */
+    a->SetData(aData, aUnitNum);
+    cMe->SetData(aData, aUnitNum);
+    b->SetData(bData, bUnitNum);
+    c->SetZeroAll();
+
+    /* call SumDim function */
+    _SumDim(a, b, c, 0);
+    _SumDim(cMe, b, 0);
+    cUser = SumDim(*a, *b, 0);
+
+    /* check results */
+    cpuTest = c->CheckData(answer, aUnitNum)
+              && cMe->CheckData(answer, aUnitNum) 
+              && cUser.CheckData(answer, aUnitNum);
+
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensor */
+    XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor cUserGPU;
+
+    /* Initialize variables */
+    aGPU->SetData(aData, aUnitNum);
+    cMeGPU->SetData(aData, aUnitNum);
+    bGPU->SetData(bData, bUnitNum);
+    cGPU->SetZeroAll();
+
+    /* call sum function */
+    _SumDim(aGPU, bGPU, cGPU, 0);
+    _SumDim(cMeGPU, bGPU, 0);
+    cUserGPU = SumDim(*aGPU, *bGPU, 0);
+
+    /* check results */
+    gpuTest = cGPU->CheckData(answer, aUnitNum)
+              && cMeGPU->CheckData(answer, aUnitNum) 
+              && cUserGPU.CheckData(answer, aUnitNum);
+
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete c;
+    delete cMe;
+    delete aGPU;
+    delete bGPU;
+    delete cGPU;
+    delete cMeGPU;
+    delete[] aDimSize;
+    delete[] bDimSize;
+
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete a;
+	delete b;
+	delete c;
+    delete cMe;
+    delete[] aDimSize;
+    delete[] bDimSize;
+
+    return cpuTest;
+#endif // USE_CUDA
+}
+
+/* 
+case 2: tensor summation c = a + b * \beta 
+where the size of b is equal to the n-th dimension of a, 
+i.e., a is summed with b by broadcasting 
+*/
+bool TestSumDim2()
+{
+    /* a tensor of size (2, 4) */
+    int aOrder = 2;
+    int * aDimSize = new int[aOrder];
+    aDimSize[0] = 2;
+    aDimSize[1] = 4;
+
+    int aUnitNum = 1;
+    for (int i = 0; i < aOrder; i++)
+        aUnitNum *= aDimSize[i];
+
+    /* a tensor of size (2, 2) */
+    int bOrder = 2;
+    int * bDimSize = new int[bOrder];
+    bDimSize[0] = 2;
+    bDimSize[1] = 2;
+
+    int bUnitNum = 1;
+    for (int i = 0; i < bOrder; i++)
+        bUnitNum *= bDimSize[i];
+
+    DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
+                          {4.0F, 5.0F, 6.0F, 7.0F} };
+    DTYPE bData[2][2] = { {1.0F, -1.0F},
+                          {-1.0F, 1.0F} };
+    DTYPE answer[2][4] = { {1.0F, 0.0F, 1.0F, 4.0F},
+                           {5.0F, 4.0F, 5.0F, 8.0F} };
+
+    /* CPU test */
+    bool cpuTest = true;
+
+    /* create tensors */
+    XTensor * a = NewTensor(aOrder, aDimSize);
+    XTensor * b = NewTensor(bOrder, bDimSize);
+    XTensor * c = NewTensor(aOrder, aDimSize);
+    XTensor * cMe = NewTensor(aOrder, aDimSize);
+    XTensor cUser;
+
+    /* initialize variables */
+    a->SetData(aData, aUnitNum);
+    cMe->SetData(aData, aUnitNum);
+    b->SetData(bData, bUnitNum);
+    c->SetZeroAll();
+
+    /* call SumDim function */
+    _SumDim(a, b, c, 1);
+    _SumDim(cMe, b, 1);
+    cUser = SumDim(*a, *b, 1);
+
+    /* check results */
+    cpuTest = c->CheckData(answer, aUnitNum)
+              && cMe->CheckData(answer, aUnitNum) 
+              && cUser.CheckData(answer, aUnitNum);
+
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensor */
+    XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * cGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * cMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor cUserGPU;
+
+    /* Initialize variables */
+    aGPU->SetData(aData, aUnitNum);
+    cMeGPU->SetData(aData, aUnitNum);
+    bGPU->SetData(bData, bUnitNum);
+    cGPU->SetZeroAll();
+
+    /* call sum function */
+    _SumDim(aGPU, bGPU, cGPU, 1);
+    _SumDim(cMeGPU, bGPU, 1);
+    cUserGPU = SumDim(*aGPU, *bGPU, 1);
+
+    /* check results */
+    gpuTest = cGPU->CheckData(answer, aUnitNum)
+              && cMeGPU->CheckData(answer, aUnitNum) 
+              && cUserGPU.CheckData(answer, aUnitNum);
+
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete c;
+    delete cMe;
+    delete aGPU;
+    delete bGPU;
+    delete cGPU;
+    delete cMeGPU;
+    delete[] aDimSize;
+    delete[] bDimSize;
+
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete a;
+	delete b;
+	delete c;
+    delete cMe;
+    delete[] aDimSize;
+    delete[] bDimSize;
+
+    return cpuTest;
+#endif // USE_CUDA
+}
+
+/* other cases */
+/*
+    TODO!!
+*/
+
+/* test for SumDim Function */
+bool TestSumDim()
+{
+    XPRINT(0, stdout, "[TEST SUMDIM] tensor summation c = a + b * beta by broadcasting\n");
+    bool returnFlag = true, caseFlag = true;
+
+    /* case 1 test */
+    caseFlag = TestSumDim1();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 1 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 1 passed!\n");
+
+    /* case 2 test */
+    caseFlag = TestSumDim2();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 2 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 2 passed!\n");
+
+    /* other cases test */
+    /*
+        TODO!!
+    */
+
+    if (returnFlag) {
+        XPRINT(0, stdout, ">> All Passed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> Failed!\n");
+
+    XPRINT(0, stdout, "\n");
+
+    return returnFlag;
+}
+
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/test/TSumDim.h
+++ b/source/tensor/test/TSumDim.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-30
+* I finish my summer holidays and go back to study.
+*/
+
+#ifndef __TEST_SUMDIM_H__
+#define __TEST_SUMDIM_H__
+
+#include "../core/arithmetic/SumDim.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* test for SumDim Function */
+extern "C"
+bool TestSumDim();
+
+} // namespace nts(NiuTrans.Tensor)
+#endif // __TEST_SUMDIM_H__
--- a/source/tensor/test/TTan.cpp
+++ b/source/tensor/test/TTan.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
+*/
+
+#include "../core/math/Unary.h"
+#include "TTan.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/*
+case 1: test Tan function.
+Set every entry to its tangent value.
+*/
+bool TestTan1()
+{
+	/* a tensor of size (3, 2) */
+	int order = 2;
+	int * dimSize = new int[order];
+	dimSize[0] = 3;
+	dimSize[1] = 2;
+
+	int unitNum = 1;
+	for (int i = 0; i < order; i++)
+		unitNum *= dimSize[i];
+
+	DTYPE aData[3][2] = { {1.0F, 2.0F}, 
+	                      {-1.0F, -2.0F},
+	                      {0.0F, 0.5F} };
+	DTYPE answer[3][2] = { {1.5574F, -2.1850F},
+	                       {-1.5574F, 2.1850F},
+	                       {0.0F, 0.5463F} };
+
+	/* CPU test */
+	bool cpuTest = true;
+
+	/* create tensors */
+	XTensor * a = NewTensor(order, dimSize);
+    XTensor * b = NewTensor(order, dimSize);
+	XTensor * aMe = NewTensor(order, dimSize);
+    XTensor bUser;
+
+	/* initialize variables */
+	a->SetData(aData, unitNum);
+	aMe->SetData(aData, unitNum);
+
+	/* call Tan function */
+	_Tan(a, b);
+	_TanMe(aMe);
+    bUser = Tan(*a);
+
+	/* check results */
+	cpuTest = b->CheckData(answer, unitNum, 1e-4F) && aMe->CheckData(answer, unitNum, 1e-4F) && bUser.CheckData(answer, unitNum, 1e-4F);
+    
+#ifdef USE_CUDA
+	/* GPU test */
+	bool gpuTest = true;
+
+	/* create tensor */
+	XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+	XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+	XTensor * aMeGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor bUserGPU;
+
+	/* Initialize variables */
+	aGPU->SetData(aData, unitNum);
+	aMeGPU->SetData(aData, unitNum);
+
+	/* call Tan function */
+    _Tan(aGPU, bGPU);
+	_TanMe(aMeGPU);
+    bUserGPU = Tan(*aGPU);
+
+	/* check results */
+	gpuTest = bGPU->CheckData(answer, unitNum, 1e-4F) && aMeGPU->CheckData(answer, unitNum, 1e-4F) && bUserGPU.CheckData(answer, unitNum, 1e-4F);
+
+	/* destroy variables */
+	delete a;
+	delete b;
+	delete aMe;
+    delete aGPU;
+    delete bGPU;
+    delete aMeGPU;
+	delete[] dimSize;
+
+	return cpuTest && gpuTest;
+#else
+	/* destroy variables */
+	delete a;
+	delete b;
+	delete aMe;
+	delete[] dimSize;
+
+	return cpuTest;
+#endif // USE_CUDA
+}
+
+/* other cases */
+/*
+TODO!!
+*/
+
+/* test for Tan Function */
+bool TestTan()
+{
+	XPRINT(0, stdout, "[TEST Tan] set every entry to its tangent value \n");
+	bool returnFlag = true, caseFlag = true;
+
+	/* case 1 test */
+	caseFlag = TestTan1();
+
+	if (!caseFlag) {
+		returnFlag = false;
+		XPRINT(0, stdout, ">> case 1 failed!\n");
+	}
+	else
+		XPRINT(0, stdout, ">> case 1 passed!\n");
+
+	/* other cases test */
+	/*
+	TODO!!
+	*/
+
+	if (returnFlag) {
+		XPRINT(0, stdout, ">> All Passed!\n");
+	}
+	else
+		XPRINT(0, stdout, ">> Failed!\n");
+
+	XPRINT(0, stdout, "\n");
+
+	return returnFlag;
+}
+
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/test/TTan.h
+++ b/source/tensor/test/TTan.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
+*/
+
+#ifndef __TEST_TAN_H__
+#define __TEST_TAN_H__
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* test for Tan Function */
+bool TestTan();
+
+} // namespace nts(NiuTrans.Tensor)
+#endif // __TEST_TAN_H__
--- a/source/tensor/test/TTopK.cpp
+++ b/source/tensor/test/TTopK.cpp
@@ -20,7 +20,7 @@
 */

 #include "TTopK.h"
-#include "TSort.h"
+
 namespace nts { // namespace nts(NiuTrans.Tensor)

 /* 
@@ -97,21 +97,12 @@ bool TestTopK1()
    int dim = 0;
    int k = sDimSize[dim];
    _TopK(s, t1, index1, dim, k);
-    _SortMe(t1, index1, dim);
    TopK(sUser, tUser1, indexUser1, dim, k);
-    _SortMe(&tUser1, &indexUser1, dim);
-
-    t1->Dump(stderr);
-    tUser1.Dump(stderr);
-
-    index1->Dump(stderr);

    dim = 1;
    k = sDimSize[dim];
    _TopK(s, t2, index2, dim, k);
-    _SortMe(t2, index2, dim);
    TopK(sUser, tUser2, indexUser2, dim, k);
-    _SortMe(&tUser2, &indexUser2, dim);

    /* check results */
    cpuTest = t1->CheckData(tAnswer1, tUnitNum) && tUser1.CheckData(tAnswer1, tUnitNum)

--- a/source/tensor/test/TTranspose.cpp
+++ b/source/tensor/test/TTranspose.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-12
+*/
+
+#include "TTranspose.h"
+#include "../core/movement/CopyValues.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/*
+case 1: test Transpose function.
+tensor transposition of dimensions i and j 
+*/
+bool TestTranspose1()
+{
+	/* a tensor of size (3, 2) */
+	int aOrder = 2;
+	int * aDimSize = new int[aOrder];
+	aDimSize[0] = 3;
+	aDimSize[1] = 2;
+
+	int aUnitNum = 1;
+	for (int i = 0; i < aOrder; i++)
+		aUnitNum *= aDimSize[i];
+
+    /* a tensor of size (2, 3) */
+	int bOrder = 2;
+	int * bDimSize = new int[bOrder];
+	bDimSize[0] = 2;
+	bDimSize[1] = 3;
+
+	int bUnitNum = 1;
+	for (int i = 0; i < bOrder; i++)
+		bUnitNum *= bDimSize[i];
+
+	DTYPE aData[3][2] = { {1.0F, 2.0F}, 
+	                      {3.0F, 4.0F},
+	                      {5.0F, 6.0F} };
+	DTYPE answer[2][3] = { {1.0F, 3.0F, 5.0F},
+	                       {2.0F, 4.0F, 6.0F} };
+
+	/* CPU test */
+	bool cpuTest = true;
+
+	/* create tensors */
+	XTensor * a = NewTensor(aOrder, aDimSize);
+	XTensor * b = NewTensor(bOrder, bDimSize);
+    XTensor bUser;
+
+	/* initialize variables */
+	a->SetData(aData, aUnitNum);
+
+	/* call Transpose function */
+    _Transpose(a, b, 0, 1);
+    bUser = Transpose(*a, 0, 1);
+
+	/* check results */
+	cpuTest = b->CheckData(answer, aUnitNum, 1e-4F)
+              && bUser.CheckData(answer, aUnitNum, 1e-4F);
+
+#ifdef USE_CUDA
+	/* GPU test */
+	bool gpuTest = true;
+
+	/* create tensor */
+	XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
+    XTensor bUserGPU;
+
+	/* Initialize variables */
+	aGPU->SetData(aData, aUnitNum);
+
+	/* call Transpose function */
+    _Transpose(aGPU, bGPU, 0, 1);
+    bUserGPU = Transpose(*aGPU, 0, 1);
+
+	/* check results */
+	gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F)
+              && bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
+
+	/* destroy variables */
+	delete a;
+	delete b;
+    delete aGPU;
+    delete bGPU;
+	delete[] aDimSize;
+	delete[] bDimSize;
+
+	return cpuTest && gpuTest;
+#else
+	/* destroy variables */
+	delete a;
+	delete b;
+	delete[] aDimSize;
+	delete[] bDimSize;
+
+	return cpuTest;
+#endif // USE_CUDA
+}
+
+/*
+case 2: test Transpose function.
+tensor transposition of dimensions i and j 
+*/
+bool TestTranspose2()
+{
+	/* a tensor of size (4, 3, 2) */
+	int aOrder = 3;
+	int * aDimSize = new int[aOrder];
+	aDimSize[0] = 4;
+	aDimSize[1] = 3;
+	aDimSize[2] = 2;
+
+	int aUnitNum = 1;
+	for (int i = 0; i < aOrder; i++)
+		aUnitNum *= aDimSize[i];
+
+    /* a tensor of size (2, 3, 4) */
+	int bOrder = 3;
+	int * bDimSize = new int[bOrder];
+	bDimSize[0] = 2;
+	bDimSize[1] = 3;
+	bDimSize[2] = 4;
+
+	int bUnitNum = 1;
+	for (int i = 0; i < bOrder; i++)
+		bUnitNum *= bDimSize[i];
+
+	DTYPE aData[4][3][2] = { { {1.0F, 2.0F}, 
+	                           {3.0F, 4.0F},
+	                           {5.0F, 6.0F} },
+                             { {2.0F, 4.0F}, 
+	                           {4.0F, 7.0F},
+	                           {6.0F, 8.0F} },
+                             { {1.0F, 2.0F}, 
+	                           {3.0F, 4.0F},
+	                           {5.0F, 6.0F} },
+                             { {2.0F, 4.0F}, 
+	                           {4.0F, 7.0F},
+	                           {6.0F, 8.0F} },};
+	DTYPE answer[2][3][4] = { { {1.0F, 2.0F, 1.0F, 2.0F},
+                                {2.0F, 4.0F, 2.0F, 4.0F},
+                                {3.0F, 4.0F, 3.0F, 4.0F} },
+                              { {4.0F, 7.0F, 4.0F, 7.0F},
+                                {5.0F, 6.0F, 5.0F, 6.0F},
+                                {6.0F, 8.0F, 6.0F, 8.0F} } };
+
+	/* CPU test */
+	bool cpuTest = true;
+
+	/* create tensors */
+	XTensor * a = NewTensor(aOrder, aDimSize);
+	XTensor * b = NewTensor(bOrder, bDimSize);
+    XTensor bUser;
+
+	/* initialize variables */
+	a->SetData(aData, aUnitNum);
+
+	/* call Transpose function */
+    _Transpose(a, b, 0, 2);
+    bUser = Transpose(*a, 0, 2);
+
+	/* check results */
+	cpuTest = b->CheckData(answer, aUnitNum, 1e-4F)
+              && bUser.CheckData(answer, aUnitNum, 1e-4F);
+
+#ifdef USE_CUDA
+	/* GPU test */
+	bool gpuTest = true;
+
+	/* create tensor */
+	XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
+    XTensor bUserGPU;
+
+	/* Initialize variables */
+	aGPU->SetData(aData, aUnitNum);
+
+	/* call Transpose function */
+    _Transpose(aGPU, bGPU, 0, 2);
+    bUserGPU = Transpose(*aGPU, 0, 2);
+
+	/* check results */
+	gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F)
+              && bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
+
+	/* destroy variables */
+	delete a;
+	delete b;
+    delete aGPU;
+    delete bGPU;
+	delete[] aDimSize;
+	delete[] bDimSize;
+
+	return cpuTest && gpuTest;
+#else
+	/* destroy variables */
+	delete a;
+	delete b;
+	delete[] aDimSize;
+	delete[] bDimSize;
+
+	return cpuTest;
+#endif // USE_CUDA
+}
+
+/* other cases */
+/*
+TODO!!
+*/
+
+/* test for Transpose Function */
+bool TestTranspose()
+{
+	XPRINT(0, stdout, "[TEST TRANSPOSE] tensor transposition with specified dimensions \n");
+	bool returnFlag = true, caseFlag = true;
+
+	/* case 1 test */
+	caseFlag = TestTranspose1();
+
+	if (!caseFlag) {
+		returnFlag = false;
+		XPRINT(0, stdout, ">> case 1 failed!\n");
+	}
+	else
+		XPRINT(0, stdout, ">> case 1 passed!\n");
+    
+	/* case 2 test */
+	caseFlag = TestTranspose2();
+
+	if (!caseFlag) {
+		returnFlag = false;
+		XPRINT(0, stdout, ">> case 2 failed!\n");
+	}
+	else
+		XPRINT(0, stdout, ">> case 2 passed!\n");
+
+	/* other cases test */
+	/*
+	TODO!!
+	*/
+
+	if (returnFlag) {
+		XPRINT(0, stdout, ">> All Passed!\n");
+	}
+	else
+		XPRINT(0, stdout, ">> Failed!\n");
+
+	XPRINT(0, stdout, "\n");
+
+	return returnFlag;
+}
+
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/test/TTranspose.h
+++ b/source/tensor/test/TTranspose.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-30
+*/
+
+#ifndef __TEST_TRANSPOSE_H__
+#define __TEST_TRANSPOSE_H__
+
+#include "../core/shape/Transpose.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* test for Transpose Function */
+bool TestTranspose();
+
+} // namespace nts(NiuTrans.Tensor)
+#endif // __TEST_TRANSPOSE_H__
--- a/source/tensor/test/Test.cpp
+++ b/source/tensor/test/Test.cpp
@@ -30,38 +30,47 @@ bool Test()
    XPRINT(0, stdout, "Testing the XTensor utilites ... \n\n");

    /*wrong = !TestAbsolute() || wrong;
+    wrong = !TestClip() || wrong;
    wrong = !TestConcatenate() || wrong;
    wrong = !TestConcatenateSolely() || wrong;
+    wrong = !TestCos() || wrong;
    wrong = !TestConvertDataType() || wrong;
    wrong = !TestCopyIndexed() || wrong;
    wrong = !TestCopyValues() || wrong;
+    wrong = !TestDiv() || wrong;
+    wrong = !TestExp() || wrong;
    wrong = !TestLog() || wrong;
    wrong = !TestMatrixMul() || wrong;
    wrong = !TestMatrixMul2D() || wrong;
    wrong = !TestMatrixMul2DParallel() || wrong;
    wrong = !TestMatrixMulBatched() || wrong;
-    wrong = !TestMatrixMulBatchedCPU() || wrong;
    wrong = !TestMerge() || wrong;
    wrong = !TestMultiply() || wrong;
    wrong = !TestNegate() || wrong;
    wrong = !TestNormalize() || wrong;
    wrong = !TestPower() || wrong;
    wrong = !TestReduceMax() || wrong;
-    wrong = !TestReduceMean() || wrong;
+    wrong = !TestReduceMean() || wrong;*/
    wrong = !TestReduceSum() || wrong;
-    wrong = !TestReduceSumSquared() || wrong;
+    /*wrong = !TestReduceSumSquared() || wrong;
    wrong = !TestReduceVariance() || wrong;
+    wrong = !TestRound() || wrong;
    wrong = !TestScaleAndShift() || wrong;
    wrong = !TestSelect() || wrong;
    wrong = !TestSetAscendingOrder() || wrong;
    wrong = !TestSetData() || wrong;
    wrong = !TestSign() || wrong;
+    wrong = !TestSin() || wrong;
    wrong = !TestSort() || wrong;
    wrong = !TestSplit() || wrong;
+    wrong = !TestSub() || wrong;
    wrong = !TestSum() || wrong;
    wrong = !TestSumByColumnTV() || wrong;
-    wrong = !TestSumByColumnVT() || wrong;*/
-    /*wrong = !TestTopK() || wrong;
+    wrong = !TestSumByColumnVT() || wrong;
+    wrong = !TestSumDim() || wrong;
+    wrong = !TestTan() || wrong;
+    wrong = !TestTranspose() || wrong;
+    //wrong = !TestTopK() || wrong;
    wrong = !TestUnsqueeze() || wrong;
    wrong = !TestXMem() || wrong;

@@ -70,8 +79,8 @@ bool Test()
    wrong = !TestLogSoftmax() || wrong;
    wrong = !TestLoss() || wrong;
    wrong = !TestRectify() || wrong;
-    wrong = !TestSigmoid() || wrong;*/
-    wrong = !TestSoftmax() || wrong;
+    wrong = !TestSigmoid() || wrong;
+    wrong = !TestSoftmax() || wrong;*/

    /* other test */
    /*

--- a/source/tensor/test/Test.h
+++ b/source/tensor/test/Test.h
@@ -23,17 +23,20 @@
 #define __TEST_H__

 #include "TAbsolute.h"
+#include "TClip.h"
 #include "TConcatenate.h"
 #include "TConcatenateSolely.h"
+#include "TCos.h"
 #include "TConvertDataType.h"
 #include "TCopyIndexed.h"
 #include "TCopyValues.h"
+#include "TDiv.h"
+#include "TExp.h"
 #include "TLog.h"
 #include "TMatrixMul.h"
 #include "TMatrixMul2D.h"
 #include "TMatrixMul2DParallel.h"
 #include "TMatrixMulBatched.h"
-#include "TMatrixMULBatchedCPU.h"
 #include "TMerge.h"
 #include "TMultiply.h"
 #include "TNegate.h"
@@ -44,16 +47,22 @@
 #include "TReduceSum.h"
 #include "TReduceSumSquared.h"
 #include "TReduceVariance.h"
+#include "TRound.h"
 #include "TScaleAndShift.h"
 #include "TSelect.h"
 #include "TSetAscendingOrder.h"
 #include "TSetData.h"
 #include "TSign.h"
+#include "TSin.h"
 #include "TSort.h"
 #include "TSplit.h"
+#include "TSub.h"
 #include "TSum.h"
 #include "TSumByColumnTV.h"
 #include "TSumByColumnVT.h"
+#include "TSumDim.h"
+#include "TTan.h"
+#include "TTranspose.h"
 #include "TTopK.h"
 #include "TUnsqueeze.h"
 #include "TXMem.h"