Test and merge.

7786f2b7 · 张裕浩 · 467c2ed7 · 7786f2b7 · 7786f2b7 · 7786f2b7
Commit 7786f2b7 authored Nov 01, 2019 by 张裕浩
--- a/source/network/Main.cpp
+++ b/source/network/Main.cpp
@@ -27,7 +27,6 @@
 #include "../tensor/test/Test.h"
 #include "../sample/fnnlm/FNNLM.h"
 #include "../sample/transformer/Transformer.h"
-//#include "../tensor/timer.h"
 //#define CRTDBG_MAP_ALLOC
 //#include <stdlib.h>
@@ -36,37 +35,19 @@
 void BackwardTest();
 void TransposeTest();
 void SumDimTest();
-void SplitBackwardTest();
-void MemTest();
 using namespace nts;
 using namespace fnnlm;
 using namespace transformer;
-void test()
-{
-    XTensor a;
-    InitTensor2D(&a, 100, 100, X_FLOAT, 0);
-    XTensor b;
-    InitTensor2D(&b, 100, 100, X_FLOAT16, 0);
-    _ConvertDataType(&a, &b);
-    return;
-}
 int main( int argc, const char ** argv )
 {
-    //timer_c asd;
+    //_CrtSetDbgFlag(_CrtSetDbgFlag(_CRTDBG_REPORT_FLAG) | _CRTDBG_LEAK_CHECK_DF);
-    test();
+    //_CrtSetBreakAlloc(2708);
-    //MemTest();
-    //return 0;
+    if(argc > 1 && !strcmp(argv[1], "-test"))
-    //SplitBackwardTest();
+        Test();
-    //return 0;
+    else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
-    //_CrtSetBreakAlloc(896);
-    //BackwardTest();
-    //return 0;
-    /*if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
        FNNLMMain(argc - 1, argv + 1);
    else if(argc > 1 && !strcmp(argv[1], "-t2t"))
        TransformerMain(argc - 1, argv + 1);
@@ -75,7 +56,8 @@ int main( int argc, const char ** argv )
        fprintf(stderr, "neural networks in an easy way. \n\n");
        fprintf(stderr, "Run this program with \"-test\" for unit test!\n");
        fprintf(stderr, "Or run this program with \"-fnnlm\" for sample FNNLM!\n");
-    }*/
+        fprintf(stderr, "Or run this program with \"-t2t\" for sample Transformer!\n");
+    }
    //_CrtDumpMemoryLeaks();
@@ -89,6 +71,9 @@ void BackwardTest()
    XTensor a;
    XTensor b;
    XTensor c;
+    a.enableGrad = true;
+    b.enableGrad = false;
+    c.enableGrad = false;
    XTensor mean;
    XTensor origin;
    InitTensor2D(&a, 2, 3);
@@ -106,14 +91,15 @@ void BackwardTest()
    b.Set1D(2.0F, 0);
    b.Set1D(1.0F, 1);
-    c = DivDim(a, b, 0);
+    DivDim(a, b, c, 0);
    c.Dump(stderr, "c:");
+    auto loss = CrossEntropy(c, a);
    //XLink::ShowNetwork(stderr, &c);
-    net.Backward(c);
+    net.Backward(loss);
-    net.Dump(stderr);
+    a.grad->Dump(stderr);
 }
@@ -229,67 +215,3 @@ void SumDimTest()
    delete[] data;
 }
-void SplitBackwardTest()
-{
-    int * dimSize = new int[2];
-    dimSize[0] = 2;
-    dimSize[1] = 4;
-    XTensor t1;
-    InitTensor2D(&t1, 2, 4, X_FLOAT, 0, NULL);
-    XTensor t2;
-    InitTensor2D(&t2, 2, 4, X_FLOAT, 0, NULL);
-    XTensor tensor;
-    //_SetDataFixedFloat(&t1, 1.0F);
-    //_SetDataFixedFloat(&t2, 2.0F);
-    t1.SetDataRand();
-    t2.SetDataRand();
-    tensor = t1 + t2;
-    XList smalls;
-    XTensor first;
-    XTensor second;
-    InitTensor2D(&first, 2, 2, X_FLOAT, 0, NULL);
-    InitTensor2D(&second, 2, 2, X_FLOAT, 0, NULL);
-    smalls.Add(&first);
-    smalls.Add(&second);
-    Split(tensor, smalls, 1, 2);
-    XTensor mul;
-    mul = Sum(first, second);
-    XNet net;
-    net.Backward(mul);
-    net.Dump(stderr);
-    printf("Done!");
-}
-void MemTest()
-{
-    XMem * mem;
-    mem = new XMem(0, FREE_ON_THE_FLY, (MTYPE)MILLION, 1024, MILLION);
-    XTensor tensor;
-    InitTensor2D(&tensor, 2, 4, X_FLOAT, 0, mem);
-    tensor.SetZeroAll();
-    tensor.Dump(stderr);
-    delete mem;
-    if (tensor.mem != NULL) {
-        printf("It isn't null!\n");
-        printf("%d\n", (int)tensor.mem->signature);
-    }
-    else {
-        printf("It's null\n");
-    }
-    tensor.Dump(stderr);
-}
\ No newline at end of file
--- a/source/network/XBackwardFunc.cpp
+++ b/source/network/XBackwardFunc.cpp
@@ -43,18 +43,18 @@ void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient)
    XNoder::MakeGrad(input);
    if(operID == FUNC_HARDTANH)
-        _HardTanHBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
+        _HardTanHBackward(output, input, output->grad, input->grad);
    else if(operID == FUNC_IDENTITY)
-        _IdentityBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
+        _IdentityBackward(output, input, output->grad, input->grad);
    else if(operID == FUNC_LOGSOFTMAX){
        int leadDim = income.GetParamInt(0);
        CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in logsoftmax!");
        _LogSoftmaxBackward(NULL, output, input, output->grad, input->grad, NULL, leadDim, NOLOSS);
    }
    else if(operID == FUNC_RECTIFY)
-        _RectifyBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
+        _RectifyBackward(output, input, output->grad, input->grad);
    else if(operID == FUNC_SIGMOID)
-        _SigmoidBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
+        _SigmoidBackward(output, input, output->grad, input->grad);
    else if(operID == FUNC_SOFTMAX){
        int leadDim = income.GetParamInt(0);
        CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in softmax!");

--- a/source/network/XBackwardLoss.cpp
+++ b/source/network/XBackwardLoss.cpp
@@ -20,7 +20,9 @@
 */
 #include "XBackwardLoss.h"
+#include "XNoder.h"
 #include "../tensor/XName.h"
+#include "../tensor/function/FHeader.h"
 #include "../tensor/core/getandset/SetData.h"
 #include "../tensor/function/HardTanH.h"
 #include "../tensor/function/Identity.h"
@@ -31,6 +33,60 @@
 namespace nts{
+/* compute dE/dx of a node */
+void XLossGrad::MakeGrad(XTensor * node, bool isEfficient)
+{
+    XLink &income = node->income;
+    int operID = income.typeID;
+    CheckNTErrors(income.tailNum >= 1, "Wrong number of tensors for loss computation!");
+    XTensor * output = income.tails[0];
+    XTensor * gold = NULL;
+    XTensor * weight = NULL;
+    XTensor * padding = NULL;
+    int leadingDim;
+    XNoder::MakeGrad(output);
+    XTensor * dedy = output->grad;
+    if (income.tailNum == 1) {
+        if(dedy->dataType == X_FLOAT)
+            _SetDataFixedFloat(dedy, 1.0F);
+        else if(dedy->dataType == X_DOUBLE)
+            _SetDataFixedDouble(dedy, 1.0);
+        else if(dedy->dataType == X_INT)
+            _SetDataFixedInt(dedy, 1);
+        else
+            ShowNTErrors("TODO");
+        return;
+    }
+    gold = income.tails[1];
+    if(operID == LOSS_CROSSENTROPY) {
+        if (income.tailNum == 3) 
+            padding = income.tails[2];
+        leadingDim = income.GetParamInt(0);
+        CheckNTErrors(leadingDim >= 0 && leadingDim < output->order, "wrong leading dimension in logsoftmax!");
+        _CrossEntropyBackward(dedy, output, gold, weight, padding, leadingDim);
+    }
+    else{
+        ShowNTErrors("Wrong activation function type!");
+    }
+    node->visitMark = NODE_FINISHED;
+}
+/* indicates whether the node is for a loss computation */
+bool XLossGrad::IsLossOP(XTensor * node)
+{
+    XLink &income = node->income;
+    return (income.typeID & LOSS_BASE) != 0;
+}
 /* 
 compute dE/dx for a given function y = f(x) 
 >> gold - gold standard to measure error (or loss)
@@ -42,39 +98,39 @@ compute dE/dx for a given function y = f(x)
 >> params - parameters of the function
 >> lossName - name of the loss, e.g., cross entropy
 */
-void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x, 
+//void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x, 
-                        XTensor * dedy, XTensor * dedx, XTensor * padding,
+//                        XTensor * dedy, XTensor * dedx, XTensor * padding,
-                        int funcID, void * params,
+//                        int funcID, void * params,
-                        LOSS_FUNCTION_NAME lossName)
+//                        LOSS_FUNCTION_NAME lossName)
-{
+//{
-    CheckNTErrors(gold && y && x, "Empty input tensors!");
+//    CheckNTErrors(gold && y && x, "Empty input tensors!");
-    CheckNTErrors(dedx, "Empty gradient tensors!");
+//    CheckNTErrors(dedx, "Empty gradient tensors!");
-    CheckNTErrors((funcID & FUNCTION_BASE) != 0, "Illegal function id");
+//    CheckNTErrors((funcID & FUNCTION_BASE) != 0, "Illegal function id");
+//
-    if(funcID == FUNC_HARDTANH){
+//    if(funcID == FUNC_HARDTANH){
-        _HardTanHBackward(gold, y, x, dedy, dedx, lossName);
+//        _HardTanHBackward(gold, y, x, dedy, dedx, lossName);
-    }
+//    }
-    else if(funcID == FUNC_IDENTITY){
+//    else if(funcID == FUNC_IDENTITY){
-        _IdentityBackward(gold, y, x, dedy, dedx, lossName);
+//        _IdentityBackward(gold, y, x, dedy, dedx, lossName);
-    }
+//    }
-    else if(funcID == FUNC_LOGSOFTMAX){
+//    else if(funcID == FUNC_LOGSOFTMAX){
-        int leadDim = *(int*)params;
+//        int leadDim = *(int*)params;
-        _LogSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
+//        _LogSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
-    }
+//    }
-    else if(funcID == FUNC_RECTIFY){
+//    else if(funcID == FUNC_RECTIFY){
-        _RectifyBackward(gold, y, x, dedy, dedx, lossName);
+//        _RectifyBackward(gold, y, x, dedy, dedx, lossName);
-    }
+//    }
-    else if(funcID == FUNC_SIGMOID){
+//    else if(funcID == FUNC_SIGMOID){
-        _SigmoidBackward(gold, y, x, dedy, dedx, lossName);
+//        _SigmoidBackward(gold, y, x, dedy, dedx, lossName);
-    }else if(funcID == FUNC_SOFTMAX){
+//    }else if(funcID == FUNC_SOFTMAX){
-        int leadDim = *(int*)params;
+//        int leadDim = *(int*)params;
-        _SoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
+//        _SoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
-    }
+//    }
-    else{
+//    else{
-        ShowNTErrors("wrong function found when call the backward process!");
+//        ShowNTErrors("wrong function found when call the backward process!");
-    }
+//    }
+//
-}
+//}
 /* 
 compute dE/dy for variable y and error(loss) function E
@@ -83,27 +139,27 @@ compute dE/dy for variable y and error(loss) function E
 >> dedy - dE/dy
 >> lossName - name of the loss, e.g., cross entropy
 */
-void XLossGrad::Compute(XTensor * gold, XTensor * y, 
+//void XLossGrad::Compute(XTensor * gold, XTensor * y, 
-                        XTensor * dedy, XTensor * padding,
+//                        XTensor * dedy, XTensor * padding,
-                        LOSS_FUNCTION_NAME lossName)
+//                        LOSS_FUNCTION_NAME lossName)
-{
+//{
-    if(gold == NULL){
+//    if(gold == NULL){
-        if(dedy->dataType == X_FLOAT)
+//        if(dedy->dataType == X_FLOAT)
-            _SetDataFixedFloat(dedy, 1.0F);
+//            _SetDataFixedFloat(dedy, 1.0F);
-        else if(dedy->dataType == X_DOUBLE)
+//        else if(dedy->dataType == X_DOUBLE)
-            _SetDataFixedDouble(dedy, 1.0);
+//            _SetDataFixedDouble(dedy, 1.0);
-        else if(dedy->dataType == X_INT)
+//        else if(dedy->dataType == X_INT)
-            _SetDataFixedInt(dedy, 1);
+//            _SetDataFixedInt(dedy, 1);
-        else{
+//        else{
-            ShowNTErrors("TODO");
+//            ShowNTErrors("TODO");
-        }
+//        }
-        return;
+//        return;
-    }
+//    }
+//
-    //_LossBackward(dedy, gold, y, lossName);
+//    //_LossBackward(dedy, gold, y, lossName);
-    if(lossName == CROSSENTROPY)
+//    if(lossName == CROSSENTROPY)
-        _CrossEntropyBackward(dedy, y, gold, NULL, padding);
+//        _CrossEntropyBackward(dedy, y, gold, NULL, padding);
+//
-}
+//}
 }
\ No newline at end of file
--- a/source/network/XBackwardLoss.h
+++ b/source/network/XBackwardLoss.h
@@ -23,6 +23,7 @@
 #include "../tensor/XTensor.h"
 #include "../tensor/function/FHeader.h"
+#include "../tensor/loss/LHeader.h"
 #ifndef __XBACKWARDLOSS_H__
 #define __XBACKWARDLOSS_H__
@@ -34,11 +35,19 @@ namespace nts{
 class XLossGrad
 {
 public:
-    /* compute dE/dx for a given function y = f(x) */
+    /* compute dE/dx of a node */
-    void Compute(XTensor * gold, XTensor * y, XTensor * x, 
+    static
-                 XTensor * dedy, XTensor * dedx, XTensor * padding,
+    void MakeGrad(XTensor * node, bool isEfficient);
-                 int funcID, void * params,
-                 LOSS_FUNCTION_NAME lossName);
+    /* indicates whether the node is for a Loss computation */
+    static
+    bool IsLossOP(XTensor * node);
+    ///* compute dE/dx for a given function y = f(x) */
+    //void Compute(XTensor * gold, XTensor * y, XTensor * x, 
+    //             XTensor * dedy, XTensor * dedx, XTensor * padding,
+    //             int funcID, void * params,
+    //             LOSS_FUNCTION_NAME lossName);
    /* compute dE/dy for variable y and error(loss) function E */
    void Compute(XTensor * gold, XTensor * y, 

--- a/source/network/XBackwardMath.cpp
+++ b/source/network/XBackwardMath.cpp
--- a/source/network/XBackwardMath.h
+++ b/source/network/XBackwardMath.h
@@ -109,6 +109,11 @@ private:
    static
    void GradMultiplyDim(XTensor * node, bool isEfficient);
+    /* gradient for multiply one dimension: c =  a * b
+       where some dimensions of b are of size 1 */
+    static
+    void GradMultiplyBroadcast(XTensor * node, bool isEfficient);
    /* gradient for negate */
    static
    void GradNegate(XTensor * node, bool isEfficient);
@@ -125,14 +130,26 @@ private:
    static
    void GradScaleAndShift(XTensor * node, bool isEfficient);
+    /* gradient for Scale */
+    static
+    void GradScale(XTensor * node, bool isEfficient);
+    /* gradient for Shift */
+    static
+    void GradShift(XTensor * node, bool isEfficient);
+    /* gradient for Descale */
+    static
+    void GradDescale(XTensor * node, bool isEfficient);
    /* gradient for Minus */
    static
    void GradSub(XTensor * node, bool isEfficient);
-	/* gradient for sub with one dimension: c = a - b * \beta
+    /* gradient for sub with one dimension: c = a - b * \beta
-	where the size of b is equal to that of one dimension of a */
+    where the size of b is equal to that of one dimension of a */
-	static
+    static
-	void GradSubDim(XTensor * node, bool isEfficient);
+    void GradSubDim(XTensor * node, bool isEfficient);
    /* gradient for sum: c =  a + b * \beta */
    static
@@ -143,6 +160,11 @@ private:
    static
    void GradSumDim(XTensor * node, bool isEfficient);
+    /* gradient for sum by broadcasting: c = a + b * \beta
+       where some dimensions of b are of size 1 */
+    static
+    void GradSumBroadcast(XTensor * node, bool isEfficient);
    /* gradient for reduceMean */
    static
    void GradReduceMean(XTensor * node, bool isEfficient);
@@ -158,6 +180,10 @@ private:
    /* gradient for reduceVariance */
    static
    void GradReduceVariance(XTensor * node, bool isEfficient);
+    /* gradient for operation */
+    static
+    void GradMulAndShift(XTensor * node, bool isEfficient);
 };
 }

--- a/source/network/XBackwardShape.cpp
+++ b/source/network/XBackwardShape.cpp
@@ -43,6 +43,8 @@ void XShapeGrad::MakeGrad(XTensor * node, bool isEfficent)
        GradCopyIndexed(node, isEfficent);
    else if(operID == MOVEMENT_GATHER)
        GradGather(node, isEfficent);
+    else if (operID == MOVEMENT_DROPOUTWITHINDEX)
+        GradDropoutWithIndex(node, isEfficent);
    else if(operID == SHAPE_MERGE)
        GradMerge(node, isEfficent);
    else if(operID == SHAPE_MERGE_LIST)
@@ -62,7 +64,7 @@ void XShapeGrad::MakeGrad(XTensor * node, bool isEfficent)
    }
 }
-/* indicates whether the node is for a shape operation */
+/* indicates whether the node is for a math operation */
 bool XShapeGrad::IsShapeOP(XTensor * node)
 {
    XLink &income = node->income;
@@ -115,7 +117,7 @@ dE/da = spreadforgather(b)
 void XShapeGrad::GradGather(XTensor * node, bool isEfficent)
 {
    XLink &income = node->income;
-    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for CopyIndexed!");
+    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for Gather!");
    XTensor * input = income.tails[0];
    XTensor * index = income.tails[1];
@@ -126,6 +128,43 @@ void XShapeGrad::GradGather(XTensor * node, bool isEfficent)
    node->visitMark = NODE_FINISHED;
 }
+/*
+gradient computation for DropoutWithIndex function
+*/
+void XShapeGrad::GradDropoutWithIndex(XTensor * node, bool isEfficent)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for DropoutWithIndex!");
+    XTensor * input = income.tails[0];
+    XTensor * index = income.tails[1];
+    DTYPE scale = income.GetParam(0);
+    XNoder::MakeGrad(input);
+    //_Identity(node->grad, input->grad);
+    _CopyValues(node->grad, input->grad);
+    int order = node->grad->order;
+    int * dimSize = new int[order];
+    for (int i = 0; i < order; i++) {
+        dimSize[i] = node->grad->dimSize[i];
+    }
+    int order1 = 1;
+    int * dimSize1 = new int[order1];
+    dimSize1[0] = input->grad->unitNum;
+    input->grad->Reshape(order1, dimSize1);
+    _DropoutWithIndex(node->grad, index, input->grad);
+    _ScaleAndShiftMe(input->grad, scale);
+    input->grad->Reshape(order, dimSize);
+    node->visitMark = NODE_FINISHED;
+}
 /* 
 gradient for merge
 for 
@@ -232,8 +271,8 @@ void XShapeGrad::GradMergeList(XTensor * node, bool isEfficient)
    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for MERGE!");
    XTensor * last = NULL;
-    XList smalls(income.tailNum);
+    TensorList smalls(income.tailNum);
-    XList smallsGrad(income.tailNum);
+    TensorList smallsGrad(income.tailNum);
    bool mergeOnly = true;
    for(int i = 0; i < income.tailNum; i++){
        XTensor * tail = income.tails[i];
@@ -242,7 +281,7 @@ void XShapeGrad::GradMergeList(XTensor * node, bool isEfficient)
        smallsGrad.Add(tail->grad);
        if(i > 1){
-            CheckNTErrors(XTensor::IsSameShaped(last, tail), 
+            CheckNTErrors(_IsSameShaped(last, tail), 
                         "Input tensors must be of the same size!");
        }
@@ -401,7 +440,7 @@ void XShapeGrad::GradSplitListPost(XTensor * node, bool isEfficient)
    /* we compute the gradient for current node, rather than for
       child node, i.e., we use the outgoing edge here */
    XLink &outgo = node->outgo;
-    XList splits(outgo.tailNum);
+    TensorList splits(outgo.tailNum);
    int whereToSplit = -1;
    int splitNum = 0;
@@ -411,7 +450,7 @@ void XShapeGrad::GradSplitListPost(XTensor * node, bool isEfficient)
        if(income.typeID == SHAPE_SPLIT_LIST){
            int w = income.GetParamInt(0);
            int splitID = income.GetParamInt(1);
            if(whereToSplit < 0)
                whereToSplit = w;
            splitNum++;

--- a/source/network/XBackwardShape.h
+++ b/source/network/XBackwardShape.h
@@ -54,6 +54,10 @@ private:
    static
    void GradGather(XTensor * node, bool isEfficent);
+    /* gradient computation for dropout with index: b = dropoutwithindex(a, index) */
+    static
+    void GradDropoutWithIndex(XTensor * node, bool isEfficent);
    /* gradient computation for merge: c = merge(a, b, ...) */
    static
    void GradMerge(XTensor * node, bool isEfficent);

--- a/source/network/XNet.cpp
+++ b/source/network/XNet.cpp
@@ -55,7 +55,7 @@ void XNetClearAll()
 XNet::XNet()
 {
    nodes.Clear();
-    isGradEfficient = false;
+    isGradEfficient = true;
 }
 /* de-constructor */
@@ -77,104 +77,20 @@ backward propagation to obtain gradient
 >> root - root node (output) of the network
 >> loss - name of loss function
 */
-void XNet::Backward(XTensor &root, LOSS_FUNCTION_NAME loss)
+void XNet::Backward(XTensor &root)
 {
-    XList roots(1);
+    TensorList roots(1);
    roots.Add(&root);
-    XList golds(1);
+    Backward(roots);
-    golds.Add(NULL);
-    XList paddings(1);
-    paddings.Add(NULL);
-    Backward(roots, golds, paddings, loss);
-}
-/*
-backward propagation to obtain gradient wrt. the loss/error function
->> root - root node (output) of the network
->> gold - gold standard for the output
->> loss - name of loss function
-*/
-void XNet::Backward(XTensor &root, XTensor &gold, LOSS_FUNCTION_NAME loss)
-{
-    XList roots(1);
-    roots.Add(&root);
-    XList golds(1);
-    golds.Add(&gold);
-    XList paddings(1);
-    paddings.Add(NULL);
-    Backward(roots, golds, paddings, loss);
-}
-/* 
-backward propagation to obtain gradient wrt. the loss/error function 
->> root - root node (output) of the network
->> gold - gold standard for the output
->> padding - specify a target value that is ignored and does not contribute to the gradient computation
->> loss - name of loss function
-*/
-void XNet::Backward(XTensor &root, XTensor &gold, XTensor &padding, LOSS_FUNCTION_NAME loss)
-{
-    XList roots(1);
-    roots.Add(&root);
-    XList golds(1);
-    golds.Add(&gold);
-    XList paddings(1);
-    paddings.Add(&padding);
-    Backward(roots, golds, paddings, loss);
-}
-/*
-backward propagation to obtain gradient
-with a number of root nodes
->> roots - a list of root nodes (output) of the network
->> loss - name of loss function
-*/
-void XNet::Backward(XList &roots, LOSS_FUNCTION_NAME loss)
-{
-    XList golds(roots.count);
-    XList paddings(roots.count);
-    for (int i = 0; i < roots.count; i++) {
-        golds.Add(NULL);
-        paddings.Add(NULL);
-    }
-    Backward(roots, golds, paddings, loss);
-}
-/*
-backward propagation to obtain gradient
-with a number of root nodes
->> roots - a list of root nodes (output) of the network
->> golds - a list of gold standard for the output
->> loss - name of loss function
-*/
-void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
-{
-    XList paddings(roots.count);
-    for (int i = 0; i < roots.count; i++)
-        paddings.Add(NULL);
-    Backward(roots, golds, paddings, loss);
 }
 /* 
 backward propagation to obtain gradient wrt. the loss/error function
 with a number of root nodes 
 >> roots - a list of root nodes (output) of the network
->> golds - a list of gold standard for the output
->> paddings - specify a target value that is ignored
->> loss - name of loss function
 */
-void XNet::Backward(XList &roots, XList &golds, XList &paddings, LOSS_FUNCTION_NAME loss)
+void XNet::Backward(TensorList &roots)
 {
    Traverse(roots);
@@ -187,39 +103,6 @@ void XNet::Backward(XList &roots, XList &golds, XList &paddings, LOSS_FUNCTION_N
        node->visitMark = NODE_UNFINISHED;
    }
-    XLossGrad lossGrad;
-    /* we start with the gradient with respect to the loss for output layers */
-    for(int i = 0; i < roots.count; i++){
-        XTensor * root = (XTensor*)roots.Get(i);
-        XTensor * gold = (XTensor*)golds.Get(i);
-        XTensor * padding = (XTensor*)paddings.Get(i);
-        XLink &income = root->income;
-        int funcID = income.typeID;
-        void * params = income.params;
-        /* we compute dE/dx if the output is generated by an activation function y = f(x).
-           Note that we do not need to obtain dE/dy here because it is no use in the 
-           folloing process of back-propagation */
-        if(gold != NULL && income.tailNum == 1 && (funcID & FUNCTION_BASE)){
-            if(funcID == FUNC_LOGSOFTMAX || funcID == FUNC_SOFTMAX) {
-                XTensor * x = income.tails[0];
-                XNoder::MakeGrad(x);
-                lossGrad.Compute(gold, root, x, NULL, x->grad, padding, funcID, params, loss);
-                root->visitMark = NODE_FINISHED;
-            }
-            else {
-                XNoder::MakeGrad(root);
-                lossGrad.Compute(gold, root, root->grad, padding, loss);
-            }
-        }
-        /* we compuate dE/dy (y is the output) if no predefined activation function is used */
-        else{
-            XNoder::MakeGrad(root);
-            lossGrad.Compute(gold, root, root->grad, NULL, loss);
-        }
-    }
    /* back-propagation from output to input */
    for(int i = nodes.count - 1; i >= 0; i--){
        XTensor * node = (XTensor*)nodes.Get(i);
@@ -266,6 +149,8 @@ void XNet::BackwardNode(XTensor * node, bool isEfficent)
            XFuncGrad::MakeGrad(node, isEfficent);
        else if(XShapeGrad::IsShapeOP(node))
            XShapeGrad::MakeGrad(node, isEfficent);
+        else if(XLossGrad::IsLossOP(node))
+            XLossGrad::MakeGrad(node, isEfficent);
        else{
            ShowNTErrors("Wrong node type!");
        }
@@ -300,7 +185,7 @@ depth-first search (Tarjan's algorithm)
 */
 void XNet::Traverse(XTensor &root)
 {
-    XList roots(1);
+    TensorList roots(1);
    roots.Add(&root);
    Traverse(roots);
@@ -311,7 +196,7 @@ traverse the net and find the topological order by
 depth-first search (Tarjan's algorithm) 
 >> roots - a list of roots (or output nodes)
 */
-void XNet::Traverse(XList &roots)
+void XNet::Traverse(TensorList &roots)
 {
    id = MakeNetID();
    nodes.Clear();
@@ -336,7 +221,7 @@ depth-first search given a node (Tarjan's algorithm for topological ordering)
 >> orders - topological order of the nodes
 >> code - code of the network
 */
-void XNet::TarjanVisit(XTensor * node, XList &orders, const unsigned int code)
+void XNet::TarjanVisit(XTensor * node, TensorList &orders, const unsigned int code)
 {
    if(node == NULL)
        return;
@@ -444,7 +329,7 @@ show network topology
 */
 void XNet::ShowNetwork(FILE * file, XTensor * node)
 {
-    XList roots(1);
+    TensorList roots(1);
    roots.Add(node);
    Traverse(roots);
@@ -458,4 +343,14 @@ void XNet::ShowNetwork(FILE * file, XTensor * node)
    }
 }
-}
+/*
\ No newline at end of file
+search for a node in a top-down manner by its name
+>> top - the top most node
+<< return - the node we found
+*/
+//XTensor * XNet::SearchNode(XTensor * top, const char * name)
+//{
+    //return XLink::SearchNode(top, name);
+//}
+}
--- a/source/network/XNet.h
+++ b/source/network/XNet.h
@@ -23,6 +23,7 @@
 #include "../tensor/XTensor.h"
 #include "../tensor/function/FHeader.h"
+#include "../tensor/loss/LHeader.h"
 #ifndef __XNET_H__
 #define __XNET_H__
@@ -36,16 +37,16 @@ struct XNet
    unsigned int id;
    /* tensor nodes of the network (in order) */
-    XList nodes;
+    TensorList nodes;
    /* tensor nodes to keep gradient for output (e.g., SGD)*/
-    XList gradNodes;
+    TensorList gradNodes;
    /* output nodes of the network */
-    XList outputs;
+    TensorList outputs;
    /* input nodes of the network */
-    XList inputs;
+    TensorList inputs;
    /* indicates whether the network just keeps the gradient for parameter tensors */
    bool isGradEfficient;
@@ -60,25 +61,11 @@ struct XNet
    void Clear();
    /* backward propagation to obtain gradient */
-    void Backward(XTensor &root, LOSS_FUNCTION_NAME loss = NOLOSS);
+    void Backward(XTensor &root);
-    /* backward propagation to obtain gradient wrt. the loss/error function */
-    void Backward(XTensor &root, XTensor &gold, LOSS_FUNCTION_NAME loss = NOLOSS);
-    /* backward propagation to obtain gradient wrt. the loss/error function */
-    void Backward(XTensor &root, XTensor &gold, XTensor &padding, LOSS_FUNCTION_NAME loss = NOLOSS);
-    /* backward propagation to obtain gradient
-       with a number of root nodes */
-    void Backward(XList &roots, LOSS_FUNCTION_NAME loss = NOLOSS);
-    /* backward propagation to obtain gradient
-       with a number of root nodes */
-    void Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss = NOLOSS);
    /* backward propagation to obtain gradient wrt. the loss/error function
       with a number of root nodes */
-    void Backward(XList &roots, XList &golds, XList &paddings, LOSS_FUNCTION_NAME loss = NOLOSS);
+    void Backward(TensorList &roots);
    /* backward computation for a given node */
    void BackwardNode(XTensor * node, bool isEfficent = false);
@@ -92,10 +79,10 @@ struct XNet
    /* traverse the net and find the topological order by 
       depth-first search (Tarjan's algorithm) */
-    void Traverse(XList &roots);
+    void Traverse(TensorList &roots);
    /* depth-first search given a node (Tarjan's algorithm for topological ordering) */
-    void TarjanVisit(XTensor * node, XList &orders, const unsigned int code);
+    void TarjanVisit(XTensor * node, TensorList &orders, const unsigned int code);
    /* dump network information */
    void Dump(FILE * file);
@@ -111,6 +98,10 @@ struct XNet
    /* show network topology */
    void ShowNetwork(FILE * file, XTensor * node);
+    /* search a node in a top-down manner by its name */
+    //static
+    //XTensor * SearchNode(XTensor * top, const char * name);
 };
 /* we make a unique id for every tensor */

--- a/source/network/XNoder.cpp
+++ b/source/network/XNoder.cpp
@@ -29,7 +29,7 @@ void XNoder::MakeGrad(XTensor * node)
    if(node == NULL)
        return;
-    if(!XTensor::IsSameShaped(node, node->grad)){
+    if(!_IsSameShaped(node, node->grad)){
        delete node->grad;
        node->grad = NewTensor(node);
        node->grad->SetZeroAll();

--- a/source/network/XNoder.h
+++ b/source/network/XNoder.h
@@ -20,7 +20,7 @@
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-18
 */
-#include "../tensor/XTensor.h"
+#include "../tensor/core/CHeader.h"
 #ifndef __XNODER_H__
 #define __XNODER_H__

--- a/source/sample/fnnlm/FNNLM.cpp
+++ b/source/sample/fnnlm/FNNLM.cpp
--- a/source/sample/transformer/T2TAttention.cpp
+++ b/source/sample/transformer/T2TAttention.cpp
@@ -51,14 +51,12 @@ initialize the model
 >> myIgnored - number of position ignored in attention (from the begining)
 >> myIsMasked - indicates whether the attention is with a mask
 >> myDevID - device id
->> myMem - the memory pool
 */
 void T2TAttention::InitModel(int argc, char ** argv, 
                             bool myIsMasked, int myIgnored, 
-                             int myDevID, XMem * myMem)
+                             int myDevID)
 {
    devID = myDevID;
-    mem = myMem;
    isMasked = myIsMasked;
    ignored = myIgnored;
@@ -71,20 +69,18 @@ void T2TAttention::InitModel(int argc, char ** argv,
    LoadParamFloat(argc, argv, "attminmax", &minmax, 0.1F);
    LoadParamFloat(argc, argv, "dropoutatt", &dropoutP, 0);
-    InitTensor2D(&wk, d, dk, X_FLOAT, devID, mem);
+    InitTensor2DV2(&wk, d, dk, X_FLOAT, devID);
-    InitTensor2D(&wq, d, dk, X_FLOAT, devID, mem);
+    InitTensor2DV2(&wq, d, dk, X_FLOAT, devID);
-    InitTensor2D(&wv, d, dv, X_FLOAT, devID, mem);
+    InitTensor2DV2(&wv, d, dv, X_FLOAT, devID);
-    InitTensor2D(&wa, d, d, X_FLOAT, devID, mem);
+    InitTensor2DV2(&wa, d, d, X_FLOAT, devID);
+    InitTensor2DV2(&wbig, d, 3 * d, X_FLOAT, devID);
-    float scale = 1.0F;
-    float finfoutk = (float)sqrt(6.0F * scale/(d + dk));
-    float finfoutv = (float)sqrt(6.0F * scale/(d + dv));
-    float finfouta = (float)sqrt(6.0F * scale / (d + d));
-    wk.SetDataRand(-finfoutk, finfoutk);
+    float scale = 1.0F;
-    wq.SetDataRand(-finfoutk, finfoutk);
+    _SetDataFanInOut(&wk, scale);
-    wv.SetDataRand(-finfoutv, finfoutv);
+    _SetDataFanInOut(&wq, scale);
-    wa.SetDataRand(-finfouta, finfouta);
+    _SetDataFanInOut(&wv, scale);
+    _SetDataFanInOut(&wa, scale);
+    _SetDataFanInOut(&wbig, scale);
 }
 /* 
@@ -103,40 +99,88 @@ XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bo
    XTensor k2;
    XTensor q2;
    XTensor v2;
-    /* linear transofmration before self-attention */
+    /* linear transformation before self-attention */
    k2 = MMul(k, wk);
    q2 = MMul(q, wq);
    v2 = MMul(v, wv);
+    return MakeAttention(k2, q2, v2, mask, isTraining);
+}
+/*
+make the network given a big tensor that keeps keys, queries and values
+>> kqv - the big tensor
+>> mask - as it is
+>> isTraining - indicates whether the model is used for training
+*/
+XTensor T2TAttention::MakeBig(XTensor &kqv, XTensor &mask, bool isTraining)
+{
+    XTensor k2;
+    XTensor q2;
+    XTensor v2;
+    XTensor kqv2;
+    TensorList split;
+    kqv2 = MMul(kqv, wbig);
+    int d1 = kqv2.GetDim(0);
+    int d2 = kqv2.GetDim(1);
+    int d3 = kqv2.GetDim(2) / 3;
+    InitTensor3DV2(&k2, d1, d2, d3, X_FLOAT, devID);
+    InitTensor3DV2(&q2, d1, d2, d3, X_FLOAT, devID);
+    InitTensor3DV2(&v2, d1, d2, d3, X_FLOAT, devID);
+    split.Add(&q2);
+    split.Add(&k2);
+    split.Add(&v2);
+    Split(kqv2, split, 2, 3);
+    return MakeAttention(k2, q2, v2, mask, isTraining);
+}
+/*
+make the attention network given keys, queries and values (after linear transformation)
+>> k - keys. It might be of size B * L * H
+       where B = batch size, L = sequence length,
+       and H = vector size of each position
+>> q - queries
+>> v - values
+>> mask - as it is
+>> isTraining - indicates whether the model is used for training
+*/
+XTensor T2TAttention::MakeAttention(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining)
+{
    XTensor kheads;
    XTensor qheads;
    XTensor vheads;
    /* multi head */
-    kheads = Split(k2, k2.order - 1, nhead);
+    kheads = Split(k, k.order - 1, nhead);
-    qheads = Split(q2, q2.order - 1, nhead);
+    qheads = Split(q, q.order - 1, nhead);
-    vheads = Split(v2, v2.order - 1, nhead);
+    vheads = Split(v, v.order - 1, nhead);
    XTensor att;
    XTensor dot;
    XTensor scalar;
    /* scalar = softmax(Q * K^T / sqrt(dk)) * V */
    dot = BMMul(qheads, X_NOTRANS, kheads, X_TRANS);
    if(isMasked)
        dot = dot + mask;
    dot = Linear(dot, 1.0F/(float)sqrt((float)dk/nhead));
-    scalar = Softmax(dot, -1);
+    scalar = Softmax(dot, -1);
    if(isTraining && dropoutP > 0)
        scalar = Dropout(scalar, dropoutP);
    att = BMMul(scalar, vheads);
    /* concatenate the heads */
    return MMul(Merge(att, att.order - 1), wa);
 }

--- a/source/sample/transformer/T2TAttention.h
+++ b/source/sample/transformer/T2TAttention.h
@@ -42,9 +42,6 @@ public:
    /* device id */
    int devID;
-    /* memory pool */
-    XMem * mem;
    /* head number */
    int nhead;
@@ -59,7 +56,9 @@ public:
    /* transformation after dot-product attention */
    XTensor wa;
+    XTensor wbig;
    /* size of transformed Q and K */
    int dk;
@@ -92,10 +91,16 @@ public:
    /* initialize the model */
    void InitModel(int argc, char ** argv, 
                   bool myIsMasked, int myIgnored, 
-                   int myDevID = -1, XMem * myMem = NULL);
+                   int myDevID = -1);
    /* make the network */
    XTensor Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining);
+    /* make the network given a big tensor that keeps keys, queries and values */
+    XTensor MakeBig(XTensor &kqv, XTensor &mask, bool isTraining);
+    /* make the attention network given keys, queries and values (after linear transformation) */
+    XTensor MakeAttention(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining);
 };
 }

--- a/source/sample/transformer/T2TBatchLoader.cpp
+++ b/source/sample/transformer/T2TBatchLoader.cpp
--- a/source/sample/transformer/T2TBatchLoader.h
+++ b/source/sample/transformer/T2TBatchLoader.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-04-25
+ * it is cold today but i'll move to a warm place tomorrow :)
+ */
+#ifndef __T2TBATCHLOADER_H__
+#define __T2TBATCHLOADER_H__
+#include "../../network/XNet.h"
+using namespace nts;
+namespace transformer
+{
+#define MAX_SEQUENCE_LENGTH 1024 * 4
+/* node to keep batch information */
+struct BatchNode
+{
+    /* begining position */
+    int beg;
+    /* end position */
+    int end;
+    /* maximum word number on the encoder side */
+    int maxEnc;
+    /* maximum word number on the decoder side */
+    int maxDec;
+    /* a key for sorting */
+    int key;
+};
+class T2TBatchLoader
+{
+public:
+    /* buffer for loading words */
+    int * buf;
+    /* another buffer */
+    int * buf2;
+    /* batch buf */
+    BatchNode * bufBatch;
+    /* buffer size */
+    int bufSize;
+    /* size of batch buffer */
+    int bufBatchSize;
+    /* length of each sequence */
+    int * seqLen;
+    /* another array */
+    int * seqLen2;
+    /* offset of the first word for each sequence */
+    int * seqOffset;
+    /* number of sequences in the buffer */
+    int nseqBuf;
+    /* offset for next sequence in the buffer */
+    int nextSeq;
+    /* offset for next batch */
+    int nextBatch;
+    /* indicates whether we double the </s> symbol for the output of lms */
+    bool isDoubledEnd;
+    /* indicates whether we use batchsize = max * sc
+       rather rather than batchsize = word-number, where max is the maximum
+       length and sc is the sentence number */
+    bool isSmallBatch;
+    /* counterpart of "isSmallBatch" */
+    bool isBigBatch;
+    /* randomize batches */
+    bool isRandomBatch;
+    /* bucket size */
+    int bucketSize;
+public:
+    /* constructor */
+    T2TBatchLoader();
+    /* de-constructor */
+    ~T2TBatchLoader();
+    /* initialization */
+    void Init(int argc, char ** argv);
+    /* load data to buffer */
+    int LoadBuf(FILE * file, bool isSorted, int step);
+    /* clear data buffer */
+    void ClearBuf();
+    /* set the random batch flag */
+    void SetRandomBatch(bool flag = true);
+    /* load a batch of sequences */
+    int LoadBatch(FILE * file, bool isLM,
+                  XTensor * batchEnc, XTensor * paddingEnc, 
+                  XTensor * batchDec, XTensor * paddingDec,
+                  XTensor * gold, XTensor * label,
+                  int * seqs,
+                  int vsEnc, int vsDec, int sBatch, int wBatch, 
+                  bool isSorted, int &ws, int &wCount,
+                  int devID, bool isTraining);
+    /* load a batch of sequences (for language modeling) */
+    int LoadBatchLM(FILE * file, 
+                    XTensor * batchEnc, XTensor * paddingEnc,
+                    XTensor * batchDec, XTensor * paddingDec,
+                    XTensor * gold, XTensor * label,
+                    int * seqs, int vs, int sBatch, int wBatch, 
+                    bool isSorted, int &wCount,
+                    int devID, bool isTraining);
+    /* load a batch of sequences (for machine translation) */
+    int LoadBatchMT(FILE * file, 
+                    XTensor * batchEnc, XTensor * paddingEnc, 
+                    XTensor * batchDec, XTensor * paddingDec,
+                    XTensor * gold, XTensor * label,
+                    int * seqs, int vsEnc, int vsDec, int sBatch, int wBatch, 
+                    bool isSorted, int &ws, int &wCount,
+                    int devID, bool isTraining);
+    /* shuffle the data file */
+    void Shuffle(const char * srcFile, const char * tgtFile);
+};
+}
+#endif
\ No newline at end of file
--- a/source/sample/transformer/T2TDecoder.cpp
+++ b/source/sample/transformer/T2TDecoder.cpp
@@ -21,6 +21,8 @@
 #include <math.h>
 #include "T2TDecoder.h"
+#include "T2TUtility.h"
+#include "T2TLayerNormal.h"
 #include "../../tensor/core/CHeader.h"
 namespace transformer
@@ -29,6 +31,10 @@ namespace transformer
 /* constructor */
 AttDecoder::AttDecoder()
 {
+    attentions = NULL;
+    fnns = NULL;
+    attLayerNorms = NULL;
+    fnnLayerNorms = NULL;
    attentionsEnde = NULL;
    attEndeLayerNorms = NULL;
 }
@@ -36,6 +42,10 @@ AttDecoder::AttDecoder()
 /* de-constructor */
 AttDecoder::~AttDecoder()
 {
+    delete[] attentions;
+    delete[] fnns;
+    delete[] attLayerNorms;
+    delete[] fnnLayerNorms;
    delete[] attentionsEnde;
    delete[] attEndeLayerNorms;
 }
@@ -47,21 +57,43 @@ initialize the model
 >> myIsMasked - indicates whether the masked attention is employed
 >> myIgnored - number of positions ignored in attention (from the start)
 >> myDevID - device id
->> myMem - the memory pool
 */
 void AttDecoder::InitModel(int argc, char ** argv, 
                           bool myIsMasked, int myIgnored, 
-                           int myDevID, XMem * myMem)
+                           int myDevID)
 {
-    AttEncoder::InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
+    //AttEncoder::InitModel(argc, argv, myIsMasked, myIgnored, myDevID);
+    devID = myDevID;
+    ignored = myIgnored;
+    LoadParamInt(argc, argv, "nlayer", &nlayer, 6);
+    LoadParamInt(argc, argv, "hsize", &hSize, DEFAULT_EMBEDDING_SIZE);
+    LoadParamInt(argc, argv, "esize", &eSize, DEFAULT_EMBEDDING_SIZE);
+    LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
+    LoadParamFloat(argc, argv, "dropout", &dropoutP, 0);
+    CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
+    CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsizetgt\"");
+    /* embedding model */
+    embedder.InitModel(argc, argv, devID, false);
+    attentions = new T2TAttention[nlayer];
+    fnns = new T2TFNN[nlayer];
+    attLayerNorms = new T2TLN[nlayer];
+    fnnLayerNorms = new T2TLN[nlayer];
    attentionsEnde = new T2TAttention[nlayer];
    attEndeLayerNorms = new T2TLN[nlayer];
    /* initialize the stacked layers */
-    for(int i = 0; i < nlayer; i++){
+    for (int i = 0; i < nlayer; i++) {
-        attentionsEnde[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
+        attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID);
-        attEndeLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
+        fnns[i].InitModel(argc, argv, myDevID);
+        attLayerNorms[i].InitModel(argc, argv, myDevID);
+        fnnLayerNorms[i].InitModel(argc, argv, myDevID);
+        attentionsEnde[i].InitModel(argc, argv, true, myIgnored, myDevID);
+        attEndeLayerNorms[i].InitModel(argc, argv, myDevID);
    }
 }
@@ -93,7 +125,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
        /******************/
        /* self attention */
-        att = attentions[i].Make(x, x, x, mask, isTraining);
+        att = attentions[i].MakeBig(x, mask, isTraining);
        /* dropout */
        if(isTraining && dropoutP > 0)
@@ -133,6 +165,8 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
        /* layer normalization */
        x = fnnLayerNorms[i].Make(res);
    }
+    x.SetName(DECODING_NAME);
    return x;
 }

--- a/source/sample/transformer/T2TDecoder.h
+++ b/source/sample/transformer/T2TDecoder.h
@@ -26,10 +26,57 @@
 namespace transformer
 {
+#define DECODING_NAME "decoding"
+#define DECODING_INPUT_NAME "decoding_input"
-class AttDecoder : public AttEncoder
+class AttDecoder
 {
 public:
+    /* device id */
+    int devID;
+    /* layer number */
+    int nlayer;
+    /* hidden layer size of the FNN layer */
+    int hSize;
+    /* embedding size */
+    int eSize;
+    /* vocabulary size */
+    int vSize;
+    /* dropout probability */
+    DTYPE dropoutP;
+    /* some positions can be ignored in attention. this is useful in lm where the first position needs
+ *     special design for the attention model. */
+    int ignored;
+    /* embedding of word at each position */
+    T2TEmbedder embedder;
+    /* FNN model of each layer */
+    T2TFNN * fnns;
+    /* attention model of each layer */
+    T2TAttention * attentions;
+    /* layer normalization for fnn */
+    T2TLN * fnnLayerNorms;
+    /* layer normalization for attention */
+    T2TLN * attLayerNorms;
+    /* input tensor of the encoder */
+    XTensor * input;
+    /* output tensor of the encoder */
+    XTensor * output;
    /* encoder-decoder attention model of each layer */
    T2TAttention * attentionsEnde;
@@ -45,7 +92,7 @@ public:
    /* initialize the model */
    void InitModel(int argc, char ** argv, 
                   bool myIsMasked, int myIgnored, 
-                   int myDevID = -1, XMem * myMem = NULL);
+                   int myDevID = -1);
    /* make the decoding network */
    XTensor Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, XTensor &maskEncDec, bool isTraining);

--- a/source/sample/transformer/T2TEmbedding.cpp
+++ b/source/sample/transformer/T2TEmbedding.cpp
@@ -31,7 +31,6 @@ namespace transformer
 T2TEmbedder::T2TEmbedder()
 {
    devID = -1;
-    mem = NULL;
    vSize = -1;
    maxLength = -1;
 }
@@ -46,19 +45,23 @@ initialize the model
 >> argc - number of arguments
 >> argv - list of pointers to the arguments
 >> myDevID - device id
->> myMem - the memory pool
 */
-void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
+void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, bool isEnc)
 {
    devID = myDevID;
-    mem = myMem;
-    LoadParamInt(argc, argv, "vsize", &vSize, -1);
+    if(isEnc){
+        LoadParamInt(argc, argv, "vsize", &vSize, -1);
+    }
+    else{
+        LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
+    }
+    //LoadParamInt(argc, argv, "vsize", &vSize, -1);
    LoadParamInt(argc, argv, "maxlen", &maxLength, 512);
    LoadParamInt(argc, argv, "d", &eSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
-    InitTensor2D(&w, vSize, eSize, X_FLOAT, devID, mem);
+    InitTensor2DV2(&w, vSize, eSize, X_FLOAT, devID);
    DTYPE v = 1.0F/(float)sqrt((float)eSize);
    w.SetDataRandn(0, v);
@@ -75,7 +78,7 @@ make positional embeddings (of size eSize * length)
 */
 void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length)
 {
-    InitTensor2D(&posEmbeddingBase, length, eSize, X_FLOAT, devID, mem);
+    InitTensor2DV2(&posEmbeddingBase, length, eSize, X_FLOAT, devID);
    float * data = new float[posEmbeddingBase.unitNum];
@@ -139,9 +142,9 @@ XTensor T2TEmbedder::Make(XTensor &input)
    /* we make positional embeddings first */
    //if(!match){
    if(true){
-        InitTensor(&posEmbedding, input.order + 1, dims, X_FLOAT, 1.0F, devID, mem);
+        InitTensorV2(&posEmbedding, input.order + 1, dims, X_FLOAT, devID);
-        XTensor * posTMP = NewTensorBuf(2, dims + 1, X_FLOAT, 1.0F, devID, mem);
+        XTensor * posTMP = NewTensorBufV2(2, dims + 1, X_FLOAT, devID);
        _CopyValues(&posEmbeddingBase, 0, posTMP->unitNum, posTMP, 0);
        _Unsqueeze(posTMP, &posEmbedding, 0, dims[0]);

--- a/source/sample/transformer/T2TEmbedding.h
+++ b/source/sample/transformer/T2TEmbedding.h
@@ -41,9 +41,6 @@ public:
    /* device id */
    int devID;
-    /* memory pool */
-    XMem * mem;
    /* vocabulary size */
    int vSize;
@@ -71,7 +68,7 @@ public:
    ~T2TEmbedder();
    /* initialize the model */
-    void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL);
+    void InitModel(int argc, char ** argv, int myDevID = -1, bool isEnc = true);
    /* make positional embeddings */
    void MakePosEmbedding(int eSize, int d, int length);

--- a/source/sample/transformer/T2TEncoder.cpp
+++ b/source/sample/transformer/T2TEncoder.cpp
@@ -52,15 +52,12 @@ initialize the model
 >> argv - list of pointers to the arguments
 >> myIsMasked - indicates whether the masked attention is employed
 >> myIgnored - number of positions ignored in attention (from the start)
->> myDevID - device id
+>> myDevID - device id*/
->> myMem - the memory pool
-*/
 void AttEncoder::InitModel(int argc, char ** argv, 
                           bool myIsMasked, int myIgnored, 
-                           int myDevID, XMem * myMem)
+                           int myDevID)
 {
    devID = myDevID;
-    mem = myMem;
    ignored = myIgnored;
    LoadParamInt(argc, argv, "nlayer", &nlayer, 6);
@@ -73,7 +70,7 @@ void AttEncoder::InitModel(int argc, char ** argv,
    CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsize\"");
    /* embedding model */
-    embedder.InitModel(argc, argv, devID, mem);
+    embedder.InitModel(argc, argv, devID);
    attentions = new T2TAttention[nlayer];
    fnns = new T2TFNN[nlayer];
@@ -82,10 +79,10 @@ void AttEncoder::InitModel(int argc, char ** argv,
    /* initialize the stacked layers */
    for(int i = 0; i < nlayer; i++){
-        attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
+        attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID);
-        fnns[i].InitModel(argc, argv, myDevID, myMem);
+        fnns[i].InitModel(argc, argv, myDevID);
-        attLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
+        attLayerNorms[i].InitModel(argc, argv, myDevID);
-        fnnLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
+        fnnLayerNorms[i].InitModel(argc, argv, myDevID);
    }
 }
@@ -103,8 +100,6 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo
    x = embedder.Make(input);
-    //x.Dump(tmpFILE, "embedding: ");
    /* dropout */
    if(isTraining && dropoutP > 0)
        x = Dropout(x, dropoutP);
@@ -116,8 +111,8 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo
        XTensor res;
        /* self attention */
-        att = attentions[i].Make(x, x, x, mask, isTraining);
+        att = attentions[i].MakeBig(x, mask, isTraining);
        /* dropout */
        if(isTraining && dropoutP > 0)
            att = Dropout(att, dropoutP);
@@ -141,6 +136,9 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo
        /* layer normalization */
        x = fnnLayerNorms[i].Make(res);
    }
+    x.SetName(ENCODING_NAME);
+    input.SetName(ENCODING_INPUT_NAME);
    return x;
 }

--- a/source/sample/transformer/T2TEncoder.h
+++ b/source/sample/transformer/T2TEncoder.h
@@ -32,6 +32,9 @@ using namespace nts;
 namespace transformer
 {
+#define ENCODING_NAME "encoding"
+#define ENCODING_INPUT_NAME "encoding_input"
 /* 
 base class of the encoder 
@@ -62,9 +65,6 @@ public:
    /* device id */
    int devID;
-    /* memory pool */
-    XMem * mem;
    /* layer number */
    int nlayer;
@@ -115,7 +115,7 @@ public:
    /* initialize the model */
    void InitModel(int argc, char ** argv, 
                   bool myIsMasked, int myIgnored, 
-                   int myDevID = -1, XMem * myMem = NULL);
+                   int myDevID = -1);
    /* make the encoding network */
    XTensor Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, bool isTraining);

--- a/source/sample/transformer/T2TFNN.cpp
+++ b/source/sample/transformer/T2TFNN.cpp
@@ -47,12 +47,10 @@ initialize the model
 >> argc - number of arguments
 >> argv - list of pointers to the arguments
 >> myDevID - device id
->> myMem - the memory pool
 */
-void T2TFNN::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
+void T2TFNN::InitModel(int argc, char ** argv, int myDevID)
 {
    devID = myDevID;
-    mem = myMem;
    float minmax = 0;
@@ -62,19 +60,17 @@ void T2TFNN::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
    LoadParamFloat(argc, argv, "fnnminmax", &minmax, 0.1F);
    LoadParamFloat(argc, argv, "dropoutfnn", &dropoutP, 0);
-    InitTensor2D(&w1, inSize, hSize, X_FLOAT, devID, mem);
+    InitTensor2DV2(&w1, inSize, hSize, X_FLOAT, devID);
-    InitTensor1D(&b1, hSize, X_FLOAT, devID, mem);
+    InitTensor1DV2(&b1, hSize, X_FLOAT, devID);
-    InitTensor2D(&w2, hSize, outSize, X_FLOAT, devID, mem);
+    InitTensor2DV2(&w2, hSize, outSize, X_FLOAT, devID);
-    InitTensor1D(&b2, outSize, X_FLOAT, devID, mem);
+    InitTensor1DV2(&b2, outSize, X_FLOAT, devID);
    float scale = 1.0F;
-    float finfout1 = (float)sqrt(6.0F * scale/(inSize + hSize));
+    _SetDataFanInOut(&w1, scale);
-    float finfout2 = (float)sqrt(6.0F * scale/(hSize + outSize));
+    _SetDataFanInOut(&w2, scale);
-    w1.SetDataRand(-finfout1, finfout1);
    b1.SetZeroAll();
-    w2.SetDataRand(-finfout2, finfout2);
    b2.SetZeroAll();
 }
@@ -89,13 +85,15 @@ XTensor T2TFNN::Make(XTensor &input, bool isTraining)
    XTensor t1;
    /* t1 = max(0, x * w1 + b1) */
-    t1 = Rectify(MMul(input, w1) + b1);
+    //t1 = Rectify(MMul(input, w1) + b1);
+    t1 = Rectify(MulAndShift(input, w1, b1));
    if(isTraining && dropoutP > 0)
        t1 = Dropout(t1, dropoutP);
    /* result = t1 * w2 + b2 */
-    return MMul(t1, w2) + b2;
+    //return MMul(t1, w2) + b2;
+    return MulAndShift(t1, w2, b2);
 }

--- a/source/sample/transformer/T2TFNN.h
+++ b/source/sample/transformer/T2TFNN.h
@@ -36,9 +36,6 @@ public:
    /* device id */
    int devID;
-    /* memory pool */
-    XMem * mem;
    /* size of input vector */
    int inSize;
@@ -72,7 +69,7 @@ public:
    ~T2TFNN();
    /* initialize the model */
-    void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL);
+    void InitModel(int argc, char ** argv, int myDevID = -1);
    /* make the network */
    XTensor Make(XTensor &input, bool isTraining);

--- a/source/sample/transformer/T2TLayerNormal.cpp
+++ b/source/sample/transformer/T2TLayerNormal.cpp
@@ -32,7 +32,6 @@ namespace transformer
 T2TLN::T2TLN()
 {
    devID = -1;
-    mem = NULL;
    d = 0;
 }
@@ -46,18 +45,16 @@ initialize the model
 >> argc - number of arguments
 >> argv - list of pointers to the arguments
 >> myDevID - device id
->> myMem - the memory pool
 */
-void T2TLN::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
+void T2TLN::InitModel(int argc, char ** argv, int myDevID)
 {
    devID = myDevID;
-    mem = myMem;
    d = 0;
    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
-    InitTensor1D(&w, d, X_FLOAT, devID, mem);
+    InitTensor1DV2(&w, d, X_FLOAT, devID);
-    InitTensor1D(&b, d, X_FLOAT, devID, mem);
+    InitTensor1DV2(&b, d, X_FLOAT, devID);
    w.SetDataRand(1.0F, 1.0F);
    b.SetZeroAll();

--- a/source/sample/transformer/T2TLayerNormal.h
+++ b/source/sample/transformer/T2TLayerNormal.h
@@ -36,9 +36,6 @@ class T2TLN
 public:
    /* device id */
    int devID;
-    /* memory pool */
-    XMem * mem;
    /* the transformation matrix w */
    XTensor w;
@@ -57,7 +54,7 @@ public:
    ~T2TLN();
    /* initialize the model */
-    void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL);
+    void InitModel(int argc, char ** argv, int myDevID = -1);
    /* make the network */
    XTensor Make(XTensor &input);

--- a/source/sample/transformer/T2TLengthPenalty.cpp
+++ b/source/sample/transformer/T2TLengthPenalty.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2019, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "../../tensor/core/CHeader.h"
+#include "T2TLengthPenalty.h"
+using namespace nts;
+namespace transformer
+{
+/* 
+GNMT-like length penalty: pl = ((5 + n)/(5 + 1))^\alpha 
+where n = length of the sequence 
+>> length - length of the sequence (for each entry)
+>> alpha - the parameter controls the length preference
+<< return - length penaltyof the sequence (for each entry)
+*/
+XTensor T2TLengthPenalizer::GNMT(const XTensor & length, float alpha)
+{
+    XTensor base;
+    XTensor lp;
+    //base = ScaleAndShift(ScaleAndShift(length, 0, 5.0F), 1.0F/(5 + 1));
+    base = (length + 5)/(1 + 5);
+    lp = Power(base, alpha);
+    return lp;
+}
+}
--- a/source/sample/transformer/T2TLengthPenalty.h
+++ b/source/sample/transformer/T2TLengthPenalty.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2019, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-04-08
+ * Start of a new week - I just finished several documents.
+ * Writing document is harder than writing code :)
+ */
+#ifndef __T2TLENGTHPENALTY_H__
+#define __T2TLENGTHPENALTY_H__
+#include "../../tensor/XTensor.h"
+using namespace nts;
+namespace transformer
+{
+/* We intend to penalize short sequences because they have higher score
+   in product of a sequence of probability-like terms and have more chances
+   to beat others in search. */
+class T2TLengthPenalizer
+{
+public:
+    /* GNMT-like length penalty: pl = ((5 + n)/(5 + 1))^\alpha 
+       where n = length of the sequence */
+    static
+    XTensor GNMT(const XTensor & length, float alpha);
+};
+}
+#endif
--- a/source/sample/transformer/T2TModel.cpp
+++ b/source/sample/transformer/T2TModel.cpp
--- a/source/sample/transformer/T2TModel.h
+++ b/source/sample/transformer/T2TModel.h
@@ -31,15 +31,15 @@
 namespace transformer
 {
+/* a transformer model that keeps parameters of the encoder,
+   the decoder and the output layer (softmax). Also, it creates
+   the network used in transformer. */
 class T2TModel
 {
 public:
    /* device id */
    int devID;
-    /* memory pool */
-    XMem * mem;
    /* the encoder */
    AttEncoder * encoder;
@@ -78,10 +78,24 @@ public:
    void MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool isTraining);
    /* make the network for machine translation (with the output softmax layer) */
-    void MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTensor &paddingEnc, XTensor &paddingDec, bool isTraining);
+    void MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, 
+                XTensor &paddingEnc, XTensor &paddingDec, bool isTraining);
+    /* make the mask for training MT models */
+    void MakeMTMask(XTensor &inputEnc, XTensor &inputDec, 
+                    XTensor &paddingEnc, XTensor &paddingDec, 
+                    XTensor &maskEnc, XTensor &maskDec, XTensor &maskEncDec);
+    /* make the mask of the encoder */
+    void MakeMTMaskEnc(XTensor &inputEnc, XTensor &paddingEnc, XTensor &maskEnc);
+    /* make the mask of the decoder */
+    void MakeMTMaskDec(XTensor &inputEnc, XTensor &inputDec,
+                       XTensor &paddingEnc, XTensor &paddingDec,
+                       XTensor &maskDec, XTensor &maskEncDec);
    /* get parameter matrics */
-    void GetParams(XList &list);
+    void GetParams(TensorList &list);
    /* dump the parameters */
    void Dump(const char * fn);

--- a/source/sample/transformer/T2TOutput.cpp
+++ b/source/sample/transformer/T2TOutput.cpp
@@ -31,7 +31,6 @@ namespace transformer
 T2TOutput::T2TOutput()
 {
    devID = -1;
-    mem = NULL;
    vSize = -1;
    inSize = -1;
    hSize = -1;
@@ -47,21 +46,19 @@ initialize the model
 >> argc - number of arguments
 >> argv - list of pointers to the arguments
 >> myDevID - device id
->> myMem - the memory pool
 */
-void T2TOutput::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
+void T2TOutput::InitModel(int argc, char ** argv, int myDevID)
 {
    devID = myDevID;
-    mem = myMem;
    float minmax = 0;
-    LoadParamInt(argc, argv, "vsize", &vSize, -1);
+    LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
    LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "d", &hSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamFloat(argc, argv, "outputminmax", &minmax, 0.08F);
-    InitTensor2D(&w, hSize, vSize, X_FLOAT, devID, mem);
+    InitTensor2DV2(&w, hSize, vSize, X_FLOAT, devID);
    float scale = 1.0F;
    float finfout = (float)sqrt(6.0F * scale/(hSize + vSize));
@@ -93,8 +90,9 @@ void T2TOutput::Make(XTensor &input, XTensor &output)
 {
    XTensor &x = input;
-    output = LogSoftmax(MMul(x, w), -1);
+    //output = LogSoftmax(MMul(x, w), -1);
-    //output = Softmax(MMul(x, w), -1);
+    output = Softmax(MMul(x, w), -1);
+    output.SetName(OUTPUT_NAME);
 }
 }
--- a/source/sample/transformer/T2TOutput.h
+++ b/source/sample/transformer/T2TOutput.h
@@ -28,6 +28,8 @@ using namespace nts;
 namespace transformer
 {
+#define OUTPUT_NAME "output"
 /* output layer */
 class T2TOutput
@@ -36,9 +38,6 @@ public:
    /* device id */
    int devID;
-    /* memory pool */
-    XMem * mem;
    /* vocabulary size */
    int vSize;
@@ -59,7 +58,7 @@ public:
    ~T2TOutput();
    /* initialize the model */
-    void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL);
+    void InitModel(int argc, char ** argv, int myDevID = -1);
    /* make the network */
    XTensor Make(XTensor &input);

--- a/source/sample/transformer/T2TPredictor.cpp
+++ b/source/sample/transformer/T2TPredictor.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-13
+ */
+#include "T2TPredictor.h"
+#include "../../tensor/core/CHeader.h"
+using namespace nts;
+namespace transformer
+{
+/* constructor */
+T2TStateBundle::T2TStateBundle()
+{
+    states = NULL;
+    isStart = false;
+}
+/* de-constructor */
+T2TStateBundle::~T2TStateBundle()
+{
+    if(states != NULL)
+        delete[] states;
+}
+/* 
+create states 
+>> num - number of states
+*/
+void T2TStateBundle::MakeStates(int num)
+{
+    CheckNTErrors(num > 0, "invalid number");
+    if(states != NULL)
+        delete[] states;
+    states = new T2TState[num];
+    for(int i = 0; i < num; i++){
+        states[i].prediction = -1;
+        states[i].pid = T2T_PID_EMPTY;
+        states[i].isEnd = false;
+        states[i].isStart = false;
+        states[i].isCompleted = false;
+        states[i].prob = 0;
+        states[i].probPath = 0;
+        states[i].modelScore = 0;
+        states[i].nstep = 0;
+        states[i].last = NULL;
+    }
+    stateNum = num;
+}
+/* constructor */
+T2TPredictor::T2TPredictor()
+{
+    startSymbol = -1;
+}
+/* de-constructor */
+T2TPredictor::~T2TPredictor()
+{
+}
+/* 
+create an initial state 
+>> model - the t2t model
+>> top - the top-most layer of the network
+>> input - input of the network
+>> beamSize - beam size
+>> state - the state to be initialized
+*/
+void T2TPredictor::Create(T2TModel * model, XTensor * top, const XTensor * input, int beamSize, T2TStateBundle * state)
+{
+    state->layersEnc.Clear();
+    state->layersDec.Clear();
+    XTensor * encoding = XLink::SearchNode(top, ENCODING_NAME);
+    CheckNTErrors(encoding != NULL, "No encoding layers found!");
+    state->layersEnc.Add(encoding);
+    state->layersDec.Add(NULL);
+    int dims[MAX_TENSOR_DIM_NUM];
+    for (int i = 0; i < input->order - 1; i++)
+        dims[i] = input->GetDim(i);
+    dims[input->order - 1] = beamSize;
+    InitTensorV2(&state->probPath, input->order, dims, X_FLOAT, input->devID);
+    InitTensorV2(&state->nstep, input->order, dims, X_FLOAT, input->devID);
+    InitTensorV2(&state->endMark, input->order, dims, X_INT, input->devID);
+    state->probPath.SetZeroAll();
+    state->nstep.SetZeroAll();
+    state->endMark.SetZeroAll();
+    state->stateNum = 0;
+}
+/*
+set start symbol
+>> symbol - the symbol (in integer)
+*/
+void T2TPredictor::SetStartSymbol(int symbol)
+{
+    startSymbol = symbol;
+}
+/* 
+read a state 
+>> model - the t2t model that keeps the network created so far
+>> state - a set of states. It keeps
+             1) hypotheses (states)
+             2) probablities of hypotheses
+             3) parts of the network for expanding toward the next state
+*/
+void T2TPredictor::Read(T2TModel * model, T2TStateBundle * state)
+{
+    m = model;
+    s = state;
+}
+/*
+predict the next state
+>> next - next states (assuming that the current state has been read)
+>> encoding - encoder output
+>> inputEnc - input of the encoder
+>> paddingEnc - padding of the encoder
+*/
+void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding,
+                           XTensor * inputEnc, XTensor * paddingEnc)
+{
+    int dims[MAX_TENSOR_DIM_NUM];
+    next->layersEnc.Clear();
+    next->layersDec.Clear();
+    AttDecoder &decoder = *m->decoder;
+    /* word indices of previous positions */
+    XTensor * inputLast = (XTensor*)s->layersDec.GetItem(0);
+    /* word indices of positions up to next state */
+    XTensor inputDec;
+    /* the first token */
+    XTensor first;
+    CheckNTErrors(inputEnc->order >= 2, "Wrong order of the tensor!");
+    for(int i = 0; i < inputEnc->order - 1; i++)
+        dims[i] = inputEnc->GetDim(i);
+    dims[inputEnc->order - 1] = 1;
+    InitTensorV2(&first, inputEnc->order, dims, X_INT, inputEnc->devID);
+    _SetDataFixedInt(&first, startSymbol);
+    /* add a new word into the input sequence of the decoder side */
+    if (inputLast == NULL) {
+        inputDec = Identity(first);
+    }
+    else{
+        inputDec = GeneratePaths(s);
+        inputDec.SetDevice(inputEnc->devID);
+        inputDec = Concatenate(first, inputDec, inputDec.order - 1);
+    }
+    /* prediction probabilities */
+    XTensor &output = next->prob;
+    XTensor decoding;
+    XTensor decodingStep;
+    for(int i = 0; i < inputDec.order - 1; i++)
+        dims[i] = inputDec.GetDim(i);
+    dims[inputDec.order - 1] = inputDec.GetDim(-1);
+    XTensor paddingDec;
+    InitTensorV2(&paddingDec, inputDec.order, dims, X_INT, paddingEnc->devID);
+    SetDataFixedInt(paddingDec, 1);
+    XTensor maskDec;
+    XTensor maskEncDec;
+    /* decoder mask */
+    m->MakeMTMaskDec(*inputEnc, inputDec, *paddingEnc, paddingDec, maskDec, maskEncDec);
+    /* make the decoding network */
+    decoding = decoder.Make(inputDec, *encoding, maskDec, maskEncDec, false);
+    XTensor selectSrc;
+    XTensor selectTgt;
+    CheckNTErrors(decoding.order >= 2, "The tensor must be of order 2 or larger!");
+    int stride = decoding.GetDim(decoding.order - 2);
+    InitTensor1DV2(&selectSrc, 1, X_INT);
+    InitTensor1DV2(&selectTgt, 1, X_INT);
+    selectSrc.SetInt(stride - 1, 0);
+    selectTgt.SetInt(0, 0);
+    selectSrc.SetDevice(decoding.devID);
+    selectTgt.SetDevice(decoding.devID);
+    /* the decoder output of the last position */
+    decodingStep = CopyIndexed(decoding, decoding.order - 2, selectSrc, selectTgt);
+    /* generate the output probabilities */
+    m->outputLayer->Make(decodingStep, output);
+    next->layersEnc.AddList(&s->layersEnc);
+    next->layersDec.Add(&inputDec);
+    next->layersDec.Add(&output);
+}
+/* 
+generate paths up to the states of the current step 
+>> state - state bundle of the current step
+*/
+XTensor T2TPredictor::GeneratePaths(T2TStateBundle * state)
+{
+    CheckNTErrors(state->stateNum >= 0, "Illegal state!");
+    int distance = -1;
+    for(int i = 0; i < state->stateNum; i++){
+        T2TState * cur = state->states + i;
+        int nsteps = 0;
+        while(cur != NULL){
+            nsteps++;
+            cur = cur->last;
+        }
+        if(nsteps > distance)
+            distance = nsteps;
+    }
+    XTensor path;
+    InitTensor2DV2(&path, state->stateNum, distance, X_INT);
+    path.SetZeroAll();
+    for(int i = 0; i < state->stateNum; i++){
+        T2TState * cur = state->states + i;
+        int nsteps = 0;
+        while(cur != NULL){
+            nsteps++;
+            path.Set2DInt(cur->prediction, i, distance - nsteps);
+            cur = cur->last;
+        }
+    }
+    return path;
+}
+}
--- a/source/sample/transformer/T2TPredictor.h
+++ b/source/sample/transformer/T2TPredictor.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-13
+ * This is the first source file I create in 2019 - new start!
+ */
+#ifndef __T2TPREDICTOR_H__
+#define __T2TPREDICTOR_H__
+#include "T2TModel.h"
+#include "T2TLengthPenalty.h"
+namespace transformer
+{
+#define T2T_PID_EMPTY -1
+/* state for search. It keeps the path (back-pointer), prediction distribution,
+   and etc. It can be regarded as a hypothsis in translation. */
+class T2TState
+{
+public:
+    /* we assume that the prediction is an integer */
+    int prediction;
+    /* id of the problem. One can regard it as the sentence id when we 
+       translate a number of sentences in the batched manner. The hypothesis 
+       is empty if id = -1 */
+    int pid;
+    /* indicates whether the state is an end */
+    bool isEnd;
+    /* indicates whether the state is the start */
+    bool isStart;
+    /* indicates whether the state is completed */
+    bool isCompleted;
+    /* probability of every prediction (last state of the path) */
+    float prob;
+    /* probability of every path */
+    float probPath;
+    /* model score of every path. A model score = path probability + some other stuff */
+    float modelScore;
+    /* nubmer of steps we go over so far */
+    int nstep;
+    /* pointer to the previous state */
+    T2TState * last;
+};
+/* a bundle of states */
+class T2TStateBundle
+{
+public:
+    /* predictions */
+    XTensor prediction;
+    /* id of the previous state that generates the current one  */
+    XTensor preID;
+    /* mark that indicates whether each hypothesis is completed */
+    XTensor endMark;
+    /* probability of every prediction (last state of the path) */
+    XTensor prob;
+    /* probability of every path */
+    XTensor probPath;
+    /* model score of every path */
+    XTensor modelScore;
+    /* step number of each hypothesis */
+    XTensor nstep;
+    /* layers on the encoder side. We actually use the encoder output instead
+       of all hidden layers. */
+    TensorList layersEnc;
+    /* layers on the decoder side */
+    TensorList layersDec;
+    /* list of states */
+    T2TState * states;
+    /* number of states */
+    int stateNum;
+    /* indicates whether it is the first state */
+    bool isStart;
+public:
+    /* constructor */
+    T2TStateBundle();
+    /* de-constructor */
+    ~T2TStateBundle();
+    /* create states */
+    void MakeStates(int num);
+};
+/* The predictor reads the current state and then predicts the next. 
+   It is exactly the same procedure of MT inference -
+   we get the state of previous words and then generate the next word.
+   Here, a state can be regared as the representation of words (word 
+   indices, hidden states, embeddings and etc.).  */
+class T2TPredictor
+{
+private:
+    /* pointer to the transformer model */
+    T2TModel * m;
+    /* current state */
+    T2TStateBundle * s;
+    /* start symbol */
+    int startSymbol;
+public:
+    /* constructor */
+    T2TPredictor();
+    /* de-constructor */
+    ~T2TPredictor();
+    /* create an initial state */
+    void Create(T2TModel * model, XTensor * top, const XTensor * input, int beamSize, T2TStateBundle * state);
+    /* set the start symbol */
+    void SetStartSymbol(int symbol);
+    /* read a state */
+    void Read(T2TModel * model, T2TStateBundle * state);
+    /* predict the next state */
+    void Predict(T2TStateBundle * next, XTensor * encoding, XTensor * inputEnc, XTensor * paddingEnc);
+    /* generate paths up to the states of the current step */
+    XTensor GeneratePaths(T2TStateBundle * state);
+};
+}
+#endif
--- a/source/sample/transformer/T2TSearch.cpp
+++ b/source/sample/transformer/T2TSearch.cpp
--- a/source/sample/transformer/T2TSearch.h
+++ b/source/sample/transformer/T2TSearch.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
+ */
+#ifndef __T2TSEARCH_H__
+#define __T2TSEARCH_H__
+#include "T2TModel.h"
+#include "T2TPredictor.h"
+namespace transformer
+{
+/* The class orgnizes the search process. It calls "predictors" to generate
+   distributions of the predictions and prunes the search space by beam pruning.
+   This makes a graph where each path respresents a translation hypothsis.
+   The output can be the path with the highest model score. */
+class T2TSearch
+{
+private:
+    /* the alpha parameter controls the length preference */
+    float alpha;
+    /* predictor */
+    T2TPredictor predictor;
+    /* max length of the generated sequence */
+    int maxLength;
+    /* beam size */
+    int beamSize;
+    /* batch size */
+    int batchSize;
+    /* we keep the final hypotheses in a heap for each sentence in the batch. */
+    XHeap<MIN_HEAP, float> * fullHypos;
+    /* array of the end symbols */
+    int * endSymbols;
+    /* number of the end symbols */
+    int endSymbolNum;
+    /* start symbol */
+    int startSymbol;
+public:
+    /* constructor */
+    T2TSearch();
+    /* de-constructor */
+    ~T2TSearch();
+    /* initialize the model */
+    void Init(int argc, char ** argv);
+    /* search for the most promising states */
+    void Search(T2TModel * model, XTensor * input, XTensor * padding, XTensor * output);
+    /* preparation */
+    void Prepare(int myBatchSize,int myBeamSize);
+    /* compute the model score for each hypothesis */
+    void Score(T2TStateBundle * prev, T2TStateBundle * beam);
+    /* generate token indices via beam pruning */
+    void Generate(T2TStateBundle * beam);
+    /* expand the search graph */
+    void Expand(T2TStateBundle * prev, T2TStateBundle * beam);
+    /* collect hypotheses with ending symbol */
+    void Collect(T2TStateBundle * beam);
+    /* fill the hypotheis heap with incomplete hypothses */
+    void FillHeap(T2TStateBundle * beam);
+    /* save the output sequences in a tensor */
+    void Dump(XTensor * output);
+    /* check if the token is an end symbol */
+    bool IsEnd(int token);
+    /* set end symbols for search */
+    void SetEnd(const int * tokens, const int tokenNum);
+    /* make a mask to prevent duplicated entries in beam expansion for the first position */
+    XTensor MakeFirstMask(T2TStateBundle * beam);
+};
+}
+#endif
--- a/source/sample/transformer/T2TTester.cpp
+++ b/source/sample/transformer/T2TTester.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
+ */
+#include <math.h>
+#include "T2TUtility.h"
+#include "T2TTester.h"
+#include "T2TSearch.h"
+#include "../../tensor/XUtility.h"
+#include "../../tensor/core/CHeader.h"
+#include "../../network/XNoder.h"
+using namespace nts;
+namespace transformer
+{
+/* constructor */
+T2TTester::T2TTester()
+{
+}
+/* de-constructor */
+T2TTester::~T2TTester()
+{
+}
+/* initialize the model */
+void T2TTester::Init(int argc, char ** argv)
+{
+    LoadParamInt(argc, argv, "vsize", &vSize, 1);
+    LoadParamInt(argc, argv, "vsizetgt", &vSizeTgt, vSize);
+    batchLoader.Init(argc, argv);
+    seacher.Init(argc, argv);
+}
+/* 
+test the model
+>> fn - test data file
+>> ofn - output data file
+>> model - model that is trained
+*/
+void T2TTester::Test(const char * fn, const char * ofn, T2TModel * model)
+{
+    int wc = 0;
+    int ws = 0;
+    int wordCount = 0;
+    int wordCountTotal = 0;
+    int sentCount = 0;
+    int batchCount = 0;
+    float loss = 0;
+    /* data files */
+    FILE * file = fopen(fn, "rb");
+    CheckNTErrors(file, "Cannot read the test file");
+    FILE * ofile = fopen(ofn, "wb");
+    CheckNTErrors(ofile, "Cannot open the output file");
+    int devID = model->devID;
+    XNet net;
+    double startT = GetClockSec();
+    wordCount = 0;
+    /* batch of input sequences */
+    XTensor batchEnc;
+    XTensor batchDec;
+    /* label */
+    XTensor label;
+    /* padding */
+    XTensor paddingEnc;
+    XTensor paddingDec;
+    /* gold standard */
+    XTensor gold;
+    /* an array that keeps the sequences */
+    int * seqs = new int[MILLION];
+    batchLoader.SetRandomBatch(false);
+    batchLoader.ClearBuf();
+    while(batchLoader.LoadBatch(file, model->isLM, 
+                                &batchEnc, &paddingEnc, &paddingDec, &paddingDec, &gold, &label,
+                                seqs, vSize, vSizeTgt,
+                                1, 1, false, ws, wc, devID, false))
+    {
+        CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch!");
+        CheckNTErrors(!model->isLM, "Only MT model is supported!");
+        XTensor output;
+        seacher.Search(model, &batchEnc, &paddingEnc, &output);
+        Dump(ofile, &output);
+        float prob = 0;
+        loss += -prob;
+        wc = batchEnc.GetDim(-1);
+        wordCount += wc;
+        wordCountTotal += wc;
+        sentCount += batchEnc.GetDim(-2);
+        batchCount += 1;
+        if (batchCount % 1 == 0) {
+            double elapsed = GetClockSec() - startT;
+            XPRINT3(0, stderr, 
+                   "[INFO] elapsed=%.1fs, sentence=%d, sword=%d\n",
+                    elapsed, sentCount, wordCount);
+        }
+    }
+    fclose(file);
+    fclose(ofile);
+    delete[] seqs;
+    double elapsed = GetClockSec() - startT;
+    XPRINT3(0, stderr, "[INFO] test finished (took %.1fs, word=%d, and ppl=%.3f)\n",
+            elapsed,wordCountTotal, exp(loss/wordCount));
+}
+/*
+dump the result into the file
+>> file - data file
+>> output - output tensor
+*/
+void T2TTester::Dump(FILE * file, XTensor * output)
+{
+    int seqLength = output->GetDim(-1);
+    for (int i = 0; i < output->unitNum; i += seqLength) {
+        for (int j = 0; j < seqLength; j++) {
+            int w = output->GetInt(i + j);
+            fprintf(file, "%d ", w);
+            if (w < 0)
+                break;
+        }
+        fprintf(file, "\n");
+    }
+}
+}
--- a/source/sample/transformer/T2TTester.h
+++ b/source/sample/transformer/T2TTester.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
+ * A week with no trips :)
+ */
+#ifndef __T2TTESTER_H__
+#define __T2TTESTER_H__
+#include "T2TSearch.h"
+#include "T2TBatchLoader.h"
+namespace transformer
+{
+/* This class translates test sentences with a trained model. */
+class T2TTester
+{
+public:
+    /* vocabulary size of the source side */
+    int vSize;
+    /* vocabulary size of the target side */
+    int vSizeTgt;
+    /* for batching */
+    T2TBatchLoader batchLoader;
+    /* decoder for inference */
+    T2TSearch seacher;
+public:
+    /* constructor */
+    T2TTester();
+    /* de-constructor */
+    ~T2TTester();
+    /* initialize the model */
+    void Init(int argc, char ** argv);
+    /* test the model */
+    void Test(const char * fn, const char * ofn, T2TModel * model);
+    /* dump the result into the file */
+    void Dump(FILE * file, XTensor * output);
+};
+}
+#endif
\ No newline at end of file
--- a/source/sample/transformer/T2TTrainer.cpp
+++ b/source/sample/transformer/T2TTrainer.cpp
--- a/source/sample/transformer/T2TTrainer.h
+++ b/source/sample/transformer/T2TTrainer.h
@@ -23,11 +23,9 @@
 #define __T2TTRAINER_H__
 #include "T2TModel.h"
+#include "T2TBatchLoader.h"
 #include "../../tensor/function/FHeader.h"
-#define MAX_SEQUENCE_LENGTH 1024 * 4
 using namespace nts;
 namespace transformer
@@ -42,33 +40,6 @@ public:
    /* parameter array */
    char ** argArray;
-    /* buffer for loading words */
-    int * buf;
-    /* another buffer */
-    int * buf2;
-    /* buffer size */
-    int bufSize;
-    /* length of each sequence */
-    int * seqLen;
-    /* another array */
-    int * seqLen2;
-    /* offset of the first word for each sequence */
-    int * seqOffset;
-    /* number of sequences in the buffer */
-    int nseqBuf;
-    /* offset for next sequence in the buffer */
-    int nextSeq;
-    /* indicates whether the sequence is sorted by length */
-    bool isLenSorted;
    /* dimension size of each inner layer */
    int d;
@@ -111,10 +82,10 @@ public:
    float adamBeta2T;
    /* list of the moment of the parameter matrics */
-    XList moments;
+    TensorList moments;
    /* list of the 2nd order moment of the parameter matrics */
-    XList moments2nd;
+    TensorList moments2nd;
    /* indicates whether the data file is shuffled for training */
    bool isShuffled;
@@ -130,20 +101,15 @@ public:
    /* number of batches on which we do model update */
    int updateStep;
-    /* indicates whether we double the </s> symbol for the output of lms */
-    bool isDoubledEnd;
-    /* indicates whether we use batchsize = max * sc
-       rather rather than batchsize = word-number, where max is the maximum
-       length and sc is the sentence number */
-    bool isSmallBatch;
-    /* counterpart of "isSmallBatch" */
+    /* indicates whether we intend to debug the net */
-    bool isBigBatch;
+    bool isDebugged;
-    /* indicates whether we use small memory footprint for backward process */
+    /* indicates whether the sequence is sorted by length */
-    bool isSmallFootprint;
+    bool isLenSorted;
+    /* for batching */
+    T2TBatchLoader batchLoader;
 public:
    /* constructor */
@@ -163,46 +129,6 @@ public:
    /* make a checkpoint */
    void MakeCheckpoint(T2TModel * model, const char * validFN, const char * modelFN, const char * label, int id);
-    /* load data to buffer */
-    int LoadBuf(FILE * file, bool isSorted, int step);
-    /* clear data buffer */
-    void ClearBuf();
-    /* load a batch of sequences */
-    int LoadBatch(FILE * file, bool isLM,
-                  XTensor * batchEnc, XTensor * paddingEnc, 
-                  XTensor * batchDec, XTensor * paddingDec,
-                  XTensor * gold,
-                  int * seqs,
-                  int vsEnc, int vsDec, int sBatch, int wBatch, 
-                  bool isSorted, int &wCount,
-                  int devID, XMem * mem, 
-				  bool isTraining);
-    /* load a batch of sequences (for language modeling) */
-    int LoadBatchLM(FILE * file, 
-                    XTensor * batchEnc, XTensor * paddingEnc,
-                    XTensor * batchDec, XTensor * paddingDec,
-                    XTensor * gold,
-                    int * seqs, int vs, int sBatch, int wBatch, 
-                    bool isSorted, int &wCount,
-                    int devID, XMem * mem, 
-					bool isTraining);
-    /* load a batch of sequences (for machine translation) */
-    int LoadBatchMT(FILE * file, 
-                    XTensor * batchEnc, XTensor * paddingEnc, 
-                    XTensor * batchDec, XTensor * paddingDec,
-                    XTensor * gold,
-                    int * seqs, int vsEnc, int vsDec, int sBatch, int wBatch, 
-                    bool isSorted, int &wCount,
-                    int devID, XMem * mem, 
-					bool isTraining);
-    /* shuffle the data file */
-    void Shuffle(const char * srcFile, const char * tgtFile);
    /* get word probabilities for a batch of sequences */
    float GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs);

--- a/source/sample/transformer/Transformer.cpp
+++ b/source/sample/transformer/Transformer.cpp
@@ -25,6 +25,8 @@
 #include "T2TModel.h"
 #include "T2TUtility.h"
 #include "T2TTrainer.h"
+#include "T2TPredictor.h"
+#include "T2TTester.h"
 #include "../../tensor/XDevice.h"
 #include "../../tensor/XUtility.h"
 #include "../../tensor/XGlobal.h"
@@ -36,8 +38,6 @@ int TransformerMain(int argc, const char ** argv)
 {
    if(argc == 0)
        return 1;
-    fprintf(stderr, "%e\n", log(1e-8F));
    char ** args = new char*[argc];
    for(int i = 0; i < argc; i++){
@@ -49,6 +49,7 @@ int TransformerMain(int argc, const char ** argv)
    ShowParams(argc, args);
+    bool isBeamSearch = false;
    char * trainFN = new char[MAX_LINE_LENGTH];
    char * modelFN = new char[MAX_LINE_LENGTH];
    char * testFN = new char[MAX_LINE_LENGTH];
@@ -58,8 +59,10 @@ int TransformerMain(int argc, const char ** argv)
    LoadParamString(argc, args, "model", modelFN, "");
    LoadParamString(argc, args, "test", testFN, "");
    LoadParamString(argc, args, "output", outputFN, "");
+    LoadParamBool(argc, args, "beamsearch", &isBeamSearch, false);
    srand((unsigned int)time(NULL));
    T2TTrainer trainer;
    trainer.Init(argc, args);
@@ -78,12 +81,22 @@ int TransformerMain(int argc, const char ** argv)
    if(strcmp(modelFN, ""))
        model.Read(modelFN);
-    T2TTrainer tester;
-    tester.Init(argc, args);
    /* test the model on the new data */
-    if(strcmp(testFN, "") && strcmp(outputFN, ""))
+    if(strcmp(testFN, "") && strcmp(outputFN, "")){
-        tester.Test(testFN, outputFN, &model);
+        /* beam search */
+        if(isBeamSearch){
+            T2TTester searcher;
+            searcher.Init(argc, args);
+            searcher.Test(testFN, outputFN, &model);
+        }
+        /* forced decoding */
+        else{
+            T2TTrainer tester;
+            tester.Init(argc, args);
+            tester.Test(testFN, outputFN, &model);
+        }
+    }
    delete[] trainFN;
    delete[] modelFN;

--- a/source/sample/transformer/transformer.lnk
+++ b/source/sample/transformer/transformer.lnk
--- a/source/tensor/Main.cpp
+++ b/source/tensor/Main.cpp
@@ -30,7 +30,9 @@
 #include "XDevice.h"
 #include "./test/Test.h"
 #include "./core/CHeader.h"
+#include "./XBLAS.h"
+#include "./core/sort/TopK.h"
+#include "./core/movement/Gather.h"
 //#define CRTDBG_MAP_ALLOC
 //#include <stdlib.h>  
 //#include <crtdbg.h> 
@@ -39,9 +41,6 @@ using namespace nts;
 void SmallTest();
 void TransposeTest();
-void LittleTest();
-void T2TTest();
-void T2TTest2();
 void PowerTest();
 int main( int argc, const char ** argv )
@@ -166,127 +165,5 @@ void TransposeTest()
    delete[] data;
 }
-void LittleTest()
-{
-    int a = 5000;
-    int b = 100000;
-    int c = a*b;
-    printf("%d\n", c);
-    exit(1);
-}
-void T2TTest()
-{
-    XTensor * input;
-    XTensor * weight;
-    XTensor * output;
-    XTensor * gold;
-    XTensor * dedy;
-    XTensor * dedx;
-    XTensor * dedxTmp;
-    XTensor * dedw;
-    XTensor * padding;
-    DTYPE loss;
-    int * dimSize = new int[2];
-    dimSize[0] = 256;
-    dimSize[1] = 10001;
-    int * dimSize2 = new int[3];
-    dimSize2[0] = 2;
-    dimSize2[1] = 31;
-    dimSize2[2] = 256;
-    int * dimSize3 = new int[3];
-    dimSize3[0] = 2;
-    dimSize3[1] = 31;
-    dimSize3[2] = 10001;
-    int * dimSize4 = new int[2];
-    dimSize4[0] = 2;
-    dimSize4[1] = 31;
-    input = NewTensor(3, dimSize2, X_FLOAT, 1.0F, 0);
-    weight = NewTensor(2, dimSize, X_FLOAT, 1.0F, 0);
-    dedw = NewTensor(2, dimSize, X_FLOAT, 1.0F, 0);
-    gold = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
-    output = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
-    dedy = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
-    dedx = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
-    dedxTmp = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
-    padding = NewTensor(2, dimSize4, X_FLOAT, 1.0F, 0);
-    //weight = NewTensor(2, dimSize);
-    //dedw = NewTensor(2, dimSize);
-    //input = NewTensor(3, dimSize2);
-    //gold = NewTensor(3, dimSize3);
-    //output = NewTensor(3, dimSize3);
-    //dedy = NewTensor(3, dimSize3);
-    //dedx = NewTensor(3, dimSize3);
-    //dedxTmp = NewTensor(3, dimSize3);
-    //padding = NewTensor(2, dimSize4);
-    myRead(input, "x.txt", "x");
-    myRead(weight, "w.txt", "w");
-    myRead(gold, "gold.txt", "gold");
-    myRead(padding, "padding.txt", "padding");
-    XTensor inter;
-    inter = MMul(*input, *weight);
-    _Softmax(&inter, output, 2);
-    //_LogMe(output);
-    loss = _CrossEntropyFast(output, gold, REDUCE_MEAN, NULL, padding);
-    printf("loss: %f\n", loss);
-    _CrossEntropyBackward(dedy, output, gold, NULL);
-    //_CrossEntropyBackward(dedy, output, gold, NULL, padding);
-    myDump(dedy, "dedy.txt", "dedy");
-    _SoftmaxBackward(NULL, output, input, dedy, dedx, NULL, -1, NOLOSS);
-    _Sub(output, gold, dedxTmp);
-    myDump(dedx, "dedx.txt", "dedx");
-    dedx->Dump(stderr, "dedx", 200);
-    dedxTmp->Dump(stderr, "dedxTmp", 200);
-    input->Reshape(input->unitNum/input->GetDim(-1), input->GetDim(-1));
-    dedx->Reshape(dedx->unitNum/dedx->GetDim(-1), dedx->GetDim(-1));
-    _MatrixMulBatched(input, X_TRANS, dedx, X_NOTRANS, dedw);
-    myDump(dedw, "dedw.txt", "dedw");
-}
-void T2TTest2()
-{
-    int dimSize[3];
-    dimSize[0] = 161;
-    dimSize[1] = 47;
-    dimSize[2] = 10001;
-    XTensor * probs = NewTensor(3, dimSize, X_FLOAT, 1.0F, 0);
-    //XTensor * probs = NewTensor(3, dimSize, X_FLOAT, 1.0F, -1);
-    //myRead(probs, "probs.txt", " ");
-    _SetDataFixedFloat(probs, 1.0F);
-    probs->Reshape(1, probs->unitNum);
-    DTYPE sum = _ReduceSumAll(probs);
-    printf("%e\n", sum);
-    //XTensor tmp;
-    //tmp = IsNonZero(*probs);
-    //DTYPE nonZeroNum = ReduceSumAll(tmp);
-    //printf("%f\n", nonZeroNum);
-    //
-    //DTYPE gpu = ReduceSum(*probs, 1).Get2D(0, 0);
-    //printf("%e\n", gpu);
-}
--- a/source/tensor/XBLAS.h
+++ b/source/tensor/XBLAS.h
@@ -28,7 +28,6 @@
 #ifndef __XBLAS_H__
 #define __XBLAS_H__
 /* the nts (NiuTrans.Tensor) namespace */
 namespace nts{
@@ -36,7 +35,7 @@ namespace nts{
 #define OPENBLAS_CONST const
-typedef int BLASINT;
+typedef int     BLASINT;
 typedef enum CBLAS_ORDER     {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER;
 typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114} CBLAS_TRANSPOSE;
 typedef enum CBLAS_UPLO      {CblasUpper=121, CblasLower=122} CBLAS_UPLO;

--- a/source/tensor/XCall.cpp
+++ b/source/tensor/XCall.cpp
--- a/source/tensor/XCall.h
+++ b/source/tensor/XCall.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: LI Yinqiao (email: li.yin.qiao.2012@hotmail.com) 2019-10-21
+*/
+#ifndef __XCALL_H__
+#define __XCALL_H__
+#include "XTensor.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/*
+* we define the "new and delete" functions below
+*/
+/* initialize a XTensor */
+void InitTensor(XTensor * tensor,
+                const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
+                const float myDenseRatio = 1.0F, const int myDevID = -1, XMem * myMem = NULL);
+/* initialize a dense XTensor V2 */
+void InitTensorV2(XTensor * tensor,
+                const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
+                const int myDevID = -1, const bool isEnableGrad = true);
+/* initialize a dense vector */
+void InitTensor1D(XTensor * tensor, const int num, 
+                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);
+/* initialize a dense vector V2 */
+void InitTensor1DV2(XTensor * tensor, const int num, 
+                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = true);
+/* initialize a dense matrix */
+void InitTensor2D(XTensor * tensor, const int rowNum, const int colNum,
+                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);
+/* initialize a dense matrix V2 */
+void InitTensor2DV2(XTensor * tensor, const int rowNum, const int colNum,
+                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = true);
+/* initialize a dense 3d tensor */
+void InitTensor3D(XTensor * tensor, const int d0, const int d1, const int d2,
+                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);
+/* initialize a dense 3d tensor V2 */
+void InitTensor3DV2(XTensor * tensor, const int d0, const int d1, const int d2,
+                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = true);
+/* initialize a dense 4d tensor */
+void InitTensor4D(XTensor * tensor, const int d0, const int d1, const int d2, const int d3,
+                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);
+/* initialize a dense 4d tensor V2 */
+void InitTensor4DV2(XTensor * tensor, const int d0, const int d1, const int d2, const int d3,
+                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = true);
+/* initialize a dense 5d tensor */
+void InitTensor5D(XTensor * tensor, const int d0, const int d1, const int d2, const int d3, const int d4,
+                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);
+/* initialize a dense 5d tensor V2 */
+void InitTensor5DV2(XTensor * tensor, const int d0, const int d1, const int d2, const int d3, const int d4,
+                    const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = true);
+/* initialize a tensor with a reference tensor */
+void InitTensor(XTensor * tensor, const XTensor * reference);
+/* initialize a tensor with a reference tensor */
+void InitTensorV2(XTensor * tensor, const XTensor * reference);
+/* initialize a tensor on the CPU with a reference tensor */
+void InitTensorOnCPU(XTensor * tensor, const XTensor * reference);
+/* generate a XTensor with no initialization */
+XTensor * NewTensor();
+/* generate a XTensor */
+XTensor * NewTensor(const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
+                    const float myDenseRatio = 1.0F, const int myDevID = -1, XMem * myMem = NULL);
+/* generate a dense XTensor V2 */
+XTensor * NewTensorV2(const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
+                      const int myDevID = -1, const bool isEnableGrad = true);
+/* generate a XTensor which allocates data on the buffer */
+XTensor * NewTensorBuf(const int myOrder, const int * myDimSize,
+                       const TENSOR_DATA_TYPE myDataType = X_FLOAT, const float myDenseRatio = 1.0F,
+                       const int myDevID = -1, XMem * myMem = NULL);
+/* generate a dense XTensor which allocates data on the buffer V2 */
+XTensor * NewTensorBufV2(const int myOrder, const int * myDimSize,
+                       const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = true);
+/* generate a XTensor which allocates data on the buffer */
+XTensor * NewTensorBuf(const XTensor * reference, int devID, XMem * myMem);
+/* generate a XTensor which allocates data on the buffer V2 */
+XTensor * NewTensorBufV2(const XTensor * reference, int devID, const bool isEnableGrad = true);
+/* generate a dense vector */
+XTensor * NewTensor1D(const int num, const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, 
+                      XMem * myMem = NULL);
+/* generate a dense vector V2 */
+XTensor * NewTensor1DV2(const int num, const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = true);
+/* generate a dense matrix */
+XTensor * NewTensor2D(const int rowNum, const int colNum, 
+                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
+                      const int myDevID = -1, XMem * myMem = NULL);
+/* generate a dense matrix V2 */
+XTensor * NewTensor2DV2(const int rowNum, const int colNum, 
+                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
+                      const int myDevID = -1, const bool isEnableGrad = true);
+/* generate a dense 3d tensor */
+XTensor * NewTensor3D(const int d0, const int d1, const int d2, 
+                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
+                      const int myDevID = -1, XMem * myMem = NULL);
+/* generate a dense 3d tensor V2 */
+XTensor * NewTensor3DV2(const int d0, const int d1, const int d2, 
+                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
+                      const int myDevID = -1, const bool isEnableGrad = true);
+/* generate a dense 4d tensor */
+XTensor * NewTensor4D(const int d0, const int d1, const int d2, const int d3,
+                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
+                      const int myDevID = -1, XMem * myMem = NULL);
+/* generate a dense 4d tensor V2 */
+XTensor * NewTensor4DV2(const int d0, const int d1, const int d2, const int d3,
+                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
+                      const int myDevID = -1, const bool isEnableGrad = true);
+/* generate a dense 5d tensor */
+XTensor * NewTensor5D(const int d0, const int d1, const int d2, const int d3, const int d4,
+                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
+                      const int myDevID = -1, XMem * myMem = NULL);
+/* generate a dense 5d tensor V2 */
+XTensor * NewTensor5DV2(const int d0, const int d1, const int d2, const int d3, const int d4,
+                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
+                      const int myDevID = -1, const bool isEnableGrad = true);
+/* generate a dense vector by range */
+XTensor * NewTensorRange(int lower, int upper, int step, const TENSOR_DATA_TYPE myDataType = X_INT, const int myDevID = -1, const bool isEnableGrad = true);
+/* generate a copy of XTensor (with a reference to a given tensor) */
+XTensor * NewTensor(const XTensor * a, bool isFilledData = true);
+/* free the data space of a given tensor */
+void DelTensor(XTensor * tensor);
+/* free the data space of a given tensor (on the buffer) */
+void DelTensorBuf(XTensor * tensor);
+} // namespace nts(NiuTrans.Tensor)
+#endif // __XCALL_H__
\ No newline at end of file
--- a/source/tensor/XDataType.cpp
+++ b/source/tensor/XDataType.cpp
@@ -60,7 +60,7 @@ TENSOR_DATA_TYPE GetDataType(const char * typeName)
    }
 }
-/****************************************************
+/*
 Below is for calling CPU BLAS for fast matrix operations
 I'm not sure how fast it is. But it seems that other
 guys are crazy about this. So I decided to have a try.
@@ -81,35 +81,4 @@ _XINLINE_ float Float16ToFloat(unsigned short h)
    return f;
 }
-/* 
-data type conversion
->> devID - device id
->> s - source data array
->> typeS - source data type
->> t - target data array
->> typeT - target data type
->> size - number of the items in s (and t)
-*/
-void ConvertDataType(int devID, void * s, TENSOR_DATA_TYPE typeS, void * t, TENSOR_DATA_TYPE typeT, int size)
-{
-    CheckNTErrors((devID < 0), "This code must be run on CPUs!");
-    if(typeS == typeT)
-        return;
-    if(typeS == X_FLOAT && typeT == X_FLOAT16){
-        for(int i = 0; i < size; i++){
-            ((unsigned short*)t)[i] = FloatToFloat16(((float*)s)[i]);
-        }
-    }
-    else if(typeS == X_FLOAT16 && typeT == X_FLOAT){
-        for(int i = 0; i < size; i++){
-            ((float*)t)[i] = Float16ToFloat(((unsigned short*)s)[i]);
-        }
-    }
-    else{
-        ShowNTErrors("Unsupported data types for conversion!");
-    }
-}
 } /* end of the nts (NiuTrans.Tensor) namespace */
--- a/source/tensor/XDataType.h
+++ b/source/tensor/XDataType.h
@@ -49,15 +49,6 @@ extern TENSOR_DATA_TYPE GetDataType(const char * typeName);
 /* data conversion (for lower precision computation) */
 unsigned short FloatToFloat16(float f);
 float Float16ToFloat(unsigned short h);
-void ConvertDataType(int devID, 
-                     void * s, TENSOR_DATA_TYPE typeS, 
-                     void * t, TENSOR_DATA_TYPE typeT, int size);
-#ifdef USE_CUDA
-void CudaConvertDataType(int devID, 
-                         void * s, TENSOR_DATA_TYPE typeS, 
-                         void * t, TENSOR_DATA_TYPE typeT, int size);
-#endif
 } /* end of the nts (NiuTrans.Tensor) namespace */

--- a/source/tensor/XDevice.cpp
+++ b/source/tensor/XDevice.cpp
@@ -24,6 +24,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <time.h>
 #include "XDevice.h"
 #include "XGlobal.h"
 #include "XThread.h"
@@ -59,6 +60,7 @@ XDevice::~XDevice()
        cublasDestroy(cublasHandle);
    if(stream != NULL)
        delete stream;
+    curandDestroyGenerator(gen);
 #endif
 }
@@ -68,6 +70,7 @@ void XDevice::Init(int myDevID)
    Clear();
    devID = myDevID;
+    seed = rand();
    /* CPU information */
    if(devID < 0){
@@ -80,6 +83,10 @@ void XDevice::Init(int myDevID)
        cudaDeviceProp prop;
        cudaSetDevice(myDevID);
+        curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
+        curandSetPseudoRandomGeneratorSeed(gen, seed);
        if(cudaGetDeviceProperties(&prop, devID) != cudaSuccess){
            XPRINT1(0, stderr, "cannot get GPU(%d) information.", devID);
            exit(1);
@@ -194,7 +201,8 @@ void XDevice::SetGPUDevice(int devID)
    cudaError_t error = cudaSetDevice(devID);
    if (error != cudaSuccess){
-        fprintf(stderr, "Error! Calling cudaSetDevice(%d) fails(%d:%s)\n", devID, error, cudaGetErrorString(error));
+        fprintf(stderr, "Error! Calling cudaSetDevice(%d) fails(%d:%s)\n",
+                devID, error, cudaGetErrorString(error));
        exit(1);
    }
 #else
@@ -209,7 +217,7 @@ void XDevice::SetGPUDeviceFast(int devID)
    SetFastFlags();
 }
-/* switch to a get current dev */
+/* get the id of the current GPU device */
 int XDevice::GetGPUDevice()
 {
 #ifdef USE_CUDA
@@ -217,7 +225,8 @@ int XDevice::GetGPUDevice()
    cudaError_t error = cudaGetDevice(&devID);
    if (error != cudaSuccess){
-        fprintf(stderr, "Error! Calling cudaGetDevice(%d) fails(%d:%s)\n", devID, error, cudaGetErrorString(error));
+        fprintf(stderr, "Error! Calling cudaGetDevice(%d) fails(%d:%s)\n",
+                devID, error, cudaGetErrorString(error));
        exit(1);
    }
@@ -241,7 +250,7 @@ void XDevice::SetFastFlags()
 #endif
 }
-/* reset cuda flag for more efficient cuda execution (all devices) */
+/* reset the cuda flag for more efficient cuda execution (all devices) */
 void XDevice::SetFastFlagsAllDevices()
 {
 #ifdef USE_CUDA
@@ -267,9 +276,11 @@ XDevManager::~XDevManager()
 }
-/* initialize it and get the CPU and GPU information */
+/* initialization */
 void XDevManager::Init()
 {
+    srand((unsigned int)time(NULL));
    Clear();
    /* CPUs (we actually do not care about how many CPUs are using) */
@@ -309,7 +320,7 @@ void XDevManager::Clear()
 #ifdef USE_CUDA
-/* get the handle of GPU */
+/* get the handle of a given GPU */
 cublasHandle_t * XDevManager::GetCudaHandle(const int devID)
 {
    CheckNTErrors(devID < nGPU, "index of GPU is out of range.");
@@ -317,7 +328,7 @@ cublasHandle_t * XDevManager::GetCudaHandle(const int devID)
    return GPUs[devID].GetCublasHandle();
 }
-/* get the stream of cuda */
+/* get the stream of a given GPU */
 cudaStream_t * XDevManager::GetCudaStream(const int devID)
 {
    CheckNTErrors(devID < nGPU, "index of GPU is out of range.");
@@ -465,7 +476,7 @@ split a string
 >> items - splitting result
 << return - how many items are there
 */
-int SplitALine(char * inputString, const char * seperator, XList * items)
+int SplitALine(char * inputString, const char * seperator, StrList* items)
 {
    items->Clear();
@@ -514,12 +525,12 @@ get device ids for the given device information
             devInfo = "0:CPU-1 1:GPU-0 2:CPU-1"
             means that the first device is CPU, the second device
             is GPU-0, the third device is CPU.
->> devIDs - device sequence specified by devInfo
+>> devIDs - device IDs specified by devInfo
 << return - number of devices
 */
 int XDevManager::GetDeviceIDs(char * devInfo, int * devIDs)
 {
-    XList * terms = new XList(1);
+	StrList* terms = new StrList(1);
    SplitALine(devInfo, " ", terms);
    for(int i = 0; i < terms->count; i++){
@@ -556,7 +567,7 @@ int XDevManager::GetDeviceIDs(char * devInfo, int * devIDs)
    return devCount;
 }
-/* show id sequence */
+/* show device IDs */
 void XDevManager::ShowDeviceIDs(char * devInfo, char * msg)
 {
    msg[0] = 0;

--- a/source/tensor/XDevice.h
+++ b/source/tensor/XDevice.h
@@ -99,6 +99,9 @@ public:
    /* default stream for the device */
    XStream * stream;
+    /* seed for random number generation */
+    int seed;
 #ifdef USE_CUDA
    /* mutex for handle (GPU cublas) */
@@ -109,6 +112,9 @@ public:
    /* specify if the handle is initialized */
    bool isHandleReady;
+    /* generater of random numbers */
+    curandGenerator_t gen;
 #endif
@@ -230,6 +236,18 @@ extern XDevManager GDevs;
        cudaSetDevice(devIDBackup); \
 } \
+#define CheckDev(a, b) \
+{ \
+    if((a < 0 && b >= 0) || (a >= 0 && b < 0)){ \
+        fprintf(stderr, "[ERROR] (%s line %d): we must run the code on the same device (%d vs %d)\n", __FILENAME__, __LINE__, a, b); \
+        exit(1); \
+    } \
+    else if (a >= 0 && b >= 0 && a != b) { \
+        fprintf(stderr, "[ERROR] (%s line %d): we must run the code on the same device (%d vs %d)\n", __FILENAME__, __LINE__, a, b); \
+        exit(1); \
+    } \
+} \
 } /* end of the nts (NiuTrans.Tensor) namespace */
 #endif
--- a/source/tensor/XGlobal.h
+++ b/source/tensor/XGlobal.h
@@ -49,7 +49,7 @@ namespace nts {
 #ifdef DOUBELPRICSION
 #define DTYPE double
-#define DTYPE_MIN (DTYPE)1.79E+308
+#define DTYPE_MIN (DTYPE)-1.79E+308
 #else
 #define DTYPE float
 #define DTYPE_MIN (DTYPE)-3.40E+38
@@ -151,7 +151,9 @@ extern int verboseLevel;
 #define XPRINT7(VERBOSE,FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6,ARG7) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6,ARG7);FFLUSH(FILEH);}}
 #define XPRINT8(VERBOSE,FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6,ARG7,ARG8) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6,ARG7,ARG8);FFLUSH(FILEH);}}
-#define B2I(V) V==0?false:true
+#define B2I(V) V == 0 ? false : true
+#define MODX(a, b) int(b == 0 ? a : a - floor(double(a)/b) * b)
 /* BLAS interfaces */
 #ifdef DOUBELPRICSION

--- a/source/tensor/XHeap.cpp
+++ b/source/tensor/XHeap.cpp
@@ -31,15 +31,15 @@ namespace nts{
 /* constructor */
 template<HeapType hType, typename T>
+XHeap<hType, T>::XHeap()
+{
+}
+/* constructor */
+template<HeapType hType, typename T>
 XHeap<hType, T>::XHeap(int mySize, XMem * myMem)
 {
-    mem = myMem;
+    Init(mySize, myMem);
-    size = mySize;
-    count = 0;
-    if (mem == NULL)
-        items = new HeapNode<T>[mySize];
-    else
-        mem->Alloc(mem->devID, mySize * sizeof(T));
 }
 /* deconstructor */
@@ -50,6 +50,19 @@ XHeap<hType, T>::~XHeap()
 }
 template<HeapType hType, typename T>
+void XHeap<hType, T>::Init(int mySize, XMem * myMem)
+{
+    mem = myMem;
+    size = mySize;
+    count = 0;
+    if (mem == NULL)
+        items = new HeapNode<T>[mySize];
+    else
+        mem->Alloc(mem->devID, mySize * sizeof(T));
+}
+template<HeapType hType, typename T>
 void XHeap<hType, T>::Clear(T initValue)
 {
    count = 0;
@@ -89,10 +102,24 @@ _XINLINE_ HeapNode<T> XHeap<hType, T>::End()
 template<HeapType hType, typename T>
 _XINLINE_ void XHeap<hType, T>::Push(HeapNode<T> node)
 {
-    //CheckNTErrors((count < size), "Heap is full!");
+    if (count < size) {
-    items[count] = node;
+        items[count] = node;
-    Up(count);
+        Up(count);
-    count++;
+        count++;
+    }
+    else if(count == size){
+        HeapNode<T> & item0 = items[0];
+        if (hType == MIN_HEAP && item0.value >= node.value)
+            return;
+        else if (hType == MAX_HEAP && item0.value <= node.value)
+            return;
+        items[0] = node;
+        Down(0);
+    }
+    else {
+        ShowNTErrors("Overflow of the heap!");
+    }
 }
 /* replace the top-most item and update the heap */
@@ -107,7 +134,7 @@ _XINLINE_ void XHeap<hType, T>::ReplaceTop(HeapNode<T> node)
 template<HeapType hType, typename T>
 _XINLINE_ HeapNode<T> XHeap<hType, T>::Pop()
 {
-    //CheckNTErrors((size > 0), "Empty heap!");
+    CheckNTErrors(count > 0, "Empty heap!");
    HeapNode<T> node = items[0];
    items[0] = items[count - 1];
    count--;

--- a/source/tensor/XHeap.h
+++ b/source/tensor/XHeap.h
@@ -39,7 +39,7 @@ template <typename T>
 struct HeapNode
 {
    /* node index */
-    int index;
+    long long index;
    /* value of the node */
    T value;
@@ -52,9 +52,16 @@ struct HeapNode
    HeapNode(int i, T v)
    {
-        index = i;
+        index = (long long)i;
        value = v;
    };
+    HeapNode(void * i, T v)
+    {
+        index = (long long)i;
+        value = v;
+    }
 };
 /* a heap that keeps a data array of T */
@@ -76,11 +83,17 @@ public:
 public:
    /* constructor */
+    XHeap();
+    /* constructor */
    XHeap(int mySize, XMem * myMem = NULL);
    /* deconstructor */
    ~XHeap();
+    /* initialization */
+    void Init(int mySize, XMem * myMem = NULL);
    /* clear the data */
    void Clear(T initValue);
@@ -107,6 +120,9 @@ public:
    /* move item k up the tree */
    void Up(int k);
+    /* how many items are kept in the heap */
+    inline int Count() { return count; };
 };
 } /* end of the nts (NiuTrans.Tensor) namespace */

--- a/source/tensor/XLink.cpp
+++ b/source/tensor/XLink.cpp
@@ -300,9 +300,36 @@ void XLink::MakeLink(const XTensor * t1, const XTensor * t2, XTensor * h, int id
    if(h == NULL)
        return;
-    XList list(2);
+    if (!t1->enableGrad)
-    list.Add(t1);
+        return;
-    list.Add(t2);
+    TensorList list(2);
+    list.Add((XTensor*)t1);
+    list.Add((XTensor*)t2);
+    MakeLink(&list, h, id);
+}
+/*
+create a hyperedge with two input tensors and a output tensor
+>> t1 - a tail tensor
+>> t2 - the second tail tensor
+>> t3 - the third tail tensor
+>> h - head tensor
+>> id - id of the edge type
+*/
+void XLink::MakeLink(const XTensor * t1, const XTensor * t2, const XTensor * t3,XTensor * h, int id)
+{
+    if (h == NULL)
+        return;
+    if (!t1->enableGrad || !t2->enableGrad)
+        return;
+    TensorList list(3);
+    list.Add((XTensor*)t1);
+    list.Add((XTensor*)t2);
+    list.Add((XTensor*)t3);
    MakeLink(&list, h, id);
 }
@@ -313,7 +340,7 @@ create a hyper edge with a list of tensors and a output tensor
 >> h - head tensor
 >> id - id of the edge type
 */
-void XLink::MakeLink(const XList * list, XTensor * h, int id)
+void XLink::MakeLink(const TensorList * list, XTensor * h, int id)
 {
    /* forward */
    XLink &income = h->income;
@@ -347,8 +374,11 @@ create a hyper edge with a input tensors and a list of output tensors
 >> list - a list of output tensors
 >> id - id of the edge type
 */
-void XLink::MakeLink(XTensor * t, XList * list, int id)
+void XLink::MakeLink(XTensor * t, TensorList * list, int id)
 {
+    if (!t->enableGrad)
+        return;
    /* forward */
    for(int i = 0; i < list->count; i++){
        XTensor * h = (XTensor*)list->GetItem(i);
@@ -509,6 +539,88 @@ void XLink::Replace(const XTensor * oldOne, XTensor * newOne)
    }
 }
+/*
+copy a node with another, i.e., we add the links to the new node
+>> src - the node to be copied
+>> tgt - the new node
+*/
+void XLink::Copy(const XTensor * reference, XTensor * target)
+{
+    if (reference == NULL || target == NULL)
+        return;
+    XLink &newIncome = target->income;
+    XLink &newOutgo = target->outgo;
+    XLink::ClearOutgoing(target);
+    XLink::ClearIncoming(target);
+    /* incoming nodes */
+    if (reference->income.typeID != 0) {
+        if (newIncome.tailNum < reference->income.tailNum) {
+            delete[] newIncome.tails;
+            newIncome.tails = new XTensor*[reference->income.tailNum];
+        }
+        newIncome.SetType(reference->income.typeID);
+        newIncome.head = target;
+        newIncome.tailNum = reference->income.tailNum;
+        memcpy(newIncome.tails, reference->income.tails, sizeof(XTensor*) * newIncome.tailNum);
+        int paraArraySize = reference->income.paramNum * reference->income.paramSize;
+        newIncome.params = new char[paraArraySize];
+        memcpy(newIncome.params, reference->income.params, paraArraySize);
+        newIncome.paramNum = reference->income.paramNum;
+        /* update the link to each child node */
+        for (int i = 0; i < newIncome.tailNum; i++) {
+            XTensor * child = newIncome.tails[i];
+            XLink &childOutgo = child->outgo;
+            bool hit = false;
+            for (int j = 0; j < childOutgo.tailNum; j++) {
+                if (childOutgo.tails[j] == reference) {
+                    //childOutgo.tails[j] = target;
+                    childOutgo.AddTail(target);
+                    hit = true;
+                    break;
+                }
+            }
+            if (childOutgo.tailNum > 0) {
+                CheckNTErrors(hit, "No proper node found in child.outgo edge!");
+            }
+        }
+    }
+    if (newOutgo.tailNum < reference->outgo.tailNum) {
+        delete[] newOutgo.tails;
+        newOutgo.tails = new XTensor*[reference->outgo.tailNum];
+    }
+    /* outgoing nodes */
+    newOutgo.head = target;
+    newOutgo.tailNum = reference->outgo.tailNum;
+    memcpy(newOutgo.tails, reference->outgo.tails, sizeof(XTensor*) * newOutgo.tailNum);
+    /* update the link to each parent node */
+    for (int i = 0; i < newOutgo.tailNum; i++) {
+        XTensor * parent = newOutgo.tails[i];
+        XLink &parentIncome = parent->income;
+        bool hit = false;
+        for (int j = 0; j < parentIncome.tailNum; j++) {
+            if (parentIncome.tails[j] == reference) {
+                //parentIncome.tails[j] = target;
+                parentIncome.AddTail(target);
+                hit = true;
+            }
+        }
+        if (parentIncome.tailNum > 0) {
+            CheckNTErrors(hit, "No proper node found in parent.income edge!");
+        }
+    }
+}
 /* 
 copy incoming edges of a given node
 >> reference - the node we copy from
@@ -521,7 +633,7 @@ void XLink::CopyIncoming(const XTensor * reference, XTensor * target)
    ClearIncoming(target);
    int tailNum = reference->income.tailNum;
-    XList tails(tailNum);
+    TensorList tails(tailNum);
    for(int i = 0; i < tailNum; i++){
        XTensor * tail = (XTensor*)reference->income.tails[i];
        tails.Add(tail);
@@ -634,6 +746,29 @@ void XLink::ShowNode(FILE * file, XTensor * node)
    fprintf(stderr, "\n");
 }
+/* 
+search for a node in a top-down manner by its name 
+>> top - the top most node
+<< return - the node we found
+*/
+XTensor * XLink::SearchNode(XTensor * top, const char * name)
+{
+    if(!strcmp(top->name, name))
+        return top;
+    XLink &incoming = top->income;
+    for(int i = 0; i < incoming.tailNum; i++){
+        XTensor * child = incoming.tails[i];
+        XTensor * hit = SearchNode(child, name);
+        if(hit != NULL)
+            return hit;
+    }
+    return NULL;
+}
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/XLink.h
+++ b/source/tensor/XLink.h
@@ -33,7 +33,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 /* cross reference */
 struct XTensor;
-#define MAX_OP_NAME_LENGTH 16
+#define MAX_OP_NAME_LENGTH 64
 #define PARAM_UNTI_SIZE    64
 /*
@@ -138,13 +138,17 @@ struct XLink
    static
    void MakeLink(const XTensor * t1, const XTensor * t2, XTensor * h, int id);
+    /* create a hyper edge with three input tensors and a output tensor */
+    static
+    void MakeLink(const XTensor * t1, const XTensor * t2, const XTensor * t3, XTensor * h, int id);
    /* create a hyper edge with a list of input tensors and a output tensor */
    static
-    void MakeLink(const XList * list, XTensor * h, int id);
+    void MakeLink(const TensorList * list, XTensor * h, int id);
    /* create a hyper edge with a input tensors and a list of output tensors */
    static
-    void MakeLink(XTensor * h, XList * list, int id);
+    void MakeLink(XTensor * h, TensorList * list, int id);
    /* add a parameter */
    static
@@ -170,6 +174,10 @@ struct XLink
    static 
    void Replace(const XTensor * oldOne, XTensor * newOne);
+    /* copy a node with another, i.e., we add the links to the new node */
+    static
+    void Copy(const XTensor * reference, XTensor * target);
    /* copy links of a given node */
    static
    void CopyIncoming(const XTensor * reference, XTensor * target);
@@ -181,6 +189,10 @@ struct XLink
    /* show a node */
    static
    void ShowNode(FILE * file, XTensor * node);
+    /* search a node in a top-down manner by its name */
+    static
+    XTensor * SearchNode(XTensor * top, const char * name);
 };
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/XList.cpp
+++ b/source/tensor/XList.cpp
--- a/source/tensor/XList.h
+++ b/source/tensor/XList.h
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,32 +15,31 @@
 * limitations under the License.
 */
-/*
+ /*
- * 
+  *
- * Implementation of list that keeps data items
+  * Implementation of template list that keeps data items
- *
+  *
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-04-17
+  * $Created by: HU Chi (huchinlp@foxmail.com)
- * The first coding job this year!
+  *
- *
+  */
- */
-#ifndef __XLIST_H__
-#define __XLIST_H__
 #include "XMem.h"
 #include "XGlobal.h"
-/* the nts (NiuTrans.Tensor) namespace */
+#ifndef __TensorList_H__
-namespace nts{
+#define __TensorList_H__
-typedef int (* ListCompare)(const void * item1, const void * item2);
-/* the XList class */
+/* the nts (NiuTrans.Tensor) namespace */
-class XList
+namespace nts {
-{
+/* the TensorListBase class */
+template <typename T>
+struct TensorListBase {
 public:
    /* data items */
-    void ** items;
+    T *items;
    /* number of items */
    int count;
@@ -49,56 +48,105 @@ public:
    int maxNum;
    /* the memory pool for data array allocation */
-    XMem * mem;
+    XMem* mem;
-    /* indicates whether data items are integers */
-    bool isIntList;
 public:
    /* constructor */
-    XList();
+    TensorListBase();
    /* constructor */
-    XList(int myMaxNum, bool isIntListOrNot = false);
+    TensorListBase(int myMaxNum);
    /* constructor */
-    XList(int myMaxNum, XMem * myMem, bool isIntListOrNot = false);
+    TensorListBase(int myMaxNum, XMem* myMem);
    /* de-constructor */
-    ~XList();
+    ~TensorListBase();
-    /* utilities */
+    /* add an item into the list */
-    void Create(int myMaxNum, XMem * myMem);
+    void Add(T&& item);
-    void Add(const void * item);
-    void Add(void ** inputItems, int inputItemCount);
+    /* return number of elements */
-    void AddList(XList * l);
+    size_t Size();
-    void AddInt(int i);
-    void Insert(int pos, void * item);
+    /* add an item into the list */
-    void * GetItem(int i) const;   
+    void Add(const T& item);
-    int GetItemInt(int i);
-    void SetItem(int i, void * item);
+    /* add a number of items into the list */
-    void SetItemInt(int i, int item);
+    void Add(const T* inputItems, int inputItemCount);
-    int FindFirst(void * item);
+    /* append a list to the current list */
+    void AddList(TensorListBase* l);
+    /* insert an item to the given position of the list */
+    void Insert(int pos, const T& item);
+    /* insert an item to the given position of the list */
+    void Insert(int pos, T&& item);
+    /* get the item at position i */
+    T& GetItem(int i) const;
+    /* set the item at position i */
+    void SetItem(int i, const T& item);
+    /* set the item at position i */
+    void SetItem(int i, T&& item);
+    /* find the position of the first matched item  */
+    int FindFirst(const T& item);
+    /* clear the data array */
    void Clear();
-    void ClearStringList();
-    void Sort(int itemSize, ListCompare comp);
+    /* sort the list */
+    void Sort(int itemSize);
+    /* reverse the list */
    void Reverse();
+    /* remove the item at position i */
    void Remove(int i);
-    XList * Copy(XMem * myMem);
+    /* reserve space for data entry */
+    void Reserve(int n);
+    /* copy the list */
+    TensorListBase* Copy(XMem* myMem);
+    /* shuffle the list */
    void Shuffle(int nround = 10, int beg = -1, int len = 0);
    /* short */
-    _XINLINE_ void * Get(int i) {return GetItem(i);};
+    T& operator[] (int i) { return GetItem(i); };
-    _XINLINE_ int GetInt(int i) {return GetItemInt(i);};
+    T& Get(int i) { return GetItem(i); };
-    _XINLINE_ void Set(int i, void * item) {SetItem(i, item);};
+    void Set(int i, T item) { SetItem(i, item); };
-    _XINLINE_ void SetInt(int i, int item) {SetItemInt(i, item);};
+};
+struct XTensor;
+typedef TensorListBase<void*> XList;
+typedef TensorListBase<int> IntList;
+typedef TensorListBase<char> CharList;
+typedef TensorListBase<char*> StrList;
+typedef TensorListBase<long> LongList;
+typedef TensorListBase<float> FloatList;
+typedef TensorListBase<short> ShortList;
+struct Example {
+    int id;
+    IntList data;
+};
+struct Result {
+    int id;
+    IntList data;
 };
-extern XList NULLList;
+typedef TensorListBase<Result> ResultList;
+typedef TensorListBase<Example> ExampleList;
+typedef TensorListBase<XTensor*> TensorList;
-} 
+} /* end of the nts (NiuTrans.Tensor) namespace */
-/* end of the nts (NiuTrans.Tensor) namespace */
-#endif
+#endif // __TensorList_H__
--- a/source/tensor/XMem.cpp
+++ b/source/tensor/XMem.cpp
--- a/source/tensor/XMem.h
+++ b/source/tensor/XMem.h
@@ -24,6 +24,7 @@
 #ifndef __XMEM_H__
 #define __XMEM_H__
+#include <stdio.h>
 #include <stdlib.h>
 #ifdef CUDA_BLAS
@@ -38,6 +39,15 @@
 #include <curand.h>
 #endif
+#ifdef __APPLE__
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#elif WIN32
+#include <windows.h>
+#else
+#include <unistd.h>
+#endif
 /* the nts (NiuTrans.Tensor) namespace */
 namespace nts{
@@ -50,8 +60,10 @@ typedef long long          INT_64;
 #define CUDA_HOST_MALLOC 1
 #define MY_PITCH CUDA_PITCH
 #define BUF_PITCH 256
-#define MIN_BLOCK_SIZE_FOR_MEMPOOL 128 * 1024 * 1024
+#define MIN_BLOCK_SIZE_FOR_MEMPOOL 256 * 1024 * 1024
 #define MIN_BLOCK_NUM_FOR_MEMPOOL 1024
+#define MAX_CPU_MEM_NUM 16
+#define MAX_GPU_MEM_NUM 16
 /* 
 mode of runnig a memory pool 
@@ -201,6 +213,9 @@ public:
    MTYPE curUsedPin;
    MTYPE bufUsedPin;
+    /* indicates whether the memory pool is initialized */
+    bool isInitialized;
 #ifdef USE_CUDA
    /* handle used for cublas */
    cublasHandle_t cublasHandle;
@@ -402,6 +417,9 @@ public:
    /* create a new cublas handle */
    void CreateBLASHandle();
+    /* show profile of the memory pool */
+    void ShowMemUsage(FILE * file);
 #ifdef USE_CUDA
    /* get the handle of cublas */
    cublasHandle_t * GetCublasHandle();
@@ -409,6 +427,61 @@ public:
 };
+/*
+a class for the management of memory
+*/
+class XMemManager
+{
+private:
+    /* cpu memory pool information */
+    XMem CPUMems[MAX_CPU_MEM_NUM];
+    /* number of cpu memory pools */
+    int nCPUMem;
+    /* gpu memory pool information */
+    XMem GPUMems[MAX_GPU_MEM_NUM];
+    /* number of gpu memory pools */
+    int nGPUMem;
+public:
+    /* constructor */
+    XMemManager();
+    /* de-constructor */
+    ~XMemManager();
+    /* get memory size */
+    MTYPE GetAvailableMemory();
+    /* get GPU memory size */
+    MTYPE GetAvailableGPUMemory(int devID);
+    /* get buffer size */
+    void GetBufferSize(MTYPE freeMem, MTYPE * myBufSize);
+    /* initialize it and set the global memory information */
+    void Initialize();
+    /* free it */
+    void Free();
+    /* get global memory pool */
+    XMem * GetMem(const int devID);
+    /* get global memory size */
+    int GetMemSize(const int devID, MTYPE * myBlockSize, int * myBlockNum, MTYPE * myBufSize);
+    /* show memory information */
+    void ShowMemInfo();
+};
+/* managing the memories */
+extern XMemManager GMems;
 extern XMem * GMem;
 extern int testxmemid;

--- a/source/tensor/XName.cpp
+++ b/source/tensor/XName.cpp
@@ -59,6 +59,8 @@ const char * GetOPName(int type)
            return "M_DIV";
        else if (type == MATH_DIVDIM)
            return "M_DIVDIM";
+        else if (type == MATH_MASK)
+            return "M_MASK";
        else if (type == MATH_MATRIXMUL)
            return "M_MATRIXMUL";
        else if (type == MATH_MATRIXMULBATCHED)
@@ -67,6 +69,8 @@ const char * GetOPName(int type)
            return "M_MULTIPLY";
        else if (type == MATH_MULTIPLYDIM)
            return "M_MULTIPLYDIM";
+        else if (type == MATH_MULTIPLYBROADCAST)
+            return "M_MULTIPLYBROADCAST";
        else if (type == MATH_NEGATE)
            return "M_NEGATE";
        else if (type == MATH_NORMALIZE)
@@ -75,6 +79,14 @@ const char * GetOPName(int type)
            return "M_POWER";
        else if (type == MATH_SCALEANDSHIFT)
            return "M_SCALEANDSHIFT";
+        else if (type == MATH_SCALE)
+            return "M_SCALE";
+        else if (type == MATH_DESCALE)
+            return "M_DESCALE";
+        else if (type == MATH_SHIFT)
+            return "M_SHIFT";
+        else if (type == MATH_MULANDSHIFT)
+            return "M_OPERATION";
        else if (type == MATH_SIGN)
            return "M_SIGN";
        else if (type == MATH_SUB)
@@ -85,6 +97,8 @@ const char * GetOPName(int type)
            return "M_SUM";
        else if (type == MATH_SUMDIM)
            return "M_SUMDIM";
+        else if (type == MATH_SUMBROADCAST)
+            return "M_SUMBROADCAST";
        else if (type == REDUCE_REDUCEMAX)
            return "R_REDUCEMAX";
        else if (type == REDUCE_REDUCEMEAN)
@@ -97,13 +111,7 @@ const char * GetOPName(int type)
            return "R_REDUCEVARIANCE";
    }
    else if ((type & DATA_BASE) != 0){
-        if (type == GETANDSET_CONVERTDATATYPE)
+        if (type == GETANDSET_SELECT)
-            return "G_CONVERTDATATYPE";
-        else if (type == GETANDSET_INDEXTOONEHOT)
-            return "G_INDEXTOONEHOT";
-        else if (type == GETANDSET_ONEHOTTOINDEX)
-            return "G_ONEHOTTOINDEX";
-        else if (type == GETANDSET_SELECT)
            return "G_SELECT";
        else if (type == MOVEMENT_COPYINDEXED)
            return "M_COPYINDEXED";
@@ -111,6 +119,8 @@ const char * GetOPName(int type)
            return "M_COPYVALUES";
        else if (type == MOVEMENT_GATHER)
            return "M_GATHER";
+        else if (type == MOVEMENT_DROPOUTWITHINDEX)
+            return "M_DROPOUTWITHINDEX";
        else if (type == SHAPE_CONCATENATE)
            return "S_CONCATENATE";
        else if (type == SHAPE_MERGE)
@@ -152,6 +162,10 @@ const char * GetOPName(int type)
        else if (type == FUNC_SOFTMAX)
            return "F_SOFTMAX";
    }
+    else if ((type & LOSS_BASE) != 0) {
+        if (type == LOSS_CROSSENTROPY)
+            return "L_CROSSENTROPY";
+    }
    return "NULL";
 }

--- a/source/tensor/XName.h
+++ b/source/tensor/XName.h
@@ -48,21 +48,29 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define MATH_CLIP               MATH_ROUND + 1
 #define MATH_DIV                MATH_CLIP + 1
 #define MATH_DIVDIM             MATH_DIV + 1
-#define MATH_MATRIXMUL          MATH_DIVDIM + 1
+#define MATH_MASK               MATH_DIVDIM + 1
+#define MATH_MATRIXMUL          MATH_MASK + 1
 #define MATH_MATRIXMULBATCHED   MATH_MATRIXMUL + 1
 #define MATH_MULTIPLY           MATH_MATRIXMULBATCHED + 1
 #define MATH_MULTIPLYDIM        MATH_MULTIPLY + 1
-#define MATH_NEGATE             MATH_MULTIPLYDIM + 1
+#define MATH_MULTIPLYBROADCAST  MATH_MULTIPLYDIM + 1
+#define MATH_NEGATE             MATH_MULTIPLYBROADCAST + 1
 #define MATH_NORMALIZE          MATH_NEGATE + 1
 #define MATH_POWER              MATH_NORMALIZE + 1
 #define MATH_SCALEANDSHIFT      MATH_POWER + 1
-#define MATH_SIGN               MATH_SCALEANDSHIFT + 1
+#define MATH_MULANDSHIFT        MATH_SCALEANDSHIFT + 1
+#define MATH_SCALE              MATH_MULANDSHIFT + 1
+#define MATH_DESCALE            MATH_SCALE + 1
+#define MATH_SHIFT              MATH_DESCALE + 1
+#define MATH_MOD                MATH_SHIFT + 1
+#define MATH_SIGN               MATH_MOD + 1
 #define MATH_SUB                MATH_SIGN + 1
 #define MATH_SUBDIM             MATH_SUB + 1
 #define MATH_SUM                MATH_SUBDIM + 1
 #define MATH_SUMDIM             MATH_SUM + 1
+#define MATH_SUMBROADCAST       MATH_SUMDIM + 1
-#define REDUCE                  MATH_SUMDIM + 1
+#define REDUCE                  MATH_SUMBROADCAST + 1
 #define REDUCE_REDUCEMAX        REDUCE + 1
 #define REDUCE_REDUCEMEAN       REDUCE_REDUCEMAX + 1
 #define REDUCE_REDUCESUM        REDUCE_REDUCEMEAN + 1
@@ -73,16 +81,15 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define DATA_BASE               MATH_BASE * 2
 #define GETANDSET               DATA_BASE + 1
 #define GETANDSET_CONVERTDATATYPE GETANDSET + 1
-#define GETANDSET_INDEXTOONEHOT GETANDSET_CONVERTDATATYPE + 1
+#define GETANDSET_SELECT        GETANDSET_CONVERTDATATYPE + 1
-#define GETANDSET_ONEHOTTOINDEX GETANDSET_INDEXTOONEHOT + 1
-#define GETANDSET_SELECT        GETANDSET_ONEHOTTOINDEX + 1
 #define MOVEMENT                GETANDSET_SELECT + 1
 #define MOVEMENT_COPYINDEXED    MOVEMENT + 1
 #define MOVEMENT_COPYVALUES     MOVEMENT_COPYINDEXED + 1
 #define MOVEMENT_GATHER         MOVEMENT_COPYVALUES + 1
+#define MOVEMENT_DROPOUTWITHINDEX         MOVEMENT_GATHER + 1
-#define SHAPE                   MOVEMENT_GATHER + 1
+#define SHAPE                   MOVEMENT_DROPOUTWITHINDEX + 1
 #define SHAPE_CONCATENATE       SHAPE + 1
 #define SHAPE_MERGE             SHAPE_CONCATENATE + 1
 #define SHAPE_MERGE_LIST        SHAPE_MERGE + 1
@@ -108,6 +115,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define FUNC_SIGMOID            FUNC_RECTIFY + 1
 #define FUNC_SOFTMAX            FUNC_SIGMOID + 1
+#define LOSS_BASE               FUNCTION_BASE * 2
+#define LOSS_CROSSENTROPY       LOSS_BASE + 1
 /* get operator name */
 const char * GetOPName(int type);

--- a/source/tensor/XPRunner.cpp
+++ b/source/tensor/XPRunner.cpp
@@ -146,7 +146,7 @@ run a set of jobs in parallel
 >> jobArgs - the list of arguments for each job
 >> sleepTime - time to sleep (in ms) for each round
 */
-void XPRunner::Run(XList * jobFunctions, XList * jobArgs, float sleepTime)
+void XPRunner::Run(TensorList * jobFunctions, TensorList * jobArgs, float sleepTime)
 {
    if(threadNum <= 0){
        XPRINT(1, stderr, "Error! No threads were created!\n");
@@ -195,7 +195,7 @@ void XPRunner::Run(XList * jobFunctions, XList * jobArgs, float sleepTime)
            TFunction function = (TFunction)jobFunctions->GetItem(jobArgs->count - c);
            /* the arguments that are passed to the function */
-            volatile XList * args = (XList*)jobArgs->GetItem(jobArgs->count - c);
+            volatile TensorList * args = (TensorList*)jobArgs->GetItem(jobArgs->count - c);
            /* thread */
            XThread * thread  = threads + availableThreads[i];

--- a/source/tensor/XPRunner.h
+++ b/source/tensor/XPRunner.h
@@ -106,7 +106,7 @@ public:
    void KillThreads();
    /* run a set of jobs in parallel */
-    void Run(XList * jobFunctions, XList * jobArgs, float sleepTime = 0);
+    void Run(TensorList * jobFunctions, TensorList * jobArgs, float sleepTime = 0);
    /* get the number of parallel jobs to run */
    int GetJobNum(int size);

--- a/source/tensor/XQueue.cpp
+++ b/source/tensor/XQueue.cpp
@@ -42,7 +42,7 @@ job item used in queues
 JobQueueNode::JobQueueNode()
 {
    job  = NULL;
-    args = new XList(1);
+    args = new TensorList(1);
 }
 /* de-constructor */
@@ -67,7 +67,7 @@ XQueue::XQueue(int mySize)
    head = 0;
    tail = 0;
    isJobQueue = false;
-    jobDequeuerArgs = new XList(1);
+    jobDequeuerArgs = new TensorList(1);
    jobDequeuerBreak = false;
    runningJobCount = 0;
    jobStream = NULL;
@@ -188,8 +188,10 @@ void XQueue::RunJobConsumer(int jobDevID)
    isJobQueue = true;
    jobDequeuerArgs->Clear();
-    jobDequeuerArgs->Add(this);
-    jobDequeuerArgs->Add(jobDevID >= 0 ? devids + jobDevID : &cpuid);
+    // warning: this may cause unknown error
+    jobDequeuerArgs->Add((XTensor*)this);
+    jobDequeuerArgs->Add(jobDevID >= 0 ? (XTensor*)(devids + jobDevID) : (XTensor*)&cpuid);
    jobDequeuer.function = (TFunction)DequeueJobs;
    jobDequeuer.argv = jobDequeuerArgs;
@@ -211,7 +213,7 @@ void XQueue::StopJobConsumer()
 }
 /* add a job item to process */
-void XQueue::EnqueueJob(void * job, XList * jobArgs)
+void XQueue::EnqueueJob(void * job, TensorList * jobArgs)
 {
    MUTEX_LOCK(jobQueueMutex);
    runningJobCount++;
@@ -225,7 +227,7 @@ void XQueue::EnqueueJob(void * job, XList * jobArgs)
 }
 /* job item consumer */
-void XQueue::DequeueJobs(XList * args)
+void XQueue::DequeueJobs(TensorList * args)
 {
    CheckNTErrors((args->count == 2), "Illegal arguments!");

--- a/source/tensor/XQueue.h
+++ b/source/tensor/XQueue.h
@@ -52,7 +52,7 @@ public:
    void * job;
    /* arguments of the job */
-    XList * args;
+    TensorList * args;
 public:
    /* constructor */
@@ -102,7 +102,7 @@ private:
    XThread jobDequeuer;
    /* argument list of jobDequeuer */
-    XList * jobDequeuerArgs;
+    TensorList * jobDequeuerArgs;
    /* indicates whether jobDequeuer stops */
    bool jobDequeuerBreak;
@@ -141,11 +141,11 @@ public:
    void StopJobConsumer();
    /* add a job item to process */
-    void EnqueueJob(void * job, XList * jobArgs);
+    void EnqueueJob(void * job, TensorList * jobArgs);
    /* job item consumer */
    static
-    void DequeueJobs(XList * args);
+    void DequeueJobs(TensorList * args);
    /* get the break flag */
    bool GetJobBreak();

--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
--- a/source/tensor/XThread.h
+++ b/source/tensor/XThread.h
@@ -85,7 +85,7 @@ namespace nts{
 #endif
-typedef void (*TFunction) (volatile XList*);
+typedef void (*TFunction) (volatile TensorList*);
 /*
 This is a class that wraps the standard implementation of threading
@@ -133,7 +133,7 @@ public:
    /* arguments (for the function to run) */
    volatile
-    XList * argv;
+    TensorList * argv;
    /* a flag to break */
    volatile

--- a/source/tensor/core/CHeader.h
+++ b/source/tensor/core/CHeader.h
@@ -28,6 +28,7 @@
 #include "arithmetic/Div.h"
 #include "arithmetic/DivDim.h"
+#include "arithmetic/Mask.h"
 #include "arithmetic/MatrixMul.h"
 #include "arithmetic/MatrixMul2D.h"
 #include "arithmetic/MatrixMul2DMultiTheading.h"
@@ -35,25 +36,22 @@
 #include "arithmetic/MatrixMulBatched.h"
 #include "arithmetic/Multiply.h"
 #include "arithmetic/MultiplyDim.h"
-#include "arithmetic/Negate.h"
-#include "arithmetic/Sign.h"
 #include "arithmetic/Sub.h"
 #include "arithmetic/SubDim.h"
 #include "arithmetic/Sum.h"
-#include "arithmetic/SumByColumnTV.h"
-#include "arithmetic/SumByColumnVT.h"
 #include "arithmetic/SumDim.h"
 #include "arithmetic/XTensorBLAS.h"
+#include "arithmetic/MulAndShift.h"
 #include "getandset/ConvertDataType.h"
 #include "getandset/OnehotAndIndex.h"
 #include "getandset/Select.h"
 #include "getandset/SetData.h"
+#include "math/Binary.h"
 #include "math/Clip.h"
 #include "math/Compare.h"
 #include "math/Normalize.h"
-#include "math/Power.h"
 #include "math/ScaleAndShift.h"
 #include "math/Unary.h"
@@ -87,11 +85,14 @@
 #include "shape/Squeeze.h"
 #include "shape/Transpose.h"
 #include "shape/Unsqueeze.h"
+#include "shape/IsSameShaped.h"
 #include "sort/Sort.h"
 #include "sort/TopK.h"
 #include "utilities/XMatrixSegment.h"
 #include "utilities/FlushToMem.h"
+#include "utilities/CheckData.h"
+#include "utilities/SetAscendingOrder.h"
 #endif // __CHEADER_H__
--- a/source/tensor/core/arithmetic/Div.cpp
+++ b/source/tensor/core/arithmetic/Div.cpp
--- a/source/tensor/core/arithmetic/Div.h
+++ b/source/tensor/core/arithmetic/Div.h
--- a/source/tensor/core/arithmetic/DivDim.cpp
+++ b/source/tensor/core/arithmetic/DivDim.cpp
--- a/source/tensor/core/arithmetic/DivDim.h
+++ b/source/tensor/core/arithmetic/DivDim.h
--- a/source/tensor/core/arithmetic/Mask.cpp
+++ b/source/tensor/core/arithmetic/Mask.cpp
--- a/source/tensor/core/arithmetic/SumByColumnTV.cpp
+++ b/source/tensor/core/arithmetic/SumByColumnTV.cpp
--- a/source/tensor/core/arithmetic/SumByColumnVT.cuh
+++ b/source/tensor/core/arithmetic/SumByColumnVT.cuh
--- a/source/tensor/core/math/Power.h
+++ b/source/tensor/core/math/Power.h
--- a/source/tensor/core/arithmetic/MatrixMul.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul.cpp
--- a/source/tensor/core/arithmetic/MatrixMul.h
+++ b/source/tensor/core/arithmetic/MatrixMul.h
--- a/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.cpp
--- a/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.h
+++ b/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.h
--- a/source/tensor/core/arithmetic/MatrixMulBatched.cpp
+++ b/source/tensor/core/arithmetic/MatrixMulBatched.cpp
--- a/source/tensor/core/arithmetic/MatrixMulBatched.h
+++ b/source/tensor/core/arithmetic/MatrixMulBatched.h
--- a/source/tensor/core/arithmetic/MulAndShift.cpp
+++ b/source/tensor/core/arithmetic/MulAndShift.cpp
--- a/source/tensor/core/arithmetic/Negate.h
+++ b/source/tensor/core/arithmetic/Negate.h
--- a/source/tensor/core/arithmetic/Multiply.cpp
+++ b/source/tensor/core/arithmetic/Multiply.cpp
--- a/source/tensor/core/arithmetic/Multiply.h
+++ b/source/tensor/core/arithmetic/Multiply.h
--- a/source/tensor/core/arithmetic/MultiplyDim.cpp
+++ b/source/tensor/core/arithmetic/MultiplyDim.cpp
--- a/source/tensor/core/arithmetic/MultiplyDim.h
+++ b/source/tensor/core/arithmetic/MultiplyDim.h
--- a/source/tensor/core/arithmetic/Negate.cpp
+++ b/source/tensor/core/arithmetic/Negate.cpp
--- a/source/tensor/core/arithmetic/Negate.cu
+++ b/source/tensor/core/arithmetic/Negate.cu
--- a/source/tensor/core/arithmetic/Sign.cpp
+++ b/source/tensor/core/arithmetic/Sign.cpp
--- a/source/tensor/core/arithmetic/Sign.cu
+++ b/source/tensor/core/arithmetic/Sign.cu
--- a/source/tensor/core/arithmetic/Sign.cuh
+++ b/source/tensor/core/arithmetic/Sign.cuh
--- a/source/tensor/core/arithmetic/Sub.cpp
+++ b/source/tensor/core/arithmetic/Sub.cpp
--- a/source/tensor/core/arithmetic/Sub.h
+++ b/source/tensor/core/arithmetic/Sub.h
--- a/source/tensor/core/arithmetic/SubDim.cpp
+++ b/source/tensor/core/arithmetic/SubDim.cpp
--- a/source/tensor/core/arithmetic/SubDim.h
+++ b/source/tensor/core/arithmetic/SubDim.h
--- a/source/tensor/core/arithmetic/Sum.cpp
+++ b/source/tensor/core/arithmetic/Sum.cpp
--- a/source/tensor/core/arithmetic/Sum.h
+++ b/source/tensor/core/arithmetic/Sum.h
--- a/source/tensor/core/arithmetic/SumByColumnTV.cu
+++ b/source/tensor/core/arithmetic/SumByColumnTV.cu
--- a/source/tensor/core/arithmetic/SumByColumnVT.cpp
+++ b/source/tensor/core/arithmetic/SumByColumnVT.cpp
--- a/source/tensor/core/arithmetic/SumByColumnVT.cu
+++ b/source/tensor/core/arithmetic/SumByColumnVT.cu
--- a/source/tensor/core/arithmetic/SumDim.cpp
+++ b/source/tensor/core/arithmetic/SumDim.cpp
--- a/source/tensor/core/arithmetic/SumDim.cu
+++ b/source/tensor/core/arithmetic/SumDim.cu
--- a/source/tensor/core/arithmetic/SumDim.cuh
+++ b/source/tensor/core/arithmetic/SumDim.cuh
--- a/source/tensor/core/arithmetic/SumDim.h
+++ b/source/tensor/core/arithmetic/SumDim.h
--- a/source/tensor/core/arithmetic/XTensorBLAS.cu
+++ b/source/tensor/core/arithmetic/XTensorBLAS.cu
--- a/source/tensor/core/arithmetic/XTensorBLAS.h
+++ b/source/tensor/core/arithmetic/XTensorBLAS.h
--- a/source/tensor/core/getandset/ConvertDataType.cpp
+++ b/source/tensor/core/getandset/ConvertDataType.cpp
--- a/source/tensor/core/getandset/ConvertDataType.cu
+++ b/source/tensor/core/getandset/ConvertDataType.cu
--- a/source/tensor/core/getandset/ConvertDataType.h
+++ b/source/tensor/core/getandset/ConvertDataType.h
--- a/source/tensor/core/getandset/OnehotAndIndex.cpp
+++ b/source/tensor/core/getandset/OnehotAndIndex.cpp
--- a/source/tensor/core/getandset/OnehotAndIndex.cu
+++ b/source/tensor/core/getandset/OnehotAndIndex.cu
--- a/source/tensor/core/getandset/OnehotAndIndex.cuh
+++ b/source/tensor/core/getandset/OnehotAndIndex.cuh
--- a/source/tensor/core/getandset/OnehotAndIndex.h
+++ b/source/tensor/core/getandset/OnehotAndIndex.h
--- a/source/tensor/core/getandset/Select.cpp
+++ b/source/tensor/core/getandset/Select.cpp
--- a/source/tensor/core/getandset/Select.h
+++ b/source/tensor/core/getandset/Select.h
--- a/source/tensor/core/getandset/SetData.cpp
+++ b/source/tensor/core/getandset/SetData.cpp
--- a/source/tensor/core/getandset/SetData.cu
+++ b/source/tensor/core/getandset/SetData.cu
--- a/source/tensor/core/getandset/SetData.cuh
+++ b/source/tensor/core/getandset/SetData.cuh
--- a/source/tensor/core/getandset/SetData.h
+++ b/source/tensor/core/getandset/SetData.h
--- a/source/tensor/core/math/Binary.cpp
+++ b/source/tensor/core/math/Binary.cpp
--- a/source/tensor/core/math/Binary.cu
+++ b/source/tensor/core/math/Binary.cu
--- a/source/tensor/core/arithmetic/Negate.cuh
+++ b/source/tensor/core/arithmetic/Negate.cuh
--- a/source/tensor/core/math/Binary.h
+++ b/source/tensor/core/math/Binary.h
--- a/source/tensor/core/math/Clip.cpp
+++ b/source/tensor/core/math/Clip.cpp
--- a/source/tensor/core/math/Clip.cu
+++ b/source/tensor/core/math/Clip.cu
--- a/source/tensor/core/math/Clip.h
+++ b/source/tensor/core/math/Clip.h
--- a/source/tensor/core/math/Compare.cpp
+++ b/source/tensor/core/math/Compare.cpp
--- a/source/tensor/core/math/Compare.cuh
+++ b/source/tensor/core/math/Compare.cuh
--- a/source/tensor/core/math/Compare.h
+++ b/source/tensor/core/math/Compare.h
--- a/source/tensor/core/math/Normalize.cpp
+++ b/source/tensor/core/math/Normalize.cpp
--- a/source/tensor/core/math/Normalize.h
+++ b/source/tensor/core/math/Normalize.h
--- a/source/tensor/core/math/Power.cpp
+++ b/source/tensor/core/math/Power.cpp
--- a/source/tensor/core/math/Power.cu
+++ b/source/tensor/core/math/Power.cu
--- a/source/tensor/core/math/Power.cuh
+++ b/source/tensor/core/math/Power.cuh
--- a/source/tensor/core/math/ScaleAndShift.cpp
+++ b/source/tensor/core/math/ScaleAndShift.cpp
--- a/source/tensor/core/math/ScaleAndShift.h
+++ b/source/tensor/core/math/ScaleAndShift.h
--- a/source/tensor/core/math/Unary.cpp
+++ b/source/tensor/core/math/Unary.cpp
--- a/source/tensor/core/math/Unary.cu
+++ b/source/tensor/core/math/Unary.cu
--- a/source/tensor/core/math/Unary.cuh
+++ b/source/tensor/core/math/Unary.cuh
--- a/source/tensor/core/math/Unary.h
+++ b/source/tensor/core/math/Unary.h
--- a/source/tensor/core/movement/CopyInGrid.cpp
+++ b/source/tensor/core/movement/CopyInGrid.cpp
--- a/source/tensor/core/movement/CopyIndexed.cpp
+++ b/source/tensor/core/movement/CopyIndexed.cpp
--- a/source/tensor/core/movement/CopyIndexed.h
+++ b/source/tensor/core/movement/CopyIndexed.h
--- a/source/tensor/core/movement/CopyValues.cpp
+++ b/source/tensor/core/movement/CopyValues.cpp
--- a/source/tensor/core/movement/CopyValues.cu
+++ b/source/tensor/core/movement/CopyValues.cu
--- a/source/tensor/core/movement/CopyValues.h
+++ b/source/tensor/core/movement/CopyValues.h
--- a/source/tensor/core/movement/Gather.cpp
+++ b/source/tensor/core/movement/Gather.cpp
--- a/source/tensor/core/movement/Gather.cu
+++ b/source/tensor/core/movement/Gather.cu
--- a/source/tensor/core/movement/Gather.h
+++ b/source/tensor/core/movement/Gather.h
--- a/source/tensor/core/movement/Spread.cpp
+++ b/source/tensor/core/movement/Spread.cpp
--- a/source/tensor/core/movement/Spread.h
+++ b/source/tensor/core/movement/Spread.h
--- a/source/tensor/core/reduce/ReduceMax.cpp
+++ b/source/tensor/core/reduce/ReduceMax.cpp
--- a/source/tensor/core/reduce/ReduceMean.cpp
+++ b/source/tensor/core/reduce/ReduceMean.cpp
--- a/source/tensor/core/reduce/ReduceMean.h
+++ b/source/tensor/core/reduce/ReduceMean.h
--- a/source/tensor/core/reduce/ReduceSum.cpp
+++ b/source/tensor/core/reduce/ReduceSum.cpp
--- a/source/tensor/core/reduce/ReduceSum.cu
+++ b/source/tensor/core/reduce/ReduceSum.cu
--- a/source/tensor/core/reduce/ReduceSum.h
+++ b/source/tensor/core/reduce/ReduceSum.h
--- a/source/tensor/core/reduce/ReduceSumAll.cpp
+++ b/source/tensor/core/reduce/ReduceSumAll.cpp
--- a/source/tensor/core/reduce/ReduceSumSquared.cpp
+++ b/source/tensor/core/reduce/ReduceSumSquared.cpp
--- a/source/tensor/core/reduce/ReduceSumSquared.h
+++ b/source/tensor/core/reduce/ReduceSumSquared.h
--- a/source/tensor/core/reduce/ReduceVariance.cpp
+++ b/source/tensor/core/reduce/ReduceVariance.cpp
--- a/source/tensor/core/reduce/ReduceVariance.h
+++ b/source/tensor/core/reduce/ReduceVariance.h
--- a/source/tensor/core/reduce/VectorBuffer.cpp
+++ b/source/tensor/core/reduce/VectorBuffer.cpp
--- a/source/tensor/core/reduce/VectorBuffer.h
+++ b/source/tensor/core/reduce/VectorBuffer.h
--- a/source/tensor/core/shape/Concatenate.cpp
+++ b/source/tensor/core/shape/Concatenate.cpp
--- a/source/tensor/core/shape/Concatenate.h
+++ b/source/tensor/core/shape/Concatenate.h
--- a/source/tensor/core/shape/ConcatenateSolely.cpp
+++ b/source/tensor/core/shape/ConcatenateSolely.cpp
--- a/source/tensor/core/shape/ConcatenateSolely.h
+++ b/source/tensor/core/shape/ConcatenateSolely.h
--- a/source/tensor/core/shape/IsSameShaped.cpp
+++ b/source/tensor/core/shape/IsSameShaped.cpp
--- a/source/tensor/core/arithmetic/Sign.h
+++ b/source/tensor/core/arithmetic/Sign.h
--- a/source/tensor/core/shape/Merge.cpp
+++ b/source/tensor/core/shape/Merge.cpp
--- a/source/tensor/core/shape/Merge.h
+++ b/source/tensor/core/shape/Merge.h
--- a/source/tensor/core/shape/MergeBlockLists.cpp
+++ b/source/tensor/core/shape/MergeBlockLists.cpp
--- a/source/tensor/core/shape/MergeBlockLists.cu
+++ b/source/tensor/core/shape/MergeBlockLists.cu
--- a/source/tensor/core/shape/MergeBlockLists.cuh
+++ b/source/tensor/core/shape/MergeBlockLists.cuh
--- a/source/tensor/core/shape/MergeBlockLists.h
+++ b/source/tensor/core/shape/MergeBlockLists.h
--- a/source/tensor/core/shape/Permute.h
+++ b/source/tensor/core/shape/Permute.h
--- a/source/tensor/core/shape/Reshape.cpp
+++ b/source/tensor/core/shape/Reshape.cpp
--- a/source/tensor/core/shape/Reshape.h
+++ b/source/tensor/core/shape/Reshape.h
--- a/source/tensor/core/shape/Split.cpp
+++ b/source/tensor/core/shape/Split.cpp
--- a/source/tensor/core/shape/Split.h
+++ b/source/tensor/core/shape/Split.h
--- a/source/tensor/core/shape/Squeeze.cpp
+++ b/source/tensor/core/shape/Squeeze.cpp
--- a/source/tensor/core/shape/Squeeze.h
+++ b/source/tensor/core/shape/Squeeze.h
--- a/source/tensor/core/shape/Transpose.cpp
+++ b/source/tensor/core/shape/Transpose.cpp
--- a/source/tensor/core/shape/Unsqueeze.cpp
+++ b/source/tensor/core/shape/Unsqueeze.cpp
--- a/source/tensor/core/shape/Unsqueeze.h
+++ b/source/tensor/core/shape/Unsqueeze.h
--- a/source/tensor/core/sort/Sort.cpp
+++ b/source/tensor/core/sort/Sort.cpp
--- a/source/tensor/core/sort/Sort.h
+++ b/source/tensor/core/sort/Sort.h
--- a/source/tensor/core/sort/TopK.cpp
+++ b/source/tensor/core/sort/TopK.cpp
--- a/source/tensor/core/sort/TopK.cu
+++ b/source/tensor/core/sort/TopK.cu
--- a/source/tensor/core/utilities/CheckData.cpp
+++ b/source/tensor/core/utilities/CheckData.cpp
--- a/source/tensor/core/arithmetic/SumByColumnVT.h
+++ b/source/tensor/core/arithmetic/SumByColumnVT.h
--- a/source/tensor/core/utilities/FlushToMem.cpp
+++ b/source/tensor/core/utilities/FlushToMem.cpp
--- a/source/tensor/core/utilities/FlushToMem.cu
+++ b/source/tensor/core/utilities/FlushToMem.cu
--- a/source/tensor/core/utilities/FlushToMem.cuh
+++ b/source/tensor/core/utilities/FlushToMem.cuh
--- a/source/tensor/core/utilities/FlushToMem.h
+++ b/source/tensor/core/utilities/FlushToMem.h
--- a/source/tensor/core/utilities/SetAscendingOrder.cpp
+++ b/source/tensor/core/utilities/SetAscendingOrder.cpp
--- a/source/tensor/core/arithmetic/SumByColumnTV.h
+++ b/source/tensor/core/arithmetic/SumByColumnTV.h
--- a/source/tensor/core/utilities/XMatrixSegment.cpp
+++ b/source/tensor/core/utilities/XMatrixSegment.cpp
--- a/source/tensor/function/Dropout.cpp
+++ b/source/tensor/function/Dropout.cpp
--- a/source/tensor/function/Dropout.h
+++ b/source/tensor/function/Dropout.h
--- a/source/tensor/function/DropoutWithIndex.cpp
+++ b/source/tensor/function/DropoutWithIndex.cpp
--- a/source/tensor/function/DropoutWithIndex.cu
+++ b/source/tensor/function/DropoutWithIndex.cu
--- a/source/tensor/core/arithmetic/SumByColumnTV.cuh
+++ b/source/tensor/core/arithmetic/SumByColumnTV.cuh
--- a/source/tensor/test/TSumByColumnTV.h
+++ b/source/tensor/test/TSumByColumnTV.h
--- a/source/tensor/function/FHeader.h
+++ b/source/tensor/function/FHeader.h
--- a/source/tensor/function/HardTanH.cpp
+++ b/source/tensor/function/HardTanH.cpp
--- a/source/tensor/function/HardTanH.cu
+++ b/source/tensor/function/HardTanH.cu
--- a/source/tensor/function/HardTanH.cuh
+++ b/source/tensor/function/HardTanH.cuh
--- a/source/tensor/function/HardTanH.h
+++ b/source/tensor/function/HardTanH.h
--- a/source/tensor/function/Identity.cpp
+++ b/source/tensor/function/Identity.cpp
--- a/source/tensor/function/Identity.h
+++ b/source/tensor/function/Identity.h
--- a/source/tensor/function/LogSoftmax.cpp
+++ b/source/tensor/function/LogSoftmax.cpp
--- a/source/tensor/function/LogSoftmax.cu
+++ b/source/tensor/function/LogSoftmax.cu
--- a/source/tensor/function/LogSoftmax.cuh
+++ b/source/tensor/function/LogSoftmax.cuh
--- a/source/tensor/function/LogSoftmax.h
+++ b/source/tensor/function/LogSoftmax.h
--- a/source/tensor/function/Loss.cpp
+++ b/source/tensor/function/Loss.cpp
--- a/source/tensor/function/Loss.cu
+++ b/source/tensor/function/Loss.cu
--- a/source/tensor/function/Rectify.cpp
+++ b/source/tensor/function/Rectify.cpp
--- a/source/tensor/function/Rectify.cu
+++ b/source/tensor/function/Rectify.cu
--- a/source/tensor/function/Rectify.cuh
+++ b/source/tensor/function/Rectify.cuh
--- a/source/tensor/function/Rectify.h
+++ b/source/tensor/function/Rectify.h
--- a/source/tensor/function/Sigmoid.cpp
+++ b/source/tensor/function/Sigmoid.cpp
--- a/source/tensor/function/Sigmoid.cu
+++ b/source/tensor/function/Sigmoid.cu
--- a/source/tensor/function/Sigmoid.cuh
+++ b/source/tensor/function/Sigmoid.cuh
--- a/source/tensor/function/Sigmoid.h
+++ b/source/tensor/function/Sigmoid.h
--- a/source/tensor/function/Softmax.cpp
+++ b/source/tensor/function/Softmax.cpp
--- a/source/tensor/function/Softmax.cu
+++ b/source/tensor/function/Softmax.cu
--- a/source/tensor/function/Softmax.h
+++ b/source/tensor/function/Softmax.h
--- a/source/tensor/function/CrossEntropy.cpp
+++ b/source/tensor/function/CrossEntropy.cpp
--- a/source/tensor/function/CrossEntropy.cu
+++ b/source/tensor/function/CrossEntropy.cu
--- a/source/tensor/function/CrossEntropy.cuh
+++ b/source/tensor/function/CrossEntropy.cuh
--- a/source/tensor/function/CrossEntropy.h
+++ b/source/tensor/function/CrossEntropy.h
--- a/source/tensor/test/TSumByColumnVT.h
+++ b/source/tensor/test/TSumByColumnVT.h
--- a/source/tensor/test/TAbsolute.cpp
+++ b/source/tensor/test/TAbsolute.cpp
--- a/source/tensor/test/TClip.cpp
+++ b/source/tensor/test/TClip.cpp
--- a/source/tensor/test/TCompare.cpp
+++ b/source/tensor/test/TCompare.cpp
--- a/source/tensor/test/TConcatenate.cpp
+++ b/source/tensor/test/TConcatenate.cpp
--- a/source/tensor/test/TConcatenateSolely.cpp
+++ b/source/tensor/test/TConcatenateSolely.cpp
--- a/source/tensor/test/TConvertDataType.cpp
+++ b/source/tensor/test/TConvertDataType.cpp
--- a/source/tensor/test/TCopyIndexed.cpp
+++ b/source/tensor/test/TCopyIndexed.cpp
--- a/source/tensor/test/TCopyValues.cpp
+++ b/source/tensor/test/TCopyValues.cpp
--- a/source/tensor/test/TCos.cpp
+++ b/source/tensor/test/TCos.cpp
--- a/source/tensor/test/TCrossEntropy.cpp
+++ b/source/tensor/test/TCrossEntropy.cpp
--- a/source/tensor/test/TCrossEntropy.h
+++ b/source/tensor/test/TCrossEntropy.h
--- a/source/tensor/test/TDiv.cpp
+++ b/source/tensor/test/TDiv.cpp
--- a/source/tensor/test/TDivDim.cpp
+++ b/source/tensor/test/TDivDim.cpp
--- a/source/tensor/test/TDropout.cpp
+++ b/source/tensor/test/TDropout.cpp
--- a/source/tensor/test/TExp.cpp
+++ b/source/tensor/test/TExp.cpp
--- a/source/tensor/test/TGather.cpp
+++ b/source/tensor/test/TGather.cpp
--- a/source/tensor/test/THardTanH.cpp
+++ b/source/tensor/test/THardTanH.cpp
--- a/source/tensor/test/TIdentity.cpp
+++ b/source/tensor/test/TIdentity.cpp
--- a/source/tensor/test/TLog.cpp
+++ b/source/tensor/test/TLog.cpp
--- a/source/tensor/test/TLogSoftmax.cpp
+++ b/source/tensor/test/TLogSoftmax.cpp
--- a/source/tensor/test/TMatrixMul.cpp
+++ b/source/tensor/test/TMatrixMul.cpp
--- a/source/tensor/test/TMatrixMul2D.cpp
+++ b/source/tensor/test/TMatrixMul2D.cpp
--- a/source/tensor/test/TMatrixMul2DParallel.cpp
+++ b/source/tensor/test/TMatrixMul2DParallel.cpp
--- a/source/tensor/test/TMatrixMulBatched.cpp
+++ b/source/tensor/test/TMatrixMulBatched.cpp
--- a/source/tensor/test/TMerge.cpp
+++ b/source/tensor/test/TMerge.cpp
--- a/source/tensor/test/TMultiply.cpp
+++ b/source/tensor/test/TMultiply.cpp
--- a/source/tensor/test/TMultiplyDim.cpp
+++ b/source/tensor/test/TMultiplyDim.cpp
--- a/source/tensor/test/TNegate.cpp
+++ b/source/tensor/test/TNegate.cpp
--- a/source/tensor/test/TNegate.h
+++ b/source/tensor/test/TNegate.h
--- a/source/tensor/test/TNormalize.cpp
+++ b/source/tensor/test/TNormalize.cpp
--- a/source/tensor/test/TPower.cpp
+++ b/source/tensor/test/TPower.cpp
--- a/source/tensor/test/TPower.h
+++ b/source/tensor/test/TPower.h
--- a/source/tensor/test/TRectify.cpp
+++ b/source/tensor/test/TRectify.cpp
--- a/source/tensor/test/TReduceMax.cpp
+++ b/source/tensor/test/TReduceMax.cpp
--- a/source/tensor/test/TReduceMean.cpp
+++ b/source/tensor/test/TReduceMean.cpp
--- a/source/tensor/test/TReduceSum.cpp
+++ b/source/tensor/test/TReduceSum.cpp
--- a/source/tensor/test/TReduceSumAll.cpp
+++ b/source/tensor/test/TReduceSumAll.cpp
--- a/source/tensor/test/TReduceSumSquared.cpp
+++ b/source/tensor/test/TReduceSumSquared.cpp
--- a/source/tensor/test/TReduceVariance.cpp
+++ b/source/tensor/test/TReduceVariance.cpp
--- a/source/tensor/test/TRound.cpp
+++ b/source/tensor/test/TRound.cpp
--- a/source/tensor/test/TScaleAndShift.cpp
+++ b/source/tensor/test/TScaleAndShift.cpp
--- a/source/tensor/test/TSelect.cpp
+++ b/source/tensor/test/TSelect.cpp
--- a/source/tensor/test/TSetAscendingOrder.cpp
+++ b/source/tensor/test/TSetAscendingOrder.cpp
--- a/source/tensor/test/TSetData.cpp
+++ b/source/tensor/test/TSetData.cpp
--- a/source/tensor/test/TSigmoid.cpp
+++ b/source/tensor/test/TSigmoid.cpp
--- a/source/tensor/test/TSign.cpp
+++ b/source/tensor/test/TSign.cpp
--- a/source/tensor/test/TSign.h
+++ b/source/tensor/test/TSign.h
--- a/source/tensor/test/TSin.cpp
+++ b/source/tensor/test/TSin.cpp
--- a/source/tensor/test/TSoftmax.cpp
+++ b/source/tensor/test/TSoftmax.cpp
--- a/source/tensor/test/TSort.cpp
+++ b/source/tensor/test/TSort.cpp
--- a/source/tensor/test/TSplit.cpp
+++ b/source/tensor/test/TSplit.cpp
--- a/source/tensor/test/TSpread.cpp
+++ b/source/tensor/test/TSpread.cpp
--- a/source/tensor/test/TSub.cpp
+++ b/source/tensor/test/TSub.cpp
--- a/source/tensor/test/TSubDim.cpp
+++ b/source/tensor/test/TSubDim.cpp
--- a/source/tensor/test/TSum.cpp
+++ b/source/tensor/test/TSum.cpp
--- a/source/tensor/test/TSumByColumnTV.cpp
+++ b/source/tensor/test/TSumByColumnTV.cpp
--- a/source/tensor/test/TSumByColumnVT.cpp
+++ b/source/tensor/test/TSumByColumnVT.cpp
--- a/source/tensor/test/TSumDim.cpp
+++ b/source/tensor/test/TSumDim.cpp
--- a/source/tensor/test/TTan.cpp
+++ b/source/tensor/test/TTan.cpp
--- a/source/tensor/test/TTranspose.cpp
+++ b/source/tensor/test/TTranspose.cpp
--- a/source/tensor/test/TUnsqueeze.cpp
+++ b/source/tensor/test/TUnsqueeze.cpp
--- a/source/tensor/test/Test.cpp
+++ b/source/tensor/test/Test.cpp
--- a/source/tensor/test/Test.h
+++ b/source/tensor/test/Test.h