Sync with github

1d17c439 · hello · 010f385d · 1d17c439 · 1d17c439 · 1d17c439
Commit 1d17c439 authored Dec 24, 2020 by hello
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
--- a/LICENSE
+++ b/LICENSE
--- a/README.md
+++ b/README.md
--- a/doc/manual.md
+++ b/doc/manual.md
--- a/doc/pic/embeding-pos.jpg
+++ b/doc/pic/embeding-pos.jpg
--- a/doc/pic/self-attention-tensor-figure.jpg
+++ b/doc/pic/self-attention-tensor-figure.jpg
--- a/doc/pic/self-attention.jpg
+++ b/doc/pic/self-attention.jpg
--- a/doc/pic/transformer-architecture.jpg
+++ b/doc/pic/transformer-architecture.jpg
--- a/doc/pic/transformer-training.jpg
+++ b/doc/pic/transformer-training.jpg
--- a/doc/pic/transformer-translate.jpg
+++ b/doc/pic/transformer-translate.jpg
--- a/source/Main.cpp
+++ b/source/Main.cpp
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
- * All rights reserved.
+ * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
+* All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,19 +19,18 @@
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-10
 */

-//#define CRTDBG_MAP_ALLOC
-//#include <stdlib.h>
-//#include <crtdbg.h>
-
 #include <stdio.h>
 #include "./network/XNet.h"
 #include "./tensor/XUtility.h"
 #include "./tensor/function/FHeader.h"
 #include "./tensor/core/CHeader.h"
+#include "./tensor/test/Test.h"
 #include "./sample/fnnlm/FNNLM.h"
 #include "./sample/transformer/Transformer.h"

-
+//#define CRTDBG_MAP_ALLOC
+//#include <stdlib.h>
+//#include <crtdbg.h>

 using namespace nts;
 using namespace fnnlm;
@@ -39,27 +38,19 @@ using namespace transformer;

 int main( int argc, const char ** argv )
 {
-    /*_CrtSetDbgFlag(_CrtSetDbgFlag(_CRTDBG_REPORT_FLAG) | _CRTDBG_LEAK_CHECK_DF);
-    _CrtSetBreakAlloc(2708);*/
-
-    TransformerMain(argc - 1, argv + 1);
+    if(argc > 1 && !strcmp(argv[1], "-test"))
+        Test();
+    else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
+        FNNLMMain(argc - 1, argv + 1);
+    else if(argc > 1 && !strcmp(argv[1], "-t2t"))
+        TransformerMain(argc - 1, argv + 1);
+    else{
+        fprintf(stderr, "Thanks for using NiuTensor! This is a library for building\n");
+        fprintf(stderr, "neural networks in an easy way. \n\n");
+        fprintf(stderr, "Run this program with \"-test\" for unit test!\n");
+        fprintf(stderr, "Or run this program with \"-fnnlm\" for sample FNNLM!\n");
+        fprintf(stderr, "Or run this program with \"-t2t\" for sample Transformer!\n");
+    }

-    //XTensor singleScore, singleIdx, score;
-    //InitTensor3DV2(&score, 2, 1, 136160);
-    ////score.SetDataRand(0, 1);
-    //InitTensor1DV2(&singleIdx, 1, X_INT);
-    //singleIdx.Set1DInt(1, 0);
-
-    //singleIdx.Dump(stderr);
-    //singleScore = Select(score, singleIdx, 0);
-    //XTensor s, i;
-    //InitTensor3DV2(&s, 2, 1, 4);
-    //InitTensor3DV2(&i, 2, 1, 4, X_INT);
-    //TopK(score, s, i, -1, 4);
-    //i.Dump(stderr, "single score:\n");
-
-    //_CrtDumpMemoryLeaks();
-    
    return 0;
 }
-
--- a/source/network/XBackwardFunc.cpp
+++ b/source/network/XBackwardFunc.cpp
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -31,37 +31,65 @@ namespace nts{
 /* compute dE/dx of a node */
 void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient)
 {
+    if (!isEfficient) {
+        CheckNTErrors(node->grad != NULL, "No gradient found!");
+    }
+    else {
+        CheckNTErrors(!node->isGrad || node->grad != NULL, "No gradient found!");
+    }
+
    XLink &income = node->income;
    int operID = income.typeID;

-    CheckNTErrors(node->grad != NULL, "No gradient found!");
    CheckNTErrors(income.tailNum == 1, "Too many input tensors for the function!");

    XTensor * input = income.tails[0];
    XTensor * output = node;

-    XNoder::MakeGrad(input);
+    if (!isEfficient || input->isGrad) {
+        XNoder::MakeGrad(input);

-    if(operID == FUNC_HARDTANH)
-        _HardTanHBackward(output, input, output->grad, input->grad);
-    else if(operID == FUNC_IDENTITY)
-        _IdentityBackward(output, input, output->grad, input->grad);
-    else if(operID == FUNC_LOGSOFTMAX){
-        int leadDim = income.GetParamInt(0);
-        CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in logsoftmax!");
-        _LogSoftmaxBackward(NULL, output, input, output->grad, input->grad, NULL, leadDim, NOLOSS);
-    }
-    else if(operID == FUNC_RECTIFY)
-        _RectifyBackward(output, input, output->grad, input->grad);
-    else if(operID == FUNC_SIGMOID)
-        _SigmoidBackward(output, input, output->grad, input->grad);
-    else if(operID == FUNC_SOFTMAX){
-        int leadDim = income.GetParamInt(0);
-        CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in softmax!");
-        _SoftmaxBackward(NULL, output, input, output->grad, input->grad, NULL, leadDim, NOLOSS);
-    }
-    else{
-        ShowNTErrors("Wrong activation function type!");
+        XTensor * dedx = input->grad;
+        XTensor * dedy = output->grad;
+
+        XTensor* tmp;
+
+        /* store the result to a temporary node if the input has multiple children */
+        if (input->outgo.tailNum > 1) {
+            tmp = NewTensor(output);
+            tmp->SetZeroAll();
+        }
+        /* otherwise, the result is directly stored into the input node  */
+        else {
+            tmp = dedx;
+        }
+
+        if (operID == FUNC_HARDTANH)
+            _HardTanHBackward(output, input, dedy, tmp);
+        else if (operID == FUNC_IDENTITY)
+            _IdentityBackward(output, input, dedy, tmp);
+        else if (operID == FUNC_LOGSOFTMAX) {
+            int leadDim = income.GetParamInt(0);
+            CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in logsoftmax!");
+            _LogSoftmaxBackward(NULL, output, input, dedy, tmp, NULL, leadDim, NOLOSS);
+        }
+        else if (operID == FUNC_RECTIFY)
+            _RectifyBackward(output, input, dedy, tmp);
+        else if (operID == FUNC_SIGMOID)
+            _SigmoidBackward(output, input, dedy, tmp);
+        else if (operID == FUNC_SOFTMAX) {
+            int leadDim = income.GetParamInt(0);
+            CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in softmax!");
+            _SoftmaxBackward(NULL, output, input, dedy, tmp, NULL, leadDim, NOLOSS);
+        }
+        else {
+            ShowNTErrors("Unsupported backward computation! TODO!");
+        }
+
+        if (input->outgo.tailNum > 1) {
+            _SumMe(dedx, tmp);
+            DelTensor(tmp);
+        }
    }

    node->visitMark = NODE_FINISHED;

--- a/source/network/XBackwardFunc.h
+++ b/source/network/XBackwardFunc.h
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/network/XBackwardLoss.cpp
+++ b/source/network/XBackwardLoss.cpp
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -33,7 +33,6 @@

 namespace nts{

-
 /* compute dE/dx of a node */
 void XLossGrad::MakeGrad(XTensor * node, bool isEfficient)
 {
@@ -48,33 +47,45 @@ void XLossGrad::MakeGrad(XTensor * node, bool isEfficient)
    XTensor * padding = NULL;
    int leadingDim;

-    XNoder::MakeGrad(output);
-    XTensor * dedy = output->grad;
-
-    if (income.tailNum == 1) {
-        if(dedy->dataType == X_FLOAT)
-            _SetDataFixedFloat(dedy, 1.0F);
-        else if(dedy->dataType == X_DOUBLE)
-            _SetDataFixedDouble(dedy, 1.0);
-        else if(dedy->dataType == X_INT)
-            _SetDataFixedInt(dedy, 1);
-        else
-            ShowNTErrors("TODO");
-
-        return;
-    }
-
-    gold = income.tails[1];
-
-    if(operID == LOSS_CROSSENTROPY) {
-        if (income.tailNum == 3) 
-            padding = income.tails[2];
-        leadingDim = income.GetParamInt(0);
-        CheckNTErrors(leadingDim >= 0 && leadingDim < output->order, "wrong leading dimension in logsoftmax!");
-        _CrossEntropyBackward(dedy, output, gold, weight, padding, leadingDim);
-    }
-    else{
-        ShowNTErrors("Wrong activation function type!");
+    bool isRoot = XNoder::IsRoot(node);
+
+    if (!isEfficient || output->isGrad) {
+        XNoder::MakeGrad(output);
+        XTensor * dedy = output->grad;
+
+        if (income.tailNum == 1) {
+            dedy->SetDataFixed(1);
+            return;
+        }
+
+        gold = income.tails[1];
+
+        XTensor* tmp;
+        if (!isRoot) {
+            tmp = NewTensor(output);
+            tmp->SetZeroAll();
+        }
+        else{
+            tmp = dedy;
+        }
+
+        if (operID == LOSS_CROSSENTROPY) {
+            if (income.tailNum == 3)
+                padding = income.tails[2];
+            leadingDim = income.GetParamInt(0);
+            CheckNTErrors(leadingDim >= 0 && leadingDim < output->order, "wrong leading dimension in logsoftmax!");
+            _CrossEntropyBackward(tmp, output, gold, weight, padding, leadingDim);
+            if (isRoot)
+                gold->DestroyData();
+            else
+                _SumMe(dedy, tmp);
+        }
+        else {
+            ShowNTErrors("Unsupported backward computation! TODO!");
+        }
+        
+        if (!isRoot)
+            DelTensor(tmp);
    }

    node->visitMark = NODE_FINISHED;
@@ -87,79 +98,4 @@ bool XLossGrad::IsLossOP(XTensor * node)
    return (income.typeID & LOSS_BASE) != 0;
 }

-/* 
-compute dE/dx for a given function y = f(x) 
->> gold - gold standard to measure error (or loss)
->> y - output of the function
->> x - input of the function
->> dedy - dE/dy
->> dedx - dE/dx
->> funcID - id of the function f
->> params - parameters of the function
->> lossName - name of the loss, e.g., cross entropy
-*/
-//void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x, 
-//                        XTensor * dedy, XTensor * dedx, XTensor * padding,
-//                        int funcID, void * params,
-//                        LOSS_FUNCTION_NAME lossName)
-//{
-//    CheckNTErrors(gold && y && x, "Empty input tensors!");
-//    CheckNTErrors(dedx, "Empty gradient tensors!");
-//    CheckNTErrors((funcID & FUNCTION_BASE) != 0, "Illegal function id");
-//
-//    if(funcID == FUNC_HARDTANH){
-//        _HardTanHBackward(gold, y, x, dedy, dedx, lossName);
-//    }
-//    else if(funcID == FUNC_IDENTITY){
-//        _IdentityBackward(gold, y, x, dedy, dedx, lossName);
-//    }
-//    else if(funcID == FUNC_LOGSOFTMAX){
-//        int leadDim = *(int*)params;
-//        _LogSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
-//    }
-//    else if(funcID == FUNC_RECTIFY){
-//        _RectifyBackward(gold, y, x, dedy, dedx, lossName);
-//    }
-//    else if(funcID == FUNC_SIGMOID){
-//        _SigmoidBackward(gold, y, x, dedy, dedx, lossName);
-//    }else if(funcID == FUNC_SOFTMAX){
-//        int leadDim = *(int*)params;
-//        _SoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
-//    }
-//    else{
-//        ShowNTErrors("wrong function found when call the backward process!");
-//    }
-//
-//}
-
-/* 
-compute dE/dy for variable y and error(loss) function E
->> gold - gold standard to measure error (or loss)
->> y - output of the function
->> dedy - dE/dy
->> lossName - name of the loss, e.g., cross entropy
-*/
-//void XLossGrad::Compute(XTensor * gold, XTensor * y, 
-//                        XTensor * dedy, XTensor * padding,
-//                        LOSS_FUNCTION_NAME lossName)
-//{
-//    if(gold == NULL){
-//        if(dedy->dataType == X_FLOAT)
-//            _SetDataFixedFloat(dedy, 1.0F);
-//        else if(dedy->dataType == X_DOUBLE)
-//            _SetDataFixedDouble(dedy, 1.0);
-//        else if(dedy->dataType == X_INT)
-//            _SetDataFixedInt(dedy, 1);
-//        else{
-//            ShowNTErrors("TODO");
-//        }
-//        return;
-//    }
-//
-//    //_LossBackward(dedy, gold, y, lossName);
-//    if(lossName == CROSSENTROPY)
-//        _CrossEntropyBackward(dedy, y, gold, NULL, padding);
-//
-//}
-
 }
\ No newline at end of file
--- a/source/network/XBackwardLoss.h
+++ b/source/network/XBackwardLoss.h
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/network/XBackwardMath.cpp
+++ b/source/network/XBackwardMath.cpp
--- a/source/network/XBackwardMath.h
+++ b/source/network/XBackwardMath.h
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -126,6 +126,18 @@ private:
    static
    void GradPower(XTensor * node, bool isEfficient);

+    /* gradient for power */
+    static
+    void GradReciprocal(XTensor* node, bool isEfficient);
+
+    /* gradient for sqrt */
+    static
+    void GradSqrt(XTensor* node, bool isEfficient);
+    
+    /* gradient for square */
+    static
+    void GradSquare(XTensor* node, bool isEfficient);
+
    /* gradient for ScaleAndShift */
    static
    void GradScaleAndShift(XTensor * node, bool isEfficient);
@@ -188,6 +200,10 @@ private:
    /* gradient for operation */
    static
    void GradMulAndShift(XTensor * node, bool isEfficient);
+
+    /* gradient for MLP */
+    static
+    void GradMLP(XTensor* node, bool isEfficient);
 };

 }

--- a/source/network/XBackwardShape.cpp
+++ b/source/network/XBackwardShape.cpp
--- a/source/network/XBackwardShape.h
+++ b/source/network/XBackwardShape.h
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -34,7 +34,7 @@ class XShapeGrad
 public:
    /* compute dE/dx of a node */
    static
-    void MakeGrad(XTensor * node, bool isEfficent);
+    void MakeGrad(XTensor * node, bool isEfficient);

    /* indicates whether the node is for a shaping operation */
    static
@@ -42,55 +42,59 @@ public:

    /* post processing of a node */
    static
-    void PostProcessing(XTensor * node, int typeId, bool isEfficent);
+    void PostProcessing(XTensor * node, int typeId, bool isEfficient);

 private:
    
+    /* gradient computation for convertdatatype: b = convertdatatype(a) */
+    static
+    void GradConvertDataType(XTensor * node, bool isEfficient);
+            
    /* gradient computation for copying indexed sub-tensors: b = copyindexed(a, srcIndex, indexSize, tgtIndex, copyNum) */
    static
-    void GradCopyIndexed(XTensor * node, bool isEfficent);
+    void GradCopyIndexed(XTensor * node, bool isEfficient);
        
    /* gradient computation for copying indexed sub-tensors: b = gather(a, index) */
    static
-    void GradGather(XTensor * node, bool isEfficent);
+    void GradGather(XTensor * node, bool isEfficient);

    /* gradient computation for dropout with index: b = dropoutwithindex(a, index) */
    static
-    void GradDropoutWithIndex(XTensor * node, bool isEfficent);
+    void GradDropoutWithIndex(XTensor * node, bool isEfficient);

    /* gradient computation for merge: c = merge(a, b, ...) */
    static
-    void GradMerge(XTensor * node, bool isEfficent);
+    void GradMerge(XTensor * node, bool isEfficient);

    /* gradient computation for merging a list of tensors : c = merge(list(a, b, ...)) */
    static
-    void GradMergeList(XTensor * node, bool isEfficent);
+    void GradMergeList(XTensor * node, bool isEfficient);
    
    /* gradient computation for transposing a tensor : b = transpose(a) */
    static
-    void GradTranspose(XTensor * node, bool isEfficent);
+    void GradTranspose(XTensor * node, bool isEfficient);

    /* gradient computation for reshaping a tensor: c = reshape(a) */
    static
-    void GradReshape(XTensor * node, bool isEfficent);
+    void GradReshape(XTensor * node, bool isEfficient);

    /* gradient computation for split: c = split(a) */
    static
-    void GradSplit(XTensor * node, bool isEfficent);
+    void GradSplit(XTensor * node, bool isEfficient);

    /* gradient computation for spliting. we return the list of the splits : list(c_1, ...) = split(a) */
    static
-    void GradSplitList(XTensor * node, bool isEfficent);
+    void GradSplitList(XTensor * node, bool isEfficient);

    /* gradient computation for spliting. we return the list of the splits : list(c_1, ...) = split(a).
       this method is called only when all nodes of spliting have been processed. We do this in a post-processing
       manner because we can fuze multiple memory copy jobs one time. This is good for system speed up. */
    static
-    void GradSplitListPost(XTensor * node, bool isEfficent);
+    void GradSplitListPost(XTensor * node, bool isEfficient);

    /* gradient computation for unsqueezing a tensor : c = unsqueeze(a) */
    static
-    void GradUnsqueeze(XTensor * node, bool isEfficent);
+    void GradUnsqueeze(XTensor * node, bool isEfficient);

 };


--- a/source/network/XNet.cpp
+++ b/source/network/XNet.cpp
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -121,8 +121,13 @@ void XNet::Backward(TensorList &roots)
                ClearGrad(parent);
            }

-            if(XNoder::IsLeaf(node))
+            if (XNoder::IsLeaf(node)) {
                ClearGrad(node);
+                if (node->outgo.tailNum == 0) {
+                    delete node;
+                }
+            }
+            
        }
    }
 }
@@ -316,7 +321,6 @@ void XNet::ClearGrad(XTensor * node)
    }

    if(finished){
-        //fprintf(stderr, "del %d %ld\n", node->id, node->grad->unitNum);
        delete node->grad;
        node->grad = NULL;
    }
@@ -334,7 +338,7 @@ void XNet::ShowNetwork(FILE * file, XTensor * node)

    Traverse(roots);

-    XLink::ShowNode(file, node);
+    //XLink::ShowNode(file, node);

    /* go over nodes in its topological order */
    for(int i = nodes.count - 1; i >= 0; i--){

--- a/source/network/XNet.h
+++ b/source/network/XNet.h
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/network/XNoder.cpp
+++ b/source/network/XNoder.cpp
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/network/XNoder.h
+++ b/source/network/XNoder.h
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/fnnlm/FNNLM.cpp
+++ b/source/sample/fnnlm/FNNLM.cpp
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
+ * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University. 
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -128,8 +128,10 @@ int FNNLMMain(int argc, const char ** argv)
    Init(model);

    /* learn model parameters */
-    if(strcmp(trainFN, ""))
+    if(strcmp(trainFN, "")) {
+        ENABLE_GRAD;
        Train(trainFN, shuffled, model);
+    }

    /* save the final model */
    if(strcmp(modelFN, "") && strcmp(trainFN, ""))

--- a/source/sample/fnnlm/FNNLM.h
+++ b/source/sample/fnnlm/FNNLM.h
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
+ * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University. 
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/T2TDecoder.cpp
+++ b/source/sample/transformer/T2TDecoder.cpp
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,12 +17,15 @@

 /*
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-10-09
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

-#include <math.h>
+#include <cmath>
+
 #include "T2TDecoder.h"
-#include "T2TUtility.h"
-#include "T2TLayerNormal.h"
+#include "module/T2TUtility.h"
+#include "module/T2TLayerNormal.h"
+#include "module/T2TCommonModules.h"
 #include "../../tensor/core/CHeader.h"

 namespace transformer
@@ -34,6 +37,7 @@ AttDecoder::AttDecoder()
    selfAtt = NULL;
    fnns = NULL;
    selfAttLayerNorms = NULL;
+    fnnLayerNorms = NULL;
    enDeAtt = NULL;
    enDeAttLayerNorms = NULL;
    decoderLayerNorm = NULL;
@@ -49,123 +53,143 @@ AttDecoder::~AttDecoder()
    delete[] selfAtt;
    delete[] fnns;
    delete[] selfAttLayerNorms;
+    delete[] fnnLayerNorms;
    delete[] enDeAtt;
    delete[] enDeAttLayerNorms;
-    delete decoderLayerNorm;
+    if (preNorm)
+        delete decoderLayerNorm;
 }

-/* 
-initialize the model 
->> argc - number of arguments
->> argv - list of pointers to the arguments
->> myIsMasked - indicates whether the masked attention is employed
->> myIgnored - number of positions ignored in attention (from the start)
->> myDevID - device id
+/*
+initialize the model
+>> config - configurations of the model
 */
-void AttDecoder::InitModel(int argc, char ** argv, 
-                           bool myIsMasked, int myIgnored, 
-                           int myDevID)
+void AttDecoder::InitModel(T2TConfig& config)
 {
-    //AttEncoder::InitModel(argc, argv, myIsMasked, myIgnored, myDevID);
-
-    devID = myDevID;
-    ignored = myIgnored;
-
-    LoadParamInt(argc, argv, "nlayer", &nlayer, 4);
-    LoadParamInt(argc, argv, "hsize", &hSize, DEFAULT_EMBEDDING_SIZE);
-    LoadParamInt(argc, argv, "esize", &eSize, DEFAULT_EMBEDDING_SIZE);
-    LoadParamInt(argc, argv, "vsizetgt", &vSize, 34040);
-    LoadParamFloat(argc, argv, "dropout", &dropoutP, 0);
+    devID = config.devID;
+    nlayer = config.nDecLayer;
+    hSize = config.modelSize;
+    eSize = config.embSize;
+    vSize = config.tgtVocabSize;
+    dropoutP = config.dropout;
+    preNorm = config.preNorm;

    CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
    CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsizetgt\"");

    /* embedding model */
-    embedder.InitModel(argc, argv, devID, false);
+    embedder.InitModel(config, false);

    selfAtt = new T2TAttention[nlayer];
    fnns = new T2TFNN[nlayer];
    selfAttLayerNorms = new T2TLN[nlayer];
    enDeAtt = new T2TAttention[nlayer];
    enDeAttLayerNorms = new T2TLN[nlayer];
-    decoderLayerNorm = new T2TLN;
+    fnnLayerNorms = new T2TLN[nlayer];
    selfAttCache = new Cache[nlayer];
    enDeAttCache = new Cache[nlayer];
+    if (preNorm)
+        decoderLayerNorm = new T2TLN;

    /* initialize the stacked layers */
    for (int i = 0; i < nlayer; i++) {
-        selfAtt[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID);
-        fnns[i].InitModel(argc, argv, myDevID);
-        selfAttLayerNorms[i].InitModel(argc, argv, myDevID);
-        enDeAtt[i].InitModel(argc, argv, true, myIgnored, myDevID);
-        enDeAttLayerNorms[i].InitModel(argc, argv, myDevID);
+        selfAtt[i].InitModel(config);
+        fnns[i].InitModel(config);
+        selfAttLayerNorms[i].InitModel(config);
+        fnnLayerNorms[i].InitModel(config);
+        enDeAtt[i].InitModel(config);
+        enDeAttLayerNorms[i].InitModel(config);
    }
-    decoderLayerNorm->InitModel(argc, argv, myDevID);
+    if (preNorm)
+        decoderLayerNorm->InitModel(config);
 }

-/* 
+/*
 make the decoding network
 >> inputDec - the input tensor of the decoder
 >> outputEnc - the output tensor of the encoder
 >> mask - mask that indicates which position is valid
 >> maskEncDec - mask for the encoder-decoder attention
+>> nstep - the current length of the decoder input
 >> isTraining - indicates whether the model is used for training
-<< return - the output tensor of the encoder
+<< return - the output tensor of the decoder
 */
-XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor *mask, XTensor &maskEncDec, bool isTraining)
+XTensor AttDecoder::Make(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
+    XTensor* maskEncDec, int nstep, bool isTraining)
 {
    XTensor x;
-
-    x = embedder.Make(inputDec, inputDec.GetDim(1), true);
+    x = embedder.Make(inputDec, true, isTraining, nstep);

    /* dropout */
-    if(isTraining && dropoutP > 0)
+    if (isTraining && dropoutP > 0)
        x = Dropout(x, dropoutP);

-    for(int i = 0; i < nlayer; i++){
+    for (int i = 0; i < nlayer; i++) {
        XTensor att;
        XTensor ende;
-        XTensor ln;
        XTensor fnn;
-        XTensor inputNorm;
-        XTensor attNorm;
+        XTensor res;
+        XTensor selfAttnBefore;
+        XTensor selfAttnAfter;
+        XTensor endeAttnBefore;
+        XTensor endeAttnAfter;
+        XTensor fnnBefore;

-        /* layer normalization */
-        inputNorm = selfAttLayerNorms[i].Make(x);
+        /* layer normalization with pre-norm for self-attn */
+        selfAttnBefore = LayerNorm(x, selfAttLayerNorms[i], preNorm, true, false);

        /******************/
        /* self attention */
-        att = selfAtt[i].Make(inputNorm, inputNorm, inputNorm, NULL, isTraining, &selfAttCache[i], SELF_ATT);
+        att = selfAtt[i].Make(selfAttnBefore, selfAttnBefore, selfAttnBefore, 
+                              mask, isTraining, &selfAttCache[i], SELF_ATT);

        /* dropout */
-        if(isTraining && dropoutP > 0)
+        if (isTraining && dropoutP > 0)
            att = Dropout(att, dropoutP);

        /* residual connection */
-        att = att + x;
+        res = Sum(att, x);

-        /* layer normalization */
-        attNorm = enDeAttLayerNorms[i].Make(att);
+        /* layer normalization with post-norm for self-attention */
+        selfAttnAfter = LayerNorm(res, selfAttLayerNorms[i], preNorm, false, true);
+
+        /* layer normalization with pre-norm for encoder-decoder attention */
+        endeAttnBefore = LayerNorm(selfAttnAfter, enDeAttLayerNorms[i], preNorm, true, false);

        /* encoder-decoder attention */
-        ende = enDeAtt[i].Make(outputEnc, attNorm, outputEnc, &maskEncDec, isTraining, &enDeAttCache[i], EN_DE_ATT);
+        ende = enDeAtt[i].Make(outputEnc, endeAttnBefore, outputEnc, maskEncDec, 
+                               isTraining, &enDeAttCache[i], EN_DE_ATT);

        /* dropout */
-        if(isTraining && dropoutP > 0)
+        if (isTraining && dropoutP > 0)
            ende = Dropout(ende, dropoutP);

        /* residual connection */
-        ende = ende + att;
+        res = Sum(ende, selfAttnAfter);
+
+        /* layer normalization with post-norm for encoder-decoder attention */
+        endeAttnAfter = LayerNorm(res, enDeAttLayerNorms[i], preNorm, false, true);
+
+        /* layer normalization with pre-norm for fnn */
+        fnnBefore = LayerNorm(endeAttnAfter, fnnLayerNorms[i], preNorm, true, false);

        /* fnn */
-        x = fnns[i].Make(ende, isTraining);
+        fnn = fnns[i].Make(fnnBefore, isTraining);

-    }
+        /* dropout */
+        if (isTraining && dropoutP > 0)
+            fnn = Dropout(fnn, dropoutP);

-    x = decoderLayerNorm->Make(x);
+        /* residual connection */
+        res = Sum(fnn, endeAttnAfter);

-    return x;
-}
+        /* layer normalization with post-norm for fnn */
+        x = LayerNorm(res, fnnLayerNorms[i], preNorm, false, true);
+    }

+    if (preNorm)
+        x = decoderLayerNorm->Make(x);

+    return x;
 }
+}
\ No newline at end of file
--- a/source/sample/transformer/T2TDecoder.h
+++ b/source/sample/transformer/T2TDecoder.h
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,18 +17,17 @@

 /*
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

 #ifndef __T2TDECODER_H__
 #define __T2TDECODER_H__

 #include "T2TEncoder.h"
+#include "module/T2TUtility.h"

 namespace transformer
 {
-    
-#define DECODING_NAME "decoding"
-#define DECODING_INPUT_NAME "decoding_input"

 class AttDecoder
 {
@@ -52,36 +51,29 @@ public:
    /* dropout probability */
    DTYPE dropoutP;

-    /* some positions can be ignored in attention. this is useful in lm where the first position needs
-     * special design for the attention model. */
-    int ignored;
-
    /* embedding of word at each position */
    T2TEmbedder embedder;

    /* FNN model of each layer */
-    T2TFNN * fnns;
+    T2TFNN* fnns;

    /* attention model of each layer */
-    T2TAttention * selfAtt;
+    T2TAttention* selfAtt;

    /* layer normalization for attention */
-    T2TLN * selfAttLayerNorms;
-
-    /* layer normalization for decoder */
-    T2TLN * decoderLayerNorm;
+    T2TLN* selfAttLayerNorms;

-    /* input tensor of the encoder */
-    XTensor * input;
+    /* layer normalization for fnn */
+    T2TLN* fnnLayerNorms;

-    /* output tensor of the encoder */
-    XTensor * output;
+    /* layer normalization for decoder */
+    T2TLN* decoderLayerNorm;

    /* encoder-decoder attention model of each layer */
-    T2TAttention * enDeAtt;
+    T2TAttention* enDeAtt;

    /* layer normalization for encoder-decoder attention */
-    T2TLN * enDeAttLayerNorms;
+    T2TLN* enDeAttLayerNorms;

    /* layer cache list */
    Cache* selfAttCache;
@@ -89,20 +81,22 @@ public:
    /* layer cache list */
    Cache* enDeAttCache;

+    /* the location of layer normalization */
+    bool preNorm;
+
 public:
    /* constructor */
    AttDecoder();

-    /* deconstructor */
+    /* de-constructor */
    ~AttDecoder();

    /* initialize the model */
-    void InitModel(int argc, char ** argv, 
-                   bool myIsMasked, int myIgnored, 
-                   int myDevID = -1);
+    void InitModel(T2TConfig& config);

    /* make the decoding network */
-    XTensor Make(XTensor &inputDec, XTensor &outputEnc, XTensor *mask, XTensor &maskEncDec, bool isTraining);
+    XTensor Make(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
+                 XTensor* maskEncDec, int nstep, bool isTraining);
 };

 }

--- a/source/sample/transformer/T2TEncoder.cpp
+++ b/source/sample/transformer/T2TEncoder.cpp
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,12 +17,15 @@

 /*
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

-#include <math.h>
+#include <cmath>
+
 #include "T2TEncoder.h"
-#include "T2TLayerNormal.h"
-#include "T2TUtility.h"
+#include "module/T2TUtility.h"
+#include "module/T2TLayerNormal.h"
+#include "module/T2TCommonModules.h"
 #include "../../tensor/core/CHeader.h"

 namespace transformer
@@ -31,63 +34,65 @@ namespace transformer
 /* constructor */
 AttEncoder::AttEncoder()
 {
-    attentions = NULL;
+    selfAtt = NULL;
    fnns = NULL;
    attLayerNorms = NULL;
+    fnnLayerNorms = NULL;
    encoderLayerNorm = NULL;
 }

 /* de-constructor */
 AttEncoder::~AttEncoder()
 {
-    delete[] attentions;
+    delete[] selfAtt;
    delete[] fnns;
    delete[] attLayerNorms;
-    delete encoderLayerNorm;
+    delete[] fnnLayerNorms;
+    if (preNorm)
+        delete encoderLayerNorm;
 }

-/* 
-initialize the model 
->> argc - number of arguments
->> argv - list of pointers to the arguments
->> myIsMasked - indicates whether the masked attention is employed
->> myIgnored - number of positions ignored in attention (from the start)
->> myDevID - device id
+/*
+initialize the model
+>> config - configurations for the model
 */
-void AttEncoder::InitModel(int argc, char ** argv, 
-                           bool myIsMasked, int myIgnored, 
-                           int myDevID)
+void AttEncoder::InitModel(T2TConfig& config)
 {
-    devID = myDevID;
-    ignored = myIgnored;
-    
-    LoadParamInt(argc, argv, "nlayer", &nlayer, 20);
-    LoadParamInt(argc, argv, "hsize", &hSize, DEFAULT_EMBEDDING_SIZE);
-    LoadParamInt(argc, argv, "esize", &eSize, DEFAULT_EMBEDDING_SIZE);
-    LoadParamInt(argc, argv, "vsize", &vSize, 34040);
-    LoadParamFloat(argc, argv, "dropout", &dropoutP, 0);
+
+    devID = config.devID;
+    nlayer = config.nEncLayer;
+    eSize = config.embSize;
+    hSize = config.modelSize;
+    vSize = config.srcVocabSize;
+    preNorm = config.preNorm;
+    dropoutP = config.dropout;

    CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
    CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsize\"");

    /* embedding model */
-    embedder.InitModel(argc, argv, devID);
+    embedder.InitModel(config);

-    attentions = new T2TAttention[nlayer];
+    selfAtt = new T2TAttention[nlayer];
    fnns = new T2TFNN[nlayer];
    attLayerNorms = new T2TLN[nlayer];
-    encoderLayerNorm = new T2TLN;
+    fnnLayerNorms = new T2TLN[nlayer];
+
+    if (preNorm)
+        encoderLayerNorm = new T2TLN;

    /* initialize the stacked layers */
-    for(int i = 0; i < nlayer; i++){
-        attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID);
-        fnns[i].InitModel(argc, argv, myDevID);
-        attLayerNorms[i].InitModel(argc, argv, myDevID);
+    for (int i = 0; i < nlayer; i++) {
+        selfAtt[i].InitModel(config);
+        fnns[i].InitModel(config);
+        attLayerNorms[i].InitModel(config);
+        fnnLayerNorms[i].InitModel(config);
    }
-    encoderLayerNorm->InitModel(argc, argv, myDevID);
+    if (preNorm)
+        encoderLayerNorm->InitModel(config);
 }

-/* 
+/*
 make the encoding network
 >> input - the input tensor of the encoder
 >> mask - the mask that indicate each position is valid
@@ -95,53 +100,74 @@ make the encoding network
 >> isTraining - indicates whether the model is used for training
 << return - the output tensor of the encoder
 */
-XTensor AttEncoder::Make(XTensor &input, XTensor *mask, XTensor &maskEncDec, bool isTraining)
+XTensor AttEncoder::Make(XTensor& input, XTensor* mask, XTensor& maskEncDec, bool isTraining)
 {
    XTensor x;

-    x = embedder.Make(input, 0);
+    x = embedder.Make(input, false, isTraining);

    /* dropout */
-    if(isTraining && dropoutP > 0)
+    if (isTraining && dropoutP > 0)
        x = Dropout(x, dropoutP);

-    for(int i = 0; i < nlayer; i++){
+    for (int i = 0; i < nlayer; i++) {
        XTensor att;
-        XTensor ln;
        XTensor fnn;
        XTensor res;
-        XTensor inputNorm;
+        XTensor attnBefore;
+        XTensor attnAfter;
+        XTensor fnnBefore;

-        /* layer normalization */
-        inputNorm = attLayerNorms[i].Make(x);
+        /* layer normalization with pre-norm for self-attn */
+        attnBefore = LayerNorm(x, attLayerNorms[i], preNorm, true, false);

        /* self attention */
-        att = attentions[i].Make(inputNorm, inputNorm, inputNorm, mask, isTraining, NULL, 0);
+        att = selfAtt[i].Make(attnBefore, attnBefore, attnBefore, mask, isTraining, NULL, 0);
+
+        /* dropout */
+        if (isTraining && dropoutP > 0)
+            att = Dropout(att, dropoutP);

        /* residual connection */
        res = Sum(att, x);

+        /* layer normalization with post-norm for self-attn */
+        attnAfter = LayerNorm(res, attLayerNorms[i], preNorm, false, true);
+
+        /* layer normalization with pre-norm for fnn */
+        fnnBefore = LayerNorm(attnAfter, fnnLayerNorms[i], preNorm, true, false);
+
        /* fnn */
-        x = fnns[i].Make(res, isTraining);
-    }
+        fnn = fnns[i].Make(fnnBefore, isTraining);

-    x = encoderLayerNorm->Make(x);
+        /* dropout */
+        if (isTraining && dropoutP > 0)
+            fnn = Dropout(fnn, dropoutP);
+
+        /* residual connection */
+        res = Sum(fnn, attnAfter);
+
+        /* layer normalization with post-norm for fnn */
+        x = LayerNorm(res, fnnLayerNorms[i], preNorm, false, true);
+    }
+    if (preNorm)
+        x = encoderLayerNorm->Make(x);

    return x;
 }

 /*
-make the encoding network (wrapper) 
+make the encoding network (wrapper)
 >> input - the input tensor of the encoder
 >> mask - the mask that indicate each position is valid
 >> isTraining - indicates whether the model is used for training
 << return - the output tensor of the encoder
 */
-XTensor AttEncoder::Make(XTensor &input, XTensor *mask, bool isTraining)
+XTensor AttEncoder::Make(XTensor& input, XTensor* mask, bool isTraining)
 {
    XTensor nothing;

    return Make(input, mask, nothing, isTraining);
 }

-}
+}
\ No newline at end of file
--- a/source/sample/transformer/T2TEncoder.h
+++ b/source/sample/transformer/T2TEncoder.h
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,47 +17,35 @@

 /*
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

 #ifndef __T2TENCODER_H__
 #define __T2TENCODER_H__

-#include "T2TFNN.h"
-#include "T2TAttention.h"
-#include "T2TEmbedding.h"
-#include "T2TLayerNormal.h"
+#include "module/T2TFNN.h"
+#include "module/T2TUtility.h"
+#include "module/T2TAttention.h"
+#include "module/T2TEmbedding.h"
+#include "module/T2TLayerNormal.h"
 #include "../../network/XNet.h"

 using namespace nts;

 namespace transformer
 {
-    
-#define ENCODING_NAME "encoding"
-#define ENCODING_INPUT_NAME "encoding_input"

-/* 
-base class of the encoder 
+/*
+base class of the encoder
 */
 class T2TEncoder
 {
 public:
-    virtual
-    XTensor Make(XTensor &input, XTensor *mask, XTensor &mask2, bool isTraining) = 0;
-};
-
-/* 
-the encoder based on RNN 
-*/
-class RNNEncoder : T2TEncoder
-{
-public:
-    XTensor Make(XTensor &input, XTensor *mask, XTensor &mask2, bool isTraining);
+    virtual XTensor Make(XTensor& input, XTensor* mask, XTensor& mask2, bool isTraining) = 0;
 };

-
-/* 
-the encoder based on self-attention 
+/*
+the encoder based on self-attention
 */
 class AttEncoder : T2TEncoder
 {
@@ -88,23 +76,23 @@ public:
    T2TEmbedder embedder;

    /* FNN model of each layer */
-    T2TFNN * fnns;
+    T2TFNN* fnns;

    /* attention model of each layer */
-    T2TAttention * attentions;
+    T2TAttention* selfAtt;

    /* layer normalizations for attention */
-    T2TLN * attLayerNorms;
+    T2TLN* attLayerNorms;
+
+    /* layer normalization for fnn */
+    T2TLN* fnnLayerNorms;

    /* layer normalization for encoder */
-    T2TLN * encoderLayerNorm;
+    T2TLN* encoderLayerNorm;

-    /* input tensor of the encoder */
-    XTensor * input;
+    /* the location of layer normalization */
+    bool preNorm;

-    /* output tensor of the encoder */
-    XTensor * output;
-    
 public:
    /* constructor */
    AttEncoder();
@@ -113,18 +101,15 @@ public:
    ~AttEncoder();

    /* initialize the model */
-    void InitModel(int argc, char ** argv, 
-                   bool myIsMasked, int myIgnored, 
-                   int myDevID = -1);
+    void InitModel(T2TConfig& config);

    /* make the encoding network */
-    XTensor Make(XTensor &input, XTensor *mask, XTensor &maskEncDec, bool isTraining);
+    XTensor Make(XTensor& input, XTensor* mask, XTensor& maskEncDec, bool isTraining);

    /* make the encoding network (wrapper) */
-    XTensor Make(XTensor &input, XTensor *mask, bool isTraining);
+    XTensor Make(XTensor& input, XTensor* mask, bool isTraining);
 };

-
 }

 #endif
--- a/source/sample/transformer/T2TModel.cpp
+++ b/source/sample/transformer/T2TModel.cpp
--- a/source/sample/transformer/T2TModel.h
+++ b/source/sample/transformer/T2TModel.h
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,16 +17,18 @@

 /*
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

 #ifndef __T2TMODEL_H__
 #define __T2TMODEL_H__

-#include "T2TFNN.h"
-#include "T2TAttention.h"
 #include "T2TEncoder.h"
 #include "T2TDecoder.h"
-#include "T2TOutput.h"
+#include "module/T2TFNN.h"
+#include "module/T2TOutput.h"
+#include "module/T2TUtility.h"
+#include "module/T2TAttention.h"

 namespace transformer
 {
@@ -41,13 +43,13 @@ public:
    int devID;

    /* the encoder */
-    AttEncoder * encoder;
+    AttEncoder* encoder;

    /* the decoder */
-    AttDecoder * decoder;
+    AttDecoder* decoder;

    /* output layer */
-    T2TOutput * outputLayer;
+    T2TOutput* outputLayer;

    /* indicates whether the model is running for language modeling */
    bool isLM;
@@ -55,9 +57,18 @@ public:
    /* indicates whether the model is running for machine translation */
    bool isMT;

+    /* indicates whether the model is running with FP16 data type */
+    bool useFP16;
+
    /* number of heads in the attention model */
    int nhead;

+    /* indicates whether share encoders embeddings with decoders */
+    int shareAllEmbeddings;
+
+    /* indicates whether share decoder embeddings with output weights */
+    int shareDecInputOutputWeight;
+
 public:
    /* constructor */
    T2TModel();
@@ -66,42 +77,42 @@ public:
    ~T2TModel();

    /* initialize the model */
-    void InitModel(int argc, char ** argv);
+    void InitModel(T2TConfig& config);

    /* make the encoding network */
-    XTensor MakeEncoder(XTensor &input, XTensor *mask, bool isTraining);
+    XTensor MakeEncoder(XTensor& input, XTensor* mask, bool isTraining);

    /* make the encoding network */
-    XTensor MakeDecoder(XTensor &inputEnc, XTensor &inputDec, XTensor *mask, XTensor &MaskEncDec, bool isTraining);
+    XTensor MakeDecoder(XTensor& inputEnc, XTensor& inputDec, XTensor* mask,
+        XTensor& MaskEncDec, bool isTraining);

-    /* make the network for langauge modeling (with the output softmax layer) */
-    void MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool isTraining);
+    /* make the network for language modeling (with the output softmax layer) */
+    void MakeLM(XTensor& input, XTensor& output, XTensor& padding, bool isTraining);

    /* make the network for machine translation (with the output softmax layer) */
-    void MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, 
-                XTensor &paddingEnc, XTensor &paddingDec, bool isTraining);
+    void MakeMT(XTensor& inputEnc, XTensor& inputDec, XTensor& output,
+        XTensor& paddingEnc, XTensor& paddingDec, bool isTraining);

    /* make the mask for training MT models */
-    void MakeMTMask(XTensor &inputEnc, XTensor &inputDec, 
-                    XTensor &paddingEnc, XTensor &paddingDec, 
-                    XTensor &maskEnc, XTensor &maskDec, XTensor &maskEncDec);
-    
+    void MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
+        XTensor& paddingEnc, XTensor& paddingDec,
+        XTensor& maskEnc, XTensor& maskDec, XTensor& maskEncDec);
+
    /* make the mask of the encoder */
-    void MakeMTMaskEnc(XTensor &inputEnc, XTensor &paddingEnc, XTensor &maskEnc);
-    
+    void MakeMTMaskEnc(XTensor& paddingEnc, XTensor& maskEnc);
+
    /* make the mask of the decoder */
-    void MakeMTMaskDec(XTensor& inputEnc, XTensor& inputDec,
-        XTensor& paddingEnc, XTensor& paddingDec,
+    void MakeMTMaskDec(XTensor& paddingEnc, XTensor& paddingDec,
        XTensor& maskDec, XTensor& maskEncDec);

-    /* get parameter matrics */
-    void GetParams(TensorList &list);
+    /* get parameter matrices */
+    void GetParams(TensorList& list);

-    /* dump the parameters */
-    void Dump(const char * fn);
+    /* dump the model to a file */
+    void Dump(const char* fn);

    /* read the parameters */
-    void Read(const char * fn);
+    void Read(FILE* file);
 };

 }

--- a/source/sample/transformer/Transformer.cpp
+++ b/source/sample/transformer/Transformer.cpp
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
+ * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,75 +17,55 @@

 /*
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
 */

-#include <math.h>
-#include <time.h>
+#include <cmath>
+#include <ctime>
+
 #include "Transformer.h"
-#include "T2TModel.h"
-#include "T2TUtility.h"
-#include "T2TPredictor.h"
-#include "T2TTester.h"
+#include "train/T2TTrainer.h"
+#include "module/T2TUtility.h"
+#include "translate/T2TTranslator.h"
 #include "../../tensor/XDevice.h"
-#include "../../tensor/XUtility.h"
 #include "../../tensor/XGlobal.h"
+#include "../../tensor/XUtility.h"

 namespace transformer
 {

-int TransformerMain(int argc, const char ** argv)
+int TransformerMain(int argc, const char** argv)
 {
-    if(argc == 0)
+    if (argc == 0)
        return 1;

-    char ** args = new char*[argc];
-    for(int i = 0; i < argc; i++){
-        args[i] = new char[strlen(argv[i]) + 1];
-        strcpy(args[i], argv[i]);
-    }
-
-    ShowParams(argc, args);
-
-    bool isBeamSearch = false;
-    char * trainFN = new char[MAX_LINE_LENGTH];
-    char * modelFN = new char[MAX_LINE_LENGTH];
-    char * testFN = new char[MAX_LINE_LENGTH];
-    char * outputFN = new char[MAX_LINE_LENGTH];
-    char * rawModel = new char[MAX_LINE_LENGTH];
-
-    LoadParamString(argc, args, "model", modelFN, "");
-    LoadParamString(argc, args, "rawmodel", rawModel, "");
-    LoadParamString(argc, args, "test", testFN, "");
-    LoadParamString(argc, args, "output", outputFN, "");
-    LoadParamBool(argc, args, "beamsearch", &isBeamSearch, false);
+    /* load configurations */
+    T2TConfig config(argc, argv);

    srand((unsigned int)time(NULL));

-    T2TModel model;
-    model.InitModel(argc, args);
-
-    /* load the model if neccessary */
-    if(strcmp(modelFN, ""))
-        model.Read(modelFN);
-
-    /* test the model on the new data */
-    if(strcmp(testFN, "") && strcmp(outputFN, "")){
-        T2TTester searcher;
-        searcher.Init(argc, args);
-        searcher.Test(testFN, outputFN, &model);
+    /* train the model */
+    if (strcmp(config.trainFN, "") != 0) {
+        ENABLE_GRAD;
+        T2TModel model;
+        model.InitModel(config);
+        T2TTrainer trainer;
+        trainer.Init(config);
+        trainer.Train(config.trainFN, config.validFN, config.modelFN, &model);
    }

-    delete[] trainFN;
-    delete[] modelFN;
-    delete[] testFN;
-    delete[] outputFN;
-    delete[] rawModel;
-
-    for(int i = 0; i < argc; i++)
-        delete[] args[i];
-    delete[] args;
+    /* translate the test file */
+    if (strcmp(config.testFN, "") != 0 && strcmp(config.outputFN, "") != 0) {
+        DISABLE_GRAD;
+        T2TModel model;
+        model.InitModel(config);
+        T2TTranslator translator;
+        translator.Init(config);
+        translator.Translate(config.testFN, config.srcVocabFN, 
+                             config.tgtVocabFN, config.outputFN, &model);
+    }

    return 0;
 }

-}
+}
\ No newline at end of file
--- a/source/sample/transformer/Transformer.h
+++ b/source/sample/transformer/Transformer.h
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,13 +17,13 @@

 /*
 *
- * An impelementation of the transformer system. See more details 
- * about FNNLM in 
+ * An implementation of the transformer system. See more details
+ * about FNNLM in
 * "Attention Is All You Need" by Vaswani et al.
 * https://arxiv.org/pdf/1706.03762.pdf
 *
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
- * I start writing the code related to NMT - a long time since my last coding 
+ * I start writing the code related to NMT - a long time since my last coding
 * work on MT
 */


--- a/source/sample/transformer/module/T2TAttention.cpp
+++ b/source/sample/transformer/module/T2TAttention.cpp
--- a/source/sample/transformer/module/T2TAttention.h
+++ b/source/sample/transformer/module/T2TAttention.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
+ */
+
+#ifndef __T2TATTENTION_H__
+#define __T2TATTENTION_H__
+
+#include "T2TNNUtil.h"
+#include "T2TUtility.h"
+#include "../../../network/XNet.h"
+#include "../../../tensor/core/CHeader.h"
+
+using namespace nts;
+
+namespace transformer
+{
+/* attention type */
+enum { NONE, SELF_ATT, EN_DE_ATT };
+
+/* layer cache for keys and values */
+class Cache
+{
+public:
+    /* cache for keys, (B, L, H) */
+    XTensor key;
+
+    /* cache for values, (B, L, H) */
+    XTensor value;
+
+public:
+
+    /* indicates cache miss if 'true' */
+    bool miss;
+
+    /* constructor */
+    Cache();
+
+    /* update the states cache */
+    void Update(XTensor&& k, XTensor&& v);
+
+    /* keep alive states */
+    void KeepAlive(XTensor& aliveIdx);
+
+    /* reorder alive states */
+    void Reorder(XTensor& reorder);
+};
+
+/* multi-head attention */
+class T2TAttention
+{
+public:
+    /* device id */
+    int devID;
+
+    /* head number */
+    int nhead;
+
+    /* transformation matrix for Q */
+    XTensor wq;
+
+    /* bias for Q */
+    XTensor bq;
+
+    /* transformation matrix for K */
+    XTensor wk;
+
+    /* bias for K */
+    XTensor bk;
+
+    /* transformation matrix for V */
+    XTensor wv;
+
+    /* bias for V */
+    XTensor bv;
+
+    XTensor wBig;
+
+    XTensor bBig;
+
+    /* RPR emb */
+    XTensor RPEmbK;
+
+    /* transformation after dot-product attention */
+    XTensor wo;
+
+    /* bias after dot-product attention */
+    XTensor bo;
+
+    /* size of transformed Q and K */
+    int dk;
+
+    /* size of transformed V */
+    int dv;
+
+    /* size of input Q, K and V */
+    int d;
+
+    /* indicates whether we use the RPR attention */
+    bool useRPR;
+
+    /* dropout probability */
+    DTYPE dropoutP;
+
+    /* the maximum relative window size */
+    int maxRP;
+
+public:
+    /* constructor */
+    T2TAttention();
+
+    /* de-constructor */
+    ~T2TAttention();
+
+    /* initialize the model */
+    void InitModel(T2TConfig& config);
+
+    /* make the network */
+    XTensor Make(XTensor& k, XTensor& q, XTensor& v,
+                 XTensor* mask, bool isTraining,
+                 Cache* cache, int cacheType);
+
+    /* make the attention network given keys, queries and values (after linear transformation) */
+    XTensor MakeAttention(XTensor& k, XTensor& q, XTensor& v,
+                          XTensor* mask, bool isTraining);
+
+    /* make the attention network given keys, queries and values (after linear transformation) */
+    XTensor MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
+                             XTensor* mask, bool isTraining, bool isEnc);
+
+    XTensor GetRPEmbedding(const int lenQ, const int lenKV, const int maxRelativeLen, const bool isEnc);
+
+    XTensor RPDotProduct(XTensor& x, XTensor& y, XTensor& z, const bool is_key);
+};
+}
+
+#endif
--- a/source/sample/transformer/module/T2TCommonModules.cpp
+++ b/source/sample/transformer/module/T2TCommonModules.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2020, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Bei Li (libei_neu@outlook.com) 2020-02-05
+ * This file includes some common modules of the Transformer model
+ */
+
+#include <cmath>
+
+#include "T2TCommonModules.h"
+#include "../../../tensor/core/CHeader.h"
+#include "../../../tensor/function/FHeader.h"
+
+namespace transformer
+{
+
+/* 
+flexible layer normalization for the Transformer 
+>> input - input tensor
+>> ln - the layernorm network
+>> prenorm - whether we use prenorm or not
+>> before - whether we use layernorm before attention/fnn
+>> after - whether we use layernorm after attention/fnn
+*/
+XTensor LayerNorm(XTensor& input, T2TLN& ln, bool prenorm, bool before, bool after)
+{
+    if (after ^ prenorm)
+        return ln.Make(input);
+    else
+        return input;
+}
+
+}
\ No newline at end of file
--- a/source/sample/transformer/module/T2TCommonModules.h
+++ b/source/sample/transformer/module/T2TCommonModules.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2020, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /*
+  * $Created by: Bei Li (libei_neu@outlook.com) 2020-02-03
+  */
+
+#ifndef __COMMONMODULE_H__
+#define __COMMONMODULE_H__
+
+#include "T2TLayerNormal.h"
+#include "T2TCommonModules.h"
+
+using namespace nts;
+
+namespace transformer
+{
+
+/* the layer normalization module to control pre-norm or post-norm*/
+XTensor LayerNorm(XTensor& input, T2TLN& ln, bool prenorm, bool before, bool after);
+
+}
+
+#endif
\ No newline at end of file
--- a/source/sample/transformer/module/T2TEmbedding.cpp
+++ b/source/sample/transformer/module/T2TEmbedding.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-01
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-07
+ */
+
+#include <cmath>
+
+#include "T2TUtility.h"
+#include "T2TEmbedding.h"
+#include "../../../tensor/core/CHeader.h"
+
+namespace transformer
+{
+
+/* constructor */
+T2TEmbedder::T2TEmbedder()
+{
+    devID = -1;
+    vSize = -1;
+    maxLength = -1;
+}
+
+/* de-constructor */
+T2TEmbedder::~T2TEmbedder()
+{
+}
+
+/*
+initialize the model
+>> config - configurations of the model
+>> isEnc - indicates if it is used for the encoder
+*/
+void T2TEmbedder::InitModel(T2TConfig& config, bool isEnc)
+{
+    devID = config.devID;
+    d = config.modelSize;
+    padIdx = config.padID;
+    eSize = config.embSize;
+    maxLength = config.maxPosLen;
+    vSize = (isEnc) ? config.srcVocabSize : config.tgtVocabSize;
+
+    InitTensor2D(&w, vSize, eSize, X_FLOAT, devID);
+
+    maxLength = maxLength + 1 + 1;
+    DTYPE v = 1.0F / (float)sqrt((float)eSize);
+    w.SetDataRandn(0, v);
+
+    /* create the positional embedding matrix */
+    MakePosEmbedding(maxLength);
+}
+
+/*
+make positional embeddings (of size eSize * length)
+>> length - length of the sequence
+*/
+void T2TEmbedder::MakePosEmbedding(int length)
+{
+    InitTensor2D(&posEmbeddingBase, length, eSize, X_FLOAT, devID);
+
+    float* data = new float[posEmbeddingBase.unitNum];
+
+    for (int pos = 0; pos < length; pos++) {
+        float* dp = data + pos * eSize;
+
+        int channelSize = eSize / 2;
+        int offset = 0;
+        for (int i = 0; i < channelSize; i++) {
+            dp[offset++] = (float)sin(pos * exp(-i * log(10000.0F) / (channelSize - 1)));
+        }
+        for (int i = 0; i < channelSize; i++) {
+            dp[offset++] = (float)cos(pos * exp(-i * log(10000.0F) / (channelSize - 1)));
+        }
+    }
+
+    /* padding zeros */
+    int padStart = padIdx * eSize;
+    for (int i = padStart; i < padStart + eSize; i++)
+        data[i] = 0.F;
+
+    posEmbeddingBase.SetData(data, posEmbeddingBase.unitNum);
+
+    if (w.dataType != posEmbeddingBase.dataType)
+        posEmbeddingBase = ConvertDataType(posEmbeddingBase, w.dataType);
+
+    delete[] data;
+}
+
+/*
+make the network
+>> input - the word indices
+>> nstep - the length of current sequence
+>> isDec - indicates whether it is decoder
+>> isTraining - indicates whether it is training
+<< return - word & position embeddings of the input
+*/
+XTensor T2TEmbedder::Make(XTensor& input, bool isDec, bool isTraining, int nstep)
+{
+    /* make sure the padding index is 1 */
+    CheckNTErrors(input.order > 1, "Wrong input tensor size!");
+    CheckNTErrors(input.dimSize[input.order - 1] < maxLength, "The sequence is too long!");
+    CheckNTErrors(vSize > 0, "set vocabulary size by \"-vsize\"");
+    CheckNTErrors(eSize > 0, "set embedding size by \"-esize\"");
+
+    XTensor wordEmbedding, position, posEmbedding;
+    InitTensor(&position, &input);
+
+    int* posData = new int[input.unitNum];
+
+    XTensor inputCPU;
+    InitTensorOnCPU(&inputCPU, &input);
+    _CopyValues(&input, &inputCPU);
+
+    if (!isDec)
+    {
+        /* encoder embeddings */
+        for (int i = 0; i < inputCPU.dimSize[0]; i++) {
+            int startNoPad = 1 + 1;
+            int* p = ((int*)inputCPU.data) + i * inputCPU.dimSize[1];
+            for (int j = 0; j < inputCPU.dimSize[1]; j++) {
+                if (p[j] == 1) {
+                    posData[i * inputCPU.dimSize[1] + j] = 1;
+                }
+                else {
+                    posData[i * inputCPU.dimSize[1] + j] = startNoPad++;
+                }
+            }
+        }
+        position.SetData(posData, position.unitNum);
+    }
+    else
+    {
+        /* decoder embeddings */
+        position.SetDataFixed(nstep + 2);
+    }
+
+    delete[] posData;
+
+    /* we make positional embeddings first */
+    posEmbedding = Gather(posEmbeddingBase, position);
+
+    /* then we make word embeddings */
+    wordEmbedding = Gather(w, input);
+
+    wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize));
+
+    /* we sum over the two embeddings */
+    return wordEmbedding + posEmbedding;
+}
+
+}
\ No newline at end of file
--- a/source/sample/transformer/module/T2TEmbedding.h
+++ b/source/sample/transformer/module/T2TEmbedding.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-01
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-07
+ */
+
+#ifndef __T2TEMBEDDING_H__
+#define __T2TEMBEDDING_H__
+
+#include "T2TUtility.h"
+#include "../../../network/XNet.h"
+
+using namespace nts;
+
+namespace transformer
+{
+
+#define DEFAULT_EMBEDDING_SIZE 512
+
+/*
+embedding (of word at position i):
+word embedding + positional embedding
+*/
+class T2TEmbedder
+{
+public:
+    /* device id */
+    int devID;
+
+    /* vocabulary size */
+    int vSize;
+
+    /* embedding size */
+    int eSize;
+
+    /* maximum length of the sequence */
+    int maxLength;
+
+    /* dimension size of the hidden layers in the t2t model */
+    int d;
+
+    /* padding index */
+    int padIdx;
+
+    /* word embedding matrix */
+    XTensor w;
+
+    /* predefined positional embeddings. It can speeds up
+       the embedding processing by re-loading. */
+    XTensor posEmbeddingBase;
+
+public:
+    /* constructor */
+    T2TEmbedder();
+
+    /* de-constructor */
+    ~T2TEmbedder();
+
+    /* initialize the model */
+    void InitModel(T2TConfig& config, bool isEnc = true);
+
+    /* make positional embeddings */
+    void MakePosEmbedding(int length);
+
+    /* make the network */
+    XTensor Make(XTensor& input, bool isDec, bool isTraining, int nstep = 0);
+};
+
+}
+
+#endif
--- a/source/sample/transformer/module/T2TFNN.cpp
+++ b/source/sample/transformer/module/T2TFNN.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
+ */
+
+#include <cmath>
+
+#include "T2TFNN.h"
+#include "T2TUtility.h"
+#include "T2TEmbedding.h"
+#include "../../../tensor/core/CHeader.h"
+#include "../../../tensor/function/FHeader.h"
+
+namespace transformer
+{
+
+/* constructor */
+T2TFNN::T2TFNN()
+{
+    inSize = -1;
+    outSize = -1;
+    hSize = -1;
+}
+
+/* de-constructor */
+T2TFNN::~T2TFNN()
+{
+}
+
+/*
+initialize the model
+>> argc - number of arguments
+>> argv - list of pointers to the arguments
+>> config - configurations of the model
+*/
+void T2TFNN::InitModel(T2TConfig& config)
+{
+    devID = config.devID;
+
+    inSize = config.modelSize;
+    outSize = config.modelSize;
+    hSize = config.fnnHiddenSize;
+    dropoutP = config.fnnDropout;
+
+    InitTensor2D(&w1, inSize, hSize, X_FLOAT, devID);
+    InitTensor1D(&b1, hSize, X_FLOAT, devID);
+
+    InitTensor2D(&w2, hSize, outSize, X_FLOAT, devID);
+    InitTensor1D(&b2, outSize, X_FLOAT, devID);
+
+    float scale = 1.0F;
+    _SetDataFanInOut(&w1, scale);
+    _SetDataFanInOut(&w2, scale);
+
+    b1.SetZeroAll();
+    b2.SetZeroAll();
+}
+
+/*
+make the network
+y = max(0, x * w1 + b1) * w2 + b2
+>> input - the input tensor
+>> return - the output tensor
+*/
+XTensor T2TFNN::Make(XTensor& input, bool isTraining)
+{
+    XTensor t1;
+
+    /* t1 = max(0, x * w1 + b1) */
+    t1 = Rectify(MulAndShift(input, w1, b1));
+
+    if (isTraining && dropoutP > 0)
+        t1 = Dropout(t1, dropoutP);
+
+    /* result = t1 * w2 + b2 */
+    return MulAndShift(t1, w2, b2);
+}
+
+}
\ No newline at end of file
--- a/source/sample/transformer/module/T2TFNN.h
+++ b/source/sample/transformer/module/T2TFNN.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
+ */
+
+#ifndef __T2TFNN_H__
+#define __T2TFNN_H__
+
+#include "T2TUtility.h"
+#include "T2TLayerNormal.h"
+#include "../../../tensor/XTensor.h"
+
+using namespace nts;
+
+namespace transformer
+{
+
+/* a fnn: y = max(0, x * w1 + b1) * w2 + b2 */
+class T2TFNN
+{
+public:
+    /* device id */
+    int devID;
+
+    /* size of input vector */
+    int inSize;
+
+    /* size of output vector */
+    int outSize;
+
+    /* size of hidden layers */
+    int hSize;
+
+    /* matrix of transformation 1 */
+    XTensor w1;
+
+    /* bias of transformation 1 */
+    XTensor b1;
+
+    /* matrix of transformation 2 */
+    XTensor w2;
+
+    /* bias of transformation 2 */
+    XTensor b2;
+
+    /* dropout probability */
+    DTYPE dropoutP;
+
+public:
+
+    /* constructor */
+    T2TFNN();
+
+    /* de-constructor */
+    ~T2TFNN();
+
+    /* initialize the model */
+    void InitModel(T2TConfig& config);
+
+    /* make the network */
+    XTensor Make(XTensor& input, bool isTraining);
+};
+
+}
+
+#endif
--- a/source/sample/transformer/module/T2TGatedLinearUnit.cpp
+++ b/source/sample/transformer/module/T2TGatedLinearUnit.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Bei Li (libei_neu@outlook.com) 2020-02-03
+ */
+
+
+#include <cmath>
+
+#include "T2TUtility.h"
+#include "T2TEmbedding.h"
+#include "T2TGatedLinearUnit.h"
+#include "../../../tensor/core/CHeader.h"
+#include "../../../tensor/function/FHeader.h"
+
+namespace transformer
+{
+
+/* constructor */
+GLU::GLU()
+{
+    inSize = -1;
+    outSize = -1;
+    hSize = -1;
+}
+
+/* de-constructor */
+GLU::~GLU()
+{
+}
+
+/*
+initialize the model
+>> config - configurations of the model
+*/
+void GLU::InitModel(T2TConfig& config)
+{
+    devID = config.devID;
+
+    float minmax = 0;
+
+    inSize = config.modelSize;
+    outSize = config.modelSize;
+
+    InitTensor2D(&w1, hSize, outSize, X_FLOAT, devID);
+    InitTensor1D(&b1, outSize, X_FLOAT, devID);
+
+    InitTensor2D(&w2, hSize, outSize, X_FLOAT, devID);
+    InitTensor1D(&b2, outSize, X_FLOAT, devID);
+}
+
+/*
+make the network
+y = W1 * x + b1 * sigmod(W2 * x + b2)
+>> input - the input tensor, size = 2 * hSize
+>> return - the output tensor, size = hSize
+*/
+XTensor GLU::Make(XTensor& input)
+{
+    XTensor t1;
+    XTensor t2;
+    TensorList input_list;
+
+    /* split the input into two vectors with the dim hSize */
+    Split(input, input_list, -1, 2);
+
+    /* t1 = W1 * x + b1 */
+    t1 = MulAndShift(input_list.GetItem(0), w1, b1);
+
+    /* t2 = W2 * x + b2 */
+    t2 = MulAndShift(input_list.GetItem(1), w2, b2);
+
+    return t1 * Sigmoid(t2);
+}
+
+}
\ No newline at end of file
--- a/source/sample/transformer/module/T2TGatedLinearUnit.h
+++ b/source/sample/transformer/module/T2TGatedLinearUnit.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Bei Li (libei_neu@outlook.com) 2020-02-03
+ */
+
+
+#ifndef __GLU_H__
+#define __GLU_H__
+
+#include "T2TLayerNormal.h"
+#include "T2TGatedLinearUnit.h"
+
+using namespace nts;
+
+namespace transformer
+{
+
+/* a fnn: y = max(0, x * w1 + b1) * w2 + b2 */
+class GLU
+{
+public:
+    /* device id */
+    int devID;
+
+    /* size of input vector */
+    int inSize;
+
+    /* size of output vector */
+    int outSize;
+
+    /* size of hidden layers */
+    int hSize;
+
+    /* matrix of transformation 1 */
+    XTensor w1;
+
+    /* bias of transformation 1 */
+    XTensor b1;
+
+    /* matrix of transformation 2 */
+    XTensor w2;
+
+    /* bias of transformation 2 */
+    XTensor b2;
+
+public:
+
+    /* constructor */
+    GLU();
+
+    /* de-constructor */
+    ~GLU();
+
+    /* initialize the model */
+    void InitModel(T2TConfig& config);
+
+    /* make the network */
+    XTensor Make(XTensor& input);
+};
+
+}
+
+#endif
\ No newline at end of file
--- a/source/sample/transformer/module/T2TLayerHistory.cpp
+++ b/source/sample/transformer/module/T2TLayerHistory.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Bei Li (libei_neu@outlook.com) 2020-02-03
+ */
+
+#include <cmath>
+
+#include "T2TUtility.h"
+#include "T2TEmbedding.h"
+#include "T2TLayerNormal.h"
+#include "T2TLayerHistory.h"
+
+#include "../../../tensor/core/CHeader.h"
+
+#define SAFE_DELETE(x) do{ if((x) != NULL){delete (x); (x) = NULL;} } while(false)
+#define SAFE_DELETE_ARRAY(x) do{ if((x) != NULL) {delete [] (x); (x)=NULL;} } while(false)
+
+namespace transformer
+{
+
+/* constructor */
+LayerHistory::LayerHistory()
+{
+    d = -1;
+    count = -1;
+    weight = NULL;
+    layerNorms = NULL;
+}
+
+/* de-constructor */
+LayerHistory::~LayerHistory()
+{
+    history.Clear();
+    delete[] layerNorms;
+}
+
+/*
+initialize the model
+>> config - configurations of the model
+*/
+void LayerHistory::InitModel(T2TConfig& config)
+{
+    devID = config.devID;
+    d = config.modelSize;
+    nlayer = config.nEncLayer;
+
+    InitTensor2D(&weight, nlayer + 1, nlayer + 1, X_FLOAT, devID);
+
+    layerNorms = new T2TLN[nlayer];
+
+    /* initialize the layer normalization of each layer */
+    for (int i = 0; i < nlayer; i++) {
+        layerNorms[i].InitModel(config);
+    }
+}
+
+/*
+the Add operation
+>> tensor - the previous layer output. It might be of size B * L * H
+            where B = batch size, L = sequence length,
+            and H = vector size of each position
+*/
+void LayerHistory::Add(XTensor& tensor)
+{
+    /* the embedding is not normed */
+    count += 1;
+    if (history.Size() == 0) {
+
+        //sample_ = tensor;
+        history.Add(&tensor);
+        return;
+    }
+    XTensor ln = layerNorms[count - 2].Make(tensor);
+    history.Add(&ln);
+}
+
+/*
+generate the weight sum vector of all previous layer output in the history as the layer input
+*/
+XTensor LayerHistory::Pop()
+{
+    /* the number of layer output in the history */
+    size_t size = history.Size();
+
+    TensorList historyList;
+    for (size_t i = 0; i < size; i++)
+        historyList.Add(history[i]);
+
+    /* we need stack the tensor along the first dim*/
+    XTensor stackTensor = Stack(historyList, 0);
+
+    XTensor interWeight;
+
+    InitTensor2D(&interWeight, 1, weight.dimSize[1], DEFAULT_DTYPE, devID);
+    XTensor layerWeight;
+    InitTensor1D(&layerWeight, size, DEFAULT_DTYPE, devID);
+
+    _SelectRange(&weight, &interWeight, 0, size - 1, size);
+    interWeight.Reshape(interWeight.unitNum);
+    _SelectRange(&interWeight, &layerWeight, 0, 0, size);
+    MultiplyDimMe(stackTensor, layerWeight, 0);
+
+    XTensor result;
+    ReduceSum(stackTensor, result, 0);
+
+    return result;
+}
+
+void LayerHistory::ClearHistory()
+{
+    history.Clear();
+}
+
+}
\ No newline at end of file
--- a/source/sample/transformer/module/T2TLayerHistory.h
+++ b/source/sample/transformer/module/T2TLayerHistory.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Bei Li (libei_neu@outlook.com) 2020-02-03
+ */
+
+#ifndef __LAYERHISTORY_H__
+#define __LAYERHISTORY_H__
+
+#include "T2TLayerNormal.h"
+#include "T2TLayerHistory.h"
+
+#include "../../../tensor/function/FHeader.h"
+
+using namespace nts;
+
+namespace transformer
+{
+
+/*
+multi-head attention
+y(Q, K, V) = cat(head_1, head_2, ..., head_n)
+where head_i = Attention(Q * w_i^Q, K * w_i^K, V * w_i^V)
+      attention(Q, K, V) = softmax(Q * K^T/d_k^0.5) V
+      d_k = dimension size of K
+*/
+class LayerHistory
+{
+public:
+    /* device id */
+    int devID;
+
+    /* the triangle weight matrix for dlcl */
+    XTensor weight;
+
+    /* hidden size */
+    int d;
+
+    /* layer number */
+    int nlayer;
+
+    /* current layer number */
+    int count;
+
+    /* a history to store the value of intimidate layers */
+    TensorList history;
+
+    /* layer normalization for each intimidate layer */
+    T2TLN* layerNorms;
+
+public:
+    /* constructor */
+    LayerHistory();
+
+    /* de-constructor */
+    ~LayerHistory();
+
+    /* initialize the model */
+    void InitModel(T2TConfig& config);
+
+    /* add the layer output to the history */
+    void Add(XTensor& tensor);
+
+    /* compute the layer input for the current layer, the weight sum of all previous layer output after normed in the history */
+    XTensor Pop();
+
+    /* clean the history*/
+    void ClearHistory();
+};
+
+}
+
+#endif
--- a/source/sample/transformer/module/T2TLayerNormal.cpp
+++ b/source/sample/transformer/module/T2TLayerNormal.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
+ */
+
+#include <cmath>
+#include "T2TUtility.h"
+#include "T2TEmbedding.h"
+#include "T2TLayerNormal.h"
+#include "../../../tensor/core/CHeader.h"
+
+namespace transformer
+{
+
+/* constructor */
+T2TLN::T2TLN()
+{
+    devID = -1;
+    d = 0;
+}
+
+/* de-constructor */
+T2TLN::~T2TLN()
+{
+}
+
+/*
+initialize the model
+>> argc - number of arguments
+>> argv - list of pointers to the arguments
+>> config - configurations of the model
+*/
+void T2TLN::InitModel(T2TConfig& config)
+{
+    devID = config.devID;
+
+    d = config.modelSize;
+
+    InitTensor1D(&w, d, X_FLOAT, devID);
+    InitTensor1D(&b, d, X_FLOAT, devID);
+    w.SetDataRand(1.0F, 1.0F);
+    b.SetZeroAll();
+}
+
+/*
+make the network
+>> input - the input tensor
+>> return - layer normalization output
+*/
+XTensor T2TLN::Make(XTensor& input)
+{
+    XTensor& x = input;
+    XTensor xn;
+    XTensor mean;
+    XTensor variance;
+    XTensor standard;
+    XTensor meanFilled;
+    XTensor standardFilled;
+
+    TENSOR_DATA_TYPE dataType = input.dataType;
+
+    if (dataType == X_FLOAT16) {
+        /* reduce functions can only run with FP32 */
+        x = ConvertDataType(input, X_FLOAT);
+    }
+
+    /* \mu = (sum_i x_i)/m */
+    mean = ReduceMean(x, x.order - 1);
+
+    /* \sigma = (sum_i (x_i - \mu)^2)/m */
+    variance = ReduceVariance(x, x.order - 1, mean);
+
+    /* standard = sqrt(variance) */
+    standard = Power(variance, 0.5F);
+
+    /* unsqueeze mean and standard deviation to fit them into
+       the same shape of x */
+    meanFilled = Unsqueeze(mean, x.order - 1, x.GetDim(-1));
+    standardFilled = Unsqueeze(standard, x.order - 1, x.GetDim(-1));
+
+    /* x' = (x - \mu)/standard */
+    xn = (x - meanFilled) / standardFilled;
+
+    if (dataType != mean.dataType) {
+        x = ConvertDataType(x, dataType);
+        xn = ConvertDataType(xn, dataType);
+    }
+
+    /* result = x' * w + b   */
+    return xn * w + b;
+}
+
+}
\ No newline at end of file
--- a/source/sample/transformer/module/T2TLayerNormal.h
+++ b/source/sample/transformer/module/T2TLayerNormal.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
+ */
+
+#ifndef __T2TLAYERNORMAL_H__
+#define __T2TLAYERNORMAL_H__
+
+#include "T2TUtility.h"
+#include "../../../network/XNet.h"
+
+using namespace nts;
+
+namespace transformer
+{
+
+/* layer normalization: y = norm(x) * w + b
+   where norm(x) = (x - mean)/standardDeviation */
+class T2TLN
+{
+public:
+    /* device id */
+    int devID;
+
+    /* the transformation matrix w */
+    XTensor w;
+
+    /* the bias term b */
+    XTensor b;
+
+    /* dimension size of the model */
+    int d;
+
+public:
+    /* constructor */
+    T2TLN();
+
+    /* de-constructor */
+    ~T2TLN();
+
+    /* initialize the model */
+    void InitModel(T2TConfig& config);
+
+    /* make the network */
+    XTensor Make(XTensor& input);
+};
+
+}
+
+#endif
--- a/source/sample/transformer/module/T2TNNUtil.cpp
+++ b/source/sample/transformer/module/T2TNNUtil.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Chi (huchinlp@foxmail.com) 2020-03-21
+ */
+
+#include "T2TNNUtil.h"
+
+namespace transformer
+{
+
+/* 
+a wrapper for the gather function 
+>> src - the input tensor
+>> index - the index tensor
+<< res - the output tensor
+*/
+XTensor AutoGather(XTensor& src, XTensor& index)
+{
+
+    if (src.order == 2)
+        return Gather(src, index);
+    else {
+        CheckNTErrors(src.order == 3, "the source must be 3d");
+
+        int order = src.order;
+        int dimSize[MAX_TENSOR_DIM_NUM];
+        for (int i = 0; i < src.order; i++) {
+            dimSize[i] = src.dimSize[i];
+        }
+
+        src.Reshape(src.dimSize[0], src.dimSize[1] * src.dimSize[2]);
+        XTensor res = Gather(src, index);
+
+        src.Reshape(order, dimSize);
+
+        dimSize[0] = index.dimSize[0];
+        dimSize[1] = res.unitNum / (dimSize[0] * dimSize[2]);
+
+        res.Reshape(order, dimSize);
+        return res;
+    }
+}
+
+}
\ No newline at end of file
--- a/source/sample/transformer/module/T2TNNUtil.h
+++ b/source/sample/transformer/module/T2TNNUtil.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Chi (huchinlp@foxmail.com) 2020-03-21
+ */
+
+#ifndef __T2TNNUTIL_H__
+#define __T2TNNUTIL_H__
+
+#include "../../../tensor/XGlobal.h"
+#include "../../../tensor/core/CHeader.h"
+#include "../../../tensor/function/FHeader.h"
+
+using namespace nts;
+
+namespace transformer
+{
+
+/* the gather function for tensor with any dimension */
+XTensor AutoGather(XTensor& src, XTensor& index);
+
+}
+
+#endif
\ No newline at end of file
--- a/source/sample/transformer/module/T2TOutput.cpp
+++ b/source/sample/transformer/module/T2TOutput.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
+ */
+
+#include <cmath>
+
+#include "T2TOutput.h"
+#include "T2TUtility.h"
+#include "T2TEmbedding.h"
+#include "../../../tensor/core/CHeader.h"
+
+namespace transformer
+{
+
+/* constructor */
+T2TOutput::T2TOutput()
+{
+    devID = -1;
+    vSize = -1;
+    hSize = -1;
+}
+
+/* de-constructor */
+T2TOutput::~T2TOutput()
+{
+}
+
+/*
+initialize the model
+>> config - configurations of the model
+*/
+void T2TOutput::InitModel(T2TConfig& config)
+{
+    devID = config.devID;
+    hSize = config.modelSize;
+    vSize = config.tgtVocabSize;
+
+    InitTensor2D(&w, vSize, hSize, X_FLOAT, devID);
+
+    DTYPE v = 1.0F / (float)sqrt((float)hSize);
+    w.SetDataRandn(0, v);
+}
+
+/*
+make the network (redefined output tensor)
+>> input - input tensor
+>> output - output tensor
+>> isTraining - whether it is used for training
+>> normalized - whether ignore the log-softmax
+*/
+void T2TOutput::Make(XTensor& input, XTensor& output, bool isTraining, bool normalized)
+{
+    XTensor& x = input;
+
+    output = MMul(x, X_NOTRANS, w, X_TRANS);
+
+    /* use softmax for training */
+    if (isTraining) {
+        output = Softmax(output, -1);
+        return;
+    }
+
+    /* normalize the output for beam search */
+    if (normalized) {
+        auto dataType = output.dataType;
+        if (dataType == X_FLOAT16)
+            output = ConvertDataType(output, X_FLOAT);
+
+        output = LogSoftmax(output, -1);
+
+        if (output.dataType != dataType)
+            output = ConvertDataType(output, dataType);
+    }
+}
+
+}
\ No newline at end of file
--- a/source/sample/transformer/module/T2TOutput.h
+++ b/source/sample/transformer/module/T2TOutput.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
+ */
+
+#ifndef __T2TOUTPUT_H__
+#define __T2TOUTPUT_H__
+
+#include "T2TUtility.h"
+#include "../../../tensor/function/FHeader.h"
+
+using namespace nts;
+
+namespace transformer
+{
+
+/* output layer */
+class T2TOutput
+{
+public:
+    /* device id */
+    int devID;
+
+    /* vocabulary size */
+    int vSize;
+
+    /* vector size of the linear transformation */
+    int hSize;
+
+    /* transformation matrix */
+    XTensor w;
+
+public:
+    /* constructor */
+    T2TOutput();
+
+    /* de-constructor */
+    ~T2TOutput();
+
+    /* initialize the model */
+    void InitModel(T2TConfig& config);
+
+    /* make the network (redefined output tensor) */
+    void Make(XTensor& input, XTensor& output, bool isTraining, bool normalized);
+};
+
+}
+
+#endif
\ No newline at end of file
--- a/source/sample/transformer/module/T2TUtility.cpp
+++ b/source/sample/transformer/module/T2TUtility.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <string>
+#include <fstream>
+#include <sstream>
+
+#include "T2TUtility.h"
+#include "../../../tensor/XGlobal.h"
+
+using namespace nts;
+using namespace std;
+
+namespace transformer
+{
+
+/*
+load configurations from the command
+>> argc - number of arguments
+>> argv - the list of arguments
+*/
+T2TConfig::T2TConfig(int argc, const char** argv)
+{
+    char** args = new char* [MAX_PARAM_NUM];
+    for (int i = 0; i < argc; i++) {
+        args[i] = new char[strlen(argv[i]) + 1];
+        strcpy(args[i], argv[i]);
+    }
+
+    char* configFN = new char[1024];
+    LoadParamString(argc, args, "config", configFN, "");
+
+    int argsNum = argc;
+
+    /* load configurations from a file */
+    if (strcmp(configFN, "") != 0)
+        argsNum = LoadFromFile(configFN, args);
+
+    ShowParams(argsNum, args);
+
+    /* options for the model */
+    LoadParamInt(argsNum, args, "nhead", &nhead, 8);
+    LoadParamInt(argsNum, args, "enclayer", &nEncLayer, 1);
+    LoadParamInt(argsNum, args, "declayer", &nDecLayer, 1);
+    LoadParamInt(argsNum, args, "maxrp", &maxRP, 8);
+    LoadParamInt(argsNum, args, "embsize", &embSize, 256);
+    LoadParamInt(argsNum, args, "modelsize", &modelSize, 256);
+    LoadParamInt(argsNum, args, "maxpos", &maxPosLen, 1024);
+    LoadParamInt(argsNum, args, "fnnhidden", &fnnHiddenSize, modelSize * 4);
+    LoadParamInt(argsNum, args, "vsize", &srcVocabSize, 10000);
+    LoadParamInt(argsNum, args, "vsizetgt", &tgtVocabSize, 10000);
+    LoadParamInt(argsNum, args, "padid", &padID, 1);
+    LoadParamInt(argsNum, args, "startid", &startID, 2);
+    LoadParamInt(argsNum, args, "endid", &endID, 2);
+    LoadParamBool(argsNum, args, "rpr", &useRPR, false);
+    LoadParamBool(argsNum, args, "prenorm", &preNorm, false);
+    LoadParamString(argsNum, args, "model", modelFN, "model.bin");
+    LoadParamString(argsNum, args, "srcvocab", srcVocabFN, "vocab.src");
+    LoadParamString(argsNum, args, "tgtvocab", tgtVocabFN, "vocab.tgt");
+
+    /* options for training */
+    LoadParamString(argsNum, args, "train", trainFN, "");
+    LoadParamString(argsNum, args, "valid", validFN, "");
+    LoadParamInt(argsNum, args, "dev", &devID, 0);
+    LoadParamInt(argsNum, args, "wbatch", &wBatchSize, 2048);
+    LoadParamInt(argsNum, args, "sbatch", &sBatchSize, 1);
+    isTraining = (strcmp(trainFN, "") == 0) ? false : true;
+    LoadParamBool(argsNum, args, "mt", &isMT, true);
+    LoadParamFloat(argsNum, args, "dropout", &dropout, 0.1);
+    LoadParamFloat(argsNum, args, "fnndrop", &fnnDropout, 0.0);
+    LoadParamFloat(argsNum, args, "attdrop", &attDropout, 0.0);
+
+    LoadParamFloat(argc, args, "lrate", &lrate, 1.0F);
+    LoadParamFloat(argc, args, "lrbias", &lrbias, 0);
+    LoadParamInt(argc, args, "nepoch", &nepoch, 20);
+    LoadParamInt(argc, args, "nstep", &nstep, 100000);
+    LoadParamInt(argc, args, "nwarmup", &nwarmup, 3000);
+    LoadParamBool(argc, args, "adam", &useAdam, true);
+    LoadParamFloat(argc, args, "adambeta1", &adamBeta1, 0.9F);
+    LoadParamFloat(argc, args, "adambeta2", &adamBeta2, 0.98F);
+    LoadParamFloat(argc, args, "adamdelta", &adamDelta, 1e-9F);
+    LoadParamBool(argc, args, "shuffled", &isShuffled, true);
+    LoadParamFloat(argc, args, "labelsmoothing", &labelSmoothingP, 0.1);
+    LoadParamInt(argc, args, "nstepcheckpoint", &nStepCheckpoint, -1);
+    LoadParamBool(argc, args, "epochcheckpoint", &useEpochCheckpoint, false);
+    LoadParamInt(argc, args, "updatestep", &updateStep, 1);
+    LoadParamBool(argc, args, "debug", &isDebugged, false);
+    LoadParamBool(argc, args, "sorted", &isLenSorted, false);
+
+    LoadParamInt(argc, args, "bufsize", &bufSize, 50000);
+    LoadParamBool(argc, args, "doubledend", &isDoubledEnd, false);
+    LoadParamBool(argc, args, "smallbatch", &isSmallBatch, true);
+    LoadParamBool(argc, args, "bigbatch", &isBigBatch, false);
+    LoadParamBool(argc, args, "randbatch", &isRandomBatch, false);
+    LoadParamInt(argc, args, "bucketsize", &bucketSize, 0);
+
+    /* options for translating */
+    LoadParamString(argsNum, args, "test", testFN, "");
+    LoadParamString(argsNum, args, "output", outputFN, "");
+    LoadParamInt(argsNum, args, "beamsize", &beamSize, 1);
+    LoadParamBool(argsNum, args, "fp16", &useFP16, false);
+    LoadParamFloat(argsNum, args, "lenalpha", &lenAlpha, 0.6);
+    LoadParamFloat(argsNum, args, "maxlenalpha", &maxLenAlpha, 2.0);
+
+    for (int i = 0; i < argc; i++)
+        delete[] args[i];
+    delete[] args;
+    delete[] configFN;
+}
+
+/*
+load configurations from a file
+>> configFN - path to the configuration file
+>> args - the list to store the configurations
+format: one option per line, separated by a blank or a tab
+*/
+int T2TConfig::LoadFromFile(const char* configFN, char** args) {
+    ifstream f(configFN, ios::in);
+    CheckNTErrors(f.is_open(), "unable to open the config file");
+
+    int argsNum = 0;
+
+    /* parse arguments */
+    string key, value;
+    while (f >> key >> value) {
+        key += '-';
+        strcpy(args[argsNum++], key.c_str());
+        strcpy(args[argsNum++], value.c_str());
+    }
+
+    /* record the number of arguments */
+    return argsNum;
+}
+
+void LoadParamString(int argc, char** argv, const char* name, char* p, const char* defaultP)
+{
+    char vname[128];
+    vname[0] = '-';
+    strcpy(vname + 1, name);
+    bool hit = false;
+    for (int i = 0; i < argc; i++) {
+        if (!strcmp(argv[i], vname) && i + 1 < argc) {
+            strcpy(p, argv[i + 1]);
+            hit = true;
+            break;
+        }
+    }
+    if (!hit)
+        strcpy(p, defaultP);
+}
+
+void LoadParamInt(int argc, char** argv, const char* name, int* p, int defaultP)
+{
+    char vname[128];
+    vname[0] = '-';
+    strcpy(vname + 1, name);
+    bool hit = false;
+    for (int i = 0; i < argc; i++) {
+        if (!strcmp(argv[i], vname) && i + 1 < argc) {
+            *(int*)p = atoi(argv[i + 1]);
+            hit = true;
+            break;
+        }
+    }
+    if (!hit)
+        *p = defaultP;
+}
+
+void LoadParamBool(int argc, char** argv, const char* name, bool* p, bool defaultP)
+{
+    char vname[128];
+    vname[0] = '-';
+    strcpy(vname + 1, name);
+    bool hit = false;
+    for (int i = 0; i < argc; i++) {
+        if (!strcmp(argv[i], vname)) {
+            *(bool*)p = true;
+            hit = true;
+            break;
+        }
+    }
+    if (!hit)
+        *p = defaultP;
+}
+
+void LoadParamFloat(int argc, char** argv, const char* name, float* p, float defaultP)
+{
+    char vname[128];
+    vname[0] = '-';
+    strcpy(vname + 1, name);
+    bool hit = false;
+    for (int i = 0; i < argc; i++) {
+        if (!strcmp(argv[i], vname) && i + 1 < argc) {
+            *p = (float)atof(argv[i + 1]);
+            hit = true;
+            break;
+        }
+    }
+    if (!hit)
+        *p = defaultP;
+}
+
+void ShowParams(int argc, char** argv)
+{
+    fprintf(stderr, "args:\n");
+    for (int i = 0; i < argc; i++) {
+        if (argv[i][1] == 0)
+            continue;
+        if (argv[i][0] == '-' && (argv[i][1] < '1' || argv[i][1] > '9')) {
+            if (i + 1 < argc && argv[i + 1][0] != '-')
+                fprintf(stderr, " %s=%s\n", argv[i], argv[i + 1]);
+            else
+                fprintf(stderr, " %s=yes\n", argv[i]);
+        }
+    }
+    fprintf(stderr, "\n");
+}
+
+#define MAX_WORD_NUM 120
+
+/*
+split string by delimiter, this will return indices of all sub-strings
+>> s - the original string
+>> delimiter - as it is
+<< indices - indices of all sub-strings
+*/
+UInt64List SplitToPos(const string& s, const string& delimiter)
+{
+    UInt64List indices;
+    if (delimiter.length() == 0) {
+        indices.Add(0);
+    }
+    size_t pos = 0;
+    uint64_t start = 0;
+    while ((pos = s.find(delimiter, start)) != string::npos) {
+        if (pos != start) {
+            indices.Add(start);
+        }
+        start = pos + delimiter.length();
+    }
+    if (start != s.length()) {
+        indices.Add(start);
+    }
+    return indices;
+}
+
+/* split a string to a int64_t list */
+IntList SplitInt(const string& s, const string& delimiter)
+{
+    IntList values;
+    auto indices = SplitToPos(s, delimiter);
+    for (int i = 0; i < indices.Size(); i++) {
+        values.Add(strtol(s.data() + indices[i], nullptr, 10));
+    }
+    return values;
+}
+
+/* split a string to a float list */
+FloatList SplitFloat(const string& s, const string& delimiter)
+{
+    FloatList values;
+    auto indices = SplitToPos(s, delimiter);
+    for (int i = 0; i < indices.Size(); i++) {
+        values.Add(strtof(s.data() + indices[i], nullptr));
+    }
+    return values;
+}
+
+}
\ No newline at end of file
--- a/source/sample/transformer/module/T2TUtility.h
+++ b/source/sample/transformer/module/T2TUtility.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
+ */
+
+#ifndef __T2TUTILITY_H__
+#define __T2TUTILITY_H__
+
+#include <string>
+#include <cstdio>
+
+#include "../../../tensor/XList.h"
+
+using namespace std;
+using namespace nts;
+
+namespace transformer
+{
+
+#define MAX_PARAM_NUM 100
+
+/* load arguments */
+void LoadParamInt(int argc, char** argv, const char* name, int* p, int defaultP);
+void LoadParamBool(int argc, char** argv, const char* name, bool* p, bool defaultP);
+void LoadParamFloat(int argc, char** argv, const char* name, float* p, float defaultP);
+void LoadParamString(int argc, char** argv, const char* name, char* p, const char* defaultP);
+
+/* show arguments */
+void ShowParams(int argc, char** argv);
+
+/* split string */
+IntList SplitInt(const string& s, const string& delimiter);
+FloatList SplitFloat(const string& s, const string& delimiter);
+UInt64List SplitToPos(const string& s, const string& delimiter);
+
+/* configurations for t2t */
+class T2TConfig {
+public:
+    /* path to the model */
+    char modelFN[1024];
+
+    /* path to the source vocab */
+    char srcVocabFN[1024];
+
+    /* path to the target vocab */
+    char tgtVocabFN[1024];
+
+    /* path to the input file (for inference) */
+    char testFN[1024];
+
+    /* path to the output file (for inference) */
+    char outputFN[1024];
+
+    /* path to the training file */
+    char trainFN[1024];
+
+    /* path to the validation file */
+    char validFN[1024];
+
+    /* device id */
+    int devID;
+
+    /* beam size */
+    int beamSize;
+
+    /* word batch size */
+    int wBatchSize;
+
+    /* sentence batch size */
+    int sBatchSize;
+
+    /* number of heads in attention */
+    int nhead;
+
+    /* number of encoder layers */
+    int nEncLayer;
+
+    /* number of decoder layers */
+    int nDecLayer;
+
+    /* the maximum relative position in RPR attentions */
+    int maxRP;
+
+    /* the dimension of embeddings */
+    int embSize;
+
+    /* the dimension of hidden layer */
+    int modelSize;
+
+    /* the maximum length in positional embedding */
+    int maxPosLen;
+
+    /* the dimension of fnn hidden layer */
+    int fnnHiddenSize;
+
+    /* the vocab size of source sequence */
+    int srcVocabSize;
+
+    /* the vocab size of target sequence */
+    int tgtVocabSize;
+
+    /* the padding id */
+    int padID;
+
+    /* start symbol */
+    int startID;
+
+    /* end symbol */
+    int endID;
+
+    /* indicates whether the model uses pre-norm */
+    bool preNorm;
+
+    /* indicates whether the model is running for machine translation */
+    bool isMT;
+
+    /* indicates whether the model is running with FP16 data type */
+    bool useFP16;
+
+    /* indicates whether we use the RPR attention */
+    bool useRPR;
+
+    /* indicates whether we train the model */
+    bool isTraining;
+
+    /* dropout rate for the model */
+    float dropout;
+
+    /* dropout rate for fnn layers */
+    float fnnDropout;
+
+    /* dropout rate for attention layers */
+    float attDropout;
+
+    /* the alpha parameter controls the length preference */
+    float lenAlpha;
+
+    /* scalar of the input sequence (for max number of search steps) */
+    float maxLenAlpha;
+
+    /* learning rate */
+    float lrate;
+
+    /* the parameter that controls the maximum learning rate in training */
+    float lrbias;
+
+    /* training epoch number */
+    int nepoch;
+
+    /* traing step number */
+    int nstep;
+
+    /* indicates whether we use Adam */
+    bool useAdam;
+
+    /* hyper parameters of Adam */
+    float adamBeta1;
+    float adamBeta2;
+    float adamDelta;
+
+    /* step number of warm-up for training */
+    int nwarmup;
+
+    /* indicates whether the data file is shuffled for training */
+    bool isShuffled;
+
+    /* the factor of label smoothing */
+    float labelSmoothingP;
+
+    /* number of steps after which we make a checkpoint */
+    int nStepCheckpoint;
+
+    /* indicates whether we make a checkpoint after each training epoch */
+    bool useEpochCheckpoint;
+
+    /* number of batches on which we do model update */
+    int updateStep;
+
+    /* indicates whether we intend to debug the net */
+    bool isDebugged;
+
+    /* indicates whether the sequence is sorted by length */
+    bool isLenSorted;
+
+    /* buffer size */
+    int bufSize;
+
+    /* indicates whether we double the </s> symbol for the output of LM */
+    bool isDoubledEnd;
+
+    /* indicates whether we use batchsize = max * sc
+       rather rather than batchsize = word-number, where max is the maximum
+       length and sc is the sentence number */
+    bool isSmallBatch;
+
+    /* counterpart of "isSmallBatch" */
+    bool isBigBatch;
+
+    /* randomize batches */
+    bool isRandomBatch;
+
+    /* bucket size */
+    int bucketSize;
+
+public:
+
+    /* load configurations from the command */
+    T2TConfig(int argc, const char** argv);
+
+    /* load configurations from a file */
+    int LoadFromFile(const char* configFN, char** args);
+};
+
+}
+
+#endif
--- a/source/sample/transformer/train/T2TBatchLoader.cpp
+++ b/source/sample/transformer/train/T2TBatchLoader.cpp
--- a/source/sample/transformer/train/T2TBatchLoader.h
+++ b/source/sample/transformer/train/T2TBatchLoader.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-04-25
+ * it is cold today but I'll move to a warm place tomorrow :)
+ */
+
+#ifndef __T2TBATCHLOADER_H__
+#define __T2TBATCHLOADER_H__
+
+#include "../module/T2TUtility.h"
+#include "../../../network/XNet.h"
+
+using namespace nts;
+
+namespace transformer
+{
+
+#define MAX_SEQUENCE_LENGTH 1024 * 4
+
+/* node to keep batch information */
+struct BatchNode
+{
+    /* beginning position */
+    int beg;
+
+    /* end position */
+    int end;
+
+    /* maximum word number on the encoder side */
+    int maxEnc;
+
+    /* maximum word number on the decoder side */
+    int maxDec;
+
+    /* a key for sorting */
+    int key;
+};
+
+class T2TBatchLoader
+{
+public:
+    /* buffer for loading words */
+    int* buf;
+
+    /* another buffer */
+    int* buf2;
+
+    /* batch buf */
+    BatchNode* bufBatch;
+
+    /* buffer size */
+    int bufSize;
+
+    /* size of batch buffer */
+    int bufBatchSize;
+
+    /* length of each sequence */
+    int* seqLen;
+
+    /* another array */
+    int* seqLen2;
+
+    /* offset of the first word for each sequence */
+    int* seqOffset;
+
+    /* number of sequences in the buffer */
+    int nseqBuf;
+
+    /* offset for next sequence in the buffer */
+    int nextSeq;
+
+    /* offset for next batch */
+    int nextBatch;
+
+    /* indicates whether we double the </s> symbol for the output of LM */
+    bool isDoubledEnd;
+
+    /* indicates whether we use batchsize = max * sc
+       rather rather than batchsize = word-number, where max is the maximum
+       length and sc is the sentence number */
+    bool isSmallBatch;
+
+    /* counterpart of "isSmallBatch" */
+    bool isBigBatch;
+
+    /* randomize batches */
+    bool isRandomBatch;
+
+    /* bucket size */
+    int bucketSize;
+
+public:
+    /* constructor */
+    T2TBatchLoader();
+
+    /* de-constructor */
+    ~T2TBatchLoader();
+
+    /* initialization */
+    void Init(T2TConfig& config);
+
+    /* load data to buffer */
+    int LoadBuf(FILE* file, bool isSorted, int step);
+
+    /* clear data buffer */
+    void ClearBuf();
+
+    /* set the random batch flag */
+    void SetRandomBatch(bool flag = true);
+
+    /* load a batch of sequences */
+    int LoadBatch(FILE* file, bool isLM,
+        XTensor* batchEnc, XTensor* paddingEnc,
+        XTensor* batchDec, XTensor* paddingDec,
+        XTensor* gold, XTensor* label,
+        int* seqs,
+        int vsEnc, int vsDec, int sBatch, int wBatch,
+        bool isSorted, int& ws, int& wCount,
+        int devID, bool isTraining);
+
+    /* load a batch of sequences (for language modeling) */
+    int LoadBatchLM(FILE* file,
+        XTensor* batchEnc, XTensor* paddingEnc,
+        XTensor* batchDec, XTensor* paddingDec,
+        XTensor* gold, XTensor* label,
+        int* seqs, int vs, int sBatch, int wBatch,
+        bool isSorted, int& wCount,
+        int devID, bool isTraining);
+
+    /* load a batch of sequences (for machine translation) */
+    int LoadBatchMT(FILE* file,
+        XTensor* batchEnc, XTensor* paddingEnc,
+        XTensor* batchDec, XTensor* paddingDec,
+        XTensor* gold, XTensor* label,
+        int* seqs, int vsEnc, int vsDec, int sBatch, int wBatch,
+        bool isSorted, int& ws, int& wCount,
+        int devID, bool isTraining);
+
+    /* shuffle the data file */
+    void Shuffle(const char* srcFile, const char* tgtFile);
+};
+
+}
+
+#endif
\ No newline at end of file
--- a/source/sample/transformer/train/T2TTrainer.cpp
+++ b/source/sample/transformer/train/T2TTrainer.cpp
--- a/source/sample/transformer/train/T2TTrainer.h
+++ b/source/sample/transformer/train/T2TTrainer.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-02
+ */
+
+#ifndef __T2TTRAINER_H__
+#define __T2TTRAINER_H__
+
+#include "../T2TModel.h"
+#include "T2TBatchLoader.h"
+#include "../../../tensor/function/FHeader.h"
+
+using namespace nts;
+
+namespace transformer
+{
+
+/* trainer of the T2T model */
+class T2TTrainer
+{
+public:
+
+    /* configurations */
+    T2TConfig* cfg;
+
+    /* dimension size of each inner layer */
+    int d;
+
+    /* step number of warm-up for training */
+    int nwarmup;
+
+    /* vocabulary size of the source side */
+    int vSize;
+
+    /* vocabulary size of the target side */
+    int vSizeTgt;
+
+    /* learning rate */
+    float lrate;
+
+    /* the parameter that controls the maximum learning rate in training */
+    float lrbias;
+
+    /* sentence batch size */
+    int sBatchSize;
+
+    /* word batch size */
+    int wBatchSize;
+
+    /* training epoch number */
+    int nepoch;
+
+    /* traing step number */
+    int nstep;
+
+    /* indicates whether we use adam */
+    bool useAdam;
+
+    /* hyper parameters of adam*/
+    float adamBeta1;
+    float adamBeta2;
+    float adamDelta;
+    float adamBeta1T;
+    float adamBeta2T;
+
+    /* list of the moment of the parameter matrices */
+    TensorList moments;
+
+    /* list of the 2nd order moment of the parameter matrices */
+    TensorList moments2nd;
+
+    /* indicates whether the data file is shuffled for training */
+    bool isShuffled;
+
+    /* the factor of label smoothing */
+    DTYPE labelSmoothingP;
+
+    /* number of steps after which we make a checkpoint */
+    int nStepCheckpoint;
+
+    /* indicates whether we make a checkpoint after each training epoch */
+    bool useEpochCheckpoint;
+
+    /* number of batches on which we do model update */
+    int updateStep;
+
+    /* indicates whether we intend to debug the net */
+    bool isDebugged;
+
+    /* indicates whether the sequence is sorted by length */
+    bool isLenSorted;
+
+    /* for batching */
+    T2TBatchLoader batchLoader;
+
+public:
+    /* constructor */
+    T2TTrainer();
+
+    /* de-constructor */
+    ~T2TTrainer();
+
+    /* initialize the trainer */
+    void Init(T2TConfig& config);
+
+    /* train the model */
+    void Train(const char* fn, const char* validFN, const char* modelFN, T2TModel* model);
+
+    /* test the model */
+    void Validate(const char* fn, const char* ofn, T2TModel* model);
+
+    /* make a checkpoint */
+    void MakeCheckpoint(T2TModel* model, const char* validFN, const char* modelFN, const char* label, int id);
+
+    /* update the model by delta rule */
+    void Update(T2TModel* model, const float lr);
+
+    /* prepare model for training */
+    void PrepareModel(T2TModel* model);
+};
+
+}
+
+#endif
--- a/source/sample/transformer/translate/T2TDataSet.cpp
+++ b/source/sample/transformer/translate/T2TDataSet.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: HU Chi (huchinlp@foxmail.com) 2019-04-03
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
+ */
+
+#include <string>
+#include <vector>
+#include <cstdlib>
+#include <fstream>
+#include <algorithm>
+
+#include "T2TDataSet.h"
+#include "../module/T2TUtility.h"
+
+using namespace transformer;
+
+namespace nts {
+
+/* sort the output by id (in ascending order) */
+void DataSet::SortInput() {
+    sort(inputBuffer.items, inputBuffer.items + inputBuffer.count, [](Example* a, Example* b) {
+        return a->values.count > b->values.count;
+        });
+}
+
+/* sort the input by length (in descending order) */
+void DataSet::SortOutput() {
+    sort(outputBuffer.items, outputBuffer.items + outputBuffer.count, [](Result* a, Result* b) {
+        return a->id < b->id;
+        });
+}
+
+/*
+load data from the file to the buffer
+*/
+void DataSet::LoadDataToBuffer()
+{
+    string line;
+    inputBuffer.Clear();
+    bufferUsed = 0;
+
+    int id = 0;
+    const string tokenDelimiter = " ";
+
+    while (getline(*fp, line)) {
+        IntList values;
+
+        /* load words and transform them to ids */
+        auto indices = SplitToPos(line, tokenDelimiter);
+
+        /* reserve the first 120 words if the input is too long */
+        size_t maxLen = indices.Size() > MAX_WORD_NUM ? MAX_WORD_NUM : indices.Size();
+
+        for (size_t i = 0; i < maxLen; i++) {
+            auto offset = (i != (indices.Size() - 1)) ?
+                indices[i + 1] - indices[i] - tokenDelimiter.size()
+                : line.size() - indices[i];
+            string word = line.substr(indices[i], offset);
+            if (srcVocab.word2id.find(word) == srcVocab.word2id.end())
+                values.Add(3);
+            else
+                values.Add(srcVocab.word2id.at(word));
+        }
+
+        /* make sure that the sequence ends with EOS */
+        if (values.Size() != 0 && values[-1] != EOS)
+            values.Add(EOS);
+
+        Example* example = new Example;
+        example->id = id;
+        example->values = values;
+        if (values.Size() != 0)
+            inputBuffer.Add(example);
+        else
+            emptyLines.Add(id);
+        id++;
+    }
+    fp->close();
+
+    SortInput();
+
+    XPRINT1(0, stderr, "[INFO] loaded %d sentences\n", id);
+}
+
+/*
+load a mini-batch to the device
+>> batchEnc - a tensor to store the batch of input
+>> paddingEnc - a tensor to store the batch of paddings
+>> minSentBatch - the minimum number of sentence batch
+>> batchSize - the maxium number of words in a batch
+>> devID - the device id, -1 for the CPU
+<< indices of the sentences
+*/
+UInt64List DataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
+                              size_t minSentBatch, size_t batchSize, int devID)
+{
+    size_t realBatchSize = minSentBatch;
+
+    /* get the maximum sentence length in a mini-batch */
+    size_t maxLen = inputBuffer[bufferUsed]->values.Size();
+
+    /* dynamic batching for sentences */
+    while ((realBatchSize < (inputBuffer.Size() - bufferUsed))
+        && (realBatchSize * maxLen < batchSize)) {
+        realBatchSize++;
+    }
+
+    /* real batch size */
+    if ((inputBuffer.Size() - bufferUsed) < realBatchSize) {
+        realBatchSize = inputBuffer.Size() - bufferUsed;
+    }
+
+    CheckNTErrors(maxLen != 0, "invalid length");
+
+    int* batchValues = new int[realBatchSize * maxLen];
+    float* paddingValues = new float[realBatchSize * maxLen];
+
+    for (int i = 0; i < realBatchSize * maxLen; i++) {
+        batchValues[i] = 1;
+        paddingValues[i] = 0.0F;
+    }
+
+    size_t cur = 0;
+
+    /* left padding */
+    UInt64List infos;
+    size_t totalLength = 0;
+
+    for (int i = 0; i < realBatchSize; ++i) {
+        infos.Add(inputBuffer[bufferUsed + i]->id);
+        totalLength += inputBuffer[bufferUsed + i]->values.Size();
+
+        cur = maxLen * (i + 1) - inputBuffer[bufferUsed + i]->values.Size();
+        for (int j = 0; j < inputBuffer[bufferUsed + i]->values.Size(); j++) {
+            batchValues[cur] = inputBuffer[bufferUsed + i]->values[j];
+            paddingValues[cur++] = 1.0F;
+        }
+    }
+    infos.Add(totalLength);
+
+    InitTensor2D(batchEnc, realBatchSize, maxLen, X_INT, devID);
+    InitTensor2D(paddingEnc, realBatchSize, maxLen, X_FLOAT, devID);
+
+    bufferUsed += realBatchSize;
+
+    batchEnc->SetData(batchValues, batchEnc->unitNum);
+    paddingEnc->SetData(paddingValues, paddingEnc->unitNum);
+
+    delete[] batchValues;
+    delete[] paddingValues;
+
+    return infos;
+}
+
+/*
+the constructor of DataSet
+>> dataFile - path of the data file
+>> srcVocabFN - path of the source vocab file
+>> tgtVocabFN - path of the target vocab file
+*/
+void DataSet::Init(const char* dataFile, const char* srcVocabFN, const char* tgtVocabFN)
+{
+    fp = new ifstream(dataFile);
+    CheckNTErrors(fp->is_open(), "can not open the file");
+    bufferUsed = 0;
+
+    CheckNTErrors(strcmp(srcVocabFN, "") != 0, "missing source vocab file");
+    CheckNTErrors(strcmp(tgtVocabFN, "") != 0, "missing target vocab file");
+
+    srcVocab.Load(srcVocabFN);
+
+    /* share source and target vocabs */
+    if (strcmp(srcVocabFN, tgtVocabFN) == 0) {
+        XPRINT(0, stderr, "[INFO] share source and target vocabs \n");
+        tgtVocab.CopyFrom(srcVocab);
+    }
+    else {
+        tgtVocab.Load(tgtVocabFN);
+    }
+
+    LoadDataToBuffer();
+}
+
+/* check if the buffer is empty */
+bool DataSet::IsEmpty() {
+    if (bufferUsed < inputBuffer.Size())
+        return false;
+    return true;
+}
+
+/* dump the translation to a file */
+void DataSet::DumpRes(const char* ofn)
+{
+    ofstream ofile(ofn, ios::out);
+
+    for (int t = 0; t < outputBuffer.Size(); t++) {
+        auto res = outputBuffer[t];
+        for (int i = 0; i < res->res.Size(); i++) {
+            if (res->res[i] < 4)
+                break;
+            ofile << tgtVocab.id2word[res->res[i]] << " ";
+        }
+        ofile << "\n";
+    }
+
+    ofile.close();
+}
+
+/* de-constructor */
+DataSet::~DataSet()
+{
+    /* release the file */
+    delete fp;
+
+    /* release the input buffer */
+    for (int i = 0; i < inputBuffer.Size(); i++)
+        delete inputBuffer[i];
+
+    /* release the output buffer */
+    for (int i = 0; i < outputBuffer.Size(); i++)
+        delete outputBuffer[i];
+}
+
+}
\ No newline at end of file
--- a/source/sample/transformer/translate/T2TDataSet.h
+++ b/source/sample/transformer/translate/T2TDataSet.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: HU Chi (huchinlp@foxmail.com) 2019-04-03
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
+ */
+
+#ifndef __DATASET_H__
+#define __DATASET_H__
+
+#include <cstdio>
+#include <vector>
+#include <fstream>
+#include "T2TVocab.h"
+
+#include "../../../tensor/XList.h"
+#include "../../../tensor/XTensor.h"
+#include "../../../tensor/XGlobal.h"
+
+#define MAX_WORD_NUM 120
+
+using namespace std;
+
+namespace nts {
+/* the struct of tokenized input */
+struct Example {
+    int id;
+    IntList values;
+};
+
+/* the struct of tokenized output */
+struct Result {
+    int id;
+    IntList res;
+};
+
+/* A `DataSet` is associated with a file which contains variable length data.*/
+struct DataSet {
+public:
+    /* the data buffer */
+    InputBufferType inputBuffer;
+
+    /* a list of empty line number */
+    IntList emptyLines;
+
+    /* the result buffer */
+    OutputBufferType outputBuffer;
+
+    /* the pointer to file stream */
+    ifstream* fp;
+
+    /* size of used data in buffer */
+    size_t bufferUsed;
+
+    /* the source vocabulary */
+    Vocab srcVocab;
+
+    /* the target vocabulary */
+    Vocab tgtVocab;
+
+public:
+
+    /* sort the input by length */
+    void SortInput();
+
+    /* reorder the output by ids */
+    void SortOutput();
+
+    /* load data from a file to the buffer */
+    void LoadDataToBuffer();
+
+    /* generate a mini-batch */
+    UInt64List LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
+        size_t sBatch, size_t wBatch, int devID);
+
+    /* initialization function */
+    void Init(const char* dataFile, const char* srcVocabFN, const char* tgtVocabFN);
+
+    /* check if the buffer is empty */
+    bool IsEmpty();
+
+    /* dump the translations to a file */
+    void DumpRes(const char* ofn);
+
+    /* de-constructor */
+    ~DataSet();
+};
+}
+
+#endif // __DATASET_H__
\ No newline at end of file
--- a/source/sample/transformer/translate/T2TLengthPenalty.cpp
+++ b/source/sample/transformer/translate/T2TLengthPenalty.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-04-08
+ * Start of a new week - I just finished several documents.
+ * Writing document is harder than writing code :)
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
+ */
+
+#include "T2TLengthPenalty.h"
+
+using namespace nts;
+
+namespace transformer
+{
+
+/*
+GNMT-like length penalty: pl = ((5 + n)/(5 + 1))^\alpha
+where n = length of the sequence
+>> length - length of the sequence
+>> alpha - the parameter controls the length preference
+<< return - length penalty of the sequence
+*/
+float T2TLengthPenalizer::GNMT(float length, float alpha)
+{
+    float base;
+    float lp;
+
+    base = (length + 5.0F) / (1.0F + 5.0F);
+
+    lp = pow(base, alpha);
+
+    return lp;
+}
+
+}
\ No newline at end of file
--- a/source/sample/transformer/translate/T2TLengthPenalty.h
+++ b/source/sample/transformer/translate/T2TLengthPenalty.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-04-08
+ * Start of a new week - I just finished several documents.
+ * Writing document is harder than writing code :)
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
+ */
+
+#ifndef __T2TLENGTHPENALTY_H__
+#define __T2TLENGTHPENALTY_H__
+
+#include "../module/T2TUtility.h"
+#include "../../../tensor/XTensor.h"
+
+using namespace nts;
+
+namespace transformer
+{
+
+/* We intend to penalize short sequences because they have higher score
+   in product of a sequence of probability-like terms and have more chances
+   to beat others in search. */
+class T2TLengthPenalizer
+{
+public:
+    /* GNMT-like length penalty: pl = ((5 + n)/(5 + 1))^\alpha
+       where n = length of the sequence */
+    static float GNMT(float length, float alpha);
+};
+
+}
+
+#endif
--- a/source/sample/transformer/translate/T2TPredictor.cpp
+++ b/source/sample/transformer/translate/T2TPredictor.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-13
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
+ */
+
+#include <iostream>
+
+#include "T2TPredictor.h"
+#include "../module/T2TNNUtil.h"
+
+using namespace nts;
+
+namespace transformer
+{
+
+/* constructor */
+T2TStateBundle::T2TStateBundle()
+{
+    states = NULL;
+    isStart = false;
+}
+
+/* de-constructor */
+T2TStateBundle::~T2TStateBundle()
+{
+    if (states != NULL)
+        delete[] states;
+}
+
+/*
+create states
+>> num - number of states
+*/
+void T2TStateBundle::MakeStates(int num)
+{
+    CheckNTErrors(num > 0, "invalid number");
+
+    if (states != NULL)
+        delete[] states;
+
+    states = new T2TState[num];
+
+    for (int i = 0; i < num; i++) {
+        states[i].prediction = -1;
+        states[i].pid = T2T_PID_EMPTY;
+        states[i].isEnd = false;
+        states[i].isStart = false;
+        states[i].isCompleted = false;
+        states[i].prob = 0;
+        states[i].probPath = 0;
+        states[i].modelScore = 0;
+        states[i].nstep = 0;
+        states[i].last = NULL;
+    }
+
+    stateNum = num;
+}
+
+/* constructor */
+T2TPredictor::T2TPredictor()
+{
+    startSymbol = 2;
+}
+
+/* de-constructor */
+T2TPredictor::~T2TPredictor()
+{
+}
+
+/*
+create an initial state
+>> model - the t2t model
+>> top - the top-most layer of the network
+>> input - input of the network
+>> beamSize - beam size
+>> state - the state to be initialized
+*/
+void T2TPredictor::Create(T2TModel* model, XTensor* top, const XTensor* input,
+    int beamSize, T2TStateBundle* state)
+{
+    int dims[MAX_TENSOR_DIM_NUM];
+    for (int i = 0; i < input->order - 1; i++)
+        dims[i] = input->dimSize[i];
+    dims[input->order - 1] = beamSize;
+
+    InitTensor(&state->probPath, input->order, dims, X_FLOAT, input->devID);
+    InitTensor(&state->endMark, input->order, dims, X_INT, input->devID);
+
+    state->probPath.SetZeroAll();
+    state->nstep = 0.0F;
+    state->endMark.SetZeroAll();
+
+    state->stateNum = 0;
+}
+
+/*
+set start symbol
+>> symbol - the symbol (in integer)
+*/
+void T2TPredictor::SetStartSymbol(int symbol)
+{
+    startSymbol = symbol;
+}
+
+/*
+read a state
+>> model - the t2t model that keeps the network created so far
+>> state - a set of states. It keeps
+1) hypotheses (states)
+2) probabilities of hypotheses
+3) parts of the network for expanding toward the next state
+*/
+void T2TPredictor::Read(T2TModel* model, T2TStateBundle* state)
+{
+    m = model;
+    s = state;
+}
+
+/*
+predict the next state
+>> next - next states
+>> aliveIndices - indices of alive states, (B)
+>> absoluteIdx - the absolute indices of alive states, (B)
+>> encoding - encoder output, (B, L, E)
+>> inputEnc - input of the encoder, (B, L)
+>> paddingEnc - padding of the encoder, (B, L)
+>> rawBatchSize - the raw batch size (in case of some states are pruned)
+>> isStart - whether it is the start state or not
+>> reorderState - the new order of states
+>> needReorder - whether we need reordering the states
+>> nstep - current time step of the target sequence
+*/
+void T2TPredictor::Predict(T2TStateBundle* next, XTensor& aliveState, XTensor& encoding,
+                           XTensor& inputEnc, XTensor& paddingEnc, int batchSize, bool isStart,
+                           XTensor& reorderState, bool needReorder, int nstep)
+{
+    int dims[MAX_TENSOR_DIM_NUM];
+
+    /* word indices of positions up to next state */
+    XTensor inputDec;
+
+    /* the first token */
+    XTensor first;
+
+    InitTensor2D(&first, batchSize, 1, X_INT, inputEnc.devID);
+    first.SetDataFixed(startSymbol);
+
+    /* add a new word into the input sequence of the decoder side */
+    if (isStart) {
+        inputDec = Identity(first);
+    }
+    else {
+        /* only pass one step to the decoder */
+        inputDec = GetLastPrediction(s, inputEnc.devID);
+    }
+
+    /* keep alive states for the decoder */
+    if (aliveState.dimSize[0] < batchSize) {
+        /* alive inputs */
+        inputDec = AutoGather(inputDec, aliveState);
+
+        /* alive cache */
+        for (int i = 0; i < m->decoder->nlayer; i++) {
+            m->decoder->selfAttCache[i].KeepAlive(aliveState);
+            m->decoder->enDeAttCache[i].KeepAlive(aliveState);
+        }
+    }
+
+    if (needReorder) {
+        for (int i = 0; i < m->decoder->nlayer; i++) {
+            m->decoder->selfAttCache[i].Reorder(reorderState);
+            m->decoder->enDeAttCache[i].Reorder(reorderState);
+        }
+    }
+
+    /* prediction probabilities */
+    XTensor& output = next->prob;
+    XTensor decoding;
+
+    for (int i = 0; i < inputDec.order - 1; i++)
+        dims[i] = inputDec.dimSize[i];
+    dims[inputDec.order - 1] = inputDec.dimSize[inputDec.order - 1];
+
+    XTensor paddingDec;
+    InitTensor(&paddingDec, inputDec.order, dims, X_INT, paddingEnc.devID);
+    paddingDec.SetDataFixed(1);
+
+    XTensor maskDec;
+    XTensor maskEncDec;
+
+    /* decoder mask */
+    m->MakeMTMaskDec(paddingEnc, paddingDec, maskDec, maskEncDec);
+
+    /* make the decoding network */
+    decoding = m->decoder->Make(inputDec, encoding, NULL, &maskEncDec, nstep, false);
+
+    CheckNTErrors(decoding.order >= 2, "The tensor must be of order 2 or larger!");
+
+    /* generate the output probabilities */
+    m->outputLayer->Make(decoding, output, false, true);
+}
+
+/*
+generate paths up to the states of the current step
+>> state - state bundle of the current step
+*/
+XTensor T2TPredictor::GeneratePaths(T2TStateBundle* state)
+{
+    CheckNTErrors(state->stateNum >= 0, "Illegal state!");
+
+    int distance = -1;
+
+    for (int i = 0; i < state->stateNum; i++) {
+        T2TState* cur = state->states + i;
+        int nsteps = 0;
+
+        while (cur != NULL) {
+            nsteps++;
+            cur = cur->last;
+        }
+
+        if (nsteps > distance)
+            distance = nsteps;
+    }
+
+    XTensor path;
+    InitTensor2D(&path, state->stateNum, distance, X_INT);
+    path.SetZeroAll();
+
+    for (int i = 0; i < state->stateNum; i++) {
+        T2TState* cur = state->states + i;
+        int nsteps = 0;
+
+        while (cur != NULL) {
+            nsteps++;
+            path.Set2DInt(cur->prediction, i, distance - nsteps);
+            cur = cur->last;
+        }
+    }
+
+    return path;
+}
+
+/*
+get the predictions of the previous step
+>> state - state bundle of the current step
+>> devID - the device id for the predictions
+*/
+XTensor T2TPredictor::GetLastPrediction(T2TStateBundle* state, int devID)
+{
+    CheckNTErrors(state->stateNum >= 0, "Illegal state!");
+
+    IntList last;
+
+    for (int i = 0; i < state->stateNum; i++) {
+        T2TState* cur = state->states + i;
+
+        last.Add(cur->prediction);
+    }
+
+    XTensor lastPred;
+    InitTensor2D(&lastPred, last.Size(), 1, X_INT, devID);
+    lastPred.SetData(last.items, last.Size());
+
+    return lastPred;
+}
+
+}
\ No newline at end of file
--- a/source/sample/transformer/translate/T2TPredictor.h
+++ b/source/sample/transformer/translate/T2TPredictor.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-13
+ * This is the first source file I create in 2019 - new start!
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
+ */
+
+#ifndef __T2TPREDICTOR_H__
+#define __T2TPREDICTOR_H__
+
+#include "../T2TModel.h"
+#include "T2TLengthPenalty.h"
+
+using namespace std;
+
+namespace transformer
+{
+
+#define T2T_PID_EMPTY -1
+
+/* state for search. It keeps the path (back-pointer), prediction distribution,
+   and etc. It can be regarded as a hypotheses in translation. */
+class T2TState
+{
+public:
+    /* we assume that the prediction is an integer */
+    int prediction;
+
+    /* id of the problem. One can regard it as the sentence id when we
+       translate a number of sentences in the batched manner. The hypotheses
+       is empty if id = -1 */
+    int pid;
+
+    /* indicates whether the state is an end */
+    bool isEnd;
+
+    /* indicates whether the state is the start */
+    bool isStart;
+
+    /* indicates whether the state is completed */
+    bool isCompleted;
+
+    /* probability of every prediction (last state of the path) */
+    float prob;
+
+    /* probability of every path */
+    float probPath;
+
+    /* model score of every path. A model score = path probability + some other stuff */
+    float modelScore;
+
+    /* number of steps we go over so far */
+    int nstep;
+
+    /* pointer to the previous state */
+    T2TState* last;
+};
+
+/* a bundle of states */
+class T2TStateBundle
+{
+public:
+    /* predictions */
+    XTensor prediction;
+
+    /* id of the previous state that generates the current one  */
+    XTensor preID;
+
+    /* mark that indicates whether each hypotheses is completed */
+    XTensor endMark;
+
+    /* probability of every prediction (last state of the path) */
+    XTensor prob;
+
+    /* probability of every path */
+    XTensor probPath;
+
+    /* model score of every path */
+    XTensor modelScore;
+
+    /* step number of each hypotheses */
+    float nstep;
+
+    /* list of states */
+    T2TState* states;
+
+    /* number of states */
+    int stateNum;
+
+    /* indicates whether it is the first state */
+    bool isStart;
+
+public:
+    /* constructor */
+    T2TStateBundle();
+
+    /* de-constructor */
+    ~T2TStateBundle();
+
+    /* create states */
+    void MakeStates(int num);
+};
+
+/* The predictor reads the current state and then predicts the next.
+   It is exactly the same procedure of MT inference -
+   we get the state of previous words and then generate the next word.
+   Here, a state can be regarded as the representation of words (word
+   indices, hidden states, embeddings and etc.).  */
+class T2TPredictor
+{
+private:
+    /* pointer to the transformer model */
+    T2TModel* m;
+
+    /* current state */
+    T2TStateBundle* s;
+
+    /* start symbol */
+    int startSymbol;
+
+    /* end symbol */
+    int endSymbol;
+
+public:
+    /* constructor */
+    T2TPredictor();
+
+    /* de-constructor */
+    ~T2TPredictor();
+
+    /* create an initial state */
+    void Create(T2TModel* model, XTensor* top, const XTensor* input, int beamSize, T2TStateBundle* state);
+
+    /* set the start symbol */
+    void SetStartSymbol(int symbol);
+
+    /* read a state */
+    void Read(T2TModel* model, T2TStateBundle* state);
+
+    /* predict the next state */
+    void Predict(T2TStateBundle* next, XTensor& aliveIndices, XTensor& encoding,
+        XTensor& inputEnc, XTensor& paddingEnc, int rawBatchSize,
+        bool isStart, XTensor& reorderState, bool needReorder, int nstep);
+
+    /* generate paths up to the states of the current step */
+    XTensor GeneratePaths(T2TStateBundle* state);
+
+    /* get the predictions of the previous step */
+    XTensor GetLastPrediction(T2TStateBundle* state, int devID);
+};
+
+}
+
+#endif
--- a/source/sample/transformer/translate/T2TSearch.cpp
+++ b/source/sample/transformer/translate/T2TSearch.cpp
--- a/source/sample/transformer/translate/T2TSearch.h
+++ b/source/sample/transformer/translate/T2TSearch.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
+ */
+
+#ifndef __T2TSEARCH_H__
+#define __T2TSEARCH_H__
+
+#include "../T2TModel.h"
+#include "T2TPredictor.h"
+
+using namespace std;
+
+namespace transformer
+{
+
+/* The class organizes the search process. It calls "predictors" to generate
+   distributions of the predictions and prunes the search space by beam pruning.
+   This makes a graph where each path represents a translation hypotheses.
+   The output can be the path with the highest model score. */
+class BeamSearch
+{
+private:
+    /* the alpha parameter controls the length preference */
+    float alpha;
+
+    /* predictor */
+    T2TPredictor predictor;
+
+    /* max length of the generated sequence */
+    int maxLength;
+
+    /* beam size */
+    int beamSize;
+
+    /* batch size */
+    int batchSize;
+
+    /* we keep the final hypotheses in a heap for each sentence in the batch. */
+    XHeap<MIN_HEAP, float>* fullHypos;
+
+    /* array of the end symbols */
+    int* endSymbols;
+
+    /* number of the end symbols */
+    int endSymbolNum;
+
+    /* start symbol */
+    int startSymbol;
+
+    /* scalar of the input sequence (for max number of search steps) */
+    float scalarMaxLength;
+
+    /* indicate whether the early stop strategy is used */
+    bool isEarlyStop;
+
+    /* pids for alive states */
+    IntList aliveStatePids;
+
+    /* alive sentences */
+    IntList aliveSentList;
+
+    /* whether we need to reorder the states */
+    bool needReorder;
+
+public:
+    /* constructor */
+    BeamSearch();
+
+    /* de-constructor */
+    ~BeamSearch();
+
+    /* initialize the model */
+    void Init(T2TConfig& config);
+
+    /* search for the most promising states */
+    void Search(T2TModel* model, XTensor& input, XTensor& padding, IntList* output, XTensor& score);
+
+    /* preparation */
+    void Prepare(int myBatchSize, int myBeamSize);
+
+    /* compute the model score for each hypotheses */
+    void Score(T2TStateBundle* prev, T2TStateBundle* beam);
+
+    /* generate token indices via beam pruning */
+    void Generate(T2TStateBundle* prev, T2TStateBundle* beam);
+
+    /* expand the search graph */
+    void Expand(T2TStateBundle* prev, T2TStateBundle* beam, XTensor& reorderState);
+
+    /* collect hypotheses with ending symbol */
+    void Collect(T2TStateBundle* beam);
+
+    /* fill the hypotheses heap with incomplete hypotheses */
+    void FillHeap(T2TStateBundle* beam);
+
+    /* save the output sequences and score */
+    void Dump(IntList* output, XTensor* score);
+
+    /* check if the token is an end symbol */
+    bool IsEnd(int token);
+
+    /* check whether all hypotheses are completed */
+    bool IsAllCompleted(T2TStateBundle* beam);
+
+    /* update the beam by pruning finished states */
+    void RemoveFinishedStates(T2TStateBundle* beam, XTensor& aliveEncoding,
+        XTensor& aliveInput, XTensor& alivePadding, XTensor& aliveIdx);
+
+    /* set end symbols for search */
+    void SetEnd(const int* tokens, const int tokenNum);
+
+    /* make a mask to prevent duplicated entries in beam expansion for the first position */
+    XTensor MakeFirstMask(T2TStateBundle* beam);
+};
+
+class GreedySearch
+{
+private:
+
+    /* predictor */
+    T2TPredictor predictor;
+
+    /* max length of the generated sequence */
+    int maxLength;
+
+    /* batch size */
+    int batchSize;
+
+    /* array of the end symbols */
+    int* endSymbols;
+
+    /* number of the end symbols */
+    int endSymbolNum;
+
+    /* start symbol */
+    int startSymbol;
+
+    /* scalar of the input sequence (for max number of search steps) */
+    float scalarMaxLength;
+
+public:
+    /* constructor */
+    GreedySearch();
+
+    /* de-constructor */
+    ~GreedySearch();
+
+    /* initialize the model */
+    void Init(T2TConfig& config);
+
+    /* search for the most promising states */
+    void Search(T2TModel* model, XTensor& input, XTensor& padding, IntList* output);
+
+    /* preparation */
+    void Prepare(int myBatchSize);
+
+    /* check if the token is an end symbol */
+    bool IsEnd(int token);
+
+    /* set end symbols for search */
+    void SetEnd(const int* tokens, const int tokenNum);
+};
+
+}
+
+#endif
--- a/source/sample/transformer/translate/T2TTranslator.cpp
+++ b/source/sample/transformer/translate/T2TTranslator.cpp
--- a/source/sample/transformer/translate/T2TTranslator.h
+++ b/source/sample/transformer/translate/T2TTranslator.h
--- a/source/sample/transformer/translate/T2TVocab.cpp
+++ b/source/sample/transformer/translate/T2TVocab.cpp
--- a/source/sample/transformer/translate/T2TVocab.h
+++ b/source/sample/transformer/translate/T2TVocab.h
--- a/source/tensor/XBLAS.cpp
+++ b/source/tensor/XBLAS.cpp
--- a/source/tensor/XBLAS.h
+++ b/source/tensor/XBLAS.h
--- a/source/tensor/XCall.cpp
+++ b/source/tensor/XCall.cpp
--- a/source/tensor/XCall.h
+++ b/source/tensor/XCall.h
--- a/source/tensor/XDataType.cpp
+++ b/source/tensor/XDataType.cpp
--- a/source/tensor/XDataType.h
+++ b/source/tensor/XDataType.h
--- a/source/tensor/XDevice.cpp
+++ b/source/tensor/XDevice.cpp
--- a/source/tensor/XDevice.h
+++ b/source/tensor/XDevice.h
--- a/source/tensor/XGlobal.cpp
+++ b/source/tensor/XGlobal.cpp
--- a/source/tensor/XGlobal.h
+++ b/source/tensor/XGlobal.h
--- a/source/tensor/XHeap.cpp
+++ b/source/tensor/XHeap.cpp
--- a/source/tensor/XHeap.h
+++ b/source/tensor/XHeap.h
--- a/source/tensor/XLink.cpp
+++ b/source/tensor/XLink.cpp
--- a/source/tensor/XLink.h
+++ b/source/tensor/XLink.h
--- a/source/tensor/XList.cpp
+++ b/source/tensor/XList.cpp
--- a/source/tensor/XList.h
+++ b/source/tensor/XList.h
--- a/source/tensor/XMem.cpp
+++ b/source/tensor/XMem.cpp
--- a/source/tensor/XMem.h
+++ b/source/tensor/XMem.h
--- a/source/tensor/XName.cpp
+++ b/source/tensor/XName.cpp
--- a/source/tensor/XName.h
+++ b/source/tensor/XName.h
--- a/source/tensor/XPRunner.cpp
+++ b/source/tensor/XPRunner.cpp
--- a/source/tensor/XPRunner.h
+++ b/source/tensor/XPRunner.h
--- a/source/tensor/XQueue.cpp
+++ b/source/tensor/XQueue.cpp
--- a/source/tensor/XQueue.h
+++ b/source/tensor/XQueue.h
--- a/source/tensor/XStream.cpp
+++ b/source/tensor/XStream.cpp
--- a/source/tensor/XStream.h
+++ b/source/tensor/XStream.h
--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
--- a/source/tensor/XThread.cpp
+++ b/source/tensor/XThread.cpp
--- a/source/tensor/XThread.h
+++ b/source/tensor/XThread.h
--- a/source/tensor/XUtility.cpp
+++ b/source/tensor/XUtility.cpp
--- a/source/tensor/XUtility.h
+++ b/source/tensor/XUtility.h
--- a/source/tensor/core/CHeader.h
+++ b/source/tensor/core/CHeader.h
--- a/source/tensor/core/arithmetic/Div.cpp
+++ b/source/tensor/core/arithmetic/Div.cpp
--- a/source/tensor/core/arithmetic/Div.cu
+++ b/source/tensor/core/arithmetic/Div.cu
--- a/source/tensor/core/arithmetic/Div.cuh
+++ b/source/tensor/core/arithmetic/Div.cuh
--- a/source/tensor/core/arithmetic/Div.h
+++ b/source/tensor/core/arithmetic/Div.h
--- a/source/tensor/core/arithmetic/DivDim.cpp
+++ b/source/tensor/core/arithmetic/DivDim.cpp
--- a/source/tensor/core/arithmetic/DivDim.cu
+++ b/source/tensor/core/arithmetic/DivDim.cu
--- a/source/tensor/core/arithmetic/DivDim.cuh
+++ b/source/tensor/core/arithmetic/DivDim.cuh
--- a/source/tensor/core/arithmetic/DivDim.h
+++ b/source/tensor/core/arithmetic/DivDim.h
--- a/source/tensor/core/arithmetic/Mask.cpp
+++ b/source/tensor/core/arithmetic/Mask.cpp
--- a/source/tensor/core/arithmetic/Mask.cu
+++ b/source/tensor/core/arithmetic/Mask.cu
--- a/source/tensor/core/arithmetic/Mask.cuh
+++ b/source/tensor/core/arithmetic/Mask.cuh
--- a/source/tensor/core/arithmetic/Mask.h
+++ b/source/tensor/core/arithmetic/Mask.h
--- a/source/tensor/core/arithmetic/MatrixMul.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul.cpp
--- a/source/tensor/core/arithmetic/MatrixMul.h
+++ b/source/tensor/core/arithmetic/MatrixMul.h
--- a/source/tensor/core/arithmetic/MatrixMul2D.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul2D.cpp
--- a/source/tensor/core/arithmetic/MatrixMul2D.cu
+++ b/source/tensor/core/arithmetic/MatrixMul2D.cu
--- a/source/tensor/core/arithmetic/MatrixMul2D.cuh
+++ b/source/tensor/core/arithmetic/MatrixMul2D.cuh
--- a/source/tensor/core/arithmetic/MatrixMul2D.h
+++ b/source/tensor/core/arithmetic/MatrixMul2D.h
--- a/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.cpp
--- a/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.h
+++ b/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.h
--- a/source/tensor/core/arithmetic/MatrixMul2DParallel.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul2DParallel.cpp
--- a/source/tensor/core/arithmetic/MatrixMul2DParallel.h
+++ b/source/tensor/core/arithmetic/MatrixMul2DParallel.h
--- a/source/tensor/core/arithmetic/MatrixMulBatched.cpp
+++ b/source/tensor/core/arithmetic/MatrixMulBatched.cpp
--- a/source/tensor/core/arithmetic/MatrixMulBatched.h
+++ b/source/tensor/core/arithmetic/MatrixMulBatched.h
--- a/source/tensor/core/arithmetic/MulAndShift.cpp
+++ b/source/tensor/core/arithmetic/MulAndShift.cpp
--- a/source/tensor/core/arithmetic/MulAndShift.h
+++ b/source/tensor/core/arithmetic/MulAndShift.h
--- a/source/tensor/core/arithmetic/Multiply.cpp
+++ b/source/tensor/core/arithmetic/Multiply.cpp
--- a/source/tensor/core/arithmetic/Multiply.cu
+++ b/source/tensor/core/arithmetic/Multiply.cu
--- a/source/tensor/core/arithmetic/Multiply.cuh
+++ b/source/tensor/core/arithmetic/Multiply.cuh
--- a/source/tensor/core/arithmetic/Multiply.h
+++ b/source/tensor/core/arithmetic/Multiply.h
--- a/source/tensor/core/arithmetic/MultiplyDim.cpp
+++ b/source/tensor/core/arithmetic/MultiplyDim.cpp
--- a/source/tensor/core/arithmetic/MultiplyDim.cu
+++ b/source/tensor/core/arithmetic/MultiplyDim.cu
--- a/source/tensor/core/arithmetic/MultiplyDim.cuh
+++ b/source/tensor/core/arithmetic/MultiplyDim.cuh
--- a/source/tensor/core/arithmetic/MultiplyDim.h
+++ b/source/tensor/core/arithmetic/MultiplyDim.h
--- a/source/tensor/core/arithmetic/Sub.cpp
+++ b/source/tensor/core/arithmetic/Sub.cpp
--- a/source/tensor/core/arithmetic/Sub.h
+++ b/source/tensor/core/arithmetic/Sub.h
--- a/source/tensor/core/arithmetic/Sum.cpp
+++ b/source/tensor/core/arithmetic/Sum.cpp
--- a/source/tensor/core/arithmetic/Sum.cu
+++ b/source/tensor/core/arithmetic/Sum.cu
--- a/source/tensor/core/arithmetic/Sum.cuh
+++ b/source/tensor/core/arithmetic/Sum.cuh
--- a/source/tensor/core/arithmetic/Sum.h
+++ b/source/tensor/core/arithmetic/Sum.h
--- a/source/tensor/core/arithmetic/SumDim.cpp
+++ b/source/tensor/core/arithmetic/SumDim.cpp
--- a/source/tensor/core/arithmetic/SumDim.cu
+++ b/source/tensor/core/arithmetic/SumDim.cu
--- a/source/tensor/core/arithmetic/SumDim.cuh
+++ b/source/tensor/core/arithmetic/SumDim.cuh
--- a/source/tensor/core/arithmetic/SumDim.h
+++ b/source/tensor/core/arithmetic/SumDim.h
--- a/source/tensor/core/arithmetic/XTensorBLAS.cpp
+++ b/source/tensor/core/arithmetic/XTensorBLAS.cpp
--- a/source/tensor/core/arithmetic/XTensorBLAS.cu
+++ b/source/tensor/core/arithmetic/XTensorBLAS.cu
--- a/source/tensor/core/arithmetic/XTensorBLAS.h
+++ b/source/tensor/core/arithmetic/XTensorBLAS.h
--- a/source/tensor/core/getandset/ConvertDataType.cpp
+++ b/source/tensor/core/getandset/ConvertDataType.cpp
--- a/source/tensor/core/getandset/ConvertDataType.cu
+++ b/source/tensor/core/getandset/ConvertDataType.cu
--- a/source/tensor/core/getandset/ConvertDataType.cuh
+++ b/source/tensor/core/getandset/ConvertDataType.cuh
--- a/source/tensor/core/getandset/ConvertDataType.h
+++ b/source/tensor/core/getandset/ConvertDataType.h
--- a/source/tensor/core/getandset/OnehotAndIndex.cpp
+++ b/source/tensor/core/getandset/OnehotAndIndex.cpp
--- a/source/tensor/core/getandset/OnehotAndIndex.cu
+++ b/source/tensor/core/getandset/OnehotAndIndex.cu
--- a/source/tensor/core/getandset/OnehotAndIndex.cuh
+++ b/source/tensor/core/getandset/OnehotAndIndex.cuh
--- a/source/tensor/core/getandset/OnehotAndIndex.h
+++ b/source/tensor/core/getandset/OnehotAndIndex.h
--- a/source/tensor/core/getandset/Select.cpp
+++ b/source/tensor/core/getandset/Select.cpp
--- a/source/tensor/core/getandset/Select.cuh
+++ b/source/tensor/core/getandset/Select.cuh
--- a/source/tensor/core/getandset/Select.h
+++ b/source/tensor/core/getandset/Select.h
--- a/source/tensor/core/getandset/SetData.cpp
+++ b/source/tensor/core/getandset/SetData.cpp
--- a/source/tensor/core/getandset/SetData.cu
+++ b/source/tensor/core/getandset/SetData.cu
--- a/source/tensor/core/getandset/SetData.cuh
+++ b/source/tensor/core/getandset/SetData.cuh
--- a/source/tensor/core/getandset/SetData.h
+++ b/source/tensor/core/getandset/SetData.h
--- a/source/tensor/core/math/Binary.cpp
+++ b/source/tensor/core/math/Binary.cpp
--- a/source/tensor/core/math/Binary.cu
+++ b/source/tensor/core/math/Binary.cu
--- a/source/tensor/core/math/Binary.cuh
+++ b/source/tensor/core/math/Binary.cuh
--- a/source/tensor/core/math/Binary.h
+++ b/source/tensor/core/math/Binary.h
--- a/source/tensor/core/math/Clip.cpp
+++ b/source/tensor/core/math/Clip.cpp
--- a/source/tensor/core/math/Clip.cu
+++ b/source/tensor/core/math/Clip.cu
--- a/source/tensor/core/math/Clip.cuh
+++ b/source/tensor/core/math/Clip.cuh
--- a/source/tensor/core/math/Clip.h
+++ b/source/tensor/core/math/Clip.h
--- a/source/tensor/core/math/Compare.cpp
+++ b/source/tensor/core/math/Compare.cpp
--- a/source/tensor/core/math/Compare.cu
+++ b/source/tensor/core/math/Compare.cu
--- a/source/tensor/core/math/Compare.cuh
+++ b/source/tensor/core/math/Compare.cuh
--- a/source/tensor/core/math/Compare.h
+++ b/source/tensor/core/math/Compare.h
--- a/source/tensor/core/math/Normalize.cpp
+++ b/source/tensor/core/math/Normalize.cpp
--- a/source/tensor/core/math/Normalize.cu
+++ b/source/tensor/core/math/Normalize.cu
--- a/source/tensor/core/math/Normalize.cuh
+++ b/source/tensor/core/math/Normalize.cuh
--- a/source/tensor/core/math/Normalize.h
+++ b/source/tensor/core/math/Normalize.h
--- a/source/tensor/core/math/ScaleAndShift.cpp
+++ b/source/tensor/core/math/ScaleAndShift.cpp
--- a/source/tensor/core/math/ScaleAndShift.cu
+++ b/source/tensor/core/math/ScaleAndShift.cu
--- a/source/tensor/core/math/ScaleAndShift.cuh
+++ b/source/tensor/core/math/ScaleAndShift.cuh
--- a/source/tensor/core/math/ScaleAndShift.h
+++ b/source/tensor/core/math/ScaleAndShift.h
--- a/source/tensor/core/math/Unary.cpp
+++ b/source/tensor/core/math/Unary.cpp
--- a/source/tensor/core/math/Unary.cu
+++ b/source/tensor/core/math/Unary.cu
--- a/source/tensor/core/math/Unary.cuh
+++ b/source/tensor/core/math/Unary.cuh
--- a/source/tensor/core/math/Unary.h
+++ b/source/tensor/core/math/Unary.h
--- a/source/tensor/core/movement/CopyBlocks.cpp
+++ b/source/tensor/core/movement/CopyBlocks.cpp
--- a/source/tensor/core/movement/CopyBlocks.h
+++ b/source/tensor/core/movement/CopyBlocks.h
--- a/source/tensor/core/movement/CopyBlocksInGrid.cpp
+++ b/source/tensor/core/movement/CopyBlocksInGrid.cpp
--- a/source/tensor/core/movement/CopyBlocksInGrid.cu
+++ b/source/tensor/core/movement/CopyBlocksInGrid.cu
--- a/source/tensor/core/movement/CopyBlocksInGrid.cuh
+++ b/source/tensor/core/movement/CopyBlocksInGrid.cuh
--- a/source/tensor/core/movement/CopyBlocksInGrid.h
+++ b/source/tensor/core/movement/CopyBlocksInGrid.h
--- a/source/tensor/core/movement/CopyBlocksOnSite.cpp
+++ b/source/tensor/core/movement/CopyBlocksOnSite.cpp
--- a/source/tensor/core/movement/CopyBlocksOnSite.cu
+++ b/source/tensor/core/movement/CopyBlocksOnSite.cu
--- a/source/tensor/core/movement/CopyBlocksOnSite.cuh
+++ b/source/tensor/core/movement/CopyBlocksOnSite.cuh
--- a/source/tensor/core/movement/CopyBlocksOnSite.h
+++ b/source/tensor/core/movement/CopyBlocksOnSite.h
--- a/source/tensor/core/movement/CopyBlocksSelected.cu
+++ b/source/tensor/core/movement/CopyBlocksSelected.cu
--- a/source/tensor/core/movement/CopyBlocksSelected.cuh
+++ b/source/tensor/core/movement/CopyBlocksSelected.cuh
--- a/source/tensor/core/movement/CopyData2D.cpp
+++ b/source/tensor/core/movement/CopyData2D.cpp
--- a/source/tensor/core/movement/CopyData2D.h
+++ b/source/tensor/core/movement/CopyData2D.h
--- a/source/tensor/core/movement/CopyInGrid.cpp
+++ b/source/tensor/core/movement/CopyInGrid.cpp
--- a/source/tensor/core/movement/CopyInGrid.h
+++ b/source/tensor/core/movement/CopyInGrid.h
--- a/source/tensor/core/movement/CopyIndexed.cpp
+++ b/source/tensor/core/movement/CopyIndexed.cpp
--- a/source/tensor/core/movement/CopyIndexed.cu
+++ b/source/tensor/core/movement/CopyIndexed.cu
--- a/source/tensor/core/movement/CopyIndexed.cuh
+++ b/source/tensor/core/movement/CopyIndexed.cuh
--- a/source/tensor/core/movement/CopyIndexed.h
+++ b/source/tensor/core/movement/CopyIndexed.h
--- a/source/tensor/core/movement/CopyValues.cpp
+++ b/source/tensor/core/movement/CopyValues.cpp
--- a/source/tensor/core/movement/CopyValues.cu
+++ b/source/tensor/core/movement/CopyValues.cu
--- a/source/tensor/core/movement/CopyValues.cuh
+++ b/source/tensor/core/movement/CopyValues.cuh
--- a/source/tensor/core/movement/CopyValues.h
+++ b/source/tensor/core/movement/CopyValues.h
--- a/source/tensor/core/movement/Gather.cpp
+++ b/source/tensor/core/movement/Gather.cpp
--- a/source/tensor/core/movement/Gather.cu
+++ b/source/tensor/core/movement/Gather.cu
--- a/source/tensor/core/movement/Gather.cuh
+++ b/source/tensor/core/movement/Gather.cuh
--- a/source/tensor/core/movement/Gather.h
+++ b/source/tensor/core/movement/Gather.h
--- a/source/tensor/core/movement/Spread.cpp
+++ b/source/tensor/core/movement/Spread.cpp
--- a/source/tensor/core/movement/Spread.cu
+++ b/source/tensor/core/movement/Spread.cu
--- a/source/tensor/core/movement/Spread.cuh
+++ b/source/tensor/core/movement/Spread.cuh
--- a/source/tensor/core/movement/Spread.h
+++ b/source/tensor/core/movement/Spread.h
--- a/source/tensor/core/reduce/ReduceMax.cpp
+++ b/source/tensor/core/reduce/ReduceMax.cpp
--- a/source/tensor/core/reduce/ReduceMax.cu
+++ b/source/tensor/core/reduce/ReduceMax.cu
--- a/source/tensor/core/reduce/ReduceMax.cuh
+++ b/source/tensor/core/reduce/ReduceMax.cuh
--- a/source/tensor/core/reduce/ReduceMax.h
+++ b/source/tensor/core/reduce/ReduceMax.h
--- a/source/tensor/core/reduce/ReduceMean.cpp
+++ b/source/tensor/core/reduce/ReduceMean.cpp
--- a/source/tensor/core/reduce/ReduceMean.h
+++ b/source/tensor/core/reduce/ReduceMean.h
--- a/source/tensor/core/reduce/ReduceStandardVariance.h
+++ b/source/tensor/core/reduce/ReduceStandardVariance.h
--- a/source/tensor/core/reduce/ReduceSum.cpp
+++ b/source/tensor/core/reduce/ReduceSum.cpp
--- a/source/tensor/core/reduce/ReduceSum.cu
+++ b/source/tensor/core/reduce/ReduceSum.cu
--- a/source/tensor/core/reduce/ReduceSum.cuh
+++ b/source/tensor/core/reduce/ReduceSum.cuh
--- a/source/tensor/core/reduce/ReduceSum.h
+++ b/source/tensor/core/reduce/ReduceSum.h
--- a/source/tensor/core/reduce/ReduceSumAll.cpp
+++ b/source/tensor/core/reduce/ReduceSumAll.cpp
--- a/source/tensor/core/reduce/ReduceSumAll.h
+++ b/source/tensor/core/reduce/ReduceSumAll.h
--- a/source/tensor/core/reduce/ReduceSumSquared.cpp
+++ b/source/tensor/core/reduce/ReduceSumSquared.cpp
--- a/source/tensor/core/reduce/ReduceSumSquared.h
+++ b/source/tensor/core/reduce/ReduceSumSquared.h
--- a/source/tensor/core/reduce/ReduceVariance.cpp
+++ b/source/tensor/core/reduce/ReduceVariance.cpp
--- a/source/tensor/core/reduce/ReduceVariance.h
+++ b/source/tensor/core/reduce/ReduceVariance.h
--- a/source/tensor/core/reduce/VectorBuffer.cpp
+++ b/source/tensor/core/reduce/VectorBuffer.cpp
--- a/source/tensor/core/reduce/VectorBuffer.h
+++ b/source/tensor/core/reduce/VectorBuffer.h
--- a/source/tensor/core/shape/Concatenate.cpp
+++ b/source/tensor/core/shape/Concatenate.cpp
--- a/source/tensor/core/shape/Concatenate.h
+++ b/source/tensor/core/shape/Concatenate.h
--- a/source/tensor/core/shape/ConcatenateSolely.cpp
+++ b/source/tensor/core/shape/ConcatenateSolely.cpp
--- a/source/tensor/core/shape/ConcatenateSolely.h
+++ b/source/tensor/core/shape/ConcatenateSolely.h
--- a/source/tensor/core/shape/IsSameShaped.cpp
+++ b/source/tensor/core/shape/IsSameShaped.cpp
--- a/source/tensor/core/shape/IsSameShaped.h
+++ b/source/tensor/core/shape/IsSameShaped.h
--- a/source/tensor/core/shape/MakeMergeBlockIndex.cpp
+++ b/source/tensor/core/shape/MakeMergeBlockIndex.cpp
--- a/source/tensor/core/shape/MakeMergeBlockIndex.cu
+++ b/source/tensor/core/shape/MakeMergeBlockIndex.cu
--- a/source/tensor/core/shape/MakeMergeBlockIndex.cuh
+++ b/source/tensor/core/shape/MakeMergeBlockIndex.cuh
--- a/source/tensor/core/shape/MakeMergeBlockIndex.h
+++ b/source/tensor/core/shape/MakeMergeBlockIndex.h
--- a/source/tensor/core/shape/MakeSplitBlockIndex.cpp
+++ b/source/tensor/core/shape/MakeSplitBlockIndex.cpp
--- a/source/tensor/core/shape/MakeSplitBlockIndex.cu
+++ b/source/tensor/core/shape/MakeSplitBlockIndex.cu
--- a/source/tensor/core/shape/MakeSplitBlockIndex.cuh
+++ b/source/tensor/core/shape/MakeSplitBlockIndex.cuh
--- a/source/tensor/core/shape/MakeSplitBlockIndex.h
+++ b/source/tensor/core/shape/MakeSplitBlockIndex.h
--- a/source/tensor/core/shape/Merge.cpp
+++ b/source/tensor/core/shape/Merge.cpp
--- a/source/tensor/core/shape/Merge.h
+++ b/source/tensor/core/shape/Merge.h
--- a/source/tensor/core/shape/MergeBlockLists.cpp
+++ b/source/tensor/core/shape/MergeBlockLists.cpp
--- a/source/tensor/core/shape/MergeBlockLists.cu
+++ b/source/tensor/core/shape/MergeBlockLists.cu
--- a/source/tensor/core/shape/MergeBlockLists.cuh
+++ b/source/tensor/core/shape/MergeBlockLists.cuh
--- a/source/tensor/core/shape/MergeBlockLists.h
+++ b/source/tensor/core/shape/MergeBlockLists.h
--- a/source/tensor/core/shape/Permute.h
+++ b/source/tensor/core/shape/Permute.h
--- a/source/tensor/core/shape/Reshape.cpp
+++ b/source/tensor/core/shape/Reshape.cpp
--- a/source/tensor/core/shape/Reshape.h
+++ b/source/tensor/core/shape/Reshape.h
--- a/source/tensor/core/shape/Split.cpp
+++ b/source/tensor/core/shape/Split.cpp
--- a/source/tensor/core/shape/Split.h
+++ b/source/tensor/core/shape/Split.h
--- a/source/tensor/core/shape/Squeeze.cpp
+++ b/source/tensor/core/shape/Squeeze.cpp
--- a/source/tensor/core/shape/Squeeze.h
+++ b/source/tensor/core/shape/Squeeze.h
--- a/source/tensor/core/shape/Stack.cpp
+++ b/source/tensor/core/shape/Stack.cpp
--- a/source/tensor/core/shape/Stack.h
+++ b/source/tensor/core/shape/Stack.h
--- a/source/tensor/core/shape/Transpose.cpp
+++ b/source/tensor/core/shape/Transpose.cpp
--- a/source/tensor/core/shape/Transpose.h
+++ b/source/tensor/core/shape/Transpose.h
--- a/source/tensor/core/shape/Unsqueeze.cpp
+++ b/source/tensor/core/shape/Unsqueeze.cpp
--- a/source/tensor/core/shape/Unsqueeze.cu
+++ b/source/tensor/core/shape/Unsqueeze.cu
--- a/source/tensor/core/shape/Unsqueeze.cuh
+++ b/source/tensor/core/shape/Unsqueeze.cuh
--- a/source/tensor/core/shape/Unsqueeze.h
+++ b/source/tensor/core/shape/Unsqueeze.h
--- a/source/tensor/core/sort/Sort.cpp
+++ b/source/tensor/core/sort/Sort.cpp
--- a/source/tensor/core/sort/Sort.cu
+++ b/source/tensor/core/sort/Sort.cu
--- a/source/tensor/core/sort/Sort.cuh
+++ b/source/tensor/core/sort/Sort.cuh
--- a/source/tensor/core/sort/Sort.h
+++ b/source/tensor/core/sort/Sort.h
--- a/source/tensor/core/sort/TopK.cpp
+++ b/source/tensor/core/sort/TopK.cpp
--- a/source/tensor/core/sort/TopK.cu
+++ b/source/tensor/core/sort/TopK.cu
--- a/source/tensor/core/sort/TopK.cuh
+++ b/source/tensor/core/sort/TopK.cuh
--- a/source/tensor/core/sort/TopK.h
+++ b/source/tensor/core/sort/TopK.h
--- a/source/tensor/core/utilities/CheckData.cpp
+++ b/source/tensor/core/utilities/CheckData.cpp
--- a/source/tensor/core/utilities/CheckData.h
+++ b/source/tensor/core/utilities/CheckData.h
--- a/source/tensor/core/utilities/Float16.cpp
+++ b/source/tensor/core/utilities/Float16.cpp
--- a/source/tensor/core/utilities/Float16.h
+++ b/source/tensor/core/utilities/Float16.h
--- a/source/tensor/core/utilities/FlushToMem.cpp
+++ b/source/tensor/core/utilities/FlushToMem.cpp
--- a/source/tensor/core/utilities/FlushToMem.cu
+++ b/source/tensor/core/utilities/FlushToMem.cu
--- a/source/tensor/core/utilities/FlushToMem.cuh
+++ b/source/tensor/core/utilities/FlushToMem.cuh
--- a/source/tensor/core/utilities/FlushToMem.h
+++ b/source/tensor/core/utilities/FlushToMem.h
--- a/source/tensor/core/utilities/SetAscendingOrder.cpp
+++ b/source/tensor/core/utilities/SetAscendingOrder.cpp
--- a/source/tensor/core/utilities/SetAscendingOrder.cu
+++ b/source/tensor/core/utilities/SetAscendingOrder.cu
--- a/source/tensor/core/utilities/SetAscendingOrder.cuh
+++ b/source/tensor/core/utilities/SetAscendingOrder.cuh
--- a/source/tensor/core/utilities/SetAscendingOrder.h
+++ b/source/tensor/core/utilities/SetAscendingOrder.h
--- a/source/tensor/core/utilities/XMatrixSegment.cpp
+++ b/source/tensor/core/utilities/XMatrixSegment.cpp
--- a/source/tensor/core/utilities/XMatrixSegment.h
+++ b/source/tensor/core/utilities/XMatrixSegment.h
--- a/source/tensor/function/Dropout.cpp
+++ b/source/tensor/function/Dropout.cpp
--- a/source/tensor/function/Dropout.cu
+++ b/source/tensor/function/Dropout.cu
--- a/source/tensor/function/Dropout.cuh
+++ b/source/tensor/function/Dropout.cuh
--- a/source/tensor/function/Dropout.h
+++ b/source/tensor/function/Dropout.h
--- a/source/tensor/function/DropoutWithIndex.cpp
+++ b/source/tensor/function/DropoutWithIndex.cpp
--- a/source/tensor/function/DropoutWithIndex.cu
+++ b/source/tensor/function/DropoutWithIndex.cu
--- a/source/tensor/function/DropoutWithIndex.cuh
+++ b/source/tensor/function/DropoutWithIndex.cuh
--- a/source/tensor/function/DropoutWithIndex.h
+++ b/source/tensor/function/DropoutWithIndex.h
--- a/source/tensor/function/FHeader.h
+++ b/source/tensor/function/FHeader.h
--- a/source/tensor/function/HardTanH.cpp
+++ b/source/tensor/function/HardTanH.cpp
--- a/source/tensor/function/HardTanH.cu
+++ b/source/tensor/function/HardTanH.cu
--- a/source/tensor/function/HardTanH.cuh
+++ b/source/tensor/function/HardTanH.cuh
--- a/source/tensor/function/HardTanH.h
+++ b/source/tensor/function/HardTanH.h
--- a/source/tensor/function/Identity.cpp
+++ b/source/tensor/function/Identity.cpp
--- a/source/tensor/function/Identity.h
+++ b/source/tensor/function/Identity.h
--- a/source/tensor/function/LogSoftmax.cpp
+++ b/source/tensor/function/LogSoftmax.cpp
--- a/source/tensor/function/LogSoftmax.cu
+++ b/source/tensor/function/LogSoftmax.cu
--- a/source/tensor/function/LogSoftmax.cuh
+++ b/source/tensor/function/LogSoftmax.cuh
--- a/source/tensor/function/LogSoftmax.h
+++ b/source/tensor/function/LogSoftmax.h
--- a/source/tensor/function/Loss.cpp
+++ b/source/tensor/function/Loss.cpp
--- a/source/tensor/function/Loss.cu
+++ b/source/tensor/function/Loss.cu
--- a/source/tensor/function/Loss.cuh
+++ b/source/tensor/function/Loss.cuh
--- a/source/tensor/function/Loss.h
+++ b/source/tensor/function/Loss.h
--- a/source/tensor/function/Rectify.cpp
+++ b/source/tensor/function/Rectify.cpp
--- a/source/tensor/function/Rectify.cu
+++ b/source/tensor/function/Rectify.cu
--- a/source/tensor/function/Rectify.cuh
+++ b/source/tensor/function/Rectify.cuh
--- a/source/tensor/function/Rectify.h
+++ b/source/tensor/function/Rectify.h
--- a/source/tensor/function/Sigmoid.cpp
+++ b/source/tensor/function/Sigmoid.cpp
--- a/source/tensor/function/Sigmoid.cu
+++ b/source/tensor/function/Sigmoid.cu
--- a/source/tensor/function/Sigmoid.cuh
+++ b/source/tensor/function/Sigmoid.cuh
--- a/source/tensor/function/Sigmoid.h
+++ b/source/tensor/function/Sigmoid.h
--- a/source/tensor/function/Softmax.cpp
+++ b/source/tensor/function/Softmax.cpp
--- a/source/tensor/function/Softmax.cu
+++ b/source/tensor/function/Softmax.cu
--- a/source/tensor/function/Softmax.cuh
+++ b/source/tensor/function/Softmax.cuh
--- a/source/tensor/function/Softmax.h
+++ b/source/tensor/function/Softmax.h
--- a/source/tensor/loss/CrossEntropy.cpp
+++ b/source/tensor/loss/CrossEntropy.cpp
--- a/source/tensor/loss/CrossEntropy.cu
+++ b/source/tensor/loss/CrossEntropy.cu
--- a/source/tensor/loss/CrossEntropy.cuh
+++ b/source/tensor/loss/CrossEntropy.cuh
--- a/source/tensor/loss/CrossEntropy.h
+++ b/source/tensor/loss/CrossEntropy.h
--- a/source/tensor/loss/LHeader.h
+++ b/source/tensor/loss/LHeader.h
--- a/source/tensor/test/TAbsolute.cpp
+++ b/source/tensor/test/TAbsolute.cpp
--- a/source/tensor/test/TAbsolute.h
+++ b/source/tensor/test/TAbsolute.h
--- a/source/tensor/test/TClip.cpp
+++ b/source/tensor/test/TClip.cpp
--- a/source/tensor/test/TClip.h
+++ b/source/tensor/test/TClip.h
--- a/source/tensor/test/TCompare.cpp
+++ b/source/tensor/test/TCompare.cpp
--- a/source/tensor/test/TCompare.h
+++ b/source/tensor/test/TCompare.h
--- a/source/tensor/test/TConcatenate.cpp
+++ b/source/tensor/test/TConcatenate.cpp
--- a/source/tensor/test/TConcatenate.h
+++ b/source/tensor/test/TConcatenate.h
--- a/source/tensor/test/TConcatenateSolely.cpp
+++ b/source/tensor/test/TConcatenateSolely.cpp
--- a/source/tensor/test/TConcatenateSolely.h
+++ b/source/tensor/test/TConcatenateSolely.h
--- a/source/tensor/test/TConvertDataType.cpp
+++ b/source/tensor/test/TConvertDataType.cpp
--- a/source/tensor/test/TConvertDataType.h
+++ b/source/tensor/test/TConvertDataType.h
--- a/source/tensor/test/TCopyIndexed.cpp
+++ b/source/tensor/test/TCopyIndexed.cpp
--- a/source/tensor/test/TCopyIndexed.h
+++ b/source/tensor/test/TCopyIndexed.h
--- a/source/tensor/test/TCopyValues.cpp
+++ b/source/tensor/test/TCopyValues.cpp
--- a/source/tensor/test/TCopyValues.h
+++ b/source/tensor/test/TCopyValues.h
--- a/source/tensor/test/TCos.cpp
+++ b/source/tensor/test/TCos.cpp
--- a/source/tensor/test/TCos.h
+++ b/source/tensor/test/TCos.h
--- a/source/tensor/test/TCrossEntropy.cpp
+++ b/source/tensor/test/TCrossEntropy.cpp
--- a/source/tensor/test/TCrossEntropy.h
+++ b/source/tensor/test/TCrossEntropy.h
--- a/source/tensor/test/TDiv.cpp
+++ b/source/tensor/test/TDiv.cpp
--- a/source/tensor/test/TDiv.h
+++ b/source/tensor/test/TDiv.h
--- a/source/tensor/test/TDivDim.cpp
+++ b/source/tensor/test/TDivDim.cpp
--- a/source/tensor/test/TDivDim.h
+++ b/source/tensor/test/TDivDim.h
--- a/source/tensor/test/TDropout.cpp
+++ b/source/tensor/test/TDropout.cpp
--- a/source/tensor/test/TDropout.h
+++ b/source/tensor/test/TDropout.h
--- a/source/tensor/test/TExp.cpp
+++ b/source/tensor/test/TExp.cpp
--- a/source/tensor/test/TExp.h
+++ b/source/tensor/test/TExp.h
--- a/source/tensor/test/TGather.cpp
+++ b/source/tensor/test/TGather.cpp
--- a/source/tensor/test/TGather.h
+++ b/source/tensor/test/TGather.h
--- a/source/tensor/test/THardTanH.cpp
+++ b/source/tensor/test/THardTanH.cpp
--- a/source/tensor/test/THardTanH.h
+++ b/source/tensor/test/THardTanH.h
--- a/source/tensor/test/TIdentity.cpp
+++ b/source/tensor/test/TIdentity.cpp
--- a/source/tensor/test/TIdentity.h
+++ b/source/tensor/test/TIdentity.h
--- a/source/tensor/test/TLog.cpp
+++ b/source/tensor/test/TLog.cpp
--- a/source/tensor/test/TLog.h
+++ b/source/tensor/test/TLog.h
--- a/source/tensor/test/TLogSoftmax.cpp
+++ b/source/tensor/test/TLogSoftmax.cpp
--- a/source/tensor/test/TLogSoftmax.h
+++ b/source/tensor/test/TLogSoftmax.h
--- a/source/tensor/test/TLoss.cpp
+++ b/source/tensor/test/TLoss.cpp
--- a/source/tensor/test/TLoss.h
+++ b/source/tensor/test/TLoss.h
--- a/source/tensor/test/TMatrixMul.cpp
+++ b/source/tensor/test/TMatrixMul.cpp
--- a/source/tensor/test/TMatrixMul.h
+++ b/source/tensor/test/TMatrixMul.h
--- a/source/tensor/test/TMatrixMul2D.cpp
+++ b/source/tensor/test/TMatrixMul2D.cpp
--- a/source/tensor/test/TMatrixMul2D.h
+++ b/source/tensor/test/TMatrixMul2D.h
--- a/source/tensor/test/TMatrixMul2DParallel.cpp
+++ b/source/tensor/test/TMatrixMul2DParallel.cpp
--- a/source/tensor/test/TMatrixMul2DParallel.h
+++ b/source/tensor/test/TMatrixMul2DParallel.h
--- a/source/tensor/test/TMatrixMulBatched.cpp
+++ b/source/tensor/test/TMatrixMulBatched.cpp
--- a/source/tensor/test/TMatrixMulBatched.h
+++ b/source/tensor/test/TMatrixMulBatched.h
--- a/source/tensor/test/TMerge.cpp
+++ b/source/tensor/test/TMerge.cpp
--- a/source/tensor/test/TMerge.h
+++ b/source/tensor/test/TMerge.h
--- a/source/tensor/test/TMultiply.cpp
+++ b/source/tensor/test/TMultiply.cpp
--- a/source/tensor/test/TMultiply.h
+++ b/source/tensor/test/TMultiply.h
--- a/source/tensor/test/TMultiplyDim.cpp
+++ b/source/tensor/test/TMultiplyDim.cpp
--- a/source/tensor/test/TMultiplyDim.h
+++ b/source/tensor/test/TMultiplyDim.h
--- a/source/tensor/test/TNegate.cpp
+++ b/source/tensor/test/TNegate.cpp
--- a/source/tensor/test/TNegate.h
+++ b/source/tensor/test/TNegate.h
--- a/source/tensor/test/TNormalize.cpp
+++ b/source/tensor/test/TNormalize.cpp
--- a/source/tensor/test/TNormalize.h
+++ b/source/tensor/test/TNormalize.h
--- a/source/tensor/test/TPower.cpp
+++ b/source/tensor/test/TPower.cpp
--- a/source/tensor/test/TPower.h
+++ b/source/tensor/test/TPower.h
--- a/source/tensor/test/TRectify.cpp
+++ b/source/tensor/test/TRectify.cpp
--- a/source/tensor/test/TRectify.h
+++ b/source/tensor/test/TRectify.h
--- a/source/tensor/test/TReduceMax.cpp
+++ b/source/tensor/test/TReduceMax.cpp
--- a/source/tensor/test/TReduceMax.h
+++ b/source/tensor/test/TReduceMax.h
--- a/source/tensor/test/TReduceMean.cpp
+++ b/source/tensor/test/TReduceMean.cpp
--- a/source/tensor/test/TReduceMean.h
+++ b/source/tensor/test/TReduceMean.h
--- a/source/tensor/test/TReduceSum.cpp
+++ b/source/tensor/test/TReduceSum.cpp
--- a/source/tensor/test/TReduceSum.h
+++ b/source/tensor/test/TReduceSum.h
--- a/source/tensor/test/TReduceSumAll.cpp
+++ b/source/tensor/test/TReduceSumAll.cpp
--- a/source/tensor/test/TReduceSumAll.h
+++ b/source/tensor/test/TReduceSumAll.h
--- a/source/tensor/test/TReduceSumSquared.cpp
+++ b/source/tensor/test/TReduceSumSquared.cpp
--- a/source/tensor/test/TReduceSumSquared.h
+++ b/source/tensor/test/TReduceSumSquared.h
--- a/source/tensor/test/TReduceVariance.cpp
+++ b/source/tensor/test/TReduceVariance.cpp
--- a/source/tensor/test/TReduceVariance.h
+++ b/source/tensor/test/TReduceVariance.h
--- a/source/tensor/test/TRound.cpp
+++ b/source/tensor/test/TRound.cpp
--- a/source/tensor/test/TRound.h
+++ b/source/tensor/test/TRound.h
--- a/source/tensor/test/TScaleAndShift.cpp
+++ b/source/tensor/test/TScaleAndShift.cpp
--- a/source/tensor/test/TScaleAndShift.h
+++ b/source/tensor/test/TScaleAndShift.h
--- a/source/tensor/test/TSelect.cpp
+++ b/source/tensor/test/TSelect.cpp
--- a/source/tensor/test/TSelect.h
+++ b/source/tensor/test/TSelect.h
--- a/source/tensor/test/TSetAscendingOrder.cpp
+++ b/source/tensor/test/TSetAscendingOrder.cpp
--- a/source/tensor/test/TSetAscendingOrder.h
+++ b/source/tensor/test/TSetAscendingOrder.h
--- a/source/tensor/test/TSetData.cpp
+++ b/source/tensor/test/TSetData.cpp
--- a/source/tensor/test/TSetData.h
+++ b/source/tensor/test/TSetData.h
--- a/source/tensor/test/TSigmoid.cpp
+++ b/source/tensor/test/TSigmoid.cpp
--- a/source/tensor/test/TSigmoid.h
+++ b/source/tensor/test/TSigmoid.h
--- a/source/tensor/test/TSign.cpp
+++ b/source/tensor/test/TSign.cpp
--- a/source/tensor/test/TSign.h
+++ b/source/tensor/test/TSign.h
--- a/source/tensor/test/TSin.cpp
+++ b/source/tensor/test/TSin.cpp
--- a/source/tensor/test/TSin.h
+++ b/source/tensor/test/TSin.h
--- a/source/tensor/test/TSoftmax.cpp
+++ b/source/tensor/test/TSoftmax.cpp
--- a/source/tensor/test/TSoftmax.h
+++ b/source/tensor/test/TSoftmax.h
--- a/source/tensor/test/TSort.cpp
+++ b/source/tensor/test/TSort.cpp
--- a/source/tensor/test/TSort.h
+++ b/source/tensor/test/TSort.h
--- a/source/tensor/test/TSplit.cpp
+++ b/source/tensor/test/TSplit.cpp
--- a/source/tensor/test/TSplit.h
+++ b/source/tensor/test/TSplit.h
--- a/source/tensor/test/TSpread.cpp
+++ b/source/tensor/test/TSpread.cpp
--- a/source/tensor/test/TSpread.h
+++ b/source/tensor/test/TSpread.h
--- a/source/tensor/test/TSub.cpp
+++ b/source/tensor/test/TSub.cpp
--- a/source/tensor/test/TSub.h
+++ b/source/tensor/test/TSub.h
--- a/source/tensor/test/TSum.cpp
+++ b/source/tensor/test/TSum.cpp
--- a/source/tensor/test/TSum.h
+++ b/source/tensor/test/TSum.h
--- a/source/tensor/test/TSumDim.cpp
+++ b/source/tensor/test/TSumDim.cpp
--- a/source/tensor/test/TSumDim.h
+++ b/source/tensor/test/TSumDim.h
--- a/source/tensor/test/TTan.cpp
+++ b/source/tensor/test/TTan.cpp
--- a/source/tensor/test/TTan.h
+++ b/source/tensor/test/TTan.h
--- a/source/tensor/test/TTopK.cpp
+++ b/source/tensor/test/TTopK.cpp
--- a/source/tensor/test/TTopK.h
+++ b/source/tensor/test/TTopK.h
--- a/source/tensor/test/TTranspose.cpp
+++ b/source/tensor/test/TTranspose.cpp
--- a/source/tensor/test/TTranspose.h
+++ b/source/tensor/test/TTranspose.h
--- a/source/tensor/test/TUnsqueeze.cpp
+++ b/source/tensor/test/TUnsqueeze.cpp
--- a/source/tensor/test/TUnsqueeze.h
+++ b/source/tensor/test/TUnsqueeze.h
--- a/source/tensor/test/TXMem.cpp
+++ b/source/tensor/test/TXMem.cpp
--- a/source/tensor/test/TXMem.h
+++ b/source/tensor/test/TXMem.h
--- a/source/tensor/test/Test.cpp
+++ b/source/tensor/test/Test.cpp
--- a/source/tensor/test/Test.h
+++ b/source/tensor/test/Test.h