Merge branch liyinqiao.

d291f56a · liyinqiao · 66f7a298 · d291f56a · d291f56a · d291f56a
Commit d291f56a authored Feb 06, 2021 by liyinqiao
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
--- a/LICENSE
+++ b/LICENSE
--- a/README.md
+++ b/README.md
--- a/data/fnnlm/test/wsj.test
+++ b/data/fnnlm/test/wsj.test
--- a/data/fnnlm/train/wsj.train
+++ b/data/fnnlm/train/wsj.train
--- a/data/transformer/test/bpevocab
+++ b/data/transformer/test/bpevocab
--- a/data/transformer/test/code
+++ b/data/transformer/test/code
--- a/data/transformer/test/test.de
+++ b/data/transformer/test/test.de
--- a/data/transformer/test/test.en
+++ b/data/transformer/test/test.en
--- a/data/transformer/train/bpevocab
+++ b/data/transformer/train/bpevocab
--- a/data/transformer/train/code
+++ b/data/transformer/train/code
--- a/data/transformer/train/train.data.tgz
+++ b/data/transformer/train/train.data.tgz
--- a/data/transformer/train/valid.data.tgz
+++ b/data/transformer/train/valid.data.tgz
--- a/doc/Configuration.md
+++ b/doc/Configuration.md
-# NiuTrans.Tensor环境配置
-
-## 注意事项
-
-CUDA最新版本9.2尚且不支持VS2017最新版本，因此建议使用CUDA版本为9.0或9.1，建议使用VS版本为VS2015，或使用VS2017时安装v140工具集，解决方案平台设置为×64。
-
-## CUDA配置
-
-在已安装好VS、CUDA并配置好环境变量后，一些关键的CUDA配置选项如下所示，以下配置选项在 **项目 -> 属性** 中可以找到。
-
->$(CUDA_PATH)\include
-
-加入到 **VC++目录 -> 包含** 中。
-
->$(CUDA_PATH)\lib\Win32
-
-加入到 **VC++目录 -> 库** 中。
-
->cuda.lib;cudadevrt.lib;cudart.lib;cudart_static.lib;nvcuvid.lib;OpenCL.lib;cublas.lib;curand.lib;
-
-加入到 **链接器->输入->附加依赖项** 中。
-
-配置完成后，右键 **工程->项目依赖性** ，选择CUDA9。
-在.cu文件上右键属性，在项类型中选择"CUDA C/C++"（最好搜索.cu文件，然后全选设置）。
-
-## 其他配置
-
-**C/C++->常规->SDL检查**，设为否。
-
-在 **C/C++->预处理器->预处理器定义** 中，添加
-
->USE_CUDA;USE_BLAS;WIN32;MKL;_DEBUG;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_WARNINGS_
-CONSOLE;
-
-**链接器->系统->子系统**，设置为控制台。
-
-**常规->字符集**，使用Unicode字符集。
-
-**调试->命令参数**中设置可执行文件所需要的参数。
-
-
--- a/doc/manual.md
+++ b/doc/manual.md
--- a/doc/pic/embeding-pos.jpg
+++ b/doc/pic/embeding-pos.jpg
--- a/doc/pic/self-attention-tensor-figure.jpg
+++ b/doc/pic/self-attention-tensor-figure.jpg
--- a/doc/pic/self-attention.jpg
+++ b/doc/pic/self-attention.jpg
--- a/doc/pic/transformer-architecture.jpg
+++ b/doc/pic/transformer-architecture.jpg
--- a/doc/pic/transformer-training.jpg
+++ b/doc/pic/transformer-training.jpg
--- a/doc/pic/transformer-translate.jpg
+++ b/doc/pic/transformer-translate.jpg
--- a/source/Main.cpp
+++ b/source/Main.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
+* All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-10
+ */
+
+#include <stdio.h>
+#include "./network/XNet.h"
+#include "./tensor/XUtility.h"
+#include "./tensor/function/FHeader.h"
+#include "./tensor/core/CHeader.h"
+#include "./tensor/test/Test.h"
+#include "./sample/fnnlm/FNNLM.h"
+#include "./sample/transformer/NMT.h"
+
+//#define CRTDBG_MAP_ALLOC
+//#include <stdlib.h>
+//#include <crtdbg.h>
+
+using namespace nts;
+using namespace fnnlm;
+using namespace nmt;
+
+int main( int argc, const char ** argv )
+{
+    if(argc > 1 && !strcmp(argv[1], "-test"))
+        Test();
+    else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
+        FNNLMMain(argc - 1, argv + 1);
+    else if(argc > 1 && !strcmp(argv[1], "-t2t"))
+        NMTMain(argc - 1, argv + 1);
+    else{
+        fprintf(stderr, "Thanks for using NiuTensor! This is a library for building\n");
+        fprintf(stderr, "neural networks in an easy way. \n\n");
+        fprintf(stderr, "Run this program with \"-test\" for unit test!\n");
+        fprintf(stderr, "Or run this program with \"-fnnlm\" for sample FNNLM!\n");
+        fprintf(stderr, "Or run this program with \"-t2t\" for sample Transformer!\n");
+    }
+
+    return 0;
+}
--- a/source/network/Main.cpp
+++ b/source/network/Main.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-10
- */
-
-#include <stdio.h>
-#include "XNet.h"
-#include "../tensor/XUtility.h"
-#include "../tensor/function/FHeader.h"
-#include "../tensor/core/CHeader.h"
-#include "../tensor/test/Test.h"
-#include "../sample/fnnlm/FNNLM.h"
-#include "../sample/transformer/Transformer.h"
-
-//#define CRTDBG_MAP_ALLOC
-//#include <stdlib.h>
-//#include <crtdbg.h>
-
-void BackwardTest();
-void TransposeTest();
-void SumDimTest();
-
-using namespace nts;
-using namespace fnnlm;
-using namespace transformer;
-
-int main( int argc, const char ** argv )
-{
-    //_CrtSetDbgFlag(_CrtSetDbgFlag(_CRTDBG_REPORT_FLAG) | _CRTDBG_LEAK_CHECK_DF);
-    //_CrtSetBreakAlloc(2708);
-
-    if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
-        FNNLMMain(argc - 1, argv + 1);
-    else if(argc > 1 && !strcmp(argv[1], "-t2t"))
-        TransformerMain(argc - 1, argv + 1);
-    else{
-        fprintf(stderr, "Thanks for using NiuTrans.Network! This is a library for building\n");
-        fprintf(stderr, "neural networks in an easy way. \n\n");
-        fprintf(stderr, "Run this program with \"-test\" for unit test!\n");
-        fprintf(stderr, "Or run this program with \"-fnnlm\" for sample FNNLM!\n");
-    }
-
-    //_CrtDumpMemoryLeaks();
-    
-    return 0;
-}
-
-void BackwardTest()
-{
-    XNet net;
-
-    XTensor a;
-    XTensor b;
-    XTensor c;
-    XTensor mean;
-    XTensor origin;
-    InitTensor2D(&a, 2, 3);
-    InitTensor1D(&b, 2);
-
-    a.SetZeroAll();
-    b.SetZeroAll();
-    a.Set2D(1.0F, 0, 0);
-    a.Set2D(2.0F, 0, 1);
-    a.Set2D(3.0F, 0, 2);
-    a.Set2D(4.0F, 1, 0);
-    a.Set2D(5.0F, 1, 1);
-    a.Set2D(6.0F, 1, 2);
-
-    b.Set1D(2.0F, 0);
-    b.Set1D(1.0F, 1);
-
-    c = DivDim(a, b, 0);
-    c.Dump(stderr, "c:");
-
-    //XLink::ShowNetwork(stderr, &c);
-
-    net.Backward(c);
-
-    net.Dump(stderr);
-
-}
-
-void TransposeTest()
-{
-#ifdef USE_CUDA
-    XMem mem0(0, UNI_FREE, MILLION * 64, 1024, MILLION * 64);
-    //XMem mem1(1, UNI_FREE, MILLION * 64, 1024, MILLION * 64);
-    XTensor x;
-    XTensor y;
-    XTensor z;
-
-    int loops = 2000;
-
-    int B = 3 * 2 * 4;
-    int K = 8 * 1;
-    int N = 50;
-    int H = 512 * 4;
-
-    int nnn = GDevs.nGPU;
-
-    InitTensor3D(&x, B, N, H, X_FLOAT, 0);
-    InitTensor4D(&y, K, B, N, H/K, X_FLOAT, 0);
-    InitTensor3D(&z, B, N, H, X_FLOAT, 0);
-
-    cudaEvent_t ctime0;
-    cudaEvent_t ctime1;
-    cudaEvent_t ctime2;
-    cudaEvent_t ctime3;
-    cudaEvent_t ctime4;
-    cudaEvent_t ctime5;
-
-    float elapsedSplit = 0.0;
-    float elapsedMerge = 0.0;
-    float elapsedSum = 0.0;
-
-    cudaEventCreate(&ctime0);
-    cudaEventCreate(&ctime1);
-    cudaEventCreate(&ctime2);
-    cudaEventCreate(&ctime3);
-    cudaEventCreate(&ctime4);
-    cudaEventCreate(&ctime5);
-
-    cudaEventRecord(ctime0, 0);
-
-    double time0 = GetClock();
-    for(int i = 0; i < loops; i++)
-        _Split(&x, &y, 2, K);
-    double time1 = GetClock();
-    
-    cudaEventRecord(ctime1, 0);
-    cudaEventSynchronize(ctime1);
-    cudaEventElapsedTime(&elapsedSplit, ctime0, ctime1);
-
-    cudaEventRecord(ctime2, 0);
-
-    double time2 = GetClock();
-    for(int i = 0; i < loops; i++)
-        _Merge(&y, &x, 3);
-    double time3 = GetClock();
-
-    cudaEventRecord(ctime3, 0);
-    cudaEventSynchronize(ctime3);
-    cudaEventElapsedTime(&elapsedMerge, ctime2, ctime3);
-
-    cudaEventRecord(ctime4, 0);
-
-    double time4 = GetClock();
-    for(int i = 0; i < loops; i++)
-        _Sum(&x, &z, &x);
-    double time5 = GetClock();
-
-    cudaEventRecord(ctime5, 0);
-    cudaEventSynchronize(ctime5);
-    cudaEventElapsedTime(&elapsedSum, ctime4, ctime5);
-
-    fprintf(stderr, "split:%f merge:%f sum:%f\n", time1 - time0, time3 - time2, time5 - time4);
-    fprintf(stderr, "split:%f merge:%f sum:%f\n", elapsedSplit, elapsedMerge, elapsedSum);
-#endif
-}
-
-void SumDimTest()
-{
-    XTensor x;
-    XTensor y;
-    XTensor z;
-
-    int a = 5;
-    int b = 7;
-    int c = 3;
-
-    InitTensor3D(&x, a, b, c, X_FLOAT, -1);
-    InitTensor1D(&y, c, X_FLOAT, -1);
-    InitTensor3D(&z, a, b, c, X_FLOAT, -1);
-
-    x.SetZeroAll();
-    y.SetZeroAll();
-    z.SetZeroAll();
-
-    DTYPE * data = new DTYPE[x.unitNum];
-
-    for(int i = 0; i < x.unitNum; i++)
-        data[i] = (DTYPE)i;
-    x.SetData(data, x.unitNum);
-
-    for(int i = 0; i < y.unitNum; i++)
-        data[i] = -(DTYPE)i;
-    y.SetData(data, y.unitNum);
-
-    _SumDim(&x, &y, &z, 2);
-
-    z.Dump(stderr, "z:");
-
-    delete[] data;
-}
--- a/source/network/XBackwardFunc.cpp
+++ b/source/network/XBackwardFunc.cpp
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -31,37 +31,65 @@ namespace nts{
 /* compute dE/dx of a node */
 void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient)
 {
+    if (!isEfficient) {
+        CheckNTErrors(node->grad != NULL, "No gradient found!");
+    }
+    else {
+        CheckNTErrors(!node->isGrad || node->grad != NULL, "No gradient found!");
+    }
+
    XLink &income = node->income;
    int operID = income.typeID;

-    CheckNTErrors(node->grad != NULL, "No gradient found!");
    CheckNTErrors(income.tailNum == 1, "Too many input tensors for the function!");

    XTensor * input = income.tails[0];
    XTensor * output = node;

-    XNoder::MakeGrad(input);
+    if (!isEfficient || input->isGrad) {
+        XNoder::MakeGrad(input);

-    if(operID == FUNC_HARDTANH)
-        _HardTanHBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
-    else if(operID == FUNC_IDENTITY)
-        _IdentityBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
-    else if(operID == FUNC_LOGSOFTMAX){
-        int leadDim = income.GetParamInt(0);
-        CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in logsoftmax!");
-        _LogSoftmaxBackward(NULL, output, input, output->grad, input->grad, NULL, leadDim, NOLOSS);
-    }
-    else if(operID == FUNC_RECTIFY)
-        _RectifyBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
-    else if(operID == FUNC_SIGMOID)
-        _SigmoidBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
-    else if(operID == FUNC_SOFTMAX){
-        int leadDim = income.GetParamInt(0);
-        CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in softmax!");
-        _SoftmaxBackward(NULL, output, input, output->grad, input->grad, NULL, leadDim, NOLOSS);
-    }
-    else{
-        ShowNTErrors("Wrong activation function type!");
+        XTensor * dedx = input->grad;
+        XTensor * dedy = output->grad;
+
+        XTensor* tmp;
+
+        /* store the result to a temporary node if the input has multiple children */
+        if (input->outgo.tailNum > 1) {
+            tmp = NewTensor(output);
+            tmp->SetZeroAll();
+        }
+        /* otherwise, the result is directly stored into the input node  */
+        else {
+            tmp = dedx;
+        }
+
+        if (operID == FUNC_HARDTANH)
+            _HardTanHBackward(output, input, dedy, tmp);
+        else if (operID == FUNC_IDENTITY)
+            _IdentityBackward(output, input, dedy, tmp);
+        else if (operID == FUNC_LOGSOFTMAX) {
+            int leadDim = income.GetParamInt(0);
+            CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in logsoftmax!");
+            _LogSoftmaxBackward(NULL, output, input, dedy, tmp, NULL, leadDim, NOLOSS);
+        }
+        else if (operID == FUNC_RECTIFY)
+            _RectifyBackward(output, input, dedy, tmp);
+        else if (operID == FUNC_SIGMOID)
+            _SigmoidBackward(output, input, dedy, tmp);
+        else if (operID == FUNC_SOFTMAX) {
+            int leadDim = income.GetParamInt(0);
+            CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in softmax!");
+            _SoftmaxBackward(NULL, output, input, dedy, tmp, NULL, leadDim, NOLOSS);
+        }
+        else {
+            ShowNTErrors("Unsupported backward computation! TODO!");
+        }
+
+        if (input->outgo.tailNum > 1) {
+            _SumMe(dedx, tmp);
+            DelTensor(tmp);
+        }
    }

    node->visitMark = NODE_FINISHED;

--- a/source/network/XBackwardFunc.h
+++ b/source/network/XBackwardFunc.h
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/network/XBackwardLoss.cpp
+++ b/source/network/XBackwardLoss.cpp
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -33,7 +33,6 @@

 namespace nts{

-
 /* compute dE/dx of a node */
 void XLossGrad::MakeGrad(XTensor * node, bool isEfficient)
 {
@@ -48,33 +47,45 @@ void XLossGrad::MakeGrad(XTensor * node, bool isEfficient)
    XTensor * padding = NULL;
    int leadingDim;

-    XNoder::MakeGrad(output);
-    XTensor * dedy = output->grad;
+    bool isRoot = XNoder::IsRoot(node);

-    if (income.tailNum == 1) {
-        if(dedy->dataType == X_FLOAT)
-            _SetDataFixedFloat(dedy, 1.0F);
-        else if(dedy->dataType == X_DOUBLE)
-            _SetDataFixedDouble(dedy, 1.0);
-        else if(dedy->dataType == X_INT)
-            _SetDataFixedInt(dedy, 1);
-        else
-            ShowNTErrors("TODO");
+    if (!isEfficient || output->isGrad) {
+        XNoder::MakeGrad(output);
+        XTensor * dedy = output->grad;

-        return;
-    }
+        if (income.tailNum == 1) {
+            dedy->SetDataFixed(1);
+            return;
+        }

-    gold = income.tails[1];
+        gold = income.tails[1];

-    if(operID == LOSS_CROSSENTROPY) {
-        if (income.tailNum == 3) 
-            padding = income.tails[2];
-         leadingDim = income.GetParamInt(0);
-        CheckNTErrors(leadingDim >= 0 && leadingDim < output->order, "wrong leading dimension in logsoftmax!");
-        _CrossEntropyBackward(dedy, output, gold, weight, padding, leadingDim);
-    }
-    else{
-        ShowNTErrors("Wrong activation function type!");
+        XTensor* tmp;
+        if (!isRoot) {
+            tmp = NewTensor(output);
+            tmp->SetZeroAll();
+        }
+        else{
+            tmp = dedy;
+        }
+
+        if (operID == LOSS_CROSSENTROPY) {
+            if (income.tailNum == 3)
+                padding = income.tails[2];
+            leadingDim = income.GetParamInt(0);
+            CheckNTErrors(leadingDim >= 0 && leadingDim < output->order, "wrong leading dimension in logsoftmax!");
+            _CrossEntropyBackward(tmp, output, gold, weight, padding, leadingDim);
+            if (isRoot)
+                gold->DestroyData();
+            else
+                _SumMe(dedy, tmp);
+        }
+        else {
+            ShowNTErrors("Unsupported backward computation! TODO!");
+        }
+        
+        if (!isRoot)
+            DelTensor(tmp);
    }

    node->visitMark = NODE_FINISHED;
@@ -87,79 +98,4 @@ bool XLossGrad::IsLossOP(XTensor * node)
    return (income.typeID & LOSS_BASE) != 0;
 }

-/* 
-compute dE/dx for a given function y = f(x) 
->> gold - gold standard to measure error (or loss)
->> y - output of the function
->> x - input of the function
->> dedy - dE/dy
->> dedx - dE/dx
->> funcID - id of the function f
->> params - parameters of the function
->> lossName - name of the loss, e.g., cross entropy
-*/
-void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x, 
-                        XTensor * dedy, XTensor * dedx, XTensor * padding,
-                        int funcID, void * params,
-                        LOSS_FUNCTION_NAME lossName)
-{
-    CheckNTErrors(gold && y && x, "Empty input tensors!");
-    CheckNTErrors(dedx, "Empty gradient tensors!");
-    CheckNTErrors((funcID & FUNCTION_BASE) != 0, "Illegal function id");
-
-    if(funcID == FUNC_HARDTANH){
-        _HardTanHBackward(gold, y, x, dedy, dedx, lossName);
-    }
-    else if(funcID == FUNC_IDENTITY){
-        _IdentityBackward(gold, y, x, dedy, dedx, lossName);
-    }
-    else if(funcID == FUNC_LOGSOFTMAX){
-        int leadDim = *(int*)params;
-        _LogSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
-    }
-    else if(funcID == FUNC_RECTIFY){
-        _RectifyBackward(gold, y, x, dedy, dedx, lossName);
-    }
-    else if(funcID == FUNC_SIGMOID){
-        _SigmoidBackward(gold, y, x, dedy, dedx, lossName);
-    }else if(funcID == FUNC_SOFTMAX){
-        int leadDim = *(int*)params;
-        _SoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
-    }
-    else{
-        ShowNTErrors("wrong function found when call the backward process!");
-    }
-
-}
-
-/* 
-compute dE/dy for variable y and error(loss) function E
->> gold - gold standard to measure error (or loss)
->> y - output of the function
->> dedy - dE/dy
->> lossName - name of the loss, e.g., cross entropy
-*/
-void XLossGrad::Compute(XTensor * gold, XTensor * y, 
-                        XTensor * dedy, XTensor * padding,
-                        LOSS_FUNCTION_NAME lossName)
-{
-    if(gold == NULL){
-        if(dedy->dataType == X_FLOAT)
-            _SetDataFixedFloat(dedy, 1.0F);
-        else if(dedy->dataType == X_DOUBLE)
-            _SetDataFixedDouble(dedy, 1.0);
-        else if(dedy->dataType == X_INT)
-            _SetDataFixedInt(dedy, 1);
-        else{
-            ShowNTErrors("TODO");
-        }
-        return;
-    }
-
-    //_LossBackward(dedy, gold, y, lossName);
-    if(lossName == CROSSENTROPY)
-        _CrossEntropyBackward(dedy, y, gold, NULL, padding);
-
-}
-
 }
\ No newline at end of file
--- a/source/network/XBackwardLoss.h
+++ b/source/network/XBackwardLoss.h
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -43,11 +43,11 @@ public:
    static
    bool IsLossOP(XTensor * node);

-    /* compute dE/dx for a given function y = f(x) */
-    void Compute(XTensor * gold, XTensor * y, XTensor * x, 
-                 XTensor * dedy, XTensor * dedx, XTensor * padding,
-                 int funcID, void * params,
-                 LOSS_FUNCTION_NAME lossName);
+    ///* compute dE/dx for a given function y = f(x) */
+    //void Compute(XTensor * gold, XTensor * y, XTensor * x, 
+    //             XTensor * dedy, XTensor * dedx, XTensor * padding,
+    //             int funcID, void * params,
+    //             LOSS_FUNCTION_NAME lossName);

    /* compute dE/dy for variable y and error(loss) function E */
    void Compute(XTensor * gold, XTensor * y, 

--- a/source/network/XBackwardMath.cpp
+++ b/source/network/XBackwardMath.cpp
--- a/source/network/XBackwardMath.h
+++ b/source/network/XBackwardMath.h
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -126,6 +126,18 @@ private:
    static
    void GradPower(XTensor * node, bool isEfficient);

+    /* gradient for power */
+    static
+    void GradReciprocal(XTensor* node, bool isEfficient);
+
+    /* gradient for sqrt */
+    static
+    void GradSqrt(XTensor* node, bool isEfficient);
+    
+    /* gradient for square */
+    static
+    void GradSquare(XTensor* node, bool isEfficient);
+
    /* gradient for ScaleAndShift */
    static
    void GradScaleAndShift(XTensor * node, bool isEfficient);
@@ -146,10 +158,10 @@ private:
    static
    void GradSub(XTensor * node, bool isEfficient);
    
-	/* gradient for sub with one dimension: c = a - b * \beta
-	where the size of b is equal to that of one dimension of a */
-	static
-	void GradSubDim(XTensor * node, bool isEfficient);
+    /* gradient for sub with one dimension: c = a - b * \beta
+    where the size of b is equal to that of one dimension of a */
+    static
+    void GradSubDim(XTensor * node, bool isEfficient);

    /* gradient for sum: c =  a + b * \beta */
    static
@@ -173,6 +185,10 @@ private:
    static
    void GradReduceSum(XTensor * node, bool isEfficient);

+    /* gradient for reduceSumAll */
+    static
+    void GradReduceSumAll(XTensor * node, bool isEfficient);
+
    /* gradient for reduceSumSquared */
    static
    void GradReduceSumSquared(XTensor * node, bool isEfficient);
@@ -184,6 +200,10 @@ private:
    /* gradient for operation */
    static
    void GradMulAndShift(XTensor * node, bool isEfficient);
+
+    /* gradient for MLP */
+    static
+    void GradMLP(XTensor* node, bool isEfficient);
 };

 }

--- a/source/network/XBackwardShape.cpp
+++ b/source/network/XBackwardShape.cpp
--- a/source/network/XBackwardShape.h
+++ b/source/network/XBackwardShape.h
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -34,7 +34,7 @@ class XShapeGrad
 public:
    /* compute dE/dx of a node */
    static
-    void MakeGrad(XTensor * node, bool isEfficent);
+    void MakeGrad(XTensor * node, bool isEfficient);

    /* indicates whether the node is for a shaping operation */
    static
@@ -42,55 +42,59 @@ public:

    /* post processing of a node */
    static
-    void PostProcessing(XTensor * node, int typeId, bool isEfficent);
+    void PostProcessing(XTensor * node, int typeId, bool isEfficient);

 private:
    
+    /* gradient computation for convertdatatype: b = convertdatatype(a) */
+    static
+    void GradConvertDataType(XTensor * node, bool isEfficient);
+            
    /* gradient computation for copying indexed sub-tensors: b = copyindexed(a, srcIndex, indexSize, tgtIndex, copyNum) */
    static
-    void GradCopyIndexed(XTensor * node, bool isEfficent);
+    void GradCopyIndexed(XTensor * node, bool isEfficient);
        
    /* gradient computation for copying indexed sub-tensors: b = gather(a, index) */
    static
-    void GradGather(XTensor * node, bool isEfficent);
+    void GradGather(XTensor * node, bool isEfficient);

    /* gradient computation for dropout with index: b = dropoutwithindex(a, index) */
    static
-    void GradDropoutWithIndex(XTensor * node, bool isEfficent);
+    void GradDropoutWithIndex(XTensor * node, bool isEfficient);

    /* gradient computation for merge: c = merge(a, b, ...) */
    static
-    void GradMerge(XTensor * node, bool isEfficent);
+    void GradMerge(XTensor * node, bool isEfficient);

    /* gradient computation for merging a list of tensors : c = merge(list(a, b, ...)) */
    static
-    void GradMergeList(XTensor * node, bool isEfficent);
+    void GradMergeList(XTensor * node, bool isEfficient);
    
    /* gradient computation for transposing a tensor : b = transpose(a) */
    static
-    void GradTranspose(XTensor * node, bool isEfficent);
+    void GradTranspose(XTensor * node, bool isEfficient);

    /* gradient computation for reshaping a tensor: c = reshape(a) */
    static
-    void GradReshape(XTensor * node, bool isEfficent);
+    void GradReshape(XTensor * node, bool isEfficient);

    /* gradient computation for split: c = split(a) */
    static
-    void GradSplit(XTensor * node, bool isEfficent);
+    void GradSplit(XTensor * node, bool isEfficient);

    /* gradient computation for spliting. we return the list of the splits : list(c_1, ...) = split(a) */
    static
-    void GradSplitList(XTensor * node, bool isEfficent);
+    void GradSplitList(XTensor * node, bool isEfficient);

    /* gradient computation for spliting. we return the list of the splits : list(c_1, ...) = split(a).
       this method is called only when all nodes of spliting have been processed. We do this in a post-processing
       manner because we can fuze multiple memory copy jobs one time. This is good for system speed up. */
    static
-    void GradSplitListPost(XTensor * node, bool isEfficent);
+    void GradSplitListPost(XTensor * node, bool isEfficient);

    /* gradient computation for unsqueezing a tensor : c = unsqueeze(a) */
    static
-    void GradUnsqueeze(XTensor * node, bool isEfficent);
+    void GradUnsqueeze(XTensor * node, bool isEfficient);

 };


--- a/source/network/XNet.cpp
+++ b/source/network/XNet.cpp
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -77,104 +77,20 @@ backward propagation to obtain gradient
 >> root - root node (output) of the network
 >> loss - name of loss function
 */
-void XNet::Backward(XTensor &root, LOSS_FUNCTION_NAME loss)
+void XNet::Backward(XTensor &root)
 {
    TensorList roots(1);
    roots.Add(&root);

-    TensorList golds(1);
-    golds.Add(NULL);
-
-    TensorList paddings(1);
-    paddings.Add(NULL);
-
-    Backward(roots, golds, paddings, loss);
-}
-
-/*
-backward propagation to obtain gradient wrt. the loss/error function
->> root - root node (output) of the network
->> gold - gold standard for the output
->> loss - name of loss function
-*/
-void XNet::Backward(XTensor &root, XTensor &gold, LOSS_FUNCTION_NAME loss)
-{
-    TensorList roots(1);
-    roots.Add(&root);
-
-    TensorList golds(1);
-    golds.Add(&gold);
-
-    TensorList paddings(1);
-    paddings.Add(NULL);
-
-    Backward(roots, golds, paddings, loss);
-}
-
-/* 
-backward propagation to obtain gradient wrt. the loss/error function 
->> root - root node (output) of the network
->> gold - gold standard for the output
->> padding - specify a target value that is ignored and does not contribute to the gradient computation
->> loss - name of loss function
-*/
-void XNet::Backward(XTensor &root, XTensor &gold, XTensor &padding, LOSS_FUNCTION_NAME loss)
-{
-    TensorList roots(1);
-    roots.Add(&root);
-
-    TensorList golds(1);
-    golds.Add(&gold);
-
-    TensorList paddings(1);
-    paddings.Add(&padding);
-
-    Backward(roots, golds, paddings, loss);
-}
-
-/*
-backward propagation to obtain gradient
-with a number of root nodes
->> roots - a list of root nodes (output) of the network
->> loss - name of loss function
-*/
-void XNet::Backward(TensorList &roots, LOSS_FUNCTION_NAME loss)
-{
-    TensorList golds(roots.count);
-    TensorList paddings(roots.count);
-    for (int i = 0; i < roots.count; i++) {
-        golds.Add(NULL);
-        paddings.Add(NULL);
-    }
-
-    Backward(roots, golds, paddings, loss);
-}
-
-/*
-backward propagation to obtain gradient
-with a number of root nodes
->> roots - a list of root nodes (output) of the network
->> golds - a list of gold standard for the output
->> loss - name of loss function
-*/
-void XNet::Backward(TensorList &roots, TensorList &golds, LOSS_FUNCTION_NAME loss)
-{
-    TensorList paddings(roots.count);
-    for (int i = 0; i < roots.count; i++)
-        paddings.Add(NULL);
-
-    Backward(roots, golds, paddings, loss);
+    Backward(roots);
 }

 /* 
 backward propagation to obtain gradient wrt. the loss/error function
 with a number of root nodes 
 >> roots - a list of root nodes (output) of the network
->> golds - a list of gold standard for the output
->> paddings - specify a target value that is ignored
->> loss - name of loss function
 */
-void XNet::Backward(TensorList &roots, TensorList &golds, TensorList &paddings, LOSS_FUNCTION_NAME loss)
+void XNet::Backward(TensorList &roots)
 {
    Traverse(roots);

@@ -187,39 +103,6 @@ void XNet::Backward(TensorList &roots, TensorList &golds, TensorList &paddings, 
        node->visitMark = NODE_UNFINISHED;
    }

-    //XLossGrad lossGrad;
-
-    /* we start with the gradient with respect to the loss for output layers */
-    /*for(int i = 0; i < roots.count; i++){
-        XTensor * root = (XTensor*)roots.Get(i);
-        XTensor * gold = (XTensor*)golds.Get(i);
-        XTensor * padding = (XTensor*)paddings.Get(i);
-        XLink &income = root->income;
-        int funcID = income.typeID;
-        void * params = income.params;*/
-
-        /* we compute dE/dx if the output is generated by an activation function y = f(x).
-           Note that we do not need to obtain dE/dy here because it is no use in the 
-           folloing process of back-propagation */
-        /*if(gold != NULL && income.tailNum == 1 && (funcID & FUNCTION_BASE)){
-            if(funcID == FUNC_LOGSOFTMAX || funcID == FUNC_SOFTMAX) {
-                XTensor * x = income.tails[0];
-                XNoder::MakeGrad(x);
-                lossGrad.Compute(gold, root, x, NULL, x->grad, padding, funcID, params, loss);
-                root->visitMark = NODE_FINISHED;
-            }
-            else {
-                XNoder::MakeGrad(root);
-                lossGrad.Compute(gold, root, root->grad, padding, loss);
-            }
-        }*/
-        /* we compuate dE/dy (y is the output) if no predefined activation function is used */
-        /*else{
-            XNoder::MakeGrad(root);
-            lossGrad.Compute(gold, root, root->grad, NULL, loss);
-        }
-    }*/
-    
    /* back-propagation from output to input */
    for(int i = nodes.count - 1; i >= 0; i--){
        XTensor * node = (XTensor*)nodes.Get(i);
@@ -238,8 +121,13 @@ void XNet::Backward(TensorList &roots, TensorList &golds, TensorList &paddings, 
                ClearGrad(parent);
            }

-            if(XNoder::IsLeaf(node))
+            if (XNoder::IsLeaf(node)) {
                ClearGrad(node);
+                if (node->outgo.tailNum == 0) {
+                    delete node;
+                }
+            }
+            
        }
    }
 }
@@ -267,7 +155,7 @@ void XNet::BackwardNode(XTensor * node, bool isEfficent)
        else if(XShapeGrad::IsShapeOP(node))
            XShapeGrad::MakeGrad(node, isEfficent);
        else if(XLossGrad::IsLossOP(node))
-			XLossGrad::MakeGrad(node, isEfficent);
+            XLossGrad::MakeGrad(node, isEfficent);
        else{
            ShowNTErrors("Wrong node type!");
        }
@@ -433,7 +321,6 @@ void XNet::ClearGrad(XTensor * node)
    }

    if(finished){
-        //fprintf(stderr, "del %d %ld\n", node->id, node->grad->unitNum);
        delete node->grad;
        node->grad = NULL;
    }
@@ -451,7 +338,7 @@ void XNet::ShowNetwork(FILE * file, XTensor * node)

    Traverse(roots);

-    XLink::ShowNode(file, node);
+    //XLink::ShowNode(file, node);

    /* go over nodes in its topological order */
    for(int i = nodes.count - 1; i >= 0; i--){
@@ -460,7 +347,6 @@ void XNet::ShowNetwork(FILE * file, XTensor * node)
    }
 }

-
 /*
 search for a node in a top-down manner by its name
 >> top - the top most node
@@ -468,7 +354,7 @@ search for a node in a top-down manner by its name
 */
 //XTensor * XNet::SearchNode(XTensor * top, const char * name)
 //{
-	//return XLink::SearchNode(top, name);
+    //return XLink::SearchNode(top, name);
 //}

 }
--- a/source/network/XNet.h
+++ b/source/network/XNet.h
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -61,25 +61,11 @@ struct XNet
    void Clear();

    /* backward propagation to obtain gradient */
-    void Backward(XTensor &root, LOSS_FUNCTION_NAME loss = NOLOSS);
-
-    /* backward propagation to obtain gradient wrt. the loss/error function */
-    void Backward(XTensor &root, XTensor &gold, LOSS_FUNCTION_NAME loss = NOLOSS);
-
-    /* backward propagation to obtain gradient wrt. the loss/error function */
-    void Backward(XTensor &root, XTensor &gold, XTensor &padding, LOSS_FUNCTION_NAME loss = NOLOSS);
-
-    /* backward propagation to obtain gradient
-       with a number of root nodes */
-    void Backward(TensorList &roots, LOSS_FUNCTION_NAME loss = NOLOSS);
-
-    /* backward propagation to obtain gradient
-       with a number of root nodes */
-    void Backward(TensorList &roots, TensorList &golds, LOSS_FUNCTION_NAME loss = NOLOSS);
+    void Backward(XTensor &root);

    /* backward propagation to obtain gradient wrt. the loss/error function
       with a number of root nodes */
-    void Backward(TensorList &roots, TensorList &golds, TensorList &paddings, LOSS_FUNCTION_NAME loss = NOLOSS);
+    void Backward(TensorList &roots);

    /* backward computation for a given node */
    void BackwardNode(XTensor * node, bool isEfficent = false);

--- a/source/network/XNoder.cpp
+++ b/source/network/XNoder.cpp
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -29,7 +29,7 @@ void XNoder::MakeGrad(XTensor * node)
    if(node == NULL)
        return;

-    if(!XTensor::IsSameShaped(node, node->grad)){
+    if(!_IsSameShaped(node, node->grad)){
        delete node->grad;
        node->grad = NewTensor(node);
        node->grad->SetZeroAll();

--- a/source/network/XNoder.h
+++ b/source/network/XNoder.h
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -20,7 +20,7 @@
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-18
 */

-#include "../tensor/XTensor.h"
+#include "../tensor/core/CHeader.h"

 #ifndef __XNODER_H__
 #define __XNODER_H__

--- a/source/sample/fnnlm/FNNLM.cpp
+++ b/source/sample/fnnlm/FNNLM.cpp
--- a/source/sample/fnnlm/FNNLM.h
+++ b/source/sample/fnnlm/FNNLM.h
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
+ * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University. 
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/Decoder.cpp
+++ b/source/sample/transformer/Decoder.cpp
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-10-09
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
+ */
+
+#include "Decoder.h"
+#include "Utility.h"
+#include "module/LayerNorm.h"
+#include "module/CommonModules.h"
+#include "../../tensor/core/CHeader.h"
+
+namespace nmt
+{
+
+/* constructor */
+AttDecoder::AttDecoder()
+{
+    selfAtt = NULL;
+    fnns = NULL;
+    selfAttLayerNorms = NULL;
+    fnnLayerNorms = NULL;
+    enDeAtt = NULL;
+    enDeAttLayerNorms = NULL;
+    decoderLayerNorm = NULL;
+    selfAttCache = NULL;
+    enDeAttCache = NULL;
+}
+
+/* de-constructor */
+AttDecoder::~AttDecoder()
+{
+    delete[] selfAttCache;
+    delete[] enDeAttCache;
+    delete[] selfAtt;
+    delete[] fnns;
+    delete[] selfAttLayerNorms;
+    delete[] fnnLayerNorms;
+    delete[] enDeAtt;
+    delete[] enDeAttLayerNorms;
+    if (preNorm)
+        delete decoderLayerNorm;
+}
+
+/*
+initialize the model
+>> config - configurations of the model
+*/
+void AttDecoder::InitModel(Config& config)
+{
+    devID = config.devID;
+    nlayer = config.nDecLayer;
+    hSize = config.modelSize;
+    eSize = config.embSize;
+    vSize = config.tgtVocabSize;
+    dropoutP = config.dropout;
+    preNorm = config.preNorm;
+
+    CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
+    CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsizetgt\"");
+
+    /* embedding model */
+    embedder.InitModel(config, false);
+
+    selfAtt = new Attention[nlayer];
+    fnns = new FNN[nlayer];
+    selfAttLayerNorms = new LN[nlayer];
+    enDeAtt = new Attention[nlayer];
+    enDeAttLayerNorms = new LN[nlayer];
+    fnnLayerNorms = new LN[nlayer];
+
+    selfAttCache = new Cache[nlayer];
+    enDeAttCache = new Cache[nlayer];
+    if (preNorm)
+        decoderLayerNorm = new LN;
+
+    /* initialize the stacked layers */
+    for (int i = 0; i < nlayer; i++) {
+        selfAtt[i].InitModel(config);
+        fnns[i].InitModel(config);
+        selfAttLayerNorms[i].InitModel(config);
+        fnnLayerNorms[i].InitModel(config);
+        enDeAtt[i].InitModel(config);
+        enDeAttLayerNorms[i].InitModel(config);
+        selfAttCache[i].enable = true;
+        enDeAttCache[i].enable = true;
+    }
+    if (preNorm)
+        decoderLayerNorm->InitModel(config);
+}
+
+/*
+make the decoding network
+>> inputDec - the input tensor of the decoder
+>> outputEnc - the output tensor of the encoder
+>> mask - mask that indicates which position is valid
+>> maskEncDec - mask for the encoder-decoder attention
+>> nstep - the current length of the decoder input
+>> isTraining - indicates whether the model is used for training
+<< return - the output tensor of the decoder
+*/
+XTensor AttDecoder::Make(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
+                         XTensor* maskEncDec, int nstep, bool isTraining)
+{
+    XTensor x;
+
+    x = embedder.Make(inputDec, true, isTraining, nstep);
+
+    /* dropout */
+    if (isTraining && dropoutP > 0)
+        x = Dropout(x, dropoutP);
+
+    for (int i = 0; i < nlayer; i++) {
+        XTensor att;
+        XTensor ende;
+        XTensor fnn;
+        XTensor res;
+        XTensor selfAttnBefore;
+        XTensor selfAttnAfter;
+        XTensor endeAttnBefore;
+        XTensor endeAttnAfter;
+        XTensor fnnBefore;
+
+        /* layer normalization with pre-norm for self-attn */
+        selfAttnBefore = LayerNorm(x, selfAttLayerNorms[i], preNorm, true, false);
+
+        /******************/
+        /* self attention */
+        att = selfAtt[i].Make(selfAttnBefore, selfAttnBefore, selfAttnBefore, 
+                              mask, isTraining, &selfAttCache[i], SELF_ATT);
+
+        /* dropout */
+        if (isTraining && dropoutP > 0)
+            att = Dropout(att, dropoutP);
+
+        /* residual connection */
+        res = Sum(att, x);
+
+        /* layer normalization with post-norm for self-attention */
+        selfAttnAfter = LayerNorm(res, selfAttLayerNorms[i], preNorm, false, true);
+
+        /* layer normalization with pre-norm for encoder-decoder attention */
+        endeAttnBefore = LayerNorm(selfAttnAfter, enDeAttLayerNorms[i], preNorm, true, false);
+
+        /* encoder-decoder attention */
+        ende = enDeAtt[i].Make(outputEnc, endeAttnBefore, outputEnc, maskEncDec, 
+                               isTraining, &enDeAttCache[i], EN_DE_ATT);
+
+        /* dropout */
+        if (isTraining && dropoutP > 0)
+            ende = Dropout(ende, dropoutP);
+
+        /* residual connection */
+        res = Sum(ende, selfAttnAfter);
+
+        /* layer normalization with post-norm for encoder-decoder attention */
+        endeAttnAfter = LayerNorm(res, enDeAttLayerNorms[i], preNorm, false, true);
+
+        /* layer normalization with pre-norm for fnn */
+        fnnBefore = LayerNorm(endeAttnAfter, fnnLayerNorms[i], preNorm, true, false);
+
+        /* fnn */
+        fnn = fnns[i].Make(fnnBefore, isTraining);
+
+        /* dropout */
+        if (isTraining && dropoutP > 0)
+            fnn = Dropout(fnn, dropoutP);
+
+        /* residual connection */
+        res = Sum(fnn, endeAttnAfter);
+
+        /* layer normalization with post-norm for fnn */
+        x = LayerNorm(res, fnnLayerNorms[i], preNorm, false, true);
+    }
+
+    if (preNorm)
+        return decoderLayerNorm->Make(x);
+
+    return x;
+}
+
+/*
+make the decoding network
+>> inputDec - the input tensor of the decoder
+>> outputEnc - the output tensor of the encoder
+>> mask - mask that indicates which position is valid
+>> maskEncDec - mask for the encoder-decoder attention
+>> nstep - the current length of the decoder input
+>> isTraining - indicates whether the model is used for training
+<< return - the output tensor of the decoder
+*/
+XTensor AttDecoder::MakeFast(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
+                             XTensor* maskEncDec, int nstep, bool isTraining)
+{
+    XTensor x;
+
+    x = embedder.Make(inputDec, true, isTraining, nstep);
+
+    /* dropout */
+    if (isTraining && dropoutP > 0)
+        x = Dropout(x, dropoutP);
+
+    for (int i = 0; i < nlayer; i++) {
+        XTensor res;
+
+        res = x;
+
+        /* layer normalization with pre-norm for self-attn */
+        x = selfAttLayerNorms[i].Make(x);
+
+        /******************/
+        /* self attention */
+        x = selfAtt[i].Make(x, x, x, mask, isTraining, &selfAttCache[i], SELF_ATT);
+
+        /* dropout */
+        if (isTraining && dropoutP > 0)
+            x = Dropout(x, dropoutP);
+
+        /* residual connection */
+        x = Sum(res, x);
+
+        res = x;
+
+        /* layer normalization with pre-norm for encoder-decoder attention */
+        x = enDeAttLayerNorms[i].Make(x);
+
+        /* encoder-decoder attention */
+        x = enDeAtt[i].Make(outputEnc, x, outputEnc, maskEncDec,
+                            isTraining, &enDeAttCache[i], EN_DE_ATT);
+
+        /* dropout */
+        if (isTraining && dropoutP > 0)
+            x = Dropout(x, dropoutP);
+
+        /* residual connection */
+        x = Sum(res, x);
+
+        res = x;
+
+        /* layer normalization with pre-norm for fnn */
+        x = fnnLayerNorms[i].Make(x);
+
+        /* fnn */
+        x = fnns[i].Make(x, isTraining);
+
+        /* dropout */
+        if (isTraining && dropoutP > 0)
+            x = Dropout(x, dropoutP);
+
+        /* residual connection */
+        x = Sum(res, x);
+    }
+
+    x = decoderLayerNorm->Make(x);
+
+    return x;
+}
+
+}
\ No newline at end of file
--- a/source/sample/transformer/T2TDecoder.h
+++ b/source/sample/transformer/T2TDecoder.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,18 +16,17 @@

 /*
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

-#ifndef __T2TDECODER_H__
-#define __T2TDECODER_H__
+#ifndef __DECODER_H__
+#define __DECODER_H__

-#include "T2TEncoder.h"
+#include "Encoder.h"
+#include "Utility.h"

-namespace transformer
+namespace nmt
 {
-    
-#define DECODING_NAME "decoding"
-#define DECODING_INPUT_NAME "decoding_input"

 class AttDecoder
 {
@@ -37,9 +35,6 @@ public:
    /* device id */
    int devID;

-    /* memory pool */
-    XMem * mem;
-
    /* layer number */
    int nlayer;

@@ -55,50 +50,56 @@ public:
    /* dropout probability */
    DTYPE dropoutP;

-    /* some positions can be ignored in attention. this is useful in lm where the first position needs
- *     special design for the attention model. */
-    int ignored;
-
    /* embedding of word at each position */
-    T2TEmbedder embedder;
+    Embedder embedder;

    /* FNN model of each layer */
-    T2TFNN * fnns;
+    FNN* fnns;

    /* attention model of each layer */
-    T2TAttention * attentions;
-
-    /* layer normalization for fnn */
-    T2TLN * fnnLayerNorms;
+    Attention* selfAtt;

    /* layer normalization for attention */
-    T2TLN * attLayerNorms;
+    LN* selfAttLayerNorms;

-    /* input tensor of the encoder */
-    XTensor * input;
+    /* layer normalization for fnn */
+    LN* fnnLayerNorms;

-    /* output tensor of the encoder */
-    XTensor * output;
+    /* layer normalization for decoder */
+    LN* decoderLayerNorm;

    /* encoder-decoder attention model of each layer */
-    T2TAttention * attentionsEnde;
+    Attention* enDeAtt;

    /* layer normalization for encoder-decoder attention */
-    T2TLN * attEndeLayerNorms;
+    LN* enDeAttLayerNorms;
+
+    /* layer cache list */
+    Cache* selfAttCache;
+
+    /* layer cache list */
+    Cache* enDeAttCache;
+
+    /* the location of layer normalization */
+    bool preNorm;
+
 public:
    /* constructor */
    AttDecoder();

-    /* deconstructor */
+    /* de-constructor */
    ~AttDecoder();

    /* initialize the model */
-    void InitModel(int argc, char ** argv, 
-                   bool myIsMasked, int myIgnored, 
-                   int myDevID = -1, XMem * myMem = NULL);
+    void InitModel(Config& config);

    /* make the decoding network */
-    XTensor Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, XTensor &maskEncDec, bool isTraining);
+    XTensor Make(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
+                 XTensor* maskEncDec, int nstep, bool isTraining);
+
+    /* make the decoding network (pre norm) */
+    XTensor MakeFast(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
+                     XTensor* maskEncDec, int nstep, bool isTraining);
 };

 }

--- a/source/sample/transformer/Encoder.cpp
+++ b/source/sample/transformer/Encoder.cpp
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
+ */
+
+#include "Encoder.h"
+#include "Utility.h"
+#include "module/LayerNorm.h"
+#include "module/CommonModules.h"
+#include "../../tensor/core/CHeader.h"
+
+namespace nmt
+{
+
+/* constructor */
+AttEncoder::AttEncoder()
+{
+    selfAtt = NULL;
+    fnns = NULL;
+    attLayerNorms = NULL;
+    fnnLayerNorms = NULL;
+    encoderLayerNorm = NULL;
+}
+
+/* de-constructor */
+AttEncoder::~AttEncoder()
+{
+    delete[] selfAtt;
+    delete[] fnns;
+    delete[] attLayerNorms;
+    delete[] fnnLayerNorms;
+    if (preNorm)
+        delete encoderLayerNorm;
+}
+
+/*
+initialize the model
+>> config - configurations for the model
+*/
+void AttEncoder::InitModel(Config& config)
+{
+
+    devID = config.devID;
+    nlayer = config.nEncLayer;
+    eSize = config.embSize;
+    hSize = config.modelSize;
+    vSize = config.srcVocabSize;
+    preNorm = config.preNorm;
+    dropoutP = config.dropout;
+
+    CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
+    CheckNTErrors(vSize > 1, "Set vocabulary size by \"-vsize\"");
+
+    /* embedding model */
+    embedder.InitModel(config);
+
+    selfAtt = new Attention[nlayer];
+    fnns = new FNN[nlayer];
+    attLayerNorms = new LN[nlayer];
+    fnnLayerNorms = new LN[nlayer];
+
+    if (preNorm)
+        encoderLayerNorm = new LN;
+
+    /* initialize the stacked layers */
+    for (int i = 0; i < nlayer; i++) {
+        selfAtt[i].InitModel(config);
+        fnns[i].InitModel(config);
+        attLayerNorms[i].InitModel(config);
+        fnnLayerNorms[i].InitModel(config);
+    }
+    if (preNorm)
+        encoderLayerNorm->InitModel(config);
+}
+
+/*
+make the encoding network
+>> input - the input tensor of the encoder
+>> mask - the mask that indicate each position is valid
+>> maskEncDec - no use
+>> isTraining - indicates whether the model is used for training
+<< return - the output tensor of the encoder
+*/
+XTensor AttEncoder::Make(XTensor& input, XTensor* mask, XTensor& maskEncDec, bool isTraining)
+{
+    XTensor x;
+
+    x = embedder.Make(input, false, isTraining);
+
+    /* dropout */
+    if (isTraining && dropoutP > 0)
+        x = Dropout(x, dropoutP);
+
+    for (int i = 0; i < nlayer; i++) {
+        XTensor att;
+        XTensor fnn;
+        XTensor res;
+        XTensor attnBefore;
+        XTensor attnAfter;
+        XTensor fnnBefore;
+
+        /* layer normalization with pre-norm for self-attn */
+        attnBefore = LayerNorm(x, attLayerNorms[i], preNorm, true, false);
+
+        /* self attention */
+        att = selfAtt[i].Make(attnBefore, attnBefore, attnBefore, mask, isTraining, NULL, SELF_ATT);
+
+        /* dropout */
+        if (isTraining && dropoutP > 0)
+            att = Dropout(att, dropoutP);
+
+        /* residual connection */
+        res = Sum(att, x);
+
+        /* layer normalization with post-norm for self-attn */
+        attnAfter = LayerNorm(res, attLayerNorms[i], preNorm, false, true);
+
+        /* layer normalization with pre-norm for fnn */
+        fnnBefore = LayerNorm(attnAfter, fnnLayerNorms[i], preNorm, true, false);
+
+        /* fnn */
+        fnn = fnns[i].Make(fnnBefore, isTraining);
+
+        /* dropout */
+        if (isTraining && dropoutP > 0)
+            fnn = Dropout(fnn, dropoutP);
+
+        /* residual connection */
+        res = Sum(fnn, attnAfter);
+
+        /* layer normalization with post-norm for fnn */
+        x = LayerNorm(res, fnnLayerNorms[i], preNorm, false, true);
+    }
+    if (preNorm)
+        return encoderLayerNorm->Make(x);
+
+    return x;
+}
+
+/*
+make the encoding network
+>> input - the input tensor of the encoder
+>> mask - the mask that indicate each position is valid
+>> maskEncDec - no use
+>> isTraining - indicates whether the model is used for training
+<< return - the output tensor of the encoder
+*/
+XTensor AttEncoder::MakeFast(XTensor& input, XTensor* mask, XTensor& maskEncDec, bool isTraining)
+{
+    XTensor x;
+
+    x = embedder.Make(input, false, isTraining);
+
+    /* dropout */
+    if (isTraining && dropoutP > 0)
+        x = Dropout(x, dropoutP);
+
+    for (int i = 0; i < nlayer; i++) {
+        XTensor res;
+
+        res = x;
+
+        /* layer normalization with pre-norm for self-attn */
+        x = attLayerNorms[i].Make(x);
+
+        /* self attention */
+        x = selfAtt[i].Make(x, x, x, mask, isTraining, NULL, SELF_ATT);
+
+        /* dropout */
+        if (isTraining && dropoutP > 0)
+            x = Dropout(x, dropoutP);
+
+        /* residual connection */
+        x = Sum(res, x);
+
+        res = x;
+
+        /* layer normalization with pre-norm for fnn */
+        x = fnnLayerNorms[i].Make(x);
+
+        /* fnn */
+        x = fnns[i].Make(x, isTraining);
+
+        /* dropout */
+        if (isTraining && dropoutP > 0)
+            x = Dropout(x, dropoutP);
+
+        /* residual connection */
+        x = Sum(res, x);
+    }
+    x = encoderLayerNorm->Make(x);
+
+    return x;
+}
+
+/*
+make the encoding network (wrapper)
+>> input - the input tensor of the encoder
+>> mask - the mask that indicate each position is valid
+>> isTraining - indicates whether the model is used for training
+<< return - the output tensor of the encoder
+*/
+XTensor AttEncoder::Make(XTensor& input, XTensor* mask, bool isTraining)
+{
+    XTensor nothing;
+
+    return Make(input, mask, nothing, isTraining);
+}
+
+}
\ No newline at end of file
--- a/source/sample/transformer/T2TEncoder.h
+++ b/source/sample/transformer/T2TEncoder.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,57 +16,42 @@

 /*
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

-#ifndef __T2TENCODER_H__
-#define __T2TENCODER_H__
+#ifndef __ENCODER_H__
+#define __ENCODER_H__

-#include "T2TFNN.h"
-#include "T2TAttention.h"
-#include "T2TEmbedding.h"
-#include "T2TLayerNormal.h"
+#include "Utility.h"
+#include "module/FNN.h"
+#include "module/Attention.h"
+#include "module/Embedding.h"
+#include "module/LayerNorm.h"
 #include "../../network/XNet.h"

 using namespace nts;

-namespace transformer
+namespace nmt
 {
-    
-#define ENCODING_NAME "encoding"
-#define ENCODING_INPUT_NAME "encoding_input"

-/* 
-base class of the encoder 
-*/
-class T2TEncoder
-{
-public:
-    virtual
-    XTensor Make(XTensor &input, XTensor &mask, XTensor &mask2, bool isTraining) = 0;
-};
-
-/* 
-the encoder based on RNN 
+/*
+base class of the encoder
 */
-class RNNEncoder : T2TEncoder
+class Encoder
 {
 public:
-    XTensor Make(XTensor &input, XTensor &mask, XTensor &mask2, bool isTraining);
+    virtual XTensor Make(XTensor& input, XTensor* mask, XTensor& mask2, bool isTraining) = 0;
 };

-
-/* 
-the encoder based on self-attention 
+/*
+the encoder based on self-attention
 */
-class AttEncoder : T2TEncoder
+class AttEncoder : Encoder
 {
 public:
    /* device id */
    int devID;

-    /* memory pool */
-    XMem * mem;
-
    /* layer number */
    int nlayer;

@@ -88,26 +72,26 @@ public:
    int ignored;

    /* embedding of word at each position */
-    T2TEmbedder embedder;
+    Embedder embedder;

    /* FNN model of each layer */
-    T2TFNN * fnns;
+    FNN* fnns;

    /* attention model of each layer */
-    T2TAttention * attentions;
+    Attention* selfAtt;
+
+    /* layer normalizations for attention */
+    LN* attLayerNorms;

    /* layer normalization for fnn */
-    T2TLN * fnnLayerNorms;
+    LN* fnnLayerNorms;

-    /* layer normalization for attention */
-    T2TLN * attLayerNorms;
+    /* layer normalization for encoder */
+    LN* encoderLayerNorm;

-    /* input tensor of the encoder */
-    XTensor * input;
+    /* the location of layer normalization */
+    bool preNorm;

-    /* output tensor of the encoder */
-    XTensor * output;
-    
 public:
    /* constructor */
    AttEncoder();
@@ -116,18 +100,18 @@ public:
    ~AttEncoder();

    /* initialize the model */
-    void InitModel(int argc, char ** argv, 
-                   bool myIsMasked, int myIgnored, 
-                   int myDevID = -1, XMem * myMem = NULL);
+    void InitModel(Config& config);

    /* make the encoding network */
-    XTensor Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, bool isTraining);
+    XTensor Make(XTensor& input, XTensor* mask, XTensor& maskEncDec, bool isTraining);
+
+    /* make the encoding network */
+    XTensor MakeFast(XTensor& input, XTensor* mask, XTensor& maskEncDec, bool isTraining);

    /* make the encoding network (wrapper) */
-    XTensor Make(XTensor &input, XTensor &mask, bool isTraining);
+    XTensor Make(XTensor& input, XTensor* mask, bool isTraining);
 };

-
 }

 #endif
--- a/source/sample/transformer/Model.cpp
+++ b/source/sample/transformer/Model.cpp
--- a/source/sample/transformer/T2TModel.h
+++ b/source/sample/transformer/T2TModel.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,40 +16,38 @@

 /*
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

-#ifndef __T2TMODEL_H__
-#define __T2TMODEL_H__
+#ifndef __MODEL_H__
+#define __MODEL_H__

-#include "T2TFNN.h"
-#include "T2TAttention.h"
-#include "T2TEncoder.h"
-#include "T2TDecoder.h"
-#include "T2TOutput.h"
+#include "Encoder.h"
+#include "Decoder.h"
+#include "module/FNN.h"
+#include "module/Output.h"
+#include "Utility.h"
+#include "module/Attention.h"

-namespace transformer
+namespace nmt
 {

-/* a transformer model that keeps parameters of the encoder,
-   the decoder and the output layer (softmax). Also, it creates
-   the network used in transformer. */
-class T2TModel
+/* a nmt model that keeps parameters of the encoder,
+   the decoder and the output layer (softmax). */
+class Model
 {
 public:
    /* device id */
    int devID;

-    /* memory pool */
-    XMem * mem;
-
    /* the encoder */
-    AttEncoder * encoder;
+    AttEncoder* encoder;

    /* the decoder */
-    AttDecoder * decoder;
+    AttDecoder* decoder;

    /* output layer */
-    T2TOutput * outputLayer;
+    Output* outputLayer;

    /* indicates whether the model is running for language modeling */
    bool isLM;
@@ -58,53 +55,65 @@ public:
    /* indicates whether the model is running for machine translation */
    bool isMT;

+    /* indicates whether the model is running with FP16 data type */
+    bool useFP16;
+
    /* number of heads in the attention model */
    int nhead;

+    /* indicates whether share encoders embeddings with decoders */
+    int shareAllEmbeddings;
+
+    /* indicates whether share decoder embeddings with output weights */
+    int shareDecInputOutputWeight;
+
 public:
    /* constructor */
-    T2TModel();
+    Model();

    /* de-constructor */
-    ~T2TModel();
+    ~Model();

    /* initialize the model */
-    void InitModel(int argc, char ** argv);
+    void InitModel(Config& config);
+
+    /* print model configurations */
+    void ShowModelConfig(Config& config);

    /* make the encoding network */
-    XTensor MakeEncoder(XTensor &input, XTensor &mask, bool isTraining);
+    XTensor MakeEncoder(XTensor& input, XTensor* mask, bool isTraining);

    /* make the encoding network */
-    XTensor MakeDecoder(XTensor &inputEnc, XTensor &inputDec, XTensor &mask, XTensor &MaskEncDec, bool isTraining);
+    XTensor MakeDecoder(XTensor& inputEnc, XTensor& inputDec, XTensor* mask,
+        XTensor& MaskEncDec, bool isTraining);

-    /* make the network for langauge modeling (with the output softmax layer) */
-    void MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool isTraining);
+    /* make the network for language modeling (with the output softmax layer) */
+    void MakeLM(XTensor& input, XTensor& output, XTensor& padding, bool isTraining);

    /* make the network for machine translation (with the output softmax layer) */
-    void MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, 
-                XTensor &paddingEnc, XTensor &paddingDec, bool isTraining);
+    void MakeMT(XTensor& inputEnc, XTensor& inputDec, XTensor& output,
+        XTensor& paddingEnc, XTensor& paddingDec, bool isTraining);

    /* make the mask for training MT models */
-    void MakeMTMask(XTensor &inputEnc, XTensor &inputDec, 
-                    XTensor &paddingEnc, XTensor &paddingDec, 
-                    XTensor &maskEnc, XTensor &maskDec, XTensor &maskEncDec);
-    
+    void MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
+        XTensor& paddingEnc, XTensor& paddingDec,
+        XTensor& maskEnc, XTensor& maskDec, XTensor& maskEncDec);
+
    /* make the mask of the encoder */
-    void MakeMTMaskEnc(XTensor &inputEnc, XTensor &paddingEnc, XTensor &maskEnc);
-    
+    void MakeMTMaskEnc(XTensor& paddingEnc, XTensor& maskEnc);
+
    /* make the mask of the decoder */
-    void MakeMTMaskDec(XTensor &inputEnc, XTensor &inputDec,
-                       XTensor &paddingEnc, XTensor &paddingDec,
-                       XTensor &maskDec, XTensor &maskEncDec);
+    void MakeMTMaskDec(XTensor& paddingEnc, XTensor& paddingDec,
+        XTensor& maskDec, XTensor& maskEncDec);

-    /* get parameter matrics */
-    void GetParams(TensorList &list);
+    /* get parameter matrices */
+    void GetParams(TensorList& list);

-    /* dump the parameters */
-    void Dump(const char * fn);
+    /* dump the model to a file */
+    void Dump(const char* fn);

    /* read the parameters */
-    void Read(const char * fn);
+    void Read(FILE* file);
 };

 }

--- a/source/sample/transformer/NMT.cpp
+++ b/source/sample/transformer/NMT.cpp
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-06, 2020-07
+ */
+
+#include <ctime>
+
+#include "NMT.h"
+#include "train/Trainer.h"
+#include "translate/Translator.h"
+
+namespace nmt
+{
+
+int NMTMain(int argc, const char** argv)
+{
+    if (argc == 0)
+        return 1;
+
+    /* load configurations */
+    Config config(argc, argv);
+
+    srand(1);
+
+    /* training */
+    if (strcmp(config.trainFN, "") != 0) {
+        
+        Model model;
+        model.InitModel(config);
+        Trainer trainer;
+        trainer.Init(config);
+        trainer.Train(config.trainFN, config.validFN, config.modelFN, &model);
+    }
+
+    /* translating */
+    if (strcmp(config.testFN, "") != 0 && strcmp(config.outputFN, "") != 0) {
+        
+        /* disable grad flow */
+        DISABLE_GRAD;
+
+        Model model;
+        model.InitModel(config);
+        Translator translator;
+        translator.Init(config);
+        translator.Translate(config.testFN, config.srcVocabFN, 
+                             config.tgtVocabFN, config.outputFN, &model);
+    }
+
+    return 0;
+}
+
+}
\ No newline at end of file
--- a/source/sample/transformer/Transformer.h
+++ b/source/sample/transformer/Transformer.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,29 +15,17 @@
 */

 /*
- *
- * An impelementation of the transformer system. See more details 
- * about FNNLM in 
- * "Attention Is All You Need" by Vaswani et al.
- * https://arxiv.org/pdf/1706.03762.pdf
- *
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
- * I start writing the code related to NMT - a long time since my last coding 
- * work on MT
+ * An implementation of the NMT system. 
 */

-#ifndef __TRANSFORMER_H__
-#define __TRANSFORMER_H__
-
-#include "../../tensor/XGlobal.h"
-#include "../../tensor/XTensor.h"
-#include "../../tensor/core/CHeader.h"
+#ifndef __NMT_H__
+#define __NMT_H__

-namespace transformer
+namespace nmt
 {

 /* entrance of the program */
-int TransformerMain(int argc, const char ** argv);
+int NMTMain(int argc, const char** argv);

 }


--- a/source/sample/transformer/T2TAttention.cpp
+++ b/source/sample/transformer/T2TAttention.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
- */
-
-#include <math.h>
-#include "T2TAttention.h"
-#include "T2TUtility.h"
-#include "T2TEmbedding.h"
-#include "../../tensor/core/CHeader.h"
-
-namespace transformer
-{
-
-/* constructor */
-T2TAttention::T2TAttention()
-{
-    nhead = -1;
-    dk = -1;
-    dv = -1;
-    d  = -1;
-    isMasked = false;
-    ignored = 0;
-}
-
-/* deconstructor */
-T2TAttention::~T2TAttention()
-{
-}
-
-/* 
-initialize the model 
->> argc - number of arguments
->> argv - list of pointers to the arguments
->> myIgnored - number of position ignored in attention (from the begining)
->> myIsMasked - indicates whether the attention is with a mask
->> myDevID - device id
->> myMem - the memory pool
-*/
-void T2TAttention::InitModel(int argc, char ** argv, 
-                             bool myIsMasked, int myIgnored, 
-                             int myDevID, XMem * myMem)
-{
-    devID = myDevID;
-    mem = myMem;
-    isMasked = myIsMasked;
-    ignored = myIgnored;
-    
-    float minmax = 0;
-
-    LoadParamInt(argc, argv, "nhead", &nhead, 8);
-    LoadParamInt(argc, argv, "d", &dk, DEFAULT_EMBEDDING_SIZE);
-    LoadParamInt(argc, argv, "d", &dv, DEFAULT_EMBEDDING_SIZE);
-    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
-    LoadParamFloat(argc, argv, "attminmax", &minmax, 0.1F);
-    LoadParamFloat(argc, argv, "dropoutatt", &dropoutP, 0);
-
-    InitTensor2D(&wk, d, dk, X_FLOAT, devID, mem);
-    InitTensor2D(&wq, d, dk, X_FLOAT, devID, mem);
-    InitTensor2D(&wv, d, dv, X_FLOAT, devID, mem);
-    InitTensor2D(&wa, d, d, X_FLOAT, devID, mem);
-    InitTensor2D(&wbig, d, 3 * d, X_FLOAT, devID, mem);
-
-    float scale = 1.0F;
-    float finfoutk = (float)sqrt(6.0F * scale/(d + dk));
-    float finfoutv = (float)sqrt(6.0F * scale/(d + dv));
-    float finfouta = (float)sqrt(6.0F * scale / (d + d));
-    float finfoutbig = (float)sqrt(6.0F * scale / (d + 3*d));
-
-    wk.SetDataRand(-finfoutk, finfoutk);
-    wq.SetDataRand(-finfoutk, finfoutk);
-    wv.SetDataRand(-finfoutv, finfoutv);
-    wa.SetDataRand(-finfouta, finfouta);
-    wbig.SetDataRand(-finfoutbig, finfoutbig);
-}
-
-/* 
-make the network 
->> k - keys. It might be of size B * L * H
-       where B = batch size, L = sequence length, 
-       and H = vector size of each position
->> q - queries
->> v - values
->> mask - as it is
->> isTraining - indicates whether the model is used for training
-<< return - multi-attention result
-*/
-XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining)
-{
-    XTensor k2;
-    XTensor q2;
-    XTensor v2;
-    
-    /* linear transformation before self-attention */
-    k2 = MMul(k, wk);
-    q2 = MMul(q, wq);
-    v2 = MMul(v, wv);
-    
-    return MakeAttention(k2, q2, v2, mask, isTraining);
-}
-    
-/*
-make the network given a big tensor that keeps keys, queries and values
->> kqv - the big tensor
->> mask - as it is
->> isTraining - indicates whether the model is used for training
-*/
-XTensor T2TAttention::MakeBig(XTensor &kqv, XTensor &mask, bool isTraining)
-{
-    XTensor k2;
-    XTensor q2;
-    XTensor v2;
-    XTensor kqv2;
-    TensorList split;
-    
-    kqv2 = MMul(kqv, wbig);
-    
-    int d1 = kqv2.GetDim(0);
-    int d2 = kqv2.GetDim(1);
-    int d3 = kqv2.GetDim(2) / 3;
-    
-    InitTensor3D(&k2, d1, d2, d3, X_FLOAT, devID, mem);
-    InitTensor3D(&q2, d1, d2, d3, X_FLOAT, devID, mem);
-    InitTensor3D(&v2, d1, d2, d3, X_FLOAT, devID, mem);
-    
-    split.Add(&q2);
-    split.Add(&k2);
-    split.Add(&v2);
-    
-    Split(kqv2, split, 2, 3);
-    
-    return MakeAttention(k2, q2, v2, mask, isTraining);
-}
-    
-/*
-make the attention network given keys, queries and values (after linear transformation)
->> k - keys. It might be of size B * L * H
-       where B = batch size, L = sequence length,
-       and H = vector size of each position
->> q - queries
->> v - values
->> mask - as it is
->> isTraining - indicates whether the model is used for training
-*/
-XTensor T2TAttention::MakeAttention(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining)
-{
-    XTensor kheads;
-    XTensor qheads;
-    XTensor vheads;
-    
-    /* multi head */
-    kheads = Split(k, k.order - 1, nhead);
-    qheads = Split(q, q.order - 1, nhead);
-    vheads = Split(v, v.order - 1, nhead);
-    
-    XTensor att;
-    XTensor dot;
-    XTensor scalar;
-    
-    /* scalar = softmax(Q * K^T / sqrt(dk)) * V */
-    dot = BMMul(qheads, X_NOTRANS, kheads, X_TRANS);
-    
-    if(isMasked)
-        dot = dot + mask;
-    
-    dot = Linear(dot, 1.0F/(float)sqrt((float)dk/nhead));
-    
-    scalar = Softmax(dot, -1);
-
-    if(isTraining && dropoutP > 0)
-        scalar = Dropout(scalar, dropoutP);
-    
-    att = BMMul(scalar, vheads);
-    
-    /* concatenate the heads */
-    return MMul(Merge(att, att.order - 1), wa);
-}
-
-}
--- a/source/sample/transformer/T2TAttention.h
+++ b/source/sample/transformer/T2TAttention.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
- */
-
-#ifndef __T2TATTENTION_H__
-#define __T2TATTENTION_H__
-
-#include "../../network/XNet.h"
-
-using namespace nts;
-
-namespace transformer
-{
-
-/* 
-multi-head attention 
-y(Q, K, V) = cat(head_1, head_2, ..., head_n)
-where head_i = Attention(Q * w_i^Q, K * w_i^K, V * w_i^V)
-      attention(Q, K, V) = softmax(Q * K^T/d_k^0.5) V
-      d_k = dimension size of K
-*/
-class T2TAttention
-{
-public:
-    /* device id */
-    int devID;
-    
-    /* memory pool */
-    XMem * mem;
-    
-    /* head number */
-    int nhead;
-
-    /* transformation matrix for K */
-    XTensor wk;
-
-    /* transformation matrix for Q */
-    XTensor wq;
-
-    /* transformation matrix for V */
-    XTensor wv;
-
-    /* transformation after dot-product attention */
-    XTensor wa;
-    
-    XTensor wbig;
-	
-    /* size of transformed Q and K */
-    int dk;
-
-    /* size of transformed V */
-    int dv;
-
-    /* size of input Q, K and V */
-    int d;
-
-    /* indicates whether the attention is masked */
-    bool isMasked;
-
-    /* some positions can be ignored in attention. this is useful in lm where the first position needs
-       special design for the attention model. */
-    int ignored;
-
-    /* indicates whether the model is used for training */
-    bool isTraining;
-    
-    /* dropout probability */
-    DTYPE dropoutP;
-
-public:
-    /* constructor */
-    T2TAttention();
-
-    /* de-constructor */
-    ~T2TAttention();
-
-    /* initialize the model */
-    void InitModel(int argc, char ** argv, 
-                   bool myIsMasked, int myIgnored, 
-                   int myDevID = -1, XMem * myMem = NULL);
-
-    /* make the network */
-    XTensor Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining);
-    
-    /* make the network given a big tensor that keeps keys, queries and values */
-    XTensor MakeBig(XTensor &kqv, XTensor &mask, bool isTraining);
-    
-    /* make the attention network given keys, queries and values (after linear transformation) */
-    XTensor MakeAttention(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining);
-};
-
-}
-
-#endif
--- a/source/sample/transformer/T2TBatchLoader.cpp
+++ b/source/sample/transformer/T2TBatchLoader.cpp
--- a/source/sample/transformer/T2TBatchLoader.h
+++ b/source/sample/transformer/T2TBatchLoader.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-04-25
- * it is cold today but i'll move to a warm place tomorrow :)
- */
-
-#ifndef __T2TBATCHLOADER_H__
-#define __T2TBATCHLOADER_H__
-
-#include "../../network/XNet.h"
-
-using namespace nts;
-
-namespace transformer
-{
-
-#define MAX_SEQUENCE_LENGTH 1024 * 16
-
-/* node to keep batch information */
-struct BatchNode
-{
-    /* begining position */
-    int beg;
-
-    /* end position */
-    int end;
-
-    /* maximum word number on the encoder side */
-    int maxEnc;
-
-    /* maximum word number on the decoder side */
-    int maxDec;
-
-    /* a key for sorting */
-    int key;
-};
-
-class T2TBatchLoader
-{
-public:
-    /* buffer for loading words */
-    int * buf;
-
-    /* another buffer */
-    int * buf2;
-
-    /* batch buf */
-    BatchNode * bufBatch;
-
-    /* buffer size */
-    int bufSize;
-
-    /* size of batch buffer */
-    int bufBatchSize;
-
-    /* length of each sequence */
-    int * seqLen;
-
-    /* another array */
-    int * seqLen2;
-
-    /* offset of the first word for each sequence */
-    int * seqOffset;
-
-    /* number of sequences in the buffer */
-    int nseqBuf;
-
-    /* offset for next sequence in the buffer */
-    int nextSeq;
-
-    /* offset for next batch */
-    int nextBatch;
-
-    /* indicates whether we double the </s> symbol for the output of lms */
-    bool isDoubledEnd;
-    
-    /* indicates whether we use batchsize = max * sc
-       rather rather than batchsize = word-number, where max is the maximum
-       length and sc is the sentence number */
-    bool isSmallBatch;
-
-    /* counterpart of "isSmallBatch" */
-    bool isBigBatch;
-
-    /* randomize batches */
-    bool isRandomBatch;
-
-    /* bucket size */
-    int bucketSize;
-
-public:
-    /* constructor */
-    T2TBatchLoader();
-
-    /* de-constructor */
-    ~T2TBatchLoader();
-
-    /* initialization */
-    void Init(int argc, char ** argv);
-
-    /* load data to buffer */
-    int LoadBuf(FILE * file, bool isSorted, int step);
-
-    /* clear data buffer */
-    void ClearBuf();
-
-    /* set the random batch flag */
-    void SetRandomBatch(bool flag = true);
-
-    /* load a batch of sequences */
-    int LoadBatch(FILE * file, bool isLM,
-                  XTensor * batchEnc, XTensor * paddingEnc, 
-                  XTensor * batchDec, XTensor * paddingDec,
-                  XTensor * gold, XTensor * label,
-                  int * seqs,
-                  int vsEnc, int vsDec, int sBatch, int wBatch, 
-                  bool isSorted, int &ws, int &wCount,
-                  int devID, XMem * mem, 
-				  bool isTraining);
-
-    /* load a batch of sequences (for language modeling) */
-    int LoadBatchLM(FILE * file, 
-                    XTensor * batchEnc, XTensor * paddingEnc,
-                    XTensor * batchDec, XTensor * paddingDec,
-                    XTensor * gold, XTensor * label,
-                    int * seqs, int vs, int sBatch, int wBatch, 
-                    bool isSorted, int &wCount,
-                    int devID, XMem * mem, 
-					bool isTraining);
-
-    /* load a batch of sequences (for machine translation) */
-    int LoadBatchMT(FILE * file, 
-                    XTensor * batchEnc, XTensor * paddingEnc, 
-                    XTensor * batchDec, XTensor * paddingDec,
-                    XTensor * gold, XTensor * label,
-                    int * seqs, int vsEnc, int vsDec, int sBatch, int wBatch, 
-                    bool isSorted, int &ws, int &wCount,
-                    int devID, XMem * mem, 
-					bool isTraining);
-
-    /* shuffle the data file */
-    void Shuffle(const char * srcFile, const char * tgtFile);
-};
-}
-
-#endif
\ No newline at end of file
--- a/source/sample/transformer/T2TDecoder.cpp
+++ b/source/sample/transformer/T2TDecoder.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-10-09
- */
-
-#include <math.h>
-#include "T2TDecoder.h"
-#include "T2TUtility.h"
-#include "T2TLayerNormal.h"
-#include "../../tensor/core/CHeader.h"
-
-namespace transformer
-{
-
-/* constructor */
-AttDecoder::AttDecoder()
-{
-    attentions = NULL;
-    fnns = NULL;
-    attLayerNorms = NULL;
-    fnnLayerNorms = NULL;
-    attentionsEnde = NULL;
-    attEndeLayerNorms = NULL;
-}
-
-/* de-constructor */
-AttDecoder::~AttDecoder()
-{
-    delete[] attentions;
-    delete[] fnns;
-    delete[] attLayerNorms;
-    delete[] fnnLayerNorms;
-    delete[] attentionsEnde;
-    delete[] attEndeLayerNorms;
-}
-
-/* 
-initialize the model 
->> argc - number of arguments
->> argv - list of pointers to the arguments
->> myIsMasked - indicates whether the masked attention is employed
->> myIgnored - number of positions ignored in attention (from the start)
->> myDevID - device id
->> myMem - the memory pool
-*/
-void AttDecoder::InitModel(int argc, char ** argv, 
-                           bool myIsMasked, int myIgnored, 
-                           int myDevID, XMem * myMem)
-{
-    //AttEncoder::InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
-
-    devID = myDevID;
-    mem = myMem;
-    ignored = myIgnored;
-
-    LoadParamInt(argc, argv, "nlayer", &nlayer, 6);
-    LoadParamInt(argc, argv, "hsize", &hSize, DEFAULT_EMBEDDING_SIZE);
-    LoadParamInt(argc, argv, "esize", &eSize, DEFAULT_EMBEDDING_SIZE);
-    LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
-    LoadParamFloat(argc, argv, "dropout", &dropoutP, 0);
-
-    CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
-    CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsizetgt\"");
-
-    /* embedding model */
-    embedder.InitModel(argc, argv, devID, mem, false);
-
-    attentions = new T2TAttention[nlayer];
-    fnns = new T2TFNN[nlayer];
-    attLayerNorms = new T2TLN[nlayer];
-    fnnLayerNorms = new T2TLN[nlayer];
-    attentionsEnde = new T2TAttention[nlayer];
-    attEndeLayerNorms = new T2TLN[nlayer];
-
-    /* initialize the stacked layers */
-    for (int i = 0; i < nlayer; i++) {
-        attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
-        fnns[i].InitModel(argc, argv, myDevID, myMem);
-        attLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
-        fnnLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
-        attentionsEnde[i].InitModel(argc, argv, true, myIgnored, myDevID, myMem);
-        attEndeLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
-    }
-}
-
-/* 
-make the decoding network
->> inputDec - the input tensor of the decoder
->> outputEnc - the output tensor of the encoder
->> mask - mask that indicates which position is valid
->> maskEncDec - mask for the encoder-decoder attention
->> isTraining - indicates whether the model is used for training
-<< return - the output tensor of the encoder
-*/
-XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, XTensor &maskEncDec, bool isTraining)
-{
-    XTensor x;
-
-    x = embedder.Make(inputDec);
-
-    /* dropout */
-    if(isTraining && dropoutP > 0)
-        x = Dropout(x, dropoutP);
-
-    for(int i = 0; i < nlayer; i++){
-        XTensor att;
-        XTensor ende;
-        XTensor ln;
-        XTensor fnn;
-        XTensor res;
-
-        /******************/
-        /* self attention */
-        att = attentions[i].MakeBig(x, mask, isTraining);
-
-        /* dropout */
-        if(isTraining && dropoutP > 0)
-            att = Dropout(att, dropoutP);
-
-        /* residual connection */
-        res = Sum(att, x);
-
-        /* layer normalization */
-        x = attLayerNorms[i].Make(res);
-
-        /*****************************/
-        /* encoder-decoder attention */
-        ende = attentionsEnde[i].Make(outputEnc, x, outputEnc, maskEncDec, isTraining);
-
-        /* dropout */
-        if(isTraining && dropoutP > 0)
-            ende = Dropout(ende, dropoutP);
-
-        /* residual connection */
-        res = Sum(ende, x);
-
-        /* layer normalization */
-        x = attEndeLayerNorms[i].Make(res);
-
-        /*******/
-        /* fnn */
-        fnn = fnns[i].Make(x, isTraining);
-
-        /* dropout */
-        if(isTraining && dropoutP > 0)
-            fnn = Dropout(fnn, dropoutP);
-
-        /* residual connection */
-        res = Sum(fnn, x);
-
-        /* layer normalization */
-        x = fnnLayerNorms[i].Make(res);
-    }
-    
-    x.SetName(DECODING_NAME);
-
-    return x;
-}
-
-
-}
--- a/source/sample/transformer/T2TEmbedding.cpp
+++ b/source/sample/transformer/T2TEmbedding.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-01
- */
-
-#include <math.h>
-#include "T2TEmbedding.h"
-#include "T2TUtility.h"
-#include "../../tensor/core/CHeader.h"
-
-namespace transformer
-{
-
-/* constructor */
-T2TEmbedder::T2TEmbedder()
-{
-    devID = -1;
-    mem = NULL;
-    vSize = -1;
-    maxLength = -1;
-}
-
-/* deconstructor */
-T2TEmbedder::~T2TEmbedder()
-{
-}
-
-/* 
-initialize the model 
->> argc - number of arguments
->> argv - list of pointers to the arguments
->> myDevID - device id
->> myMem - the memory pool
-*/
-void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, XMem * myMem, bool isEnc)
-{
-    devID = myDevID;
-    mem = myMem;
-    
-    if(isEnc){
-        LoadParamInt(argc, argv, "vsize", &vSize, -1);
-    }
-    else{
-        LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
-    }
-    //LoadParamInt(argc, argv, "vsize", &vSize, -1);
-    LoadParamInt(argc, argv, "maxlen", &maxLength, 512);
-    LoadParamInt(argc, argv, "d", &eSize, DEFAULT_EMBEDDING_SIZE);
-    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
-
-    InitTensor2D(&w, vSize, eSize, X_FLOAT, devID, mem);
-
-    DTYPE v = 1.0F/(float)sqrt((float)eSize);
-    w.SetDataRandn(0, v);
-
-    /* create the positional embedding matrix */
-    MakePosEmbedding(eSize, d, maxLength);
-}
-
-/* 
-make positional embeddings (of size eSize * length)
->> eSize - embedding size
->> d - dimension size of the hidden layers
->> length - length of the sequence
-*/
-void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length)
-{
-    InitTensor2D(&posEmbeddingBase, length, eSize, X_FLOAT, devID, mem);
-
-    float * data = new float[posEmbeddingBase.unitNum];
-
-    for(int pos = 0; pos < length; pos++){
-        float * dp = data + pos * eSize;
-        
-        int channelSize = eSize / 2;
-        int offset = 0;
-        for(int i = 0; i < channelSize; i++){
-            dp[offset++] = (float)sin(pos/pow(10000.0F, 2.0F*i/(d - 2)));
-        }
-        for(int i = 0; i < channelSize; i++){
-            dp[offset++] = (float)cos(pos/pow(10000.0F, 2.0F*i/(d - 2)));
-        }
-
-        /*
-        for(int k = 0; k < eSize; k++){
-            if(k % 2 == 0){
-                int i = k/2;
-                dp[k] = (float)sin(pos/pow(10000.0F, 2.0F*i/d));
-            }
-            else{
-                int i = (k - 1)/2;
-                dp[k] = (float)cos(pos/pow(10000.0F, 2.0F*i/d));
-            }
-        }
-        */
-    }
-
-    posEmbeddingBase.SetData(data, posEmbeddingBase.unitNum);
-
-    delete[] data;
-}
-
-/* 
-make the network 
-*/
-XTensor T2TEmbedder::Make(XTensor &input)
-{
-    //CheckNTErrors(input.GetDim(-1) == vSize, "Wrong vocabulary size!");
-    CheckNTErrors(input.order > 1, "Wrong input tensor size!");
-    CheckNTErrors(input.dimSize[input.order - 1] < maxLength, "The sequence is too long!");
-    CheckNTErrors(vSize > 0, "set vocabulary size by \"-vsize\"");
-    CheckNTErrors(eSize > 0, "set embedding size by \"-esize\"");
-
-    int dims[MAX_TENSOR_DIM_NUM];
-    memcpy(dims, input.dimSize, input.order * sizeof(int));
-    dims[input.order] = eSize;
-
-    XTensor wordEmbedding;
-    XTensor posEmbedding;
-
-    bool match = (posEmbedding.order == input.order);
-    if(match){
-        for(int i = 0; i < input.order; i++){
-            if(dims[i] != posEmbedding.GetDim(i))
-                match = false;
-        }
-    }
-
-    /* we make positional embeddings first */
-    //if(!match){
-    if(true){
-        InitTensor(&posEmbedding, input.order + 1, dims, X_FLOAT, 1.0F, devID, mem);
-
-        XTensor * posTMP = NewTensorBuf(2, dims + 1, X_FLOAT, 1.0F, devID, mem);
-
-        _CopyValues(&posEmbeddingBase, 0, posTMP->unitNum, posTMP, 0);
-        _Unsqueeze(posTMP, &posEmbedding, 0, dims[0]);
-
-        DelTensorBuf(posTMP);
-    }
-
-    /* then we make word embeddings */
-    wordEmbedding = Gather(w, input);
-    wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize));
-
-    /* we sum over the two embeddings */
-    return wordEmbedding + posEmbedding;
-}
-
-}
--- a/source/sample/transformer/T2TEncoder.cpp
+++ b/source/sample/transformer/T2TEncoder.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
- */
-
-#include <math.h>
-#include "T2TEncoder.h"
-#include "T2TLayerNormal.h"
-#include "T2TUtility.h"
-#include "../../tensor/core/CHeader.h"
-
-namespace transformer
-{
-
-/* constructor */
-AttEncoder::AttEncoder()
-{
-    attentions = NULL;
-    fnns = NULL;
-    attLayerNorms = NULL;
-    fnnLayerNorms = NULL;
-}
-
-/* de-constructor */
-AttEncoder::~AttEncoder()
-{
-    delete[] attentions;
-    delete[] fnns;
-    delete[] attLayerNorms;
-    delete[] fnnLayerNorms;
-}
-
-/* 
-initialize the model 
->> argc - number of arguments
->> argv - list of pointers to the arguments
->> myIsMasked - indicates whether the masked attention is employed
->> myIgnored - number of positions ignored in attention (from the start)
->> myDevID - device id
->> myMem - the memory pool
-*/
-void AttEncoder::InitModel(int argc, char ** argv, 
-                           bool myIsMasked, int myIgnored, 
-                           int myDevID, XMem * myMem)
-{
-    devID = myDevID;
-    mem = myMem;
-    ignored = myIgnored;
-    
-    LoadParamInt(argc, argv, "nlayer", &nlayer, 6);
-    LoadParamInt(argc, argv, "hsize", &hSize, DEFAULT_EMBEDDING_SIZE);
-    LoadParamInt(argc, argv, "esize", &eSize, DEFAULT_EMBEDDING_SIZE);
-    LoadParamInt(argc, argv, "vsize", &vSize, -1);
-    LoadParamFloat(argc, argv, "dropout", &dropoutP, 0);
-
-    CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
-    CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsize\"");
-
-    /* embedding model */
-    embedder.InitModel(argc, argv, devID, mem);
-
-    attentions = new T2TAttention[nlayer];
-    fnns = new T2TFNN[nlayer];
-    attLayerNorms = new T2TLN[nlayer];
-    fnnLayerNorms = new T2TLN[nlayer];
-
-    /* initialize the stacked layers */
-    for(int i = 0; i < nlayer; i++){
-        attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
-        fnns[i].InitModel(argc, argv, myDevID, myMem);
-        attLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
-        fnnLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
-    }
-}
-
-/* 
-make the encoding network
->> input - the input tensor of the encoder
->> mask - the mask that indicate each position is valid
->> maskEncDec - no use
->> isTraining - indicates whether the model is used for training
-<< return - the output tensor of the encoder
-*/
-XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, bool isTraining)
-{
-    XTensor x;
-
-    x = embedder.Make(input);
-
-    /* dropout */
-    if(isTraining && dropoutP > 0)
-        x = Dropout(x, dropoutP);
-
-    for(int i = 0; i < nlayer; i++){
-        XTensor att;
-        XTensor ln;
-        XTensor fnn;
-        XTensor res;
-
-        /* self attention */
-        att = attentions[i].MakeBig(x, mask, isTraining);
-        
-        /* dropout */
-        if(isTraining && dropoutP > 0)
-            att = Dropout(att, dropoutP);
-
-        /* residual connection */
-        res = Sum(att, x);
-
-        /* layer normalization */
-        x = attLayerNorms[i].Make(res);
-
-        /* fnn */
-        fnn = fnns[i].Make(x, isTraining);
-
-        /* dropout */
-        if(isTraining && dropoutP > 0)
-            fnn = Dropout(fnn, dropoutP);
-
-        /* residual connection */
-        res = Sum(fnn, x);
-
-        /* layer normalization */
-        x = fnnLayerNorms[i].Make(res);
-    }
-    
-    x.SetName(ENCODING_NAME);
-    input.SetName(ENCODING_INPUT_NAME);
-
-    return x;
-}
-
-/*
-make the encoding network (wrapper) 
->> input - the input tensor of the encoder
->> mask - the mask that indicate each position is valid
->> isTraining - indicates whether the model is used for training
-<< return - the output tensor of the encoder
-*/
-XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool isTraining)
-{
-    XTensor nothing;
-
-    return Make(input, mask, nothing, isTraining);
-}
-
-}
--- a/source/sample/transformer/T2TModel.cpp
+++ b/source/sample/transformer/T2TModel.cpp
--- a/source/sample/transformer/T2TOutput.cpp
+++ b/source/sample/transformer/T2TOutput.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
- */
-
-#include <math.h>
-#include "T2TOutput.h"
-#include "T2TUtility.h"
-#include "T2TEmbedding.h"
-#include "../../tensor/core/CHeader.h"
-
-namespace transformer
-{
-/* constructor */
-T2TOutput::T2TOutput()
-{
-    devID = -1;
-    mem = NULL;
-    vSize = -1;
-    inSize = -1;
-    hSize = -1;
-}
-
-/* de-constructor */
-T2TOutput::~T2TOutput()
-{
-}
-
-/*
-initialize the model 
->> argc - number of arguments
->> argv - list of pointers to the arguments
->> myDevID - device id
->> myMem - the memory pool
-*/
-void T2TOutput::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
-{
-    devID = myDevID;
-    mem = myMem;
-
-    float minmax = 0;
-
-    LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
-    LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE);
-    LoadParamInt(argc, argv, "d", &hSize, DEFAULT_EMBEDDING_SIZE);
-    LoadParamFloat(argc, argv, "outputminmax", &minmax, 0.08F);
-
-    InitTensor2D(&w, hSize, vSize, X_FLOAT, devID, mem);
-    
-    float scale = 1.0F;
-    float finfout = (float)sqrt(6.0F * scale/(hSize + vSize));
-    w.SetDataRand(-finfout, finfout);
-
-    DTYPE v = 1.0F/(float)sqrt((float)hSize);
-    w.SetDataRandn(0, v);
-}
-
-/* 
-make the network 
-y = softmax(x * w)
->> input - input tensor
-<< return - output tensor 
-*/
-XTensor T2TOutput::Make(XTensor &input)
-{
-    XTensor &x = input;
-
-    return LogSoftmax(MMul(x, w), -1);
-}
-
-/* 
-make the network (redefined output tensor) 
->> input - input tensor
->> output - output tensor 
-*/
-void T2TOutput::Make(XTensor &input, XTensor &output)
-{
-    XTensor &x = input;
-
-    //output = LogSoftmax(MMul(x, w), -1);
-    output = Softmax(MMul(x, w), -1);
-    output.SetName(OUTPUT_NAME);
-}
-
-}
--- a/source/sample/transformer/T2TSearch.cpp
+++ b/source/sample/transformer/T2TSearch.cpp
--- a/source/sample/transformer/T2TTester.cpp
+++ b/source/sample/transformer/T2TTester.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
- */
-
-#include <math.h>
-#include "T2TUtility.h"
-#include "T2TTester.h"
-#include "T2TSearch.h"
-#include "../../tensor/XUtility.h"
-#include "../../tensor/core/CHeader.h"
-#include "../../network/XNoder.h"
-
-using namespace nts;
-
-namespace transformer
-{
-
-/* constructor */
-T2TTester::T2TTester()
-{
-}
-
-/* de-constructor */
-T2TTester::~T2TTester()
-{
-}
-
-/* initialize the model */
-void T2TTester::Init(int argc, char ** argv)
-{
-    LoadParamInt(argc, argv, "vsize", &vSize, 1);
-    LoadParamInt(argc, argv, "vsizetgt", &vSizeTgt, vSize);
-
-    batchLoader.Init(argc, argv);
-    seacher.Init(argc, argv);
-}
-
-/* 
-test the model
->> fn - test data file
->> ofn - output data file
->> model - model that is trained
-*/
-void T2TTester::Test(const char * fn, const char * ofn, T2TModel * model)
-{
-    int wc = 0;
-    int ws = 0;
-    int wordCount = 0;
-    int wordCountTotal = 0;
-    int sentCount = 0;
-    int batchCount = 0;
-    float loss = 0;
-
-    /* data files */
-    FILE * file = fopen(fn, "rb");
-    CheckNTErrors(file, "Cannot read the test file");
-    FILE * ofile = fopen(ofn, "wb");
-    CheckNTErrors(ofile, "Cannot open the output file");
-
-    int devID = model->devID;
-    XMem * mem = model->mem;
-
-    XNet net;
-    
-    double startT = GetClockSec();
-        
-    wordCount = 0;
-        
-    /* batch of input sequences */
-    XTensor batchEnc;
-    XTensor batchDec;
-
-    /* label */
-    XTensor label;
-
-    /* padding */
-    XTensor paddingEnc;
-    XTensor paddingDec;
-
-    /* gold standard */
-    XTensor gold;
-
-    /* an array that keeps the sequences */
-    int * seqs = new int[MILLION];
-
-    batchLoader.SetRandomBatch(false);
-    batchLoader.ClearBuf();
-
-    while(batchLoader.LoadBatch(file, model->isLM, 
-                                &batchEnc, &paddingEnc, &paddingDec, &paddingDec, &gold, &label,
-                                seqs, vSize, vSizeTgt,
-                                1, 1, false, ws, wc, devID, mem, false))
-    {
-        CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch!");
-        CheckNTErrors(!model->isLM, "Only MT model is supported!");
-        
-        XTensor output;
-        XTensor score;
-
-        seacher.Search(model, &batchEnc, &paddingEnc, &output, &score);
-
-        Dump(ofile, &output);
-
-        float prob = 0;
-            
-        loss += -prob;
-        wc = batchEnc.GetDim(-1);
-        wordCount += wc;
-        wordCountTotal += wc;
-        sentCount += batchEnc.GetDim(-2);
-        batchCount += 1;
-
-        if (batchCount % 1 == 0) {
-            double elapsed = GetClockSec() - startT;
-            XPRINT3(0, stderr, 
-                   "[INFO] elapsed=%.1fs, sent=%d, sword=%d\n",
-                    elapsed, sentCount, wordCount);
-        }
-    }
-        
-    fclose(file);
-    fclose(ofile);
-
-    delete[] seqs;
-    
-    double elapsed = GetClockSec() - startT;
-
-    XPRINT4(0, stderr, "[INFO] test finished (took %.1fs, word=%d, sent=%d, and ppl=%.3f)\n",
-            elapsed,wordCountTotal, sentCount, exp(loss/wordCount));
-}
-
-/*
-dump the result into the file
->> file - data file
->> output - output tensor
-*/
-void T2TTester::Dump(FILE * file, XTensor * output)
-{
-    int seqLength = output->GetDim(-1);
-
-    for (int i = 0; i < output->unitNum; i += seqLength) {
-        for (int j = 0; j < seqLength; j++) {
-            int w = output->GetInt(i + j);
-            fprintf(file, "%d ", w);
-            if (w < 0)
-                break;
-        }
-
-        fprintf(file, "\n");
-    }
-}
-
-}
--- a/source/sample/transformer/T2TTrainer.cpp
+++ b/source/sample/transformer/T2TTrainer.cpp
--- a/source/sample/transformer/T2TUtility.cpp
+++ b/source/sample/transformer/T2TUtility.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-namespace transformer
-{
-
-FILE * tmpFILE;
-int llnum = 0;
-FILE * tf = NULL;
-
-void LoadParamString(int argc, char ** argv, const char * name, char * p, const char * defaultP)
-{
-    char vname[128];
-    vname[0] = '-';
-    strcpy(vname + 1, name);
-    bool hit = false;
-    for(int i = 0; i < argc; i++){
-        if(!strcmp(argv[i], vname) && i + 1 < argc){
-            strcpy(p, argv[i + 1]);
-            //fprintf(stderr, " %s=%s\n", name, argv[i + 1]);
-            hit = true;
-        }
-    }
-    if(!hit)
-        strcpy(p, defaultP);
-}
-
-void LoadParamInt(int argc, char ** argv, const char * name, int * p, int defaultP)
-{
-    char vname[128];
-    vname[0] = '-';
-    strcpy(vname + 1, name);
-    bool hit = false;
-    for(int i = 0; i < argc; i++){
-        if(!strcmp(argv[i], vname) && i + 1 < argc){
-            *(int*)p = atoi(argv[i + 1]);
-            //fprintf(stderr, " %s=%s\n", name, argv[i + 1]);
-            hit = true;
-        }
-    }
-    if(!hit)
-        *p = defaultP;
-}
-
-void LoadParamBool(int argc, char ** argv, const char * name, bool * p, bool defaultP)
-{
-    char vname[128];
-    vname[0] = '-';
-    strcpy(vname + 1, name);
-    bool hit = false;
-    for(int i = 0; i < argc; i++){
-        if(!strcmp(argv[i], vname)){
-            *(bool*)p = true;
-            //fprintf(stderr, " %s=%s\n", name, "true");
-            hit = true;
-        }
-    }
-    if(!hit)
-        *p = defaultP;
-}
-
-void LoadParamFloat(int argc, char ** argv, const char * name, float * p, float defaultP)
-{
-    char vname[128];
-    vname[0] = '-';
-    strcpy(vname + 1, name);
-    bool hit = false;
-    for(int i = 0; i < argc; i++){
-        if(!strcmp(argv[i], vname) && i + 1 < argc){
-            *p = (float)atof(argv[i + 1]);
-            //fprintf(stderr, " %s=%s\n", name, argv[i + 1]);
-            hit = true;
-        }
-    }
-    if(!hit)
-        *p = defaultP;
-}
-
-void ShowParams(int argc, char ** argv)
-{
-    fprintf(stderr, "args:\n");
-    for(int i = 0; i < argc; i++){
-        if(argv[i][1] == 0)
-            continue;
-        if(argv[i][0] == '-' && (argv[i][1] < '1' || argv[i][1] > '9')){
-            if(i + 1 < argc && argv[i + 1][0] != '-')
-                fprintf(stderr, " %s=%s\n", argv[i], argv[i + 1]);
-            else
-                fprintf(stderr, " %s=yes\n", argv[i]);
-        }
-    }
-    fprintf(stderr, "\n");
-}
-
-}
--- a/source/sample/transformer/Transformer.cpp
+++ b/source/sample/transformer/Transformer.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
- */
-
-#include <math.h>
-#include <time.h>
-#include "Transformer.h"
-#include "T2TModel.h"
-#include "T2TUtility.h"
-#include "T2TTrainer.h"
-#include "T2TPredictor.h"
-#include "T2TTester.h"
-#include "../../tensor/XDevice.h"
-#include "../../tensor/XUtility.h"
-#include "../../tensor/XGlobal.h"
-
-namespace transformer
-{
-
-int TransformerMain(int argc, const char ** argv)
-{
-    if(argc == 0)
-        return 1;
-
-    char ** args = new char*[argc];
-    for(int i = 0; i < argc; i++){
-        args[i] = new char[strlen(argv[i]) + 1];
-        strcpy(args[i], argv[i]);
-    }
-
-    tmpFILE = fopen("tmp.txt", "wb");
-
-    ShowParams(argc, args);
-
-    bool isBeamSearch = false;
-    char * trainFN = new char[MAX_LINE_LENGTH];
-    char * modelFN = new char[MAX_LINE_LENGTH];
-    char * testFN = new char[MAX_LINE_LENGTH];
-    char * outputFN = new char[MAX_LINE_LENGTH];
-
-    LoadParamString(argc, args, "train", trainFN, "");
-    LoadParamString(argc, args, "model", modelFN, "");
-    LoadParamString(argc, args, "test", testFN, "");
-    LoadParamString(argc, args, "output", outputFN, "");
-    LoadParamBool(argc, args, "beamsearch", &isBeamSearch, false);
-
-    srand((unsigned int)time(NULL));
-
-    T2TTrainer trainer;
-    trainer.Init(argc, args);
-
-    T2TModel model;
-    model.InitModel(argc, args);
-    
-    /* learn model parameters */
-    if(strcmp(trainFN, ""))
-        trainer.Train(trainFN, testFN, strcmp(modelFN, "") ? modelFN : "checkpoint.model", &model);
-    
-    /* save the final model */
-    if(strcmp(modelFN, "") && strcmp(trainFN, ""))
-        model.Dump(modelFN);
-    
-    /* load the model if neccessary */
-    if(strcmp(modelFN, ""))
-        model.Read(modelFN);
-
-    /* test the model on the new data */
-    if(strcmp(testFN, "") && strcmp(outputFN, "")){
-        /* beam search */
-        if(isBeamSearch){
-            T2TTester searcher;
-            searcher.Init(argc, args);
-            searcher.Test(testFN, outputFN, &model);
-        }
-
-        /* forced decoding */
-        else{
-            T2TTrainer tester;
-            tester.Init(argc, args);
-            tester.Test(testFN, outputFN, &model);
-        }
-    }
-
-    delete[] trainFN;
-    delete[] modelFN;
-    delete[] testFN;
-    delete[] outputFN;
-
-    for(int i = 0; i < argc; i++)
-        delete[] args[i];
-    delete[] args;
-
-    fclose(tmpFILE);
-
-    return 0;
-}
-
-}
--- a/source/sample/transformer/Utility.cpp
+++ b/source/sample/transformer/Utility.cpp
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <string>
+#include <fstream>
+#include <sstream>
+
+#include "Utility.h"
+#include "../../tensor/XGlobal.h"
+
+using namespace nts;
+using namespace std;
+
+namespace nmt
+{
+
+/*
+load configurations from the command
+>> argc - number of arguments
+>> argv - the list of arguments
+*/
+Config::Config(int argc, const char** argv)
+{
+    char** args = new char* [MAX_PARAM_NUM];
+    for (int i = 0; i < argc; i++) {
+        args[i] = new char[strlen(argv[i]) + 1];
+        strcpy(args[i], argv[i]);
+    }
+
+    char* configFN = new char[1024];
+    LoadParamString(argc, args, "config", configFN, "");
+
+    int argsNum = argc;
+
+    /* load configurations from a file */
+    if (strcmp(configFN, "") != 0)
+        argsNum = LoadFromFile(configFN, args);
+
+    ShowParams(argsNum, args);
+
+    /* options for the model */
+    LoadParamInt(argsNum, args, "nhead", &nhead, 4);
+    LoadParamInt(argsNum, args, "enclayer", &nEncLayer, 6);
+    LoadParamInt(argsNum, args, "declayer", &nDecLayer, 6);
+    LoadParamInt(argsNum, args, "maxrp", &maxRP, 8);
+    LoadParamInt(argsNum, args, "embsize", &embSize, 512);
+    LoadParamInt(argsNum, args, "modelsize", &modelSize, 512);
+    LoadParamInt(argsNum, args, "maxpos", &maxPosLen, 1024);
+    LoadParamInt(argsNum, args, "fnnhidden", &fnnHiddenSize, modelSize * 2);
+    LoadParamInt(argsNum, args, "vsize", &srcVocabSize, 10152);
+    LoadParamInt(argsNum, args, "vsizetgt", &tgtVocabSize, 10152);
+    LoadParamInt(argsNum, args, "padid", &padID, 1);
+    LoadParamInt(argsNum, args, "startid", &startID, 2);
+    LoadParamInt(argsNum, args, "endid", &endID, 2);
+    LoadParamBool(argsNum, args, "rpr", &useRPR, false);
+    LoadParamBool(argsNum, args, "prenorm", &preNorm, true);
+
+    // TODO: refactor the parameters type to support weight sharing during training
+    LoadParamInt(argsNum, args, "shareemb", &shareAllEmbeddings, 0);
+    LoadParamInt(argsNum, args, "sharedec", &shareDecInputOutputWeight, 0);
+    LoadParamString(argsNum, args, "model", modelFN, "");
+    LoadParamString(argsNum, args, "srcvocab", srcVocabFN, "vocab.src");
+    LoadParamString(argsNum, args, "tgtvocab", tgtVocabFN, "vocab.tgt");
+
+    /* options for training */
+    LoadParamString(argsNum, args, "train", trainFN, "");
+    LoadParamString(argsNum, args, "valid", validFN, "");
+    LoadParamInt(argsNum, args, "dev", &devID, 0);
+    LoadParamInt(argsNum, args, "wbatch", &wBatchSize, 4096);
+    LoadParamInt(argsNum, args, "sbatch", &sBatchSize, 8);
+    isTraining = (strcmp(trainFN, "") == 0) ? false : true;
+    LoadParamBool(argsNum, args, "mt", &isMT, true);
+    LoadParamFloat(argsNum, args, "dropout", &dropout, 0.3);
+    LoadParamFloat(argsNum, args, "fnndrop", &fnnDropout, 0.1);
+    LoadParamFloat(argsNum, args, "attdrop", &attDropout, 0.1);
+
+    LoadParamFloat(argc, args, "lrate", &lrate, 0.0015F);
+    LoadParamFloat(argc, args, "lrbias", &lrbias, 0);
+    LoadParamInt(argc, args, "nepoch", &nepoch, 50);
+    LoadParamInt(argc, args, "maxcheckpoint", &maxCheckpoint, 10);
+    LoadParamInt(argc, args, "nstep", &nstep, 100000);
+    LoadParamInt(argc, args, "nwarmup", &nwarmup, 8000);
+    LoadParamBool(argc, args, "adam", &useAdam, true);
+    LoadParamFloat(argc, args, "adambeta1", &adamBeta1, 0.9F);
+    LoadParamFloat(argc, args, "adambeta2", &adamBeta2, 0.98F);
+    LoadParamFloat(argc, args, "adamdelta", &adamDelta, 1e-9F);
+    LoadParamBool(argc, args, "shuffled", &isShuffled, true);
+    LoadParamFloat(argc, args, "labelsmoothing", &labelSmoothingP, 0.1);
+    LoadParamInt(argc, args, "nstepcheckpoint", &nStepCheckpoint, -1);
+    LoadParamBool(argc, args, "epochcheckpoint", &useEpochCheckpoint, true);
+    LoadParamInt(argc, args, "updatestep", &updateStep, 1);
+    LoadParamBool(argc, args, "sorted", &isLenSorted, false);
+
+    LoadParamInt(argc, args, "bufsize", &bufSize, 50000);
+    LoadParamBool(argc, args, "doubledend", &isDoubledEnd, false);
+    LoadParamBool(argc, args, "smallbatch", &isSmallBatch, true);
+    LoadParamBool(argc, args, "bigbatch", &isBigBatch, false);
+    LoadParamBool(argc, args, "randbatch", &isRandomBatch, false);
+    LoadParamInt(argc, args, "bucketsize", &bucketSize, wBatchSize * 10);
+
+    /* options for translating */
+    LoadParamString(argsNum, args, "test", testFN, "");
+    LoadParamString(argsNum, args, "output", outputFN, "");
+    LoadParamInt(argsNum, args, "beamsize", &beamSize, 1);
+    LoadParamBool(argsNum, args, "fp16", &useFP16, false);
+    LoadParamFloat(argsNum, args, "lenalpha", &lenAlpha, 0.6);
+    LoadParamFloat(argsNum, args, "maxlenalpha", &maxLenAlpha, 1.2);
+
+    for (int i = 0; i < argc; i++)
+        delete[] args[i];
+    delete[] args;
+    delete[] configFN;
+}
+
+/*
+load configurations from a file
+>> configFN - path to the configuration file
+>> args - the list to store the configurations
+format: one option per line, separated by a blank or a tab
+*/
+int Config::LoadFromFile(const char* configFN, char** args) {
+    ifstream f(configFN, ios::in);
+    CheckNTErrors(f.is_open(), "unable to open the config file");
+
+    int argsNum = 0;
+
+    /* parse arguments */
+    string key, value;
+    while (f >> key >> value) {
+        key += '-';
+        strcpy(args[argsNum++], key.c_str());
+        strcpy(args[argsNum++], value.c_str());
+    }
+
+    /* record the number of arguments */
+    return argsNum;
+}
+
+void LoadParamString(int argc, char** argv, const char* name, char* p, const char* defaultP)
+{
+    char vname[128];
+    vname[0] = '-';
+    strcpy(vname + 1, name);
+    bool hit = false;
+    for (int i = 0; i < argc; i++) {
+        if (!strcmp(argv[i], vname) && i + 1 < argc) {
+            strcpy(p, argv[i + 1]);
+            hit = true;
+            break;
+        }
+    }
+    if (!hit)
+        strcpy(p, defaultP);
+}
+
+void LoadParamInt(int argc, char** argv, const char* name, int* p, int defaultP)
+{
+    char vname[128];
+    vname[0] = '-';
+    strcpy(vname + 1, name);
+    bool hit = false;
+    for (int i = 0; i < argc; i++) {
+        if (!strcmp(argv[i], vname) && i + 1 < argc) {
+            *(int*)p = atoi(argv[i + 1]);
+            hit = true;
+            break;
+        }
+    }
+    if (!hit)
+        *p = defaultP;
+}
+
+void LoadParamBool(int argc, char** argv, const char* name, bool* p, bool defaultP)
+{
+    char vname[128];
+    vname[0] = '-';
+    strcpy(vname + 1, name);
+    bool hit = false;
+    for (int i = 0; i < argc; i++) {
+        if (!strcmp(argv[i], vname)) {
+            *(bool*)p = true;
+            hit = true;
+            break;
+        }
+    }
+    if (!hit)
+        *p = defaultP;
+}
+
+void LoadParamFloat(int argc, char** argv, const char* name, float* p, float defaultP)
+{
+    char vname[128];
+    vname[0] = '-';
+    strcpy(vname + 1, name);
+    bool hit = false;
+    for (int i = 0; i < argc; i++) {
+        if (!strcmp(argv[i], vname) && i + 1 < argc) {
+            *p = (float)atof(argv[i + 1]);
+            hit = true;
+            break;
+        }
+    }
+    if (!hit)
+        *p = defaultP;
+}
+
+void ShowParams(int argc, char** argv)
+{
+    fprintf(stderr, "args:\n");
+    for (int i = 0; i < argc; i++) {
+        if (argv[i][1] == 0)
+            continue;
+        if (argv[i][0] == '-' && (argv[i][1] < '1' || argv[i][1] > '9')) {
+            if (i + 1 < argc && argv[i + 1][0] != '-')
+                fprintf(stderr, " %s=%s\n", argv[i], argv[i + 1]);
+            else
+                fprintf(stderr, " %s=yes\n", argv[i]);
+        }
+    }
+    fprintf(stderr, "\n");
+}
+
+#define MAX_WORD_NUM 120
+
+/*
+split string by delimiter, this will return indices of all sub-strings
+>> s - the original string
+>> delimiter - as it is
+<< indices - indices of all sub-strings
+*/
+UInt64List SplitToPos(const string& s, const string& delimiter)
+{
+    UInt64List indices;
+    if (delimiter.length() == 0) {
+        indices.Add(0);
+    }
+    size_t pos = 0;
+    uint64_t start = 0;
+    while ((pos = s.find(delimiter, start)) != string::npos) {
+        if (pos != start) {
+            indices.Add(start);
+        }
+        start = pos + delimiter.length();
+    }
+    if (start != s.length()) {
+        indices.Add(start);
+    }
+    return indices;
+}
+
+/* split a string to a int64_t list */
+IntList SplitInt(const string& s, const string& delimiter)
+{
+    IntList values;
+    auto indices = SplitToPos(s, delimiter);
+    for (int i = 0; i < indices.Size(); i++) {
+        values.Add(strtol(s.data() + indices[i], nullptr, 10));
+    }
+    return values;
+}
+
+/* split a string to a float list */
+FloatList SplitFloat(const string& s, const string& delimiter)
+{
+    FloatList values;
+    auto indices = SplitToPos(s, delimiter);
+    for (int i = 0; i < indices.Size(); i++) {
+        values.Add(strtof(s.data() + indices[i], nullptr));
+    }
+    return values;
+}
+
+}
\ No newline at end of file
--- a/source/sample/transformer/Utility.h
+++ b/source/sample/transformer/Utility.h
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
+ */
+
+#ifndef __UTILITY_H__
+#define __UTILITY_H__
+
+#include <string>
+#include <cstdio>
+
+#include "../../tensor/XList.h"
+
+using namespace std;
+using namespace nts;
+
+namespace nmt
+{
+
+#define MAX_PARAM_NUM 100
+
+/* load arguments */
+void LoadParamInt(int argc, char** argv, const char* name, int* p, int defaultP);
+void LoadParamBool(int argc, char** argv, const char* name, bool* p, bool defaultP);
+void LoadParamFloat(int argc, char** argv, const char* name, float* p, float defaultP);
+void LoadParamString(int argc, char** argv, const char* name, char* p, const char* defaultP);
+
+/* show arguments */
+void ShowParams(int argc, char** argv);
+
+/* split string */
+IntList SplitInt(const string& s, const string& delimiter);
+FloatList SplitFloat(const string& s, const string& delimiter);
+UInt64List SplitToPos(const string& s, const string& delimiter);
+
+/* configurations for  */
+class Config {
+public:
+    /* path to the model */
+    char modelFN[1024];
+
+    /* path to the source vocab */
+    char srcVocabFN[1024];
+
+    /* path to the target vocab */
+    char tgtVocabFN[1024];
+
+    /* path to the input file (for inference) */
+    char testFN[1024];
+
+    /* path to the output file (for inference) */
+    char outputFN[1024];
+
+    /* path to the training file */
+    char trainFN[1024];
+
+    /* path to the validation file */
+    char validFN[1024];
+
+    /* device id */
+    int devID;
+
+    /* beam size */
+    int beamSize;
+
+    /* word batch size */
+    int wBatchSize;
+
+    /* sentence batch size */
+    int sBatchSize;
+
+    /* number of heads in attention */
+    int nhead;
+
+    /* number of encoder layers */
+    int nEncLayer;
+
+    /* number of decoder layers */
+    int nDecLayer;
+
+    /* the maximum relative position in RPR attentions */
+    int maxRP;
+
+    /* the dimension of embeddings */
+    int embSize;
+
+    /* the dimension of hidden layer */
+    int modelSize;
+
+    /* the maximum length in positional embedding */
+    int maxPosLen;
+
+    /* the dimension of fnn hidden layer */
+    int fnnHiddenSize;
+
+    /* the vocab size of source sequence */
+    int srcVocabSize;
+
+    /* the vocab size of target sequence */
+    int tgtVocabSize;
+
+    /* the padding id */
+    int padID;
+
+    /* start symbol */
+    int startID;
+
+    /* end symbol */
+    int endID;
+
+    /* indicates whether the model uses pre-norm */
+    bool preNorm;
+
+    /* indicates whether the model is running for machine translation */
+    bool isMT;
+
+    /* indicates whether share encoder decoder embeddings */
+    int shareAllEmbeddings;
+
+    /* indicates whether share decoder embeddings and output weights */
+    int shareDecInputOutputWeight;
+
+    /* indicates whether the model is running with FP16 data type */
+    bool useFP16;
+
+    /* indicates whether we use the RPR attention */
+    bool useRPR;
+
+    /* indicates whether we train the model */
+    bool isTraining;
+
+    /* dropout rate for the model */
+    float dropout;
+
+    /* dropout rate for fnn layers */
+    float fnnDropout;
+
+    /* dropout rate for attention layers */
+    float attDropout;
+
+    /* the alpha parameter controls the length preference */
+    float lenAlpha;
+
+    /* scalar of the input sequence (for max number of search steps) */
+    float maxLenAlpha;
+
+    /* learning rate */
+    float lrate;
+
+    /* the parameter that controls the maximum learning rate in training */
+    float lrbias;
+
+    /* training epoch number */
+    int nepoch;
+
+    /* training step number */
+    int nstep;
+
+    /* the maximum number of saved checkpoints */
+    int maxCheckpoint;
+
+    /* indicates whether we use Adam */
+    bool useAdam;
+
+    /* hyper parameters of Adam */
+    float adamBeta1;
+    float adamBeta2;
+    float adamDelta;
+
+    /* step number of warm-up for training */
+    int nwarmup;
+
+    /* indicates whether the data file is shuffled for training */
+    bool isShuffled;
+
+    /* the factor of label smoothing */
+    float labelSmoothingP;
+
+    /* number of steps after which we make a checkpoint */
+    int nStepCheckpoint;
+
+    /* indicates whether we make a checkpoint after each training epoch */
+    bool useEpochCheckpoint;
+
+    /* number of batches on which we do model update */
+    int updateStep;
+
+    /* indicates whether the sequence is sorted by length */
+    bool isLenSorted;
+
+    /* buffer size */
+    int bufSize;
+
+    /* indicates whether we double the </s> symbol for the output of LM */
+    bool isDoubledEnd;
+
+    /* indicates whether we use batchsize = max * sc
+       rather rather than batchsize = word-number, where max is the maximum
+       length and sc is the sentence number */
+    bool isSmallBatch;
+
+    /* counterpart of "isSmallBatch" */
+    bool isBigBatch;
+
+    /* randomize batches */
+    bool isRandomBatch;
+
+    /* bucket size */
+    int bucketSize;
+
+public:
+
+    /* load configurations from the command */
+    Config(int argc, const char** argv);
+
+    /* load configurations from a file */
+    int LoadFromFile(const char* configFN, char** args);
+};
+
+}
+
+#endif
--- a/source/sample/transformer/module/Attention.cpp
+++ b/source/sample/transformer/module/Attention.cpp
--- a/source/sample/transformer/module/Attention.h
+++ b/source/sample/transformer/module/Attention.h
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
+ */
+
+#ifndef __ATTENTION_H__
+#define __ATTENTION_H__
+
+#include "NNUtil.h"
+#include "../Utility.h"
+#include "../../../network/XNet.h"
+#include "../../../tensor/core/CHeader.h"
+
+using namespace nts;
+
+namespace nmt
+{
+/* attention type */
+enum { NONE, SELF_ATT, EN_DE_ATT };
+
+/* layer cache for keys and values */
+class Cache
+{
+public:
+    /* cache for keys, (B, L, H) */
+    XTensor key;
+
+    /* cache for values, (B, L, H) */
+    XTensor value;
+
+public:
+
+    /* indicates cache miss if 'true' */
+    bool miss;
+
+    /* indicates whether we use cache */
+    bool enable;
+
+    /* constructor */
+    Cache();
+
+    /* update the states cache */
+    void Update(XTensor&& k, XTensor&& v);
+
+    /* keep alive states */
+    void KeepAlive(XTensor& aliveIdx);
+
+    /* reorder alive states */
+    void Reorder(XTensor& reorder);
+};
+
+/* multi-head attention */
+class Attention
+{
+public:
+    /* device id */
+    int devID;
+
+    /* head number */
+    int nhead;
+
+    /* transformation matrix for Q */
+    XTensor weightQ;
+
+    /* bias for Q */
+    XTensor biasQ;
+
+    /* transformation matrix for K */
+    XTensor weightK;
+
+    /* bias for K */
+    XTensor biasK;
+
+    /* transformation matrix for V */
+    XTensor weightV;
+
+    /* bias for V */
+    XTensor biasV;
+
+    XTensor wBig;
+
+    XTensor bBig;
+
+    /* RPR emb */
+    XTensor RPEmbK;
+
+    /* transformation after dot-product attention */
+    XTensor weightO;
+
+    /* bias after dot-product attention */
+    XTensor biasO;
+
+    /* size of transformed Q and K */
+    int dk;
+
+    /* size of transformed V */
+    int dv;
+
+    /* size of input Q, K and V */
+    int d;
+
+    /* indicates whether we use the RPR attention */
+    bool useRPR;
+
+    /* dropout probability */
+    DTYPE dropoutP;
+
+    /* the maximum relative window size */
+    int maxRP;
+
+public:
+    /* constructor */
+    Attention();
+
+    /* de-constructor */
+    ~Attention();
+
+    /* initialize the model */
+    void InitModel(Config& config);
+
+    /* make the network */
+    XTensor Make(XTensor& k, XTensor& q, XTensor& v,
+                 XTensor* mask, bool isTraining,
+                 Cache* cache, int cacheType);
+
+    /* make the attention network given keys, queries and values (after linear transformation) */
+    XTensor MakeAttention(XTensor& k, XTensor& q, XTensor& v,
+                          XTensor* mask, bool isTraining);
+
+    /* make the attention network given keys, queries and values (after linear transformation) */
+    XTensor MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
+                             XTensor* mask, bool isTraining, bool isEnc);
+
+    /* generate relative position embeddings */
+    XTensor GetRPEmbedding(const int lenQ, const int lenKV, const int maxRelativeLen, const bool isEnc);
+
+    /* relative position-aware dot-product attention inner calculation */
+    XTensor RPDotProduct(XTensor& x, XTensor& y, XTensor& z, const bool is_key);
+};
+}
+
+#endif
--- a/source/sample/transformer/module/CommonModules.cpp
+++ b/source/sample/transformer/module/CommonModules.cpp
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Bei Li (libei_neu@outlook.com) 2020-02-05
+ * This file includes some common modules of the Transformer model
+ */
+
+#include "CommonModules.h"
+#include "../../../tensor/core/CHeader.h"
+#include "../../../tensor/function/FHeader.h"
+
+namespace nmt
+{
+
+/* 
+flexible layer normalization for the Transformer 
+>> input - input tensor
+>> ln - the layernorm network
+>> prenorm - whether we use prenorm or not
+>> before - whether we use layernorm before attention/fnn
+>> after - whether we use layernorm after attention/fnn
+*/
+XTensor LayerNorm(XTensor& input, LN& ln, bool prenorm, bool before, bool after)
+{
+    if (after ^ prenorm)
+        return ln.Make(input);
+    else
+        return input;
+}
+
+}
\ No newline at end of file
--- a/source/sample/transformer/T2TUtility.h
+++ b/source/sample/transformer/T2TUtility.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -15,32 +14,24 @@
 * limitations under the License.
 */

-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
- */
-
-#ifndef __T2TUTILITY_H__
-#define __T2TUTILITY_H__
+ /*
+  * $Created by: Bei Li (libei_neu@outlook.com) 2020-02-03
+  */

-#include <stdio.h>
+#ifndef __COMMONMODULE_H__
+#define __COMMONMODULE_H__

-namespace transformer
-{
+#include "LayerNorm.h"
+#include "CommonModules.h"

-extern FILE * tmpFILE;
+using namespace nts;

-/* load arguments */
-void LoadParamString(int argc, char ** argv, const char * name, char * p, const char * defaultP);
-void LoadParamInt(int argc, char ** argv, const char * name, int * p, int defaultP);
-void LoadParamBool(int argc, char ** argv, const char * name, bool * p, bool defaultP);
-void LoadParamFloat(int argc, char ** argv, const char * name, float * p, float defaultP);
-
-/* show arguments */
-void ShowParams(int argc, char ** argv);
+namespace nmt
+{

-extern int llnum;
-extern FILE * tf;
+/* the layer normalization module to control pre-norm or post-norm*/
+XTensor LayerNorm(XTensor& input, LN& ln, bool prenorm, bool before, bool after);

 }

-#endif
+#endif
\ No newline at end of file
--- a/source/sample/transformer/module/Embedding.cpp
+++ b/source/sample/transformer/module/Embedding.cpp
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-01
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-07
+ */
+
+#include "Embedding.h"
+#include "../Utility.h"
+#include "../../../tensor/core/CHeader.h"
+
+namespace nmt
+{
+
+/* constructor */
+Embedder::Embedder()
+{
+    devID = -1;
+    vSize = -1;
+    maxLength = -1;
+}
+
+/* de-constructor */
+Embedder::~Embedder()
+{
+}
+
+/*
+initialize the model
+>> config - configurations of the model
+>> isEnc - indicates if it is used for the encoder
+*/
+void Embedder::InitModel(Config& config, bool isEnc)
+{
+    devID = config.devID;
+    d = config.modelSize;
+    padIdx = config.padID;
+    eSize = config.embSize;
+    maxLength = config.maxPosLen;
+    vSize = (isEnc) ? config.srcVocabSize : config.tgtVocabSize;
+
+    InitTensor2D(&w, vSize, eSize, X_FLOAT, devID);
+
+    maxLength = maxLength + 1 + 1;
+    DTYPE v = 1.0F / (float)sqrt((float)eSize);
+    w.SetDataRandn(0, v);
+
+    /* create the positional embedding matrix */
+    MakePosEmbedding(maxLength);
+}
+
+/*
+make positional embeddings (of size eSize * length)
+>> length - length of the sequence
+*/
+void Embedder::MakePosEmbedding(int length)
+{
+    InitTensor2D(&posEmbeddingBase, length, eSize, X_FLOAT, devID);
+
+    float* data = new float[posEmbeddingBase.unitNum];
+
+    for (int pos = 0; pos < length; pos++) {
+        float* dp = data + pos * eSize;
+
+        int channelSize = eSize / 2;
+        int offset = 0;
+        for (int i = 0; i < channelSize; i++) {
+            dp[offset++] = (float)sin(pos * exp(-i * log(10000.0F) / (channelSize - 1)));
+        }
+        for (int i = 0; i < channelSize; i++) {
+            dp[offset++] = (float)cos(pos * exp(-i * log(10000.0F) / (channelSize - 1)));
+        }
+    }
+
+    /* padding zeros */
+    int padStart = padIdx * eSize;
+    for (int i = padStart; i < padStart + eSize; i++)
+        data[i] = 0.F;
+
+    posEmbeddingBase.SetData(data, posEmbeddingBase.unitNum);
+
+    if (w.dataType != posEmbeddingBase.dataType)
+        posEmbeddingBase = ConvertDataType(posEmbeddingBase, w.dataType);
+
+    delete[] data;
+}
+
+/*
+make the network
+>> input - the word indices
+>> nstep - the length of current sequence
+>> isDec - indicates whether it is decoder
+>> isTraining - indicates whether it is training
+<< return - word & position embeddings of the input
+*/
+XTensor Embedder::Make(XTensor& input, bool isDec, bool isTraining, int nstep)
+{
+    /* make sure the padding index is 1 */
+    CheckNTErrors(input.order > 1, "Wrong input tensor size!");
+    CheckNTErrors(input.dimSize[input.order - 1] < maxLength, "The sequence is too long!");
+    CheckNTErrors(vSize > 0, "Set vocabulary size by \"-vsize\"");
+    CheckNTErrors(eSize > 0, "Set embedding size by \"-esize\"");
+
+    XTensor wordEmbedding, position, posEmbedding;
+
+    InitTensor1D(&position, input.GetDim(-1), X_INT, devID);
+
+    if (!isDec || isTraining || input.GetDim(-1) > 1)
+    {
+        position.Range(0, position.unitNum, 1);
+
+        // disable grad
+        ScaleAndShiftMe(position, 1.0F, float(padIdx + 1));
+    }
+    else
+    {
+        /* decoder embeddings during decoding */
+        position.SetDataFixed(nstep + padIdx + 1);
+    }
+
+    /* we make positional embeddings first */
+    XTensor embTMP;
+    embTMP = Gather(posEmbeddingBase, position);
+    posEmbedding = Unsqueeze(embTMP, 0, input.GetDim(0));
+
+    /* then we make word embeddings */
+    //w.enableGrad = false;
+    wordEmbedding = Gather(w, input);
+
+    wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize));
+
+    /* we sum over the two embeddings */
+    SumMe(wordEmbedding, posEmbedding);
+    return wordEmbedding;
+}
+
+}
\ No newline at end of file
--- a/source/sample/transformer/T2TEmbedding.h
+++ b/source/sample/transformer/T2TEmbedding.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,33 +16,32 @@

 /*
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-01
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-07
 */

-#ifndef __T2TEMBEDDING_H__
-#define __T2TEMBEDDING_H__
+#ifndef __EMBEDDING_H__
+#define __EMBEDDING_H__

-#include "../../network/XNet.h"
+#include "../Utility.h"
+#include "../../../network/XNet.h"

 using namespace nts;

-namespace transformer
+namespace nmt
 {

 #define DEFAULT_EMBEDDING_SIZE 512

-/* 
+/*
 embedding (of word at position i):
 word embedding + positional embedding
 */
-class T2TEmbedder
+class Embedder
 {
 public:
    /* device id */
    int devID;
-    
-    /* memory pool */
-    XMem * mem;
-    
+
    /* vocabulary size */
    int vSize;

@@ -53,31 +51,34 @@ public:
    /* maximum length of the sequence */
    int maxLength;

-    /* dimension size of the hidden layers in the t2t model */
+    /* dimension size of the hidden layers in the  model */
    int d;

+    /* padding index */
+    int padIdx;
+
    /* word embedding matrix */
    XTensor w;

-    /* predefined positional embeddings. It can speeds up 
+    /* predefined positional embeddings. It can speeds up
       the embedding processing by re-loading. */
    XTensor posEmbeddingBase;

 public:
    /* constructor */
-    T2TEmbedder();
+    Embedder();

    /* de-constructor */
-    ~T2TEmbedder();
+    ~Embedder();

    /* initialize the model */
-    void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL, bool isEnc = true);
+    void InitModel(Config& config, bool isEnc = true);

    /* make positional embeddings */
-    void MakePosEmbedding(int eSize, int d, int length);
+    void MakePosEmbedding(int length);

    /* make the network */
-    XTensor Make(XTensor &input);
+    XTensor Make(XTensor& input, bool isDec, bool isTraining, int nstep = 0);
 };

 }

--- a/source/sample/transformer/T2TFNN.cpp
+++ b/source/sample/transformer/T2TFNN.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,88 +16,81 @@

 /*
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

-#include <math.h>
-#include "T2TFNN.h"
-#include "T2TUtility.h"
-#include "T2TEmbedding.h"
-#include "../../tensor/core/CHeader.h"
-#include "../../tensor/function/FHeader.h"
+#include "FNN.h"
+#include "Embedding.h"
+#include "../Utility.h"
+#include "../../../tensor/core/CHeader.h"
+#include "../../../tensor/function/FHeader.h"

-namespace transformer
+namespace nmt
 {

 /* constructor */
-T2TFNN::T2TFNN()
+FNN::FNN()
 {
-    inSize  = -1;
+    inSize = -1;
    outSize = -1;
-    hSize   = -1;
+    hSize = -1;
 }

-/* deconstructor */
-T2TFNN::~T2TFNN()
+/* de-constructor */
+FNN::~FNN()
 {
 }

-/* 
-initialize the model 
+/*
+initialize the model
 >> argc - number of arguments
 >> argv - list of pointers to the arguments
->> myDevID - device id
->> myMem - the memory pool
+>> config - configurations of the model
 */
-void T2TFNN::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
+void FNN::InitModel(Config& config)
 {
-    devID = myDevID;
-    mem = myMem;
-    
-    float minmax = 0;
+    devID = config.devID;

-    LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE);
-    LoadParamInt(argc, argv, "d", &outSize, DEFAULT_EMBEDDING_SIZE);
-    LoadParamInt(argc, argv, "fnnh", &hSize, outSize * 4);
-    LoadParamFloat(argc, argv, "fnnminmax", &minmax, 0.1F);
-    LoadParamFloat(argc, argv, "dropoutfnn", &dropoutP, 0);
+    inSize = config.modelSize;
+    outSize = config.modelSize;
+    hSize = config.fnnHiddenSize;
+    dropoutP = config.fnnDropout;

-    InitTensor2D(&w1, inSize, hSize, X_FLOAT, devID, mem);
-    InitTensor1D(&b1, hSize, X_FLOAT, devID, mem);
+    InitTensor2D(&w1, inSize, hSize, X_FLOAT, devID);
+    InitTensor1D(&b1, hSize, X_FLOAT, devID);

-    InitTensor2D(&w2, hSize, outSize, X_FLOAT, devID, mem);
-    InitTensor1D(&b2, outSize, X_FLOAT, devID, mem);
+    InitTensor2D(&w2, hSize, outSize, X_FLOAT, devID);
+    InitTensor1D(&b2, outSize, X_FLOAT, devID);

    float scale = 1.0F;
-    float finfout1 = (float)sqrt(6.0F * scale/(inSize + hSize));
-    float finfout2 = (float)sqrt(6.0F * scale/(hSize + outSize));
-    
-    w1.SetDataRand(-finfout1, finfout1);
+    _SetDataFanInOut(&w1, scale);
+    _SetDataFanInOut(&w2, scale);
+
+    w1.SetDataRand(-(DTYPE)sqrt(6.0F / inSize), (DTYPE)sqrt(6.0F / inSize));
+    w2.SetDataRand(-(DTYPE)sqrt(6.0F / hSize), (DTYPE)sqrt(6.0F / hSize));
+
    b1.SetZeroAll();
-    w2.SetDataRand(-finfout2, finfout2);
    b2.SetZeroAll();
 }

-/* 
-make the network 
+/*
+make the network
 y = max(0, x * w1 + b1) * w2 + b2
 >> input - the input tensor
->> return - the output tensor 
+>> return - the output tensor
 */
-XTensor T2TFNN::Make(XTensor &input, bool isTraining)
+XTensor FNN::Make(XTensor& input, bool isTraining)
 {
    XTensor t1;

    /* t1 = max(0, x * w1 + b1) */
-    //t1 = Rectify(MMul(input, w1) + b1);
    t1 = Rectify(MulAndShift(input, w1, b1));
-    
-    if(isTraining && dropoutP > 0)
+
+    if (isTraining && dropoutP > 0)
        t1 = Dropout(t1, dropoutP);

    /* result = t1 * w2 + b2 */
-    //return MMul(t1, w2) + b2;
    return MulAndShift(t1, w2, b2);
 }

-
-}
+}
\ No newline at end of file
--- a/source/sample/transformer/T2TFNN.h
+++ b/source/sample/transformer/T2TFNN.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,28 +16,28 @@

 /*
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

-#ifndef __T2TFNN_H__
-#define __T2TFNN_H__
+#ifndef __FNN_H__
+#define __FNN_H__

-#include "../../tensor/XTensor.h"
+#include "LayerNorm.h"
+#include "../Utility.h"
+#include "../../../tensor/XTensor.h"

 using namespace nts;

-namespace transformer
+namespace nmt
 {

 /* a fnn: y = max(0, x * w1 + b1) * w2 + b2 */
-class T2TFNN
+class FNN
 {
 public:
    /* device id */
    int devID;

-    /* memory pool */
-    XMem * mem;
-
    /* size of input vector */
    int inSize;

@@ -59,24 +58,23 @@ public:

    /* bias of transformation 2 */
    XTensor b2;
-    
+
    /* dropout probability */
    DTYPE dropoutP;

 public:

    /* constructor */
-    T2TFNN();
+    FNN();

-    /* deconstructor */
-    ~T2TFNN();
+    /* de-constructor */
+    ~FNN();

    /* initialize the model */
-    void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL);
+    void InitModel(Config& config);

    /* make the network */
-    XTensor Make(XTensor &input, bool isTraining);
-
+    XTensor Make(XTensor& input, bool isTraining);
 };

 }

--- a/source/sample/transformer/module/GLU.cpp
+++ b/source/sample/transformer/module/GLU.cpp
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Bei Li (libei_neu@outlook.com) 2020-02-03
+ */
+
+#include "GLU.h"
+#include "Embedding.h"
+#include "../Utility.h"
+#include "../../../tensor/core/CHeader.h"
+#include "../../../tensor/function/FHeader.h"
+
+namespace nmt
+{
+
+/* constructor */
+GLU::GLU()
+{
+    inSize = -1;
+    outSize = -1;
+    hSize = -1;
+}
+
+/* de-constructor */
+GLU::~GLU()
+{
+}
+
+/*
+initialize the model
+>> config - configurations of the model
+*/
+void GLU::InitModel(Config& config)
+{
+    devID = config.devID;
+
+    float minmax = 0;
+
+    inSize = config.modelSize;
+    outSize = config.modelSize;
+
+    InitTensor2D(&w1, hSize, outSize, X_FLOAT, devID);
+    InitTensor1D(&b1, outSize, X_FLOAT, devID);
+
+    InitTensor2D(&w2, hSize, outSize, X_FLOAT, devID);
+    InitTensor1D(&b2, outSize, X_FLOAT, devID);
+}
+
+/*
+make the network
+y = W1 * x + b1 * sigmod(W2 * x + b2)
+>> input - the input tensor, size = 2 * hSize
+>> return - the output tensor, size = hSize
+*/
+XTensor GLU::Make(XTensor& input)
+{
+    XTensor t1;
+    XTensor t2;
+    TensorList input_list;
+
+    /* split the input into two vectors with the dim hSize */
+    Split(input, input_list, -1, 2);
+
+    /* t1 = W1 * x + b1 */
+    t1 = MulAndShift(input_list.GetItem(0), w1, b1);
+
+    /* t2 = W2 * x + b2 */
+    t2 = MulAndShift(input_list.GetItem(1), w2, b2);
+
+    return t1 * Sigmoid(t2);
+}
+
+}
\ No newline at end of file
--- a/source/sample/transformer/module/GLU.h
+++ b/source/sample/transformer/module/GLU.h
--- a/source/sample/transformer/module/LayerHistory.cpp
+++ b/source/sample/transformer/module/LayerHistory.cpp
--- a/source/sample/transformer/module/LayerHistory.h
+++ b/source/sample/transformer/module/LayerHistory.h
--- a/source/sample/transformer/T2TLayerNormal.cpp
+++ b/source/sample/transformer/T2TLayerNormal.cpp
--- a/source/sample/transformer/T2TLayerNormal.h
+++ b/source/sample/transformer/T2TLayerNormal.h
--- a/source/sample/transformer/module/NNUtil.cpp
+++ b/source/sample/transformer/module/NNUtil.cpp
--- a/source/tensor/core/arithmetic/SubDim.cuh
+++ b/source/tensor/core/arithmetic/SubDim.cuh
--- a/source/sample/transformer/module/Output.cpp
+++ b/source/sample/transformer/module/Output.cpp
--- a/source/sample/transformer/T2TOutput.h
+++ b/source/sample/transformer/T2TOutput.h
--- a/source/sample/transformer/train/TrainDataSet.cpp
+++ b/source/sample/transformer/train/TrainDataSet.cpp
--- a/source/sample/transformer/train/TrainDataSet.h
+++ b/source/sample/transformer/train/TrainDataSet.h
--- a/source/sample/transformer/train/Trainer.cpp
+++ b/source/sample/transformer/train/Trainer.cpp
--- a/source/sample/transformer/T2TTrainer.h
+++ b/source/sample/transformer/T2TTrainer.h
--- a/source/sample/transformer/translate/DataSet.cpp
+++ b/source/sample/transformer/translate/DataSet.cpp
--- a/source/sample/transformer/translate/DataSet.h
+++ b/source/sample/transformer/translate/DataSet.h
--- a/source/sample/transformer/T2TLengthPenalty.cpp
+++ b/source/sample/transformer/T2TLengthPenalty.cpp
--- a/source/sample/transformer/T2TLengthPenalty.h
+++ b/source/sample/transformer/T2TLengthPenalty.h
--- a/source/sample/transformer/T2TPredictor.cpp
+++ b/source/sample/transformer/T2TPredictor.cpp
--- a/source/sample/transformer/T2TPredictor.h
+++ b/source/sample/transformer/T2TPredictor.h
--- a/source/sample/transformer/translate/Search.cpp
+++ b/source/sample/transformer/translate/Search.cpp
--- a/source/sample/transformer/T2TSearch.h
+++ b/source/sample/transformer/T2TSearch.h
--- a/source/sample/transformer/translate/Translator.cpp
+++ b/source/sample/transformer/translate/Translator.cpp
--- a/source/sample/transformer/T2TTester.h
+++ b/source/sample/transformer/T2TTester.h
--- a/source/sample/transformer/translate/Vocab.cpp
+++ b/source/sample/transformer/translate/Vocab.cpp
--- a/source/sample/transformer/translate/Vocab.h
+++ b/source/sample/transformer/translate/Vocab.h
--- a/source/tensor/Main.cpp
+++ b/source/tensor/Main.cpp
--- a/source/tensor/XBLAS.cpp
+++ b/source/tensor/XBLAS.cpp
--- a/source/tensor/XBLAS.h
+++ b/source/tensor/XBLAS.h
--- a/source/tensor/XCall.cpp
+++ b/source/tensor/XCall.cpp
--- a/source/tensor/XCall.h
+++ b/source/tensor/XCall.h
--- a/source/tensor/XDataType.cpp
+++ b/source/tensor/XDataType.cpp
--- a/source/tensor/XDataType.h
+++ b/source/tensor/XDataType.h
--- a/source/tensor/XDevice.cpp
+++ b/source/tensor/XDevice.cpp
--- a/source/tensor/XDevice.h
+++ b/source/tensor/XDevice.h
--- a/source/tensor/XGlobal.cpp
+++ b/source/tensor/XGlobal.cpp
--- a/source/tensor/XGlobal.h
+++ b/source/tensor/XGlobal.h
--- a/source/tensor/XHeap.cpp
+++ b/source/tensor/XHeap.cpp
--- a/source/tensor/XHeap.h
+++ b/source/tensor/XHeap.h
--- a/source/tensor/XLink.cpp
+++ b/source/tensor/XLink.cpp
--- a/source/tensor/XLink.h
+++ b/source/tensor/XLink.h
--- a/source/tensor/XList.cpp
+++ b/source/tensor/XList.cpp
--- a/source/tensor/XList.h
+++ b/source/tensor/XList.h
--- a/source/tensor/XMem.cpp
+++ b/source/tensor/XMem.cpp
--- a/source/tensor/XMem.h
+++ b/source/tensor/XMem.h
--- a/source/tensor/XName.cpp
+++ b/source/tensor/XName.cpp
--- a/source/tensor/XName.h
+++ b/source/tensor/XName.h
--- a/source/tensor/XPRunner.cpp
+++ b/source/tensor/XPRunner.cpp
--- a/source/tensor/XPRunner.h
+++ b/source/tensor/XPRunner.h
--- a/source/tensor/XQueue.cpp
+++ b/source/tensor/XQueue.cpp
--- a/source/tensor/XQueue.h
+++ b/source/tensor/XQueue.h
--- a/source/tensor/XStream.cpp
+++ b/source/tensor/XStream.cpp
--- a/source/tensor/XStream.h
+++ b/source/tensor/XStream.h
--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
--- a/source/tensor/XThread.cpp
+++ b/source/tensor/XThread.cpp
--- a/source/tensor/XThread.h
+++ b/source/tensor/XThread.h
--- a/source/tensor/XUtility.cpp
+++ b/source/tensor/XUtility.cpp
--- a/source/tensor/XUtility.h
+++ b/source/tensor/XUtility.h
--- a/source/tensor/core/CHeader.h
+++ b/source/tensor/core/CHeader.h
--- a/source/tensor/core/arithmetic/Div.cpp
+++ b/source/tensor/core/arithmetic/Div.cpp
--- a/source/tensor/core/arithmetic/Div.cu
+++ b/source/tensor/core/arithmetic/Div.cu
--- a/source/tensor/core/arithmetic/Div.cuh
+++ b/source/tensor/core/arithmetic/Div.cuh
--- a/source/tensor/core/arithmetic/Div.h
+++ b/source/tensor/core/arithmetic/Div.h
--- a/source/tensor/core/arithmetic/DivDim.cpp
+++ b/source/tensor/core/arithmetic/DivDim.cpp
--- a/source/tensor/core/arithmetic/DivDim.cu
+++ b/source/tensor/core/arithmetic/DivDim.cu
--- a/source/tensor/core/arithmetic/DivDim.cuh
+++ b/source/tensor/core/arithmetic/DivDim.cuh
--- a/source/tensor/core/arithmetic/DivDim.h
+++ b/source/tensor/core/arithmetic/DivDim.h
--- a/source/tensor/core/arithmetic/Mask.cpp
+++ b/source/tensor/core/arithmetic/Mask.cpp
--- a/source/tensor/core/arithmetic/Mask.cu
+++ b/source/tensor/core/arithmetic/Mask.cu
--- a/source/tensor/core/arithmetic/Mask.cuh
+++ b/source/tensor/core/arithmetic/Mask.cuh
--- a/source/tensor/core/arithmetic/Mask.h
+++ b/source/tensor/core/arithmetic/Mask.h
--- a/source/tensor/core/arithmetic/MatrixMul.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul.cpp
--- a/source/tensor/core/arithmetic/MatrixMul.h
+++ b/source/tensor/core/arithmetic/MatrixMul.h
--- a/source/tensor/core/arithmetic/MatrixMul2D.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul2D.cpp
--- a/source/tensor/core/arithmetic/MatrixMul2D.cu
+++ b/source/tensor/core/arithmetic/MatrixMul2D.cu
--- a/source/tensor/core/arithmetic/MatrixMul2D.cuh
+++ b/source/tensor/core/arithmetic/MatrixMul2D.cuh
--- a/source/tensor/core/arithmetic/MatrixMul2D.h
+++ b/source/tensor/core/arithmetic/MatrixMul2D.h
--- a/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.cpp
--- a/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.h
+++ b/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.h
--- a/source/tensor/core/arithmetic/MatrixMul2DParallel.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul2DParallel.cpp
--- a/source/tensor/core/arithmetic/MatrixMul2DParallel.h
+++ b/source/tensor/core/arithmetic/MatrixMul2DParallel.h
--- a/source/tensor/core/arithmetic/MatrixMulBatched.cpp
+++ b/source/tensor/core/arithmetic/MatrixMulBatched.cpp
--- a/source/tensor/core/arithmetic/MatrixMulBatched.h
+++ b/source/tensor/core/arithmetic/MatrixMulBatched.h
--- a/source/tensor/core/arithmetic/MulAndShift.cpp
+++ b/source/tensor/core/arithmetic/MulAndShift.cpp
--- a/source/tensor/core/arithmetic/MulAndShift.h
+++ b/source/tensor/core/arithmetic/MulAndShift.h
--- a/source/tensor/core/arithmetic/Multiply.cpp
+++ b/source/tensor/core/arithmetic/Multiply.cpp
--- a/source/tensor/core/arithmetic/Multiply.cu
+++ b/source/tensor/core/arithmetic/Multiply.cu
--- a/source/tensor/core/arithmetic/Multiply.cuh
+++ b/source/tensor/core/arithmetic/Multiply.cuh
--- a/source/tensor/core/arithmetic/Multiply.h
+++ b/source/tensor/core/arithmetic/Multiply.h
--- a/source/tensor/core/arithmetic/MultiplyDim.cpp
+++ b/source/tensor/core/arithmetic/MultiplyDim.cpp
--- a/source/tensor/core/arithmetic/MultiplyDim.cu
+++ b/source/tensor/core/arithmetic/MultiplyDim.cu
--- a/source/tensor/core/arithmetic/MultiplyDim.cuh
+++ b/source/tensor/core/arithmetic/MultiplyDim.cuh
--- a/source/tensor/core/arithmetic/MultiplyDim.h
+++ b/source/tensor/core/arithmetic/MultiplyDim.h
--- a/source/tensor/core/arithmetic/Negate.cpp
+++ b/source/tensor/core/arithmetic/Negate.cpp
--- a/source/tensor/core/arithmetic/Negate.cu
+++ b/source/tensor/core/arithmetic/Negate.cu
--- a/source/tensor/core/arithmetic/Negate.cuh
+++ b/source/tensor/core/arithmetic/Negate.cuh
--- a/source/tensor/core/arithmetic/Negate.h
+++ b/source/tensor/core/arithmetic/Negate.h
--- a/source/tensor/core/arithmetic/Sign.cpp
+++ b/source/tensor/core/arithmetic/Sign.cpp
--- a/source/tensor/core/arithmetic/Sign.cu
+++ b/source/tensor/core/arithmetic/Sign.cu
--- a/source/tensor/core/arithmetic/Sign.cuh
+++ b/source/tensor/core/arithmetic/Sign.cuh
--- a/source/tensor/core/arithmetic/Sign.h
+++ b/source/tensor/core/arithmetic/Sign.h
--- a/source/tensor/core/arithmetic/Sub.cpp
+++ b/source/tensor/core/arithmetic/Sub.cpp
--- a/source/tensor/core/arithmetic/Sub.cu
+++ b/source/tensor/core/arithmetic/Sub.cu
--- a/source/tensor/core/arithmetic/Sub.cuh
+++ b/source/tensor/core/arithmetic/Sub.cuh
--- a/source/tensor/core/arithmetic/Sub.h
+++ b/source/tensor/core/arithmetic/Sub.h
--- a/source/tensor/core/arithmetic/SubDim.cpp
+++ b/source/tensor/core/arithmetic/SubDim.cpp
--- a/source/tensor/core/arithmetic/SubDim.cu
+++ b/source/tensor/core/arithmetic/SubDim.cu
--- a/source/tensor/core/arithmetic/SubDim.h
+++ b/source/tensor/core/arithmetic/SubDim.h
--- a/source/tensor/core/arithmetic/Sum.cpp
+++ b/source/tensor/core/arithmetic/Sum.cpp
--- a/source/tensor/core/arithmetic/Sum.cu
+++ b/source/tensor/core/arithmetic/Sum.cu
--- a/source/tensor/core/arithmetic/Sum.cuh
+++ b/source/tensor/core/arithmetic/Sum.cuh
--- a/source/tensor/core/arithmetic/Sum.h
+++ b/source/tensor/core/arithmetic/Sum.h
--- a/source/tensor/core/arithmetic/SumByColumnTV.cpp
+++ b/source/tensor/core/arithmetic/SumByColumnTV.cpp
--- a/source/tensor/core/arithmetic/SumByColumnTV.cu
+++ b/source/tensor/core/arithmetic/SumByColumnTV.cu
--- a/source/tensor/core/arithmetic/SumByColumnTV.cuh
+++ b/source/tensor/core/arithmetic/SumByColumnTV.cuh
--- a/source/tensor/core/arithmetic/SumByColumnVT.cpp
+++ b/source/tensor/core/arithmetic/SumByColumnVT.cpp
--- a/source/tensor/core/arithmetic/SumByColumnVT.cu
+++ b/source/tensor/core/arithmetic/SumByColumnVT.cu
--- a/source/tensor/core/arithmetic/SumByColumnVT.cuh
+++ b/source/tensor/core/arithmetic/SumByColumnVT.cuh
--- a/source/tensor/core/arithmetic/SumDim.cpp
+++ b/source/tensor/core/arithmetic/SumDim.cpp
--- a/source/tensor/core/arithmetic/SumDim.cu
+++ b/source/tensor/core/arithmetic/SumDim.cu
--- a/source/tensor/core/arithmetic/SumDim.cuh
+++ b/source/tensor/core/arithmetic/SumDim.cuh
--- a/source/tensor/core/arithmetic/SumDim.h
+++ b/source/tensor/core/arithmetic/SumDim.h
--- a/source/tensor/core/arithmetic/XTensorBLAS.cpp
+++ b/source/tensor/core/arithmetic/XTensorBLAS.cpp
--- a/source/tensor/core/arithmetic/XTensorBLAS.cu
+++ b/source/tensor/core/arithmetic/XTensorBLAS.cu
--- a/source/tensor/core/arithmetic/XTensorBLAS.h
+++ b/source/tensor/core/arithmetic/XTensorBLAS.h
--- a/source/tensor/core/getandset/ConvertDataType.cpp
+++ b/source/tensor/core/getandset/ConvertDataType.cpp
--- a/source/tensor/core/getandset/ConvertDataType.cu
+++ b/source/tensor/core/getandset/ConvertDataType.cu
--- a/source/tensor/core/getandset/ConvertDataType.cuh
+++ b/source/tensor/core/getandset/ConvertDataType.cuh
--- a/source/tensor/core/getandset/ConvertDataType.h
+++ b/source/tensor/core/getandset/ConvertDataType.h
--- a/source/tensor/core/getandset/OnehotAndIndex.cpp
+++ b/source/tensor/core/getandset/OnehotAndIndex.cpp
--- a/source/tensor/core/getandset/OnehotAndIndex.cu
+++ b/source/tensor/core/getandset/OnehotAndIndex.cu
--- a/source/tensor/core/getandset/OnehotAndIndex.cuh
+++ b/source/tensor/core/getandset/OnehotAndIndex.cuh
--- a/source/tensor/core/getandset/OnehotAndIndex.h
+++ b/source/tensor/core/getandset/OnehotAndIndex.h
--- a/source/tensor/core/getandset/Select.cpp
+++ b/source/tensor/core/getandset/Select.cpp
--- a/source/tensor/core/getandset/Select.cuh
+++ b/source/tensor/core/getandset/Select.cuh
--- a/source/tensor/core/getandset/Select.h
+++ b/source/tensor/core/getandset/Select.h
--- a/source/tensor/core/getandset/SetData.cpp
+++ b/source/tensor/core/getandset/SetData.cpp
--- a/source/tensor/core/getandset/SetData.cu
+++ b/source/tensor/core/getandset/SetData.cu
--- a/source/tensor/core/getandset/SetData.cuh
+++ b/source/tensor/core/getandset/SetData.cuh
--- a/source/tensor/core/getandset/SetData.h
+++ b/source/tensor/core/getandset/SetData.h
--- a/source/tensor/core/math/Binary.cpp
+++ b/source/tensor/core/math/Binary.cpp
--- a/source/tensor/core/math/Binary.cu
+++ b/source/tensor/core/math/Binary.cu
--- a/source/tensor/core/math/Binary.cuh
+++ b/source/tensor/core/math/Binary.cuh
--- a/source/tensor/core/math/Binary.h
+++ b/source/tensor/core/math/Binary.h
--- a/source/tensor/core/math/Clip.cpp
+++ b/source/tensor/core/math/Clip.cpp
--- a/source/tensor/core/math/Clip.cu
+++ b/source/tensor/core/math/Clip.cu
--- a/source/tensor/core/math/Clip.cuh
+++ b/source/tensor/core/math/Clip.cuh
--- a/source/tensor/core/math/Clip.h
+++ b/source/tensor/core/math/Clip.h
--- a/source/tensor/core/math/Compare.cpp
+++ b/source/tensor/core/math/Compare.cpp
--- a/source/tensor/core/math/Compare.cu
+++ b/source/tensor/core/math/Compare.cu
--- a/source/tensor/core/math/Compare.cuh
+++ b/source/tensor/core/math/Compare.cuh
--- a/source/tensor/core/math/Compare.h
+++ b/source/tensor/core/math/Compare.h
--- a/source/tensor/core/math/Normalize.cpp
+++ b/source/tensor/core/math/Normalize.cpp
--- a/source/tensor/core/math/Normalize.cu
+++ b/source/tensor/core/math/Normalize.cu
--- a/source/tensor/core/math/Normalize.cuh
+++ b/source/tensor/core/math/Normalize.cuh
--- a/source/tensor/core/math/Normalize.h
+++ b/source/tensor/core/math/Normalize.h
--- a/source/tensor/core/math/Power.cpp
+++ b/source/tensor/core/math/Power.cpp
--- a/source/tensor/core/math/Power.cu
+++ b/source/tensor/core/math/Power.cu
--- a/source/tensor/core/math/Power.cuh
+++ b/source/tensor/core/math/Power.cuh
--- a/source/tensor/core/math/Power.h
+++ b/source/tensor/core/math/Power.h
--- a/source/tensor/core/math/ScaleAndShift.cpp
+++ b/source/tensor/core/math/ScaleAndShift.cpp
--- a/source/tensor/core/math/ScaleAndShift.cu
+++ b/source/tensor/core/math/ScaleAndShift.cu
--- a/source/tensor/core/math/ScaleAndShift.cuh
+++ b/source/tensor/core/math/ScaleAndShift.cuh
--- a/source/tensor/core/math/ScaleAndShift.h
+++ b/source/tensor/core/math/ScaleAndShift.h
--- a/source/tensor/core/math/Unary.cpp
+++ b/source/tensor/core/math/Unary.cpp
--- a/source/tensor/core/math/Unary.cu
+++ b/source/tensor/core/math/Unary.cu
--- a/source/tensor/core/math/Unary.cuh
+++ b/source/tensor/core/math/Unary.cuh
--- a/source/tensor/core/math/Unary.h
+++ b/source/tensor/core/math/Unary.h
--- a/source/tensor/core/movement/CopyBlocks.cpp
+++ b/source/tensor/core/movement/CopyBlocks.cpp
--- a/source/tensor/core/movement/CopyBlocks.h
+++ b/source/tensor/core/movement/CopyBlocks.h
--- a/source/tensor/core/movement/CopyBlocksInGrid.cpp
+++ b/source/tensor/core/movement/CopyBlocksInGrid.cpp
--- a/source/tensor/core/movement/CopyBlocksInGrid.cu
+++ b/source/tensor/core/movement/CopyBlocksInGrid.cu
--- a/source/tensor/core/movement/CopyBlocksInGrid.cuh
+++ b/source/tensor/core/movement/CopyBlocksInGrid.cuh
--- a/source/tensor/core/movement/CopyBlocksInGrid.h
+++ b/source/tensor/core/movement/CopyBlocksInGrid.h
--- a/source/tensor/core/movement/CopyBlocksOnSite.cpp
+++ b/source/tensor/core/movement/CopyBlocksOnSite.cpp
--- a/source/tensor/core/movement/CopyBlocksOnSite.cu
+++ b/source/tensor/core/movement/CopyBlocksOnSite.cu
--- a/source/tensor/core/movement/CopyBlocksOnSite.cuh
+++ b/source/tensor/core/movement/CopyBlocksOnSite.cuh
--- a/source/tensor/core/movement/CopyBlocksOnSite.h
+++ b/source/tensor/core/movement/CopyBlocksOnSite.h
--- a/source/tensor/core/movement/CopyBlocksSelected.cu
+++ b/source/tensor/core/movement/CopyBlocksSelected.cu
--- a/source/tensor/core/movement/CopyBlocksSelected.cuh
+++ b/source/tensor/core/movement/CopyBlocksSelected.cuh
--- a/source/tensor/core/movement/CopyData2D.cpp
+++ b/source/tensor/core/movement/CopyData2D.cpp
--- a/source/tensor/core/movement/CopyData2D.h
+++ b/source/tensor/core/movement/CopyData2D.h
--- a/source/tensor/core/movement/CopyInGrid.cpp
+++ b/source/tensor/core/movement/CopyInGrid.cpp
--- a/source/tensor/core/movement/CopyInGrid.h
+++ b/source/tensor/core/movement/CopyInGrid.h
--- a/source/tensor/core/movement/CopyIndexed.cpp
+++ b/source/tensor/core/movement/CopyIndexed.cpp
--- a/source/tensor/core/movement/CopyIndexed.cu
+++ b/source/tensor/core/movement/CopyIndexed.cu
--- a/source/tensor/core/movement/CopyIndexed.cuh
+++ b/source/tensor/core/movement/CopyIndexed.cuh
--- a/source/tensor/core/movement/CopyIndexed.h
+++ b/source/tensor/core/movement/CopyIndexed.h
--- a/source/tensor/core/movement/CopyValues.cpp
+++ b/source/tensor/core/movement/CopyValues.cpp
--- a/source/tensor/core/movement/CopyValues.cu
+++ b/source/tensor/core/movement/CopyValues.cu
--- a/source/tensor/core/movement/CopyValues.cuh
+++ b/source/tensor/core/movement/CopyValues.cuh
--- a/source/tensor/core/movement/CopyValues.h
+++ b/source/tensor/core/movement/CopyValues.h
--- a/source/tensor/core/movement/Gather.cpp
+++ b/source/tensor/core/movement/Gather.cpp
--- a/source/tensor/core/movement/Gather.cu
+++ b/source/tensor/core/movement/Gather.cu
--- a/source/tensor/core/movement/Gather.cuh
+++ b/source/tensor/core/movement/Gather.cuh
--- a/source/tensor/core/movement/Gather.h
+++ b/source/tensor/core/movement/Gather.h
--- a/source/tensor/core/movement/Spread.cpp
+++ b/source/tensor/core/movement/Spread.cpp
--- a/source/tensor/core/movement/Spread.cu
+++ b/source/tensor/core/movement/Spread.cu
--- a/source/tensor/core/movement/Spread.cuh
+++ b/source/tensor/core/movement/Spread.cuh
--- a/source/tensor/core/movement/Spread.h
+++ b/source/tensor/core/movement/Spread.h
--- a/source/tensor/core/reduce/ReduceMax.cpp
+++ b/source/tensor/core/reduce/ReduceMax.cpp
--- a/source/tensor/core/reduce/ReduceMax.cu
+++ b/source/tensor/core/reduce/ReduceMax.cu
--- a/source/tensor/core/reduce/ReduceMax.cuh
+++ b/source/tensor/core/reduce/ReduceMax.cuh
--- a/source/tensor/core/reduce/ReduceMax.h
+++ b/source/tensor/core/reduce/ReduceMax.h
--- a/source/tensor/core/reduce/ReduceMean.cpp
+++ b/source/tensor/core/reduce/ReduceMean.cpp
--- a/source/tensor/core/reduce/ReduceMean.h
+++ b/source/tensor/core/reduce/ReduceMean.h
--- a/source/tensor/core/reduce/ReduceStandardVariance.h
+++ b/source/tensor/core/reduce/ReduceStandardVariance.h
--- a/source/tensor/core/reduce/ReduceSum.cpp
+++ b/source/tensor/core/reduce/ReduceSum.cpp
--- a/source/tensor/core/reduce/ReduceSum.cu
+++ b/source/tensor/core/reduce/ReduceSum.cu
--- a/source/tensor/core/reduce/ReduceSum.cuh
+++ b/source/tensor/core/reduce/ReduceSum.cuh
--- a/source/tensor/core/reduce/ReduceSum.h
+++ b/source/tensor/core/reduce/ReduceSum.h
--- a/source/tensor/core/reduce/ReduceSumAll.cpp
+++ b/source/tensor/core/reduce/ReduceSumAll.cpp
--- a/source/tensor/core/reduce/ReduceSumAll.h
+++ b/source/tensor/core/reduce/ReduceSumAll.h
--- a/source/tensor/core/reduce/ReduceSumSquared.cpp
+++ b/source/tensor/core/reduce/ReduceSumSquared.cpp
--- a/source/tensor/core/reduce/ReduceSumSquared.h
+++ b/source/tensor/core/reduce/ReduceSumSquared.h
--- a/source/tensor/core/reduce/ReduceVariance.cpp
+++ b/source/tensor/core/reduce/ReduceVariance.cpp
--- a/source/tensor/core/reduce/ReduceVariance.h
+++ b/source/tensor/core/reduce/ReduceVariance.h
--- a/source/tensor/core/reduce/VectorBuffer.cpp
+++ b/source/tensor/core/reduce/VectorBuffer.cpp
--- a/source/tensor/core/reduce/VectorBuffer.h
+++ b/source/tensor/core/reduce/VectorBuffer.h
--- a/source/tensor/core/shape/Concatenate.cpp
+++ b/source/tensor/core/shape/Concatenate.cpp
--- a/source/tensor/core/shape/Concatenate.h
+++ b/source/tensor/core/shape/Concatenate.h
--- a/source/tensor/core/shape/ConcatenateSolely.cpp
+++ b/source/tensor/core/shape/ConcatenateSolely.cpp
--- a/source/tensor/core/shape/ConcatenateSolely.h
+++ b/source/tensor/core/shape/ConcatenateSolely.h
--- a/source/tensor/core/shape/IsSameShaped.cpp
+++ b/source/tensor/core/shape/IsSameShaped.cpp
--- a/source/tensor/test/TSubDim.h
+++ b/source/tensor/test/TSubDim.h
--- a/source/tensor/core/shape/MakeMergeBlockIndex.cpp
+++ b/source/tensor/core/shape/MakeMergeBlockIndex.cpp
--- a/source/tensor/core/shape/MakeMergeBlockIndex.cu
+++ b/source/tensor/core/shape/MakeMergeBlockIndex.cu
--- a/source/tensor/core/shape/MakeMergeBlockIndex.cuh
+++ b/source/tensor/core/shape/MakeMergeBlockIndex.cuh
--- a/source/tensor/core/shape/MakeMergeBlockIndex.h
+++ b/source/tensor/core/shape/MakeMergeBlockIndex.h
--- a/source/tensor/core/shape/MakeSplitBlockIndex.cpp
+++ b/source/tensor/core/shape/MakeSplitBlockIndex.cpp
--- a/source/tensor/core/shape/MakeSplitBlockIndex.cu
+++ b/source/tensor/core/shape/MakeSplitBlockIndex.cu
--- a/source/tensor/core/shape/MakeSplitBlockIndex.cuh
+++ b/source/tensor/core/shape/MakeSplitBlockIndex.cuh
--- a/source/tensor/core/shape/MakeSplitBlockIndex.h
+++ b/source/tensor/core/shape/MakeSplitBlockIndex.h
--- a/source/tensor/core/shape/Merge.cpp
+++ b/source/tensor/core/shape/Merge.cpp
--- a/source/tensor/core/shape/Merge.h
+++ b/source/tensor/core/shape/Merge.h
--- a/source/tensor/core/shape/MergeBlockLists.cpp
+++ b/source/tensor/core/shape/MergeBlockLists.cpp
--- a/source/tensor/core/shape/MergeBlockLists.cu
+++ b/source/tensor/core/shape/MergeBlockLists.cu
--- a/source/tensor/core/shape/MergeBlockLists.cuh
+++ b/source/tensor/core/shape/MergeBlockLists.cuh
--- a/source/tensor/core/shape/MergeBlockLists.h
+++ b/source/tensor/core/shape/MergeBlockLists.h
--- a/source/tensor/core/shape/Permute.h
+++ b/source/tensor/core/shape/Permute.h
--- a/source/tensor/core/shape/Reshape.cpp
+++ b/source/tensor/core/shape/Reshape.cpp
--- a/source/tensor/core/shape/Reshape.h
+++ b/source/tensor/core/shape/Reshape.h
--- a/source/tensor/core/shape/Split.cpp
+++ b/source/tensor/core/shape/Split.cpp
--- a/source/tensor/core/shape/Split.h
+++ b/source/tensor/core/shape/Split.h
--- a/source/tensor/core/shape/Squeeze.cpp
+++ b/source/tensor/core/shape/Squeeze.cpp
--- a/source/tensor/core/shape/Squeeze.h
+++ b/source/tensor/core/shape/Squeeze.h
--- a/source/tensor/core/shape/Stack.cpp
+++ b/source/tensor/core/shape/Stack.cpp
--- a/source/tensor/test/TSumByColumnTV.h
+++ b/source/tensor/test/TSumByColumnTV.h
--- a/source/tensor/core/shape/Transpose.cpp
+++ b/source/tensor/core/shape/Transpose.cpp
--- a/source/tensor/core/shape/Transpose.h
+++ b/source/tensor/core/shape/Transpose.h
--- a/source/tensor/core/shape/Unsqueeze.cpp
+++ b/source/tensor/core/shape/Unsqueeze.cpp
--- a/source/tensor/core/shape/Unsqueeze.cu
+++ b/source/tensor/core/shape/Unsqueeze.cu
--- a/source/tensor/core/shape/Unsqueeze.cuh
+++ b/source/tensor/core/shape/Unsqueeze.cuh
--- a/source/tensor/core/shape/Unsqueeze.h
+++ b/source/tensor/core/shape/Unsqueeze.h
--- a/source/tensor/core/sort/Sort.cpp
+++ b/source/tensor/core/sort/Sort.cpp
--- a/source/tensor/core/sort/Sort.cu
+++ b/source/tensor/core/sort/Sort.cu
--- a/source/tensor/core/sort/Sort.cuh
+++ b/source/tensor/core/sort/Sort.cuh
--- a/source/tensor/core/sort/Sort.h
+++ b/source/tensor/core/sort/Sort.h
--- a/source/tensor/core/sort/TopK.cpp
+++ b/source/tensor/core/sort/TopK.cpp
--- a/source/tensor/core/sort/TopK.cu
+++ b/source/tensor/core/sort/TopK.cu
--- a/source/tensor/core/sort/TopK.cuh
+++ b/source/tensor/core/sort/TopK.cuh
--- a/source/tensor/core/sort/TopK.h
+++ b/source/tensor/core/sort/TopK.h
--- a/source/tensor/core/utilities/CheckData.cpp
+++ b/source/tensor/core/utilities/CheckData.cpp
--- a/source/tensor/core/arithmetic/SumByColumnVT.h
+++ b/source/tensor/core/arithmetic/SumByColumnVT.h
--- a/source/tensor/core/utilities/FlushToMem.cpp
+++ b/source/tensor/core/utilities/FlushToMem.cpp
--- a/source/tensor/core/utilities/FlushToMem.cu
+++ b/source/tensor/core/utilities/FlushToMem.cu
--- a/source/tensor/core/utilities/FlushToMem.cuh
+++ b/source/tensor/core/utilities/FlushToMem.cuh
--- a/source/tensor/core/utilities/FlushToMem.h
+++ b/source/tensor/core/utilities/FlushToMem.h
--- a/source/tensor/core/utilities/SetAscendingOrder.cpp
+++ b/source/tensor/core/utilities/SetAscendingOrder.cpp
--- a/source/tensor/core/utilities/SetAscendingOrder.cu
+++ b/source/tensor/core/utilities/SetAscendingOrder.cu
--- a/source/tensor/core/utilities/SetAscendingOrder.cuh
+++ b/source/tensor/core/utilities/SetAscendingOrder.cuh
--- a/source/tensor/core/arithmetic/SumByColumnTV.h
+++ b/source/tensor/core/arithmetic/SumByColumnTV.h
--- a/source/tensor/core/utilities/XMatrixSegment.cpp
+++ b/source/tensor/core/utilities/XMatrixSegment.cpp
--- a/source/tensor/core/utilities/XMatrixSegment.h
+++ b/source/tensor/core/utilities/XMatrixSegment.h
--- a/source/tensor/function/Dropout.cpp
+++ b/source/tensor/function/Dropout.cpp
--- a/source/tensor/function/Dropout.cu
+++ b/source/tensor/function/Dropout.cu
--- a/source/tensor/function/Dropout.cuh
+++ b/source/tensor/function/Dropout.cuh
--- a/source/tensor/function/Dropout.h
+++ b/source/tensor/function/Dropout.h
--- a/source/tensor/function/DropoutWithIndex.cpp
+++ b/source/tensor/function/DropoutWithIndex.cpp
--- a/source/tensor/function/DropoutWithIndex.cu
+++ b/source/tensor/function/DropoutWithIndex.cu
--- a/source/tensor/function/DropoutWithIndex.cuh
+++ b/source/tensor/function/DropoutWithIndex.cuh
--- a/source/tensor/function/DropoutWithIndex.h
+++ b/source/tensor/function/DropoutWithIndex.h
--- a/source/tensor/function/FHeader.h
+++ b/source/tensor/function/FHeader.h
--- a/source/tensor/function/HardTanH.cpp
+++ b/source/tensor/function/HardTanH.cpp
--- a/source/tensor/function/HardTanH.cu
+++ b/source/tensor/function/HardTanH.cu
--- a/source/tensor/function/HardTanH.cuh
+++ b/source/tensor/function/HardTanH.cuh
--- a/source/tensor/function/HardTanH.h
+++ b/source/tensor/function/HardTanH.h
--- a/source/tensor/function/Identity.cpp
+++ b/source/tensor/function/Identity.cpp
--- a/source/tensor/function/Identity.h
+++ b/source/tensor/function/Identity.h
--- a/source/tensor/function/LogSoftmax.cpp
+++ b/source/tensor/function/LogSoftmax.cpp
--- a/source/tensor/function/LogSoftmax.cu
+++ b/source/tensor/function/LogSoftmax.cu
--- a/source/tensor/function/LogSoftmax.cuh
+++ b/source/tensor/function/LogSoftmax.cuh
--- a/source/tensor/function/LogSoftmax.h
+++ b/source/tensor/function/LogSoftmax.h
--- a/source/tensor/function/Loss.cpp
+++ b/source/tensor/function/Loss.cpp
--- a/source/tensor/function/Loss.cu
+++ b/source/tensor/function/Loss.cu
--- a/source/tensor/function/Loss.cuh
+++ b/source/tensor/function/Loss.cuh
--- a/source/tensor/function/Loss.h
+++ b/source/tensor/function/Loss.h
--- a/source/tensor/function/Rectify.cpp
+++ b/source/tensor/function/Rectify.cpp
--- a/source/tensor/function/Rectify.cu
+++ b/source/tensor/function/Rectify.cu
--- a/source/tensor/function/Rectify.cuh
+++ b/source/tensor/function/Rectify.cuh
--- a/source/tensor/function/Rectify.h
+++ b/source/tensor/function/Rectify.h
--- a/source/tensor/function/Sigmoid.cpp
+++ b/source/tensor/function/Sigmoid.cpp
--- a/source/tensor/function/Sigmoid.cu
+++ b/source/tensor/function/Sigmoid.cu
--- a/source/tensor/function/Sigmoid.cuh
+++ b/source/tensor/function/Sigmoid.cuh
--- a/source/tensor/function/Sigmoid.h
+++ b/source/tensor/function/Sigmoid.h
--- a/source/tensor/function/Softmax.cpp
+++ b/source/tensor/function/Softmax.cpp
--- a/source/tensor/function/Softmax.cu
+++ b/source/tensor/function/Softmax.cu
--- a/source/tensor/function/Softmax.cuh
+++ b/source/tensor/function/Softmax.cuh
--- a/source/tensor/function/Softmax.h
+++ b/source/tensor/function/Softmax.h
--- a/source/tensor/loss/CrossEntropy.cpp
+++ b/source/tensor/loss/CrossEntropy.cpp
--- a/source/tensor/loss/CrossEntropy.cu
+++ b/source/tensor/loss/CrossEntropy.cu
--- a/source/tensor/loss/CrossEntropy.cuh
+++ b/source/tensor/loss/CrossEntropy.cuh
--- a/source/tensor/loss/CrossEntropy.h
+++ b/source/tensor/loss/CrossEntropy.h
--- a/source/tensor/loss/LHeader.h
+++ b/source/tensor/loss/LHeader.h
--- a/source/tensor/test/TAbsolute.cpp
+++ b/source/tensor/test/TAbsolute.cpp
--- a/source/tensor/test/TAbsolute.h
+++ b/source/tensor/test/TAbsolute.h
--- a/source/tensor/test/TClip.cpp
+++ b/source/tensor/test/TClip.cpp
--- a/source/tensor/test/TClip.h
+++ b/source/tensor/test/TClip.h
--- a/source/tensor/test/TCompare.cpp
+++ b/source/tensor/test/TCompare.cpp
--- a/source/tensor/test/TCompare.h
+++ b/source/tensor/test/TCompare.h
--- a/source/tensor/test/TConcatenate.cpp
+++ b/source/tensor/test/TConcatenate.cpp
--- a/source/tensor/test/TConcatenate.h
+++ b/source/tensor/test/TConcatenate.h
--- a/source/tensor/test/TConcatenateSolely.cpp
+++ b/source/tensor/test/TConcatenateSolely.cpp
--- a/source/tensor/test/TConcatenateSolely.h
+++ b/source/tensor/test/TConcatenateSolely.h
--- a/source/tensor/test/TConvertDataType.cpp
+++ b/source/tensor/test/TConvertDataType.cpp
--- a/source/tensor/test/TConvertDataType.h
+++ b/source/tensor/test/TConvertDataType.h
--- a/source/tensor/test/TCopyIndexed.cpp
+++ b/source/tensor/test/TCopyIndexed.cpp
--- a/source/tensor/test/TCopyIndexed.h
+++ b/source/tensor/test/TCopyIndexed.h
--- a/source/tensor/test/TCopyValues.cpp
+++ b/source/tensor/test/TCopyValues.cpp
--- a/source/tensor/test/TCopyValues.h
+++ b/source/tensor/test/TCopyValues.h
--- a/source/tensor/test/TCos.cpp
+++ b/source/tensor/test/TCos.cpp
--- a/source/tensor/test/TCos.h
+++ b/source/tensor/test/TCos.h
--- a/source/tensor/test/TCrossEntropy.cpp
+++ b/source/tensor/test/TCrossEntropy.cpp
--- a/source/tensor/test/TCrossEntropy.h
+++ b/source/tensor/test/TCrossEntropy.h
--- a/source/tensor/test/TDiv.cpp
+++ b/source/tensor/test/TDiv.cpp
--- a/source/tensor/test/TDiv.h
+++ b/source/tensor/test/TDiv.h
--- a/source/tensor/test/TDivDim.cpp
+++ b/source/tensor/test/TDivDim.cpp
--- a/source/tensor/test/TDivDim.h
+++ b/source/tensor/test/TDivDim.h
--- a/source/tensor/test/TDropout.cpp
+++ b/source/tensor/test/TDropout.cpp
--- a/source/tensor/test/TDropout.h
+++ b/source/tensor/test/TDropout.h
--- a/source/tensor/test/TExp.cpp
+++ b/source/tensor/test/TExp.cpp
--- a/source/tensor/test/TExp.h
+++ b/source/tensor/test/TExp.h
--- a/source/tensor/test/TGather.cpp
+++ b/source/tensor/test/TGather.cpp
--- a/source/tensor/test/TGather.h
+++ b/source/tensor/test/TGather.h
--- a/source/tensor/test/THardTanH.cpp
+++ b/source/tensor/test/THardTanH.cpp
--- a/source/tensor/test/THardTanH.h
+++ b/source/tensor/test/THardTanH.h
--- a/source/tensor/test/TIdentity.cpp
+++ b/source/tensor/test/TIdentity.cpp
--- a/source/tensor/test/TIdentity.h
+++ b/source/tensor/test/TIdentity.h
--- a/source/tensor/test/TLog.cpp
+++ b/source/tensor/test/TLog.cpp
--- a/source/tensor/test/TLog.h
+++ b/source/tensor/test/TLog.h
--- a/source/tensor/test/TLogSoftmax.cpp
+++ b/source/tensor/test/TLogSoftmax.cpp
--- a/source/tensor/test/TLogSoftmax.h
+++ b/source/tensor/test/TLogSoftmax.h
--- a/source/tensor/test/TLoss.cpp
+++ b/source/tensor/test/TLoss.cpp
--- a/source/tensor/test/TLoss.h
+++ b/source/tensor/test/TLoss.h
--- a/source/tensor/test/TMatrixMul.cpp
+++ b/source/tensor/test/TMatrixMul.cpp
--- a/source/tensor/test/TMatrixMul.h
+++ b/source/tensor/test/TMatrixMul.h
--- a/source/tensor/test/TMatrixMul2D.cpp
+++ b/source/tensor/test/TMatrixMul2D.cpp
--- a/source/tensor/test/TMatrixMul2D.h
+++ b/source/tensor/test/TMatrixMul2D.h
--- a/source/tensor/test/TMatrixMul2DParallel.cpp
+++ b/source/tensor/test/TMatrixMul2DParallel.cpp
--- a/source/tensor/test/TMatrixMul2DParallel.h
+++ b/source/tensor/test/TMatrixMul2DParallel.h
--- a/source/tensor/test/TMatrixMulBatched.cpp
+++ b/source/tensor/test/TMatrixMulBatched.cpp
--- a/source/tensor/test/TMatrixMulBatched.h
+++ b/source/tensor/test/TMatrixMulBatched.h
--- a/source/tensor/test/TMerge.cpp
+++ b/source/tensor/test/TMerge.cpp
--- a/source/tensor/test/TMerge.h
+++ b/source/tensor/test/TMerge.h
--- a/source/tensor/test/TMultiply.cpp
+++ b/source/tensor/test/TMultiply.cpp
--- a/source/tensor/test/TMultiply.h
+++ b/source/tensor/test/TMultiply.h
--- a/source/tensor/test/TMultiplyDim.cpp
+++ b/source/tensor/test/TMultiplyDim.cpp
--- a/source/tensor/test/TMultiplyDim.h
+++ b/source/tensor/test/TMultiplyDim.h
--- a/source/tensor/test/TNegate.cpp
+++ b/source/tensor/test/TNegate.cpp
--- a/source/tensor/test/TNegate.h
+++ b/source/tensor/test/TNegate.h
--- a/source/tensor/test/TNormalize.cpp
+++ b/source/tensor/test/TNormalize.cpp
--- a/source/tensor/test/TNormalize.h
+++ b/source/tensor/test/TNormalize.h
--- a/source/tensor/test/TPower.cpp
+++ b/source/tensor/test/TPower.cpp
--- a/source/tensor/test/TPower.h
+++ b/source/tensor/test/TPower.h
--- a/source/tensor/test/TRectify.cpp
+++ b/source/tensor/test/TRectify.cpp
--- a/source/tensor/test/TRectify.h
+++ b/source/tensor/test/TRectify.h
--- a/source/tensor/test/TReduceMax.cpp
+++ b/source/tensor/test/TReduceMax.cpp
--- a/source/tensor/test/TReduceMax.h
+++ b/source/tensor/test/TReduceMax.h
--- a/source/tensor/test/TReduceMean.cpp
+++ b/source/tensor/test/TReduceMean.cpp
--- a/source/tensor/test/TReduceMean.h
+++ b/source/tensor/test/TReduceMean.h
--- a/source/tensor/test/TReduceSum.cpp
+++ b/source/tensor/test/TReduceSum.cpp
--- a/source/tensor/test/TReduceSum.h
+++ b/source/tensor/test/TReduceSum.h
--- a/source/tensor/test/TReduceSumAll.cpp
+++ b/source/tensor/test/TReduceSumAll.cpp
--- a/source/tensor/test/TReduceSumAll.h
+++ b/source/tensor/test/TReduceSumAll.h
--- a/source/tensor/test/TReduceSumSquared.cpp
+++ b/source/tensor/test/TReduceSumSquared.cpp
--- a/source/tensor/test/TReduceSumSquared.h
+++ b/source/tensor/test/TReduceSumSquared.h
--- a/source/tensor/test/TReduceVariance.cpp
+++ b/source/tensor/test/TReduceVariance.cpp
--- a/source/tensor/test/TReduceVariance.h
+++ b/source/tensor/test/TReduceVariance.h
--- a/source/tensor/test/TRound.cpp
+++ b/source/tensor/test/TRound.cpp
--- a/source/tensor/test/TRound.h
+++ b/source/tensor/test/TRound.h
--- a/source/tensor/test/TScaleAndShift.cpp
+++ b/source/tensor/test/TScaleAndShift.cpp
--- a/source/tensor/test/TScaleAndShift.h
+++ b/source/tensor/test/TScaleAndShift.h
--- a/source/tensor/test/TSelect.cpp
+++ b/source/tensor/test/TSelect.cpp
--- a/source/tensor/test/TSelect.h
+++ b/source/tensor/test/TSelect.h
--- a/source/tensor/test/TSetAscendingOrder.cpp
+++ b/source/tensor/test/TSetAscendingOrder.cpp
--- a/source/tensor/test/TSetAscendingOrder.h
+++ b/source/tensor/test/TSetAscendingOrder.h
--- a/source/tensor/test/TSetData.cpp
+++ b/source/tensor/test/TSetData.cpp
--- a/source/tensor/test/TSetData.h
+++ b/source/tensor/test/TSetData.h
--- a/source/tensor/test/TSigmoid.cpp
+++ b/source/tensor/test/TSigmoid.cpp
--- a/source/tensor/test/TSigmoid.h
+++ b/source/tensor/test/TSigmoid.h
--- a/source/tensor/test/TSign.cpp
+++ b/source/tensor/test/TSign.cpp
--- a/source/tensor/test/TSign.h
+++ b/source/tensor/test/TSign.h
--- a/source/tensor/test/TSin.cpp
+++ b/source/tensor/test/TSin.cpp
--- a/source/tensor/test/TSin.h
+++ b/source/tensor/test/TSin.h
--- a/source/tensor/test/TSoftmax.cpp
+++ b/source/tensor/test/TSoftmax.cpp
--- a/source/tensor/test/TSoftmax.h
+++ b/source/tensor/test/TSoftmax.h
--- a/source/tensor/test/TSort.cpp
+++ b/source/tensor/test/TSort.cpp
--- a/source/tensor/test/TSort.h
+++ b/source/tensor/test/TSort.h
--- a/source/tensor/test/TSplit.cpp
+++ b/source/tensor/test/TSplit.cpp
--- a/source/tensor/test/TSplit.h
+++ b/source/tensor/test/TSplit.h
--- a/source/tensor/test/TSpread.cpp
+++ b/source/tensor/test/TSpread.cpp
--- a/source/tensor/test/TSpread.h
+++ b/source/tensor/test/TSpread.h
--- a/source/tensor/test/TSub.cpp
+++ b/source/tensor/test/TSub.cpp
--- a/source/tensor/test/TSub.h
+++ b/source/tensor/test/TSub.h
--- a/source/tensor/test/TSubDim.cpp
+++ b/source/tensor/test/TSubDim.cpp
--- a/source/tensor/test/TSum.cpp
+++ b/source/tensor/test/TSum.cpp
--- a/source/tensor/test/TSum.h
+++ b/source/tensor/test/TSum.h
--- a/source/tensor/test/TSumByColumnTV.cpp
+++ b/source/tensor/test/TSumByColumnTV.cpp
--- a/source/tensor/test/TSumByColumnVT.cpp
+++ b/source/tensor/test/TSumByColumnVT.cpp
--- a/source/tensor/test/TSumByColumnVT.h
+++ b/source/tensor/test/TSumByColumnVT.h
--- a/source/tensor/test/TSumDim.cpp
+++ b/source/tensor/test/TSumDim.cpp
--- a/source/tensor/test/TSumDim.h
+++ b/source/tensor/test/TSumDim.h
--- a/source/tensor/test/TTan.cpp
+++ b/source/tensor/test/TTan.cpp
--- a/source/tensor/test/TTan.h
+++ b/source/tensor/test/TTan.h
--- a/source/tensor/test/TTopK.cpp
+++ b/source/tensor/test/TTopK.cpp
--- a/source/tensor/test/TTopK.h
+++ b/source/tensor/test/TTopK.h
--- a/source/tensor/test/TTranspose.cpp
+++ b/source/tensor/test/TTranspose.cpp
--- a/source/tensor/test/TTranspose.h
+++ b/source/tensor/test/TTranspose.h
--- a/source/tensor/test/TUnsqueeze.cpp
+++ b/source/tensor/test/TUnsqueeze.cpp
--- a/source/tensor/test/TUnsqueeze.h
+++ b/source/tensor/test/TUnsqueeze.h
--- a/source/tensor/test/TXMem.cpp
+++ b/source/tensor/test/TXMem.cpp
--- a/source/tensor/test/TXMem.h
+++ b/source/tensor/test/TXMem.h
--- a/source/tensor/test/Test.cpp
+++ b/source/tensor/test/Test.cpp
--- a/source/tensor/test/Test.h
+++ b/source/tensor/test/Test.h