no message

b9c318bd · hello · 1d17c439 · b9c318bd · b9c318bd · b9c318bd
Commit b9c318bd authored Feb 06, 2021 by hello
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,4 @@ x64/
 vc140.pdb
 NiuTrans.Tensor.vcxproj.user
 NiuTrans.Tensor.aps
+*.tgz
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -97,35 +97,47 @@ if(USE_CUDA)
        add_definitions(-DHALF_PRECISION)
    endif()
    find_package(CUDA REQUIRED)
-    if(WIN32)
-        SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4819")
-        set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-maxrregcount=0 -m64 --disable-warnings -use_fast_math -DUSE_CUDA")
-        if(USE_HALF_PRECISION)
-            set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-DHALF_PRECISION")
-            set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} -arch=sm_60
-                -gencode=arch=compute_60,code=sm_60
-                -gencode=arch=compute_61,code=sm_61
-                -gencode=arch=compute_62,code=sm_62
-                -gencode=arch=compute_70,code=sm_70
-                -gencode=arch=compute_70,code=compute_70
-            )
-        else()
-            set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} -arch=sm_30
-                -gencode=arch=compute_30,code=sm_30
-                -gencode=arch=compute_50,code=sm_50
-                -gencode=arch=compute_52,code=sm_52
+    if(GPU_ARCH STREQUAL K) # Kepler cards (CUDA 5 until CUDA 10)
+        set(ARCH_FLAGS -arch=compute_30 -code=compute_30,sm_30,sm_35,sm_37)
+    elseif(GPU_ARCH STREQUAL M) # Maxwell cards (CUDA 6 until CUDA 11)
+        set(ARCH_FLAGS -arch=compute_50 -code=compute_50,sm_50,sm_52,sm_53)
+    elseif(GPU_ARCH STREQUAL P) # Pascal (CUDA 8 and later)
+        set(ARCH_FLAGS -arch=compute_60 -code=compute_60,sm_60,sm_61,sm_62)
+    elseif(GPU_ARCH STREQUAL V) # Volta (CUDA 9 and later)
+        set(ARCH_FLAGS -arch=compute_70 -code=compute_70,sm_70,sm_72)
+    elseif(GPU_ARCH STREQUAL T) # Turing (CUDA 10 and later)
+        set(ARCH_FLAGS -arch=compute_75 -code=sm_75)
+    elseif(GPU_ARCH STREQUAL A) # Ampere (CUDA 11 and later)
+        set(ARCH_FLAGS -arch=compute_80 -code=sm_80)
+    endif()
+
+    if(USE_HALF_PRECISION)
+        if(NOT DEFINED GPU_ARCH)
+            set(ARCH_FLAGS -arch=sm_60
                -gencode=arch=compute_60,code=sm_60
                -gencode=arch=compute_61,code=sm_61
                -gencode=arch=compute_62,code=sm_62
                -gencode=arch=compute_70,code=sm_70
+                -gencode=arch=compute_72,code=sm_72
                -gencode=arch=compute_70,code=compute_70
            )
+        elseif(${GPU_ARCH} STREQUAL K OR ${GPU_ARCH} STREQUAL M)
+            message(FATAL_ERROR "your GPU cannot use the function half precision")
        endif()
+    endif()
+    
+    if(WIN32)
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4819")
+        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-maxrregcount=0 -m64 -Wno-deprecated-gpu-targets -use_fast_math")
+        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${ARCH_FLAGS})
        set(CMAKE_POLICY_DEFAULT_CMP0028 NEW)
        link_directories("${CUDA_ROOT}/lib/x64")
        include_directories("${CUDA_ROOT}/include")
        set(CUDA_LIB_DIR "${CUDA_ROOT}/lib/x64/")
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}cublas.lib")
+        if(CUDA_VERSION_MAJOR EQUAL 11)
+            set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}cublasLt.lib")
+        endif()
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}npps.lib")
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}nppc.lib")
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}cudadevrt.lib")
@@ -133,31 +145,14 @@ if(USE_CUDA)
    else()
        set(CMAKE_CXX_FLAGS "-fPIC -msse4.2 -w -march=native -Wno-enum-compare -Wno-sign-compare -Wno-format -Wno-dev -O3 -DNDEBUG -rdynamic")
        set(CUDA_NVCC_FLAGS "-Xcompiler -fPIC -maxrregcount=0 --disable-warnings -use_fast_math -DUSE_CUDA -Wno-deprecated-gpu-targets -std=c++11")
-        if(USE_HALF_PRECISION)
-            set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-DHALF_PRECISION")
-            set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -arch=sm_60
-                -gencode=arch=compute_60,code=sm_60
-                -gencode=arch=compute_61,code=sm_61
-                -gencode=arch=compute_62,code=sm_62
-                -gencode=arch=compute_70,code=sm_70
-                -gencode=arch=compute_70,code=compute_70
-            )
-        else()
-            set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -arch=sm_30 
-                -gencode=arch=compute_30,code=sm_30 
-                -gencode=arch=compute_50,code=sm_50 
-                -gencode=arch=compute_52,code=sm_52 
-                -gencode=arch=compute_60,code=sm_60 
-                -gencode=arch=compute_61,code=sm_61 
-                -gencode=arch=compute_62,code=sm_62 
-                -gencode=arch=compute_70,code=sm_70 
-                -gencode=arch=compute_70,code=compute_70 
-            )
-        endif()
-        link_directories(${CUDA_ROOT}/lib64)
-        include_directories(${CUDA_ROOT}/include)
+        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${ARCH_FLAGS})
+        link_directories("${CUDA_ROOT}/lib64")
+        include_directories("${CUDA_ROOT}/include")
        set(CUDA_LIB_DIR "${CUDA_ROOT}/lib64/")
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcublas_static.a")
+        if(CUDA_VERSION_MAJOR EQUAL 11)
+            set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcublasLt_static.a")
+        endif()
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libculibos.a")
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libnpps_static.a")
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libnppc_static.a")

--- a/README.md
+++ b/README.md
--- a/data/fnnlm/test/wsj.test
+++ b/data/fnnlm/test/wsj.test
--- a/data/fnnlm/train/wsj.train
+++ b/data/fnnlm/train/wsj.train
--- a/data/transformer/test/bpevocab
+++ b/data/transformer/test/bpevocab
--- a/data/transformer/test/code
+++ b/data/transformer/test/code
--- a/data/transformer/test/test.de
+++ b/data/transformer/test/test.de
--- a/data/transformer/test/test.en
+++ b/data/transformer/test/test.en
--- a/data/transformer/train/bpevocab
+++ b/data/transformer/train/bpevocab
--- a/data/transformer/train/code
+++ b/data/transformer/train/code
--- a/doc/Configuration.md
+++ b/doc/Configuration.md
-# NiuTrans.Tensor环境配置
-
-## 注意事项
-
-CUDA最新版本9.2尚且不支持VS2017最新版本，因此建议使用CUDA版本为9.0或9.1，建议使用VS版本为VS2015，或使用VS2017时安装v140工具集，解决方案平台设置为×64。
-
-## CUDA配置
-
-在已安装好VS、CUDA并配置好环境变量后，一些关键的CUDA配置选项如下所示，以下配置选项在 **项目 -> 属性** 中可以找到。
-
->$(CUDA_PATH)\include
-
-加入到 **VC++目录 -> 包含** 中。
-
->$(CUDA_PATH)\lib\Win32
-
-加入到 **VC++目录 -> 库** 中。
-
->cuda.lib;cudadevrt.lib;cudart.lib;cudart_static.lib;nvcuvid.lib;OpenCL.lib;cublas.lib;curand.lib;
-
-加入到 **链接器->输入->附加依赖项** 中。
-
-配置完成后，右键 **工程->项目依赖性** ，选择CUDA9。
-在.cu文件上右键属性，在项类型中选择"CUDA C/C++"（最好搜索.cu文件，然后全选设置）。
-
-## 其他配置
-
-**C/C++->常规->SDL检查**，设为否。
-
-在 **C/C++->预处理器->预处理器定义** 中，添加
-
->USE_CUDA;USE_BLAS;WIN32;MKL;_DEBUG;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_WARNINGS_
-CONSOLE;
-
-**链接器->系统->子系统**，设置为控制台。
-
-**常规->字符集**，使用Unicode字符集。
-
-**调试->命令参数**中设置可执行文件所需要的参数。
-
-
--- a/doc/manual.md
+++ b/doc/manual.md
@@ -39,7 +39,7 @@ NiuTensor工具包可以在Windows、Linux以及macOS环境下进行安装，支
 - 执行CMake命令对Visual Studio项目进行生成（如果 visual studio 版本低于 2019，则在使用下列命令的时候需额外加上`-A x64`的CMake参数），如计划生成动态链接库，则仅需在命令中额外加上`-DGEN_DLL=ON`的CMake参数即可，否则默认生成可执行程序。
  - 如项目计划启用MKL数学运算库（需用户自行安装），则仅需在CMake命令中使用`-DUSE_MKL=ON`参数，并通过`-DINTEL_ROOT='/intel/root/path'`指定MKL库（Intel工具包）的安装路径。如`cmake -DUSE_MKL=ON -DINTEL_ROOT='C:/Program Files (x86)/IntelSWTools/compilers_and_libraries_2020.2.254/windows' ..`。
  - 如项目计划启用OpenBLAS数学运算库（需用户自行安装），则仅需在CMake命令中使用`-DUSE_OPENBLAS=ON`参数，并通过`-DOPENBLAS_ROOT='/openblas/root/path'`指定OpenBLAS库的安装路径。如`cmake -DUSE_OPENBLAS=ON -DOPENBLAS_ROOT='C:/Program Files/OpenBLAS' ..`。
-  - 如项目计划启用CUDA数学运算库（需用户自行安装），则仅需在CMake命令中使用`-DUSE_CUDA=ON`参数，并通过`-DCUDA_ROOT='/cuda/root/path'`指定CUDA库的安装路径。如`cmake -DUSE_CUDA=ON -DCUDA_ROOT='C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.2' ..`。如需在GPU设备上使用半精度浮点数进行运算，需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数（需要注意的是半精度但需要注意的是，半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持，该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询）。
+  - 如项目计划启用CUDA数学运算库（需用户自行安装），则仅需在CMake命令中使用`-DUSE_CUDA=ON`参数，并通过`-DCUDA_ROOT='/cuda/root/path'`指定CUDA库的安装路径，通过-DGPU_ARCH=ARCH指定所在GPU设备的架构（K：Kepler架构；M：Maxwell架构；P：Pascal架构；V：Volta架构；T：Turing架构；A：Ampere架构）。如`cmake -DUSE_CUDA=ON -DCUDA_ROOT='C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.2' -DGPU_ARCH=P ..`。如需在GPU设备上使用半精度浮点数进行运算，需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数（需要注意的是半精度但需要注意的是，半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持，该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询）。
 - 执行成功将显示`Build files have been written to:...`。
 - 打开build目录中的NiuTensor.sln文件即可通过Visual Studio打开NiuTensor项目。
 - 打开后在解决方案管理器中选中NiuTensor，右键将其设为启动项目即可开始使用。
@@ -60,7 +60,7 @@ NiuTensor工具包可以在Windows、Linux以及macOS环境下进行安装，支
 - 打开CLion首选项，点击“构建，执行，部署”选项卡中的CMake，在“CMake选项”中进行设置，设置完成后CLion将自动使用CMake对项目进行构建，如计划生成动态链接库，则仅需在在“CMake选项”中额外加上`-DGEN_DLL=ON`的CMake参数即可，否则默认生成可执行程序。
  - 如项目计划启用MKL数学运算库（需用户自行安装），则仅需在“CMake选项”中填入`-DUSE_MKL=ON`，并通过`-DINTEL_ROOT='/intel/root/path'`指定MKL库（Intel工具包）的安装路径。如`-DUSE_MKL=ON -DINTEL_ROOT='/opt/intel/compilers_and_libraries_2020.2.254/linux'`。
  - 如项目计划启用OpenBLAS数学运算库（需用户自行安装），则仅需在“CMake选项”中填入`-DUSE_OPENBLAS=ON`，并通过`-DOPENBLAS_ROOT='/openblas/root/path'`指定OpenBLAS库的安装路径。如`-DUSE_OPENBLAS=ON -DOPENBLAS_ROOT='/opt/OpenBLAS'`。
-  - 如项目计划启用CUDA数学运算库（需用户自行安装），则仅需在“CMake选项”中填入`-DUSE_CUDA=ON`，并通过`-DCUDA_ROOT='/cuda/root/path'`指定CUDA库的安装路径。如`-DUSE_CUDA=ON -DCUDA_ROOT='/usr/local/cuda-9.2'`。如需在GPU设备上使用半精度浮点数进行运算，需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数（需要注意的是半精度但需要注意的是，半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持，该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询）。
+  - 如项目计划启用CUDA数学运算库（需用户自行安装），则仅需在“CMake选项”中填入`-DUSE_CUDA=ON`，并通过`-DCUDA_ROOT='/cuda/root/path'`指定CUDA库的安装路径，通过-DGPU_ARCH=ARCH指定所在GPU设备的架构（K：Kepler架构；M：Maxwell架构；P：Pascal架构；V：Volta架构；T：Turing架构；A：Ampere架构）。如`-DUSE_CUDA=ON -DCUDA_ROOT='/usr/local/cuda-9.2' -DGPU_ARCH=P `。如需在GPU设备上使用半精度浮点数进行运算，需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数（需要注意的是半精度但需要注意的是，半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持，该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询）。

 ##### CMake方式（命令行）

@@ -71,7 +71,7 @@ NiuTensor工具包可以在Windows、Linux以及macOS环境下进行安装，支
 - 执行CMake命令对项目进行生成，如计划生成动态链接库，则仅需在命令中额外加上`-DGEN_DLL=ON`的CMake参数即可，否则默认生成可执行程序。
  - 如项目计划启用MKL数学运算库（需用户自行安装），则仅需在CMake命令中使用`-DUSE_MKL=ON`参数，并通过`-DINTEL_ROOT='/intel/root/path'`指定MKL库（Intel工具包）的安装路径。如`cmake -DUSE_MKL=ON -DINTEL_ROOT='/opt/intel/compilers_and_libraries_2020.2.254/linux' ..`。
  - 如项目计划启用OpenBLAS数学运算库（需用户自行安装），则仅需在CMake命令中使用`-DUSE_OPENBLAS=ON`参数，并通过`-DOPENBLAS_ROOT='/openblas/root/path'`指定OpenBLAS库的安装路径。如`cmake -DUSE_OPENBLAS=ON -DOPENBLAS_ROOT='/opt/OpenBLAS' ..`。
-  - 如项目计划启用CUDA数学运算库（需用户自行安装），则仅需在CMake命令中使用`-DUSE_CUDA=ON`参数，并通过`-DCUDA_ROOT='/cuda/root/path'`指定CUDA库的安装路径。如`cmake -DUSE_CUDA=ON -DCUDA_ROOT='/usr/local/cuda-9.2' ..`。如需在GPU设备上使用半精度浮点数进行运算，需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数（需要注意的是半精度但需要注意的是，半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持，该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询）。
+  - 如项目计划启用CUDA数学运算库（需用户自行安装），则仅需在CMake命令中使用`-DUSE_CUDA=ON`参数，并通过`-DCUDA_ROOT='/cuda/root/path'`指定CUDA库的安装路径，通过-DGPU_ARCH=ARCH指定所在GPU设备的架构（K：Kepler架构；M：Maxwell架构；P：Pascal架构；V：Volta架构；T：Turing架构；A：Ampere架构）。如`cmake -DUSE_CUDA=ON -DCUDA_ROOT='/usr/local/cuda-9.2' -DGPU_ARCH=P ..`。如需在GPU设备上使用半精度浮点数进行运算，需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数（需要注意的是半精度但需要注意的是，半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持，该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询）。
 - 执行成功将显示`Build files have been written to:...`并在该目录下生成Makefile文件。
 - 执行`make -j`命令对NiuTensor项目进行编译，执行成功将显示`Built target NiuTensor`，安装完毕。


--- a/source/Main.cpp
+++ b/source/Main.cpp
@@ -26,7 +26,7 @@
 #include "./tensor/core/CHeader.h"
 #include "./tensor/test/Test.h"
 #include "./sample/fnnlm/FNNLM.h"
-#include "./sample/transformer/Transformer.h"
+#include "./sample/transformer/NMT.h"

 //#define CRTDBG_MAP_ALLOC
 //#include <stdlib.h>
@@ -34,7 +34,7 @@

 using namespace nts;
 using namespace fnnlm;
-using namespace transformer;
+using namespace nmt;

 int main( int argc, const char ** argv )
 {
@@ -43,7 +43,7 @@ int main( int argc, const char ** argv )
    else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
        FNNLMMain(argc - 1, argv + 1);
    else if(argc > 1 && !strcmp(argv[1], "-t2t"))
-        TransformerMain(argc - 1, argv + 1);
+        NMTMain(argc - 1, argv + 1);
    else{
        fprintf(stderr, "Thanks for using NiuTensor! This is a library for building\n");
        fprintf(stderr, "neural networks in an easy way. \n\n");

--- a/source/sample/transformer/T2TDecoder.cpp
+++ b/source/sample/transformer/T2TDecoder.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,15 +19,13 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

-#include <cmath>
-
-#include "T2TDecoder.h"
-#include "module/T2TUtility.h"
-#include "module/T2TLayerNormal.h"
-#include "module/T2TCommonModules.h"
+#include "Decoder.h"
+#include "Utility.h"
+#include "module/LayerNorm.h"
+#include "module/CommonModules.h"
 #include "../../tensor/core/CHeader.h"

-namespace transformer
+namespace nmt
 {

 /* constructor */
@@ -64,7 +61,7 @@ AttDecoder::~AttDecoder()
 initialize the model
 >> config - configurations of the model
 */
-void AttDecoder::InitModel(T2TConfig& config)
+void AttDecoder::InitModel(Config& config)
 {
    devID = config.devID;
    nlayer = config.nDecLayer;
@@ -80,16 +77,17 @@ void AttDecoder::InitModel(T2TConfig& config)
    /* embedding model */
    embedder.InitModel(config, false);

-    selfAtt = new T2TAttention[nlayer];
-    fnns = new T2TFNN[nlayer];
-    selfAttLayerNorms = new T2TLN[nlayer];
-    enDeAtt = new T2TAttention[nlayer];
-    enDeAttLayerNorms = new T2TLN[nlayer];
-    fnnLayerNorms = new T2TLN[nlayer];
+    selfAtt = new Attention[nlayer];
+    fnns = new FNN[nlayer];
+    selfAttLayerNorms = new LN[nlayer];
+    enDeAtt = new Attention[nlayer];
+    enDeAttLayerNorms = new LN[nlayer];
+    fnnLayerNorms = new LN[nlayer];
+
    selfAttCache = new Cache[nlayer];
    enDeAttCache = new Cache[nlayer];
    if (preNorm)
-        decoderLayerNorm = new T2TLN;
+        decoderLayerNorm = new LN;

    /* initialize the stacked layers */
    for (int i = 0; i < nlayer; i++) {
@@ -99,6 +97,8 @@ void AttDecoder::InitModel(T2TConfig& config)
        fnnLayerNorms[i].InitModel(config);
        enDeAtt[i].InitModel(config);
        enDeAttLayerNorms[i].InitModel(config);
+        selfAttCache[i].enable = true;
+        enDeAttCache[i].enable = true;
    }
    if (preNorm)
        decoderLayerNorm->InitModel(config);
@@ -115,9 +115,10 @@ make the decoding network
 << return - the output tensor of the decoder
 */
 XTensor AttDecoder::Make(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
-    XTensor* maskEncDec, int nstep, bool isTraining)
+                         XTensor* maskEncDec, int nstep, bool isTraining)
 {
    XTensor x;
+
    x = embedder.Make(inputDec, true, isTraining, nstep);

    /* dropout */
@@ -188,8 +189,86 @@ XTensor AttDecoder::Make(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
    }

    if (preNorm)
-        x = decoderLayerNorm->Make(x);
+        return decoderLayerNorm->Make(x);
+
+    return x;
+}
+
+/*
+make the decoding network
+>> inputDec - the input tensor of the decoder
+>> outputEnc - the output tensor of the encoder
+>> mask - mask that indicates which position is valid
+>> maskEncDec - mask for the encoder-decoder attention
+>> nstep - the current length of the decoder input
+>> isTraining - indicates whether the model is used for training
+<< return - the output tensor of the decoder
+*/
+XTensor AttDecoder::MakeFast(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
+                             XTensor* maskEncDec, int nstep, bool isTraining)
+{
+    XTensor x;
+
+    x = embedder.Make(inputDec, true, isTraining, nstep);
+
+    /* dropout */
+    if (isTraining && dropoutP > 0)
+        x = Dropout(x, dropoutP);
+
+    for (int i = 0; i < nlayer; i++) {
+        XTensor res;
+
+        res = x;
+
+        /* layer normalization with pre-norm for self-attn */
+        x = selfAttLayerNorms[i].Make(x);
+
+        /******************/
+        /* self attention */
+        x = selfAtt[i].Make(x, x, x, mask, isTraining, &selfAttCache[i], SELF_ATT);
+
+        /* dropout */
+        if (isTraining && dropoutP > 0)
+            x = Dropout(x, dropoutP);
+
+        /* residual connection */
+        x = Sum(res, x);
+
+        res = x;
+
+        /* layer normalization with pre-norm for encoder-decoder attention */
+        x = enDeAttLayerNorms[i].Make(x);
+
+        /* encoder-decoder attention */
+        x = enDeAtt[i].Make(outputEnc, x, outputEnc, maskEncDec,
+                            isTraining, &enDeAttCache[i], EN_DE_ATT);
+
+        /* dropout */
+        if (isTraining && dropoutP > 0)
+            x = Dropout(x, dropoutP);
+
+        /* residual connection */
+        x = Sum(res, x);
+
+        res = x;
+
+        /* layer normalization with pre-norm for fnn */
+        x = fnnLayerNorms[i].Make(x);
+
+        /* fnn */
+        x = fnns[i].Make(x, isTraining);
+
+        /* dropout */
+        if (isTraining && dropoutP > 0)
+            x = Dropout(x, dropoutP);
+
+        /* residual connection */
+        x = Sum(res, x);
+    }
+
+    x = decoderLayerNorm->Make(x);

    return x;
 }
+
 }
\ No newline at end of file
--- a/source/sample/transformer/T2TDecoder.h
+++ b/source/sample/transformer/T2TDecoder.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,13 +19,13 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

-#ifndef __T2TDECODER_H__
-#define __T2TDECODER_H__
+#ifndef __DECODER_H__
+#define __DECODER_H__

-#include "T2TEncoder.h"
-#include "module/T2TUtility.h"
+#include "Encoder.h"
+#include "Utility.h"

-namespace transformer
+namespace nmt
 {

 class AttDecoder
@@ -52,28 +51,28 @@ public:
    DTYPE dropoutP;

    /* embedding of word at each position */
-    T2TEmbedder embedder;
+    Embedder embedder;

    /* FNN model of each layer */
-    T2TFNN* fnns;
+    FNN* fnns;

    /* attention model of each layer */
-    T2TAttention* selfAtt;
+    Attention* selfAtt;

    /* layer normalization for attention */
-    T2TLN* selfAttLayerNorms;
+    LN* selfAttLayerNorms;

    /* layer normalization for fnn */
-    T2TLN* fnnLayerNorms;
+    LN* fnnLayerNorms;

    /* layer normalization for decoder */
-    T2TLN* decoderLayerNorm;
+    LN* decoderLayerNorm;

    /* encoder-decoder attention model of each layer */
-    T2TAttention* enDeAtt;
+    Attention* enDeAtt;

    /* layer normalization for encoder-decoder attention */
-    T2TLN* enDeAttLayerNorms;
+    LN* enDeAttLayerNorms;

    /* layer cache list */
    Cache* selfAttCache;
@@ -92,11 +91,15 @@ public:
    ~AttDecoder();

    /* initialize the model */
-    void InitModel(T2TConfig& config);
+    void InitModel(Config& config);

    /* make the decoding network */
    XTensor Make(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
                 XTensor* maskEncDec, int nstep, bool isTraining);
+
+    /* make the decoding network (pre norm) */
+    XTensor MakeFast(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
+                     XTensor* maskEncDec, int nstep, bool isTraining);
 };

 }

--- a/source/sample/transformer/T2TEncoder.cpp
+++ b/source/sample/transformer/T2TEncoder.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,15 +19,13 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

-#include <cmath>
-
-#include "T2TEncoder.h"
-#include "module/T2TUtility.h"
-#include "module/T2TLayerNormal.h"
-#include "module/T2TCommonModules.h"
+#include "Encoder.h"
+#include "Utility.h"
+#include "module/LayerNorm.h"
+#include "module/CommonModules.h"
 #include "../../tensor/core/CHeader.h"

-namespace transformer
+namespace nmt
 {

 /* constructor */
@@ -56,7 +53,7 @@ AttEncoder::~AttEncoder()
 initialize the model
 >> config - configurations for the model
 */
-void AttEncoder::InitModel(T2TConfig& config)
+void AttEncoder::InitModel(Config& config)
 {

    devID = config.devID;
@@ -68,18 +65,18 @@ void AttEncoder::InitModel(T2TConfig& config)
    dropoutP = config.dropout;

    CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
-    CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsize\"");
+    CheckNTErrors(vSize > 1, "Set vocabulary size by \"-vsize\"");

    /* embedding model */
    embedder.InitModel(config);

-    selfAtt = new T2TAttention[nlayer];
-    fnns = new T2TFNN[nlayer];
-    attLayerNorms = new T2TLN[nlayer];
-    fnnLayerNorms = new T2TLN[nlayer];
+    selfAtt = new Attention[nlayer];
+    fnns = new FNN[nlayer];
+    attLayerNorms = new LN[nlayer];
+    fnnLayerNorms = new LN[nlayer];

    if (preNorm)
-        encoderLayerNorm = new T2TLN;
+        encoderLayerNorm = new LN;

    /* initialize the stacked layers */
    for (int i = 0; i < nlayer; i++) {
@@ -122,7 +119,7 @@ XTensor AttEncoder::Make(XTensor& input, XTensor* mask, XTensor& maskEncDec, boo
        attnBefore = LayerNorm(x, attLayerNorms[i], preNorm, true, false);

        /* self attention */
-        att = selfAtt[i].Make(attnBefore, attnBefore, attnBefore, mask, isTraining, NULL, 0);
+        att = selfAtt[i].Make(attnBefore, attnBefore, attnBefore, mask, isTraining, NULL, SELF_ATT);

        /* dropout */
        if (isTraining && dropoutP > 0)
@@ -151,7 +148,63 @@ XTensor AttEncoder::Make(XTensor& input, XTensor* mask, XTensor& maskEncDec, boo
        x = LayerNorm(res, fnnLayerNorms[i], preNorm, false, true);
    }
    if (preNorm)
-        x = encoderLayerNorm->Make(x);
+        return encoderLayerNorm->Make(x);
+
+    return x;
+}
+
+/*
+make the encoding network
+>> input - the input tensor of the encoder
+>> mask - the mask that indicate each position is valid
+>> maskEncDec - no use
+>> isTraining - indicates whether the model is used for training
+<< return - the output tensor of the encoder
+*/
+XTensor AttEncoder::MakeFast(XTensor& input, XTensor* mask, XTensor& maskEncDec, bool isTraining)
+{
+    XTensor x;
+
+    x = embedder.Make(input, false, isTraining);
+
+    /* dropout */
+    if (isTraining && dropoutP > 0)
+        x = Dropout(x, dropoutP);
+
+    for (int i = 0; i < nlayer; i++) {
+        XTensor res;
+
+        res = x;
+
+        /* layer normalization with pre-norm for self-attn */
+        x = attLayerNorms[i].Make(x);
+
+        /* self attention */
+        x = selfAtt[i].Make(x, x, x, mask, isTraining, NULL, SELF_ATT);
+
+        /* dropout */
+        if (isTraining && dropoutP > 0)
+            x = Dropout(x, dropoutP);
+
+        /* residual connection */
+        x = Sum(res, x);
+
+        res = x;
+
+        /* layer normalization with pre-norm for fnn */
+        x = fnnLayerNorms[i].Make(x);
+
+        /* fnn */
+        x = fnns[i].Make(x, isTraining);
+
+        /* dropout */
+        if (isTraining && dropoutP > 0)
+            x = Dropout(x, dropoutP);
+
+        /* residual connection */
+        x = Sum(res, x);
+    }
+    x = encoderLayerNorm->Make(x);

    return x;
 }

--- a/source/sample/transformer/T2TEncoder.h
+++ b/source/sample/transformer/T2TEncoder.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,25 +19,25 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

-#ifndef __T2TENCODER_H__
-#define __T2TENCODER_H__
+#ifndef __ENCODER_H__
+#define __ENCODER_H__

-#include "module/T2TFNN.h"
-#include "module/T2TUtility.h"
-#include "module/T2TAttention.h"
-#include "module/T2TEmbedding.h"
-#include "module/T2TLayerNormal.h"
+#include "Utility.h"
+#include "module/FNN.h"
+#include "module/Attention.h"
+#include "module/Embedding.h"
+#include "module/LayerNorm.h"
 #include "../../network/XNet.h"

 using namespace nts;

-namespace transformer
+namespace nmt
 {

 /*
 base class of the encoder
 */
-class T2TEncoder
+class Encoder
 {
 public:
    virtual XTensor Make(XTensor& input, XTensor* mask, XTensor& mask2, bool isTraining) = 0;
@@ -47,7 +46,7 @@ public:
 /*
 the encoder based on self-attention
 */
-class AttEncoder : T2TEncoder
+class AttEncoder : Encoder
 {
 public:
    /* device id */
@@ -73,22 +72,22 @@ public:
    int ignored;

    /* embedding of word at each position */
-    T2TEmbedder embedder;
+    Embedder embedder;

    /* FNN model of each layer */
-    T2TFNN* fnns;
+    FNN* fnns;

    /* attention model of each layer */
-    T2TAttention* selfAtt;
+    Attention* selfAtt;

    /* layer normalizations for attention */
-    T2TLN* attLayerNorms;
+    LN* attLayerNorms;

    /* layer normalization for fnn */
-    T2TLN* fnnLayerNorms;
+    LN* fnnLayerNorms;

    /* layer normalization for encoder */
-    T2TLN* encoderLayerNorm;
+    LN* encoderLayerNorm;

    /* the location of layer normalization */
    bool preNorm;
@@ -101,11 +100,14 @@ public:
    ~AttEncoder();

    /* initialize the model */
-    void InitModel(T2TConfig& config);
+    void InitModel(Config& config);

    /* make the encoding network */
    XTensor Make(XTensor& input, XTensor* mask, XTensor& maskEncDec, bool isTraining);

+    /* make the encoding network */
+    XTensor MakeFast(XTensor& input, XTensor* mask, XTensor& maskEncDec, bool isTraining);
+
    /* make the encoding network (wrapper) */
    XTensor Make(XTensor& input, XTensor* mask, bool isTraining);
 };

--- a/source/sample/transformer/T2TModel.cpp
+++ b/source/sample/transformer/T2TModel.cpp
--- a/source/sample/transformer/T2TModel.h
+++ b/source/sample/transformer/T2TModel.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,23 +19,22 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

-#ifndef __T2TMODEL_H__
-#define __T2TMODEL_H__
+#ifndef __MODEL_H__
+#define __MODEL_H__

-#include "T2TEncoder.h"
-#include "T2TDecoder.h"
-#include "module/T2TFNN.h"
-#include "module/T2TOutput.h"
-#include "module/T2TUtility.h"
-#include "module/T2TAttention.h"
+#include "Encoder.h"
+#include "Decoder.h"
+#include "module/FNN.h"
+#include "module/Output.h"
+#include "Utility.h"
+#include "module/Attention.h"

-namespace transformer
+namespace nmt
 {

-/* a transformer model that keeps parameters of the encoder,
-   the decoder and the output layer (softmax). Also, it creates
-   the network used in transformer. */
-class T2TModel
+/* a nmt model that keeps parameters of the encoder,
+   the decoder and the output layer (softmax). */
+class Model
 {
 public:
    /* device id */
@@ -49,7 +47,7 @@ public:
    AttDecoder* decoder;

    /* output layer */
-    T2TOutput* outputLayer;
+    Output* outputLayer;

    /* indicates whether the model is running for language modeling */
    bool isLM;
@@ -71,13 +69,16 @@ public:

 public:
    /* constructor */
-    T2TModel();
+    Model();

    /* de-constructor */
-    ~T2TModel();
+    ~Model();

    /* initialize the model */
-    void InitModel(T2TConfig& config);
+    void InitModel(Config& config);
+
+    /* print model configurations */
+    void ShowModelConfig(Config& config);

    /* make the encoding network */
    XTensor MakeEncoder(XTensor& input, XTensor* mask, bool isTraining);

--- a/source/sample/transformer/Transformer.cpp
+++ b/source/sample/transformer/Transformer.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,49 +16,47 @@

 /*
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
- * $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-06, 2020-07
 */

-#include <cmath>
 #include <ctime>

-#include "Transformer.h"
-#include "train/T2TTrainer.h"
-#include "module/T2TUtility.h"
-#include "translate/T2TTranslator.h"
-#include "../../tensor/XDevice.h"
-#include "../../tensor/XGlobal.h"
-#include "../../tensor/XUtility.h"
+#include "NMT.h"
+#include "train/Trainer.h"
+#include "translate/Translator.h"

-namespace transformer
+namespace nmt
 {

-int TransformerMain(int argc, const char** argv)
+int NMTMain(int argc, const char** argv)
 {
    if (argc == 0)
        return 1;

    /* load configurations */
-    T2TConfig config(argc, argv);
+    Config config(argc, argv);

-    srand((unsigned int)time(NULL));
+    srand(1);

-    /* train the model */
+    /* training */
    if (strcmp(config.trainFN, "") != 0) {
-        ENABLE_GRAD;
-        T2TModel model;
+        
+        Model model;
        model.InitModel(config);
-        T2TTrainer trainer;
+        Trainer trainer;
        trainer.Init(config);
        trainer.Train(config.trainFN, config.validFN, config.modelFN, &model);
    }

-    /* translate the test file */
+    /* translating */
    if (strcmp(config.testFN, "") != 0 && strcmp(config.outputFN, "") != 0) {
+        
+        /* disable grad flow */
        DISABLE_GRAD;
-        T2TModel model;
+
+        Model model;
        model.InitModel(config);
-        T2TTranslator translator;
+        Translator translator;
        translator.Init(config);
        translator.Translate(config.testFN, config.srcVocabFN, 
                             config.tgtVocabFN, config.outputFN, &model);

--- a/source/sample/transformer/Transformer.h
+++ b/source/sample/transformer/Transformer.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,29 +15,17 @@
 */

 /*
- *
- * An implementation of the transformer system. See more details
- * about FNNLM in
- * "Attention Is All You Need" by Vaswani et al.
- * https://arxiv.org/pdf/1706.03762.pdf
- *
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
- * I start writing the code related to NMT - a long time since my last coding
- * work on MT
+ * An implementation of the NMT system. 
 */

-#ifndef __TRANSFORMER_H__
-#define __TRANSFORMER_H__
-
-#include "../../tensor/XGlobal.h"
-#include "../../tensor/XTensor.h"
-#include "../../tensor/core/CHeader.h"
+#ifndef __NMT_H__
+#define __NMT_H__

-namespace transformer
+namespace nmt
 {

 /* entrance of the program */
-int TransformerMain(int argc, const char** argv);
+int NMTMain(int argc, const char** argv);

 }


--- a/source/sample/transformer/T2TAttention.cpp
+++ b/source/sample/transformer/T2TAttention.cpp
--- a/source/sample/transformer/T2TAttention.h
+++ b/source/sample/transformer/T2TAttention.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- /*
-  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
-  */
-
-#ifndef __T2TATTENTION_H__
-#define __T2TATTENTION_H__
-
-#include "../../network/XNet.h"
-
-using namespace nts;
-
-namespace transformer
-{
-/* attention type */
-enum { NONE, SELF_ATT, EN_DE_ATT };
-
-/* layer cache for keys and values */
-class Cache
-{
-public:
-    /* cache for keys */
-    XTensor key;
-
-    /* cache for values */
-    XTensor value;
-
-public:
-
-    bool miss;
-
-    Cache() {
-        miss = true;
-    }
-
-    void Update(XTensor&& k, XTensor&& v) {
-        key = k;
-        value = v;
-        miss = false;
-    }
-};
-
-/*
-multi-head attention
-y(Q, K, V) = cat(head_1, head_2, ..., head_n)
-where head_i = Attention(Q * w_i^Q, K * w_i^K, V * w_i^V)
-        attention(Q, K, V) = softmax(Q * K^T/d_k^0.5) V
-        d_k = dimension size of K
-*/
-class T2TAttention
-{
-public:
-    /* device id */
-    int devID;
-
-    /* head number */
-    int nhead;
-
-    /* transformation matrix for Q */
-    XTensor wq;
-
-    /* bias for Q */
-    XTensor bq;
-
-    /* transformation matrix for K */
-    XTensor wk;
-
-    /* bias for K */
-    XTensor bk;
-
-    /* transformation matrix for V */
-    XTensor wv;
-
-    /* bias for V */
-    XTensor bv;
-
-    XTensor wBig;
-
-    XTensor bBig;
-
-    /* RPR emb */
-    XTensor rp_embedding_k;
-
-    /* transformation after dot-product attention */
-    XTensor wo;
-
-    /* bias after dot-product attention */
-    XTensor bo;
-
-    /* size of transformed Q and K */
-    int dk;
-
-    /* size of transformed V */
-    int dv;
-
-    /* size of input Q, K and V */
-    int d;
-
-    /* indicates whether the attention is masked */
-    bool isMasked;
-
-    /* some positions can be ignored in attention. this is useful in lm where the first position needs
-        special design for the attention model. */
-    int ignored;
-
-    /* indicates whether the model is used for training */
-    bool isTraining;
-
-    /* dropout probability */
-    DTYPE dropoutP;
-
-    /* max relative window size */
-    int max_relative_position;
-
-
-public:
-    /* constructor */
-    T2TAttention();
-
-    /* de-constructor */
-    ~T2TAttention();
-
-    /* initialize the model */
-    void InitModel(int argc, char** argv,
-        bool myIsMasked, int myIgnored,
-        int myDevID = -1);
-
-    /* make the network */
-    XTensor Make( XTensor& k,  XTensor& q,  XTensor& v,
-         XTensor* mask, bool isTraining, Cache* cache, int cacheType);
-
-    /* make the attention network given keys, queries and values (after linear transformation) */
-    XTensor MakeAttention(XTensor& k, XTensor& q, XTensor& v,  XTensor* mask, bool isTraining, bool is_encoder);
-
-    /* make the attention network given keys, queries and values (after linear transformation) */
-    XTensor MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v, XTensor* mask, bool isTraining, bool is_encoder);
-
-    void GetRPEmbedding(XTensor* emb_matrix, const int len_q, const int len_kv, const int max_relative_length, const int device_id, const bool is_encoder);
-
-    void RPDotProduct(XTensor* x, XTensor* y, XTensor* z, XTensor* attention, const bool is_key);
-};
-
-}
-
-#endif
--- a/source/sample/transformer/T2TEmbedding.cpp
+++ b/source/sample/transformer/T2TEmbedding.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-01
- */
-
-#include <math.h>
-#include "T2TEmbedding.h"
-#include "T2TUtility.h"
-#include "../../tensor/core/CHeader.h"
-
-namespace transformer
-{
-
-/* constructor */
-T2TEmbedder::T2TEmbedder()
-{
-    devID = -1;
-    vSize = -1;
-    maxLength = -1;
-}
-
-/* deconstructor */
-T2TEmbedder::~T2TEmbedder()
-{
-}
-
-/* 
-initialize the model 
->> argc - number of arguments
->> argv - list of pointers to the arguments
->> myDevID - device id
-*/
-void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, bool isEnc)
-{
-    devID = myDevID;
-    
-    if(isEnc){
-        LoadParamInt(argc, argv, "vsize", &vSize, -1);
-    }
-    else{
-        LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
-    }
-    //LoadParamInt(argc, argv, "vsize", &vSize, -1);
-    LoadParamInt(argc, argv, "maxlen", &maxLength, 1024);
-    LoadParamInt(argc, argv, "d", &eSize, DEFAULT_EMBEDDING_SIZE);
-    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
-    LoadParamInt(argc, argv, "pad", &padIdx, 1);
-
-    InitTensor2DV2(&w, vSize, eSize, X_FLOAT, devID);
-
-    maxLength = maxLength + 1 + 1;
-    DTYPE v = 1.0F/(float)sqrt((float)eSize);
-    w.SetDataRandn(0, v);
-
-    /* create the positional embedding matrix */
-    MakePosEmbedding(eSize, d, maxLength, padIdx);
-}
-
-/* 
-make positional embeddings (of size eSize * length)
->> eSize - embedding size
->> d - dimension size of the hidden layers
->> length - length of the sequence
-*/
-void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length, int padIdx)
-{
-    InitTensor2DV2(&posEmbeddingBase, length, eSize, X_FLOAT, devID);
-
-    float * data = new float[posEmbeddingBase.unitNum];
-
-    for(int pos = 0; pos < length; pos++){
-        float * dp = data + pos * eSize;
-        
-        int channelSize = eSize / 2;
-        int offset = 0;
-        for(int i = 0; i < channelSize; i++){
-            dp[offset++] = (float)sin(pos * exp(-i * log(10000.0F) / (channelSize - 1)));
-        }
-        for(int i = 0; i < channelSize; i++){
-            dp[offset++] = (float)cos(pos * exp(-i * log(10000.0F) / (channelSize - 1)));
-        }
-        
-    }
-    
-    /* padding zeros */
-    int padStart = padIdx * eSize;
-    for (int i = padStart; i < padStart + eSize; i++)
-        data[i] = 0.F;
-
-    posEmbeddingBase.SetData(data, posEmbeddingBase.unitNum);
-
-    delete[] data;
-}
-
-/* 
-make the network 
-*/
-XTensor T2TEmbedder::Make(XTensor &input, int prevLen, int nstep, bool isDec)
-{
-    
-	/* assert padding index is 1 */
-
-	CheckNTErrors(input.order > 1, "Wrong input tensor size!");
-	CheckNTErrors(input.dimSize[input.order - 1] < maxLength, "The sequence is too long!");
-	CheckNTErrors(vSize > 0, "set vocabulary size by \"-vsize\"");
-	CheckNTErrors(eSize > 0, "set embedding size by \"-esize\"");
-
-	XTensor wordEmbedding, position, posEmbedding;
-	InitTensorV2(&position, &input);
-
-	int* posData = new int[input.unitNum];
-
-	XTensor inputCPU;
-	InitTensorOnCPU(&inputCPU, &input);
-	_CopyValues(&input, &inputCPU);
-
-
-	if (!isDec)
-	{
-		for (int i = 0; i < inputCPU.GetDim(0); i++) {
-			int startNoPad = 2 + prevLen;
-			int* p = ((int*)inputCPU.data) + i * inputCPU.GetDim(1);
-			for (int j = 0; j < inputCPU.GetDim(1); j++) {
-				if (p[j] == 1) {
-					posData[i * inputCPU.GetDim(1) + j] = 1;
-				}
-				else {
-					posData[i * inputCPU.GetDim(1) + j] = startNoPad++;
-				}
-			}
-		}
-		position.SetData(posData, position.unitNum);
-	}
-	else
-	{
-		for (int i = 0; i < position.GetDim(0); i++) {
-			for (int j = 0; j < position.GetDim(1); j++) {
-				position.Set2DInt(nstep + 2, i, j);
-			}
-		}
-
-	}
-
-
-	delete[] posData;
-
-	/* we make positional embeddings first */
-	if (true) {
-		posEmbedding = Gather(posEmbeddingBase, position);
-	}
-    /* then we make word embeddings */
-    wordEmbedding = Gather(w, input);
-
-    wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize));
-
-    /* we sum over the two embeddings */
-	return Sum(wordEmbedding, posEmbedding);
-}
-
-}
--- a/source/sample/transformer/T2TEmbedding.h
+++ b/source/sample/transformer/T2TEmbedding.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-01
- */
-
-#ifndef __T2TEMBEDDING_H__
-#define __T2TEMBEDDING_H__
-
-#include "../../network/XNet.h"
-
-using namespace nts;
-
-namespace transformer
-{
-
-#define DEFAULT_EMBEDDING_SIZE 128
-
-/* 
-embedding (of word at position i):
-word embedding + positional embedding
-*/
-class T2TEmbedder
-{
-public:
-    /* device id */
-    int devID;
-    
-    /* vocabulary size */
-    int vSize;
-
-    /* embedding size */
-    int eSize;
-
-    /* maximum length of the sequence */
-    int maxLength;
-
-    /* dimension size of the hidden layers in the t2t model */
-    int d;
-
-    /* padding index */
-    int padIdx;
-
-    /* word embedding matrix */
-    XTensor w;
-
-    /* predefined positional embeddings. It can speeds up 
-       the embedding processing by re-loading. */
-    XTensor posEmbeddingBase;
-
-public:
-    /* constructor */
-    T2TEmbedder();
-
-    /* de-constructor */
-    ~T2TEmbedder();
-
-    /* initialize the model */
-    void InitModel(int argc, char ** argv, int myDevID = -1, bool isEnc = true);
-
-    /* make positional embeddings */
-    void MakePosEmbedding(int eSize, int d, int length, int padIdx);
-
-    /* make the network */
-    XTensor Make(XTensor &input, int prevLen=0, int nstep = -1, bool isDec = false);
-};
-
-}
-
-#endif
--- a/source/sample/transformer/T2TFNN.cpp
+++ b/source/sample/transformer/T2TFNN.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- /*
-  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
-  */
-
-#include <math.h>
-#include "T2TFNN.h"
-#include "T2TUtility.h"
-#include "T2TEmbedding.h"
-#include "../../tensor/core/CHeader.h"
-#include "../../tensor/function/FHeader.h"
-
-namespace transformer
-{
-
-/* constructor */
-T2TFNN::T2TFNN()
-{
-    inSize = -1;
-    outSize = -1;
-    hSize = -1;
-}
-
-/* deconstructor */
-T2TFNN::~T2TFNN()
-{
-}
-
-/*
-initialize the model
->> argc - number of arguments
->> argv - list of pointers to the arguments
->> myDevID - device id
-*/
-void T2TFNN::InitModel(int argc, char** argv, int myDevID)
-{
-    devID = myDevID;
-
-    float minmax = 0;
-
-    LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE);
-    LoadParamInt(argc, argv, "d", &outSize, DEFAULT_EMBEDDING_SIZE);
-    LoadParamInt(argc, argv, "fnnh", &hSize, outSize * 8);
-    LoadParamFloat(argc, argv, "fnnminmax", &minmax, 0.1F);
-    LoadParamFloat(argc, argv, "dropoutfnn", &dropoutP, 0);
-
-    InitTensor2DV2(&w1,  inSize, hSize, X_FLOAT, devID);
-    InitTensor1DV2(&b1, hSize, X_FLOAT, devID);
-
-    InitTensor2DV2(&w2, hSize, outSize,  X_FLOAT, devID);
-    InitTensor1DV2(&b2, outSize, X_FLOAT, devID);
-
-    fnnLayerNorm.InitModel(argc, argv, myDevID);
-
-    //float scale = 1.0F;
-    //float finfout1 = (float)sqrt(6.0F * scale/(inSize + hSize));
-    //float finfout2 = (float)sqrt(6.0F * scale/(hSize + outSize));
-    //
-    //w1.SetDataRand(-finfout1, finfout1);
-    //b1.SetZeroAll();
-    //w2.SetDataRand(-finfout2, finfout2);
-    //b2.SetZeroAll();
-}
-
-/*
-make the network
-y = max(0, x * w1 + b1) * w2 + b2
->> input - the input tensor
->> return - the output tensor
-*/
-XTensor T2TFNN::Make(XTensor& input, bool isTraining)
-{
-    XTensor t1;
-
-    /* t1 = max(0, x * w1 + b1) */
-    t1 = Rectify(MulAndShift(fnnLayerNorm.Make(input), w1, b1));
-
-    if (isTraining && dropoutP > 0)
-        t1 = Dropout(t1, dropoutP);
-
-    /* result = t1 * w2 + b2 */
-    XTensor res;
-    res = MulAndShift(t1, w2, b2);
-    _SumMe(&res, &input);
-    return  res;
-}
-
-
-}
--- a/source/sample/transformer/T2TFNN.h
+++ b/source/sample/transformer/T2TFNN.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
- */
-
-#ifndef __T2TFNN_H__
-#define __T2TFNN_H__
-
-#include "T2TLayerNormal.h"
-#include "../../tensor/XTensor.h"
-
-using namespace nts;
-
-namespace transformer
-{
-
-/* a fnn: y = max(0, x * w1 + b1) * w2 + b2 */
-class T2TFNN
-{
-public:
-    /* device id */
-    int devID;
-
-    /* size of input vector */
-    int inSize;
-
-    /* size of output vector */
-    int outSize;
-
-    /* size of hidden layers */
-    int hSize;
-
-    /* matrix of transformation 1 */
-    XTensor w1;
-
-    /* bias of transformation 1 */
-    XTensor b1;
-
-    /* matrix of transformation 2 */
-    XTensor w2;
-
-    /* bias of transformation 2 */
-    XTensor b2;
-    
-    /* layer normalization for fnn */
-    T2TLN fnnLayerNorm;
-
-    /* dropout probability */
-    DTYPE dropoutP;
-
-public:
-
-    /* constructor */
-    T2TFNN();
-
-    /* deconstructor */
-    ~T2TFNN();
-
-    /* initialize the model */
-    void InitModel(int argc, char ** argv, int myDevID = -1);
-
-    /* make the network */
-    XTensor Make(XTensor &input, bool isTraining);
-
-};
-
-}
-
-#endif
--- a/source/sample/transformer/T2TLayerNormal.cpp
+++ b/source/sample/transformer/T2TLayerNormal.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
- */
-
-#include <math.h>
-#include "T2TLayerNormal.h"
-#include "T2TUtility.h"
-#include "T2TEmbedding.h"
-#include "../../tensor/core/CHeader.h"
-
-namespace transformer
-{
-
-/* constructor */
-T2TLN::T2TLN()
-{
-    devID = -1;
-    d = 0;
-}
-
-/* de-constructor */
-T2TLN::~T2TLN()
-{
-}
-
-/*
-initialize the model
->> argc - number of arguments
->> argv - list of pointers to the arguments
->> myDevID - device id
-*/
-void T2TLN::InitModel(int argc, char ** argv, int myDevID)
-{
-    devID = myDevID;
-
-    d = 0;
-    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
-
-    InitTensor1DV2(&w, d, X_FLOAT, devID);
-    InitTensor1DV2(&b, d, X_FLOAT, devID);
-}
-
-/*
-make the network
-for each layer representation x, we have
-y =
->> input - the input tensor
->> return - layer normalization output
-*/
-XTensor T2TLN::Make(XTensor &input)
-{
-    XTensor &x = input;
-    XTensor xn;
-    XTensor mean;
-    XTensor variance;
-    XTensor standard;
-    XTensor meanFilled;
-    XTensor standardFilled;
-
-    /* \mu = (sum_i x_i)/m */
-    mean = ReduceMean(x, x.order - 1);
-
-    /* \sigma = (sum_i (x_i - \mu)^2)/m */
-    variance = ReduceVariance(x, x.order - 1, mean) + 1e-5F;
-
-    /* standard = sqrt(variance) */
-    standard = Power(variance, 0.5F);
-
-    /* unsqueeze mean and standard deviation to fit them into
-       the same shape of x */
-    meanFilled = Unsqueeze(mean, x.order - 1, x.GetDim(-1));
-    standardFilled = Unsqueeze(standard, x.order - 1, x.GetDim(-1));
-
-    /* x' = (x - \mu)/standard */
-    xn = (x - meanFilled) / standardFilled;
-
-    /* result = x' * w + b   */
-    return xn *  w + b;
-}
-
-}
--- a/source/sample/transformer/T2TLayerNormal.h
+++ b/source/sample/transformer/T2TLayerNormal.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
- */
-
-#ifndef __T2TLAYERNORMAL_H__
-#define __T2TLAYERNORMAL_H__
-
-#include "../../network/XNet.h"
-
-using namespace nts;
-
-namespace transformer
-{
-
-/* layer normalization: y = norm(x) * w + b 
-   where norm(x) = (x - mean)/standardDeviation */
-class T2TLN
-{
-public:
-    /* device id */
-    int devID;
-
-    /* the transformation matrix w */
-    XTensor w;
-
-    /* the bias term b */
-    XTensor b;
-
-    /* dimension size of the model */
-    int d;
-    
-public:
-    /* constructor */
-    T2TLN();
-    
-    /* de-constructor */
-    ~T2TLN();
-    
-    /* initialize the model */
-    void InitModel(int argc, char ** argv, int myDevID = -1);
-    
-    /* make the network */
-    XTensor Make(XTensor &input);
-};
-
-}
-
-#endif
--- a/source/sample/transformer/T2TLengthPenalty.cpp
+++ b/source/sample/transformer/T2TLengthPenalty.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2019, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "../../tensor/core/CHeader.h"
-#include "T2TLengthPenalty.h"
-
-using namespace nts;
-
-namespace transformer
-{
-
-/* 
-GNMT-like length penalty: pl = ((5 + n)/(5 + 1))^\alpha 
-where n = length of the sequence 
->> length - length of the sequence (for each entry)
->> alpha - the parameter controls the length preference
-<< return - length penaltyof the sequence (for each entry)
-*/
-XTensor T2TLengthPenalizer::GNMT(const XTensor & length, float alpha)
-{
-    XTensor base;
-    XTensor lp;
-
-    //base = ScaleAndShift(ScaleAndShift(length, 0, 5.0F), 1.0F/(5 + 1));
-    base = (length + 5)/(1 + 5);
-
-    lp = Power(base, alpha);
-    
-    return lp;
-}
-
-}
--- a/source/sample/transformer/T2TLengthPenalty.h
+++ b/source/sample/transformer/T2TLengthPenalty.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2019, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-04-08
- * Start of a new week - I just finished several documents.
- * Writing document is harder than writing code :)
- */
-
-#ifndef __T2TLENGTHPENALTY_H__
-#define __T2TLENGTHPENALTY_H__
-
-#include "../../tensor/XTensor.h"
-
-using namespace nts;
-
-namespace transformer
-{
-
-/* We intend to penalize short sequences because they have higher score
-   in product of a sequence of probability-like terms and have more chances
-   to beat others in search. */
-class T2TLengthPenalizer
-{
-public:
-    /* GNMT-like length penalty: pl = ((5 + n)/(5 + 1))^\alpha 
-       where n = length of the sequence */
-    static
-    XTensor GNMT(const XTensor & length, float alpha);
-};
-
-}
-
-#endif
--- a/source/sample/transformer/T2TOutput.cpp
+++ b/source/sample/transformer/T2TOutput.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
- */
-
-#include <math.h>
-#include "T2TOutput.h"
-#include "T2TUtility.h"
-#include "T2TEmbedding.h"
-#include "../../tensor/core/CHeader.h"
-
-namespace transformer
-{
-/* constructor */
-T2TOutput::T2TOutput()
-{
-    devID = -1;
-    vSize = -1;
-    inSize = -1;
-    hSize = -1;
-}
-
-/* de-constructor */
-T2TOutput::~T2TOutput()
-{
-}
-
-/*
-initialize the model 
->> argc - number of arguments
->> argv - list of pointers to the arguments
->> myDevID - device id
-*/
-void T2TOutput::InitModel(int argc, char ** argv, int myDevID)
-{
-    devID = myDevID;
-
-    float minmax = 0;
-
-    LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
-    LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE);
-    LoadParamInt(argc, argv, "d", &hSize, DEFAULT_EMBEDDING_SIZE);
-
-    InitTensor2DV2(&w, vSize, hSize, X_FLOAT, devID);
-}
-
-
-/* 
-make the network (redefined output tensor) 
->> input - input tensor
->> output - output tensor 
-*/
-void T2TOutput::Make(XTensor &input, XTensor &output)
-{
-    XTensor &x = input;
-
-    output = LogSoftmax(MMul(x, X_NOTRANS, w, X_TRANS), -1);
-}
-
-}
--- a/source/sample/transformer/T2TOutput.h
+++ b/source/sample/transformer/T2TOutput.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
- */
-
-#ifndef __T2TOUTPUT_H__
-#define __T2TOUTPUT_H__
-
-#include "../../tensor/function/FHeader.h"
-
-using namespace nts;
-
-namespace transformer
-{
-    
-#define OUTPUT_NAME "output"
-
-/* output layer */
-class T2TOutput
-{
-public:
-    /* device id */
-    int devID;
-
-    /* vocabulary size */
-    int vSize;
-
-    /* input vector size */
-    int inSize;
-
-    /* vector size of the linear transformation */
-    int hSize;
-
-    /* transformation matrix */
-    XTensor w;
-
-public:
-    /* constructor */
-    T2TOutput();
-
-    /* de-constructor */
-    ~T2TOutput();
-
-    /* initialize the model */
-    void InitModel(int argc, char ** argv, int myDevID = -1);
-
-    /* make the network */
-    XTensor Make(XTensor &input);
-
-    /* make the network (redefined output tensor) */
-    void Make(XTensor &input, XTensor &output);
-};
-
-
-}
-
-#endif
\ No newline at end of file
--- a/source/sample/transformer/T2TPredictor.cpp
+++ b/source/sample/transformer/T2TPredictor.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- /*
-  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-13
-  */
-
-#include "T2TPredictor.h"
-#include "../../tensor/core/CHeader.h"
-#include <iostream>
-
-using namespace nts;
-
-namespace transformer
-{
-
-/* constructor */
-T2TStateBundle::T2TStateBundle()
-{
-    states = NULL;
-    isStart = false;
-}
-
-/* de-constructor */
-T2TStateBundle::~T2TStateBundle()
-{
-    if (states != NULL)
-        delete[] states;
-}
-
-/*
-create states
->> num - number of states
-*/
-void T2TStateBundle::MakeStates(int num)
-{
-    CheckNTErrors(num > 0, "invalid number");
-
-    if (states != NULL)
-        delete[] states;
-
-    states = new T2TState[num];
-
-    for (int i = 0; i < num; i++) {
-        states[i].prediction = -1;
-        states[i].pid = T2T_PID_EMPTY;
-        states[i].isEnd = false;
-        states[i].isStart = false;
-        states[i].isCompleted = false;
-        states[i].prob = 0;
-        states[i].probPath = 0;
-        states[i].modelScore = 0;
-        states[i].nstep = 0;
-        states[i].last = NULL;
-    }
-
-    stateNum = num;
-}
-
-/* constructor */
-T2TPredictor::T2TPredictor()
-{
-    startSymbol = 2;
-}
-
-/* de-constructor */
-T2TPredictor::~T2TPredictor()
-{
-}
-
-/*
-create an initial state
->> model - the t2t model
->> top - the top-most layer of the network
->> input - input of the network
->> beamSize - beam size
->> state - the state to be initialized
-*/
-void T2TPredictor::Create(T2TModel* model, XTensor* top, const XTensor* input, int beamSize, T2TStateBundle* state)
-{
-    int dims[MAX_TENSOR_DIM_NUM];
-    for (int i = 0; i < input->order - 1; i++)
-        dims[i] = input->GetDim(i);
-    dims[input->order - 1] = beamSize;
-
-    InitTensorV2(&state->probPath, input->order, dims, X_FLOAT, 1.0F, input->devID);
-    InitTensorV2(&state->nstep, input->order, dims, X_FLOAT, 1.0F, input->devID);
-    InitTensorV2(&state->endMark, input->order, dims, X_INT, 1.0F, input->devID);
-
-    state->probPath.SetZeroAll();
-    state->nstep.SetZeroAll();
-    state->endMark.SetZeroAll();
-
-    state->stateNum = 0;
-}
-
-/*
-set start symbol
->> symbol - the symbol (in integer)
-*/
-void T2TPredictor::SetStartSymbol(int symbol)
-{
-    startSymbol = symbol;
-}
-
-/*
-read a state
->> model - the t2t model that keeps the network created so far
->> state - a set of states. It keeps
-             1) hypotheses (states)
-             2) probablities of hypotheses
-             3) parts of the network for expanding toward the next state
-*/
-void T2TPredictor::Read(T2TModel* model, T2TStateBundle* state)
-{
-    m = model;
-    s = state;
-}
-
-/*
-predict the next state
->> next - next states (assuming that the current state has been read)
->> encoding - encoder output
->> inputEnc - input of the encoder
->> paddingEnc - padding of the encoder
->>> isStart - is the start or not
-*/
-void T2TPredictor::Predict(T2TStateBundle* next, XTensor* encoding, XTensor* inputEnc, XTensor* paddingEnc, bool isStart)
-{
-    int dims[MAX_TENSOR_DIM_NUM];
-
-    /* word indices of positions up to next state */
-    XTensor inputDec;
-
-    /* the first token */
-    XTensor first;
-
-    CheckNTErrors(inputEnc->order >= 2, "Wrong order of the tensor!");
-    for (int i = 0; i < inputEnc->order - 1; i++)
-        dims[i] = inputEnc->GetDim(i);
-    dims[inputEnc->order - 1] = 1;
-
-    InitTensorV2(&first, inputEnc->order, dims, X_INT, 1.0F, inputEnc->devID);
-    SetDataFixedInt(first, startSymbol);
-
-    /* add a new word into the input sequence of the decoder side */
-    if (isStart) {
-        inputDec = Identity(first);
-    }
-    else {
-        /* only pass one step to the decoder */
-        inputDec = GetLastPrediction(s);
-        inputDec.SetDevice(inputEnc->devID);
-    }
-
-    /* prediction probabilities */
-    XTensor& output = next->prob;
-    XTensor decoding;
-
-    for (int i = 0; i < inputDec.order - 1; i++)
-        dims[i] = inputDec.GetDim(i);
-    dims[inputDec.order - 1] = inputDec.GetDim(-1);
-
-    XTensor paddingDec;
-    InitTensorV2(&paddingDec, inputDec.order, dims, X_INT, 1.0F, paddingEnc->devID);
-    SetDataFixedInt(paddingDec, 1);
-
-    XTensor maskDec;
-    XTensor maskEncDec;
-
-    /* decoder mask */
-    m->MakeMTMaskDec(*inputEnc, inputDec, *paddingEnc, paddingDec, maskDec, maskEncDec);
-
-    /* make the decoding network */
-    decoding = m->decoder->Make(inputDec, *encoding, &maskDec, maskEncDec, false);
-
-    CheckNTErrors(decoding.order >= 2, "The tensor must be of order 2 or larger!");
-
-    /* generate the output probabilities */
-    m->outputLayer->Make(decoding, output);
-}
-
-/*
-generate paths up to the states of the current step
->> state - state bundle of the current step
-*/
-XTensor T2TPredictor::GeneratePaths(T2TStateBundle* state)
-{
-    CheckNTErrors(state->stateNum >= 0, "Illegal state!");
-
-    int distance = -1;
-
-    for (int i = 0; i < state->stateNum; i++) {
-        T2TState* cur = state->states + i;
-        int nsteps = 0;
-
-        while (cur != NULL) {
-            nsteps++;
-            cur = cur->last;
-        }
-
-        if (nsteps > distance)
-            distance = nsteps;
-    }
-
-    XTensor path;
-    InitTensor2DV2(&path, state->stateNum, distance, X_INT);
-    path.SetZeroAll();
-
-    for (int i = 0; i < state->stateNum; i++) {
-        T2TState* cur = state->states + i;
-        int nsteps = 0;
-
-        while (cur != NULL) {
-            nsteps++;
-            path.Set2DInt(cur->prediction, i, distance - nsteps);
-            cur = cur->last;
-        }
-    }
-
-    return path;
-}
-
-/*
-get the predictions of the previous step
->> state - state bundle of the current step
-*/
-XTensor T2TPredictor::GetLastPrediction(T2TStateBundle* state)
-{
-    CheckNTErrors(state->stateNum >= 0, "Illegal state!");
-
-    XTensor lastPred;
-    InitTensor2DV2(&lastPred, state->stateNum, 1, X_INT);
-
-    for (int i = 0; i < state->stateNum; i++) {
-        T2TState* cur = state->states + i;
-
-        lastPred.Set2DInt(cur->prediction, i, 0);
-    }
-
-    return lastPred;
-}
-
-}
-
--- a/source/sample/transformer/T2TPredictor.h
+++ b/source/sample/transformer/T2TPredictor.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- /*
-  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-13
-  * This is the first source file I create in 2019 - new start!
-  */
-
-#ifndef __T2TPREDICTOR_H__
-#define __T2TPREDICTOR_H__
-
-#include "T2TModel.h"
-#include "T2TLengthPenalty.h"
-
-namespace transformer
-{
-
-#define T2T_PID_EMPTY -1
-
-/* state for search. It keeps the path (back-pointer), prediction distribution,
-   and etc. It can be regarded as a hypothsis in translation. */
-class T2TState
-{
-public:
-    /* we assume that the prediction is an integer */
-    int prediction;
-
-    /* id of the problem. One can regard it as the sentence id when we
-       translate a number of sentences in the batched manner. The hypothesis
-       is empty if id = -1 */
-    int pid;
-
-    /* indicates whether the state is an end */
-    bool isEnd;
-
-    /* indicates whether the state is the start */
-    bool isStart;
-
-    /* indicates whether the state is completed */
-    bool isCompleted;
-
-    /* probability of every prediction (last state of the path) */
-    float prob;
-
-    /* probability of every path */
-    float probPath;
-
-    /* model score of every path. A model score = path probability + some other stuff */
-    float modelScore;
-
-    /* nubmer of steps we go over so far */
-    int nstep;
-
-    /* pointer to the previous state */
-    T2TState* last;
-};
-
-/* a bundle of states */
-class T2TStateBundle
-{
-public:
-    /* predictions */
-    XTensor prediction;
-
-    /* id of the previous state that generates the current one  */
-    XTensor preID;
-
-    /* mark that indicates whether each hypothesis is completed */
-    XTensor endMark;
-
-    /* probability of every prediction (last state of the path) */
-    XTensor prob;
-
-    /* probability of every path */
-    XTensor probPath;
-
-    /* model score of every path */
-    XTensor modelScore;
-
-    /* step number of each hypothesis */
-    XTensor nstep;
-
-    /* list of states */
-    T2TState* states;
-
-    /* number of states */
-    int stateNum;
-
-    /* indicates whether it is the first state */
-    bool isStart;
-
-public:
-    /* constructor */
-    T2TStateBundle();
-
-    /* de-constructor */
-    ~T2TStateBundle();
-
-    /* create states */
-    void MakeStates(int num);
-};
-
-/* The predictor reads the current state and then predicts the next.
-   It is exactly the same procedure of MT inference -
-   we get the state of previous words and then generate the next word.
-   Here, a state can be regared as the representation of words (word
-   indices, hidden states, embeddings and etc.).  */
-class T2TPredictor
-{
-private:
-    /* pointer to the transformer model */
-    T2TModel* m;
-
-    /* current state */
-    T2TStateBundle* s;
-
-    /* start symbol */
-    int startSymbol;
-
-public:
-    /* constructor */
-    T2TPredictor();
-
-    /* de-constructor */
-    ~T2TPredictor();
-
-    /* create an initial state */
-    void Create(T2TModel* model, XTensor* top, const XTensor* input, int beamSize, T2TStateBundle* state);
-
-    /* set the start symbol */
-    void SetStartSymbol(int symbol);
-
-    /* read a state */
-    void Read(T2TModel* model, T2TStateBundle* state);
-
-    /* predict the next state */
-    void Predict(T2TStateBundle* next, XTensor* encoding, XTensor* inputEnc, XTensor* paddingEnc, bool isStart);
-
-    /* generate paths up to the states of the current step */
-    XTensor GeneratePaths(T2TStateBundle* state);
-
-    /* get the predictions of the previous step */
-    XTensor GetLastPrediction(T2TStateBundle* state);
-};
-
-}
-
-#endif
--- a/source/sample/transformer/T2TSearch.cpp
+++ b/source/sample/transformer/T2TSearch.cpp
--- a/source/sample/transformer/T2TSearch.h
+++ b/source/sample/transformer/T2TSearch.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- /*
-  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
-  */
-
-#ifndef __T2TSEARCH_H__
-#define __T2TSEARCH_H__
-
-#include "T2TModel.h"
-#include "T2TPredictor.h"
-
-namespace transformer
-{
-
-/* The class orgnizes the search process. It calls "predictors" to generate
-   distributions of the predictions and prunes the search space by beam pruning.
-   This makes a graph where each path respresents a translation hypothsis.
-   The output can be the path with the highest model score. */
-class T2TSearch
-{
-private:
-    /* the alpha parameter controls the length preference */
-    float alpha;
-
-    /* predictor */
-    T2TPredictor predictor;
-
-    /* max length of the generated sequence */
-    int maxLength;
-
-    /* beam size */
-    int beamSize;
-
-    /* batch size */
-    int batchSize;
-
-    /* we keep the final hypotheses in a heap for each sentence in the batch. */
-    XHeap<MIN_HEAP, float>* fullHypos;
-
-    /* array of the end symbols */
-    int* endSymbols;
-
-    /* number of the end symbols */
-    int endSymbolNum;
-
-    /* start symbol */
-    int startSymbol;
-
-    /* scalar of the input sequence (for max number of search steps) */
-    float scalarMaxLength;
-
-    /* indicate whether the early stop strategy is used */
-    bool isEarlyStop;
-
-public:
-    /* constructor */
-    T2TSearch();
-
-    /* de-constructor */
-    ~T2TSearch();
-
-    /* initialize the model */
-    void Init(int argc, char** argv);
-
-    /* search for the most promising states */
-    void Search(T2TModel* model, XTensor* input, XTensor* padding, XTensor* output, XTensor* score);
-
-    /* preparation */
-    void Prepare(int myBatchSize, int myBeamSize);
-
-    /* compute the model score for each hypothesis */
-    void Score(T2TStateBundle* prev, T2TStateBundle* beam);
-
-    /* generate token indices via beam pruning */
-    void Generate(T2TStateBundle* beam);
-
-    /* expand the search graph */
-    void Expand(T2TStateBundle* prev, T2TStateBundle* beam);
-
-    /* collect hypotheses with ending symbol */
-    void Collect(T2TStateBundle* beam);
-
-    /* fill the hypotheis heap with incomplete hypothses */
-    void FillHeap(T2TStateBundle* beam);
-
-    /* save the output sequences and score */
-    void Dump(XTensor* output, XTensor* score);
-
-    /* check if the token is an end symbol */
-    bool IsEnd(int token);
-
-    /*check whether all hypotheses are completed*/
-    bool IsAllCompleted(T2TStateBundle* beam);
-
-    /* set end symbols for search */
-    void SetEnd(const int* tokens, const int tokenNum);
-
-    /* make a mask to prevent duplicated entries in beam expansion for the first position */
-    XTensor MakeFirstMask(T2TStateBundle* beam);
-};
-
-}
-
-#endif
--- a/source/sample/transformer/T2TTester.cpp
+++ b/source/sample/transformer/T2TTester.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- /*
-  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
-  */
-
-#include <math.h>
-#include "T2TUtility.h"
-#include "T2TTester.h"
-#include "T2TSearch.h"
-#include "../../tensor/XUtility.h"
-#include "../../tensor/core/CHeader.h"
-#include "../../network/XNoder.h"
-#include "..//..//tensor/XTensor.h"
-
-using namespace nts;
-
-namespace transformer
-{
-
-/* constructor */
-T2TTester::T2TTester()
-{
-}
-
-/* de-constructor */
-T2TTester::~T2TTester()
-{
-}
-
-/* initialize the model */
-void T2TTester::Init(int argc, char** argv)
-{
-    LoadParamInt(argc, argv, "vsize", &vSize, 34040);
-    LoadParamInt(argc, argv, "vsizetgt", &vSizeTgt, vSize);
-    LoadParamInt(argc, argv, "sentbatch", &sentBatch, 1);
-    LoadParamBool(argc, argv, "sort", &batchLoader.sortBuffer, true);
-
-    seacher.Init(argc, argv);
-}
-
-/*
-test the model
->> fn - test data file
->> ofn - output data file
->> model - model that is trained
-*/
-void T2TTester::Test(const char* fn, const char* ofn, T2TModel* model)
-{
-    int wc = 0;
-    int wordCount = 0;
-    int wordCountTotal = 0;
-    int sentCount = 0;
-    int batchCount = 0;
-
-    /* data files */
-    FILE* ofile = fopen(ofn, "wb");
-    CheckNTErrors(ofile, "Cannot open the output file");
-
-    int devID = model->devID;
-
-    double startT = GetClockSec();
-
-    /* batch of input sequences */
-    XTensor batchEnc;
-
-    /* padding */
-    XTensor paddingEnc;
-
-    /* an array that keeps the sequences */
-    int* seqs = new int[MILLION];
-
-    batchLoader.Init(fn);
-
-
-    int count = 0;
-    while (!batchLoader.IsEmpty())
-    {
-        count++;
-        wordCount = 0;
-        for (int i = 0; i < model->decoder->nlayer; ++i) {
-            model->decoder->selfAttCache[i].miss = true;
-            model->decoder->enDeAttCache[i].miss = true;
-        }
-
-        vector<int> indices = batchLoader.LoadBatch(&batchEnc, &paddingEnc, sentBatch, devID);
-
-        XTensor output;
-        XTensor score;
-        
-        seacher.Search(model, &batchEnc, &paddingEnc, &output, &score);
-        
-        for (int i = 0; i < indices.size(); ++i) {
-            Result res;
-            XTensor sent, srcIdx, tgtIdx;
-            InitTensor1DV2(&srcIdx, 1, X_INT, output.devID);
-            int idx[]{ i };
-            srcIdx.SetData(idx, 1);
-            InitTensorV2(&tgtIdx, &srcIdx);
-            SetAscendingOrder(tgtIdx, 0);
-
-            sent = CopyIndexed(output, 0, srcIdx, tgtIdx);
-            res.values = sent;
-            res.id = indices[i];
-            batchLoader.resBuffer.emplace_back(res);
-        }
-
-        wc = batchEnc.GetDim(-1);
-        wordCount += wc;
-        wordCountTotal += wc;
-        sentCount += batchEnc.GetDim(-2);
-        batchCount += 1;
-
-        if (batchCount % 1 == 0) {
-            double elapsed = GetClockSec() - startT;
-            XPRINT3(0, stderr, "[INFO] elapsed=%.1fs, sentence=%d, sword=%d\n", elapsed, sentCount, wordCount);
-        }
-    }
-
-    batchLoader.RerankRes();
-
-    for (auto res : batchLoader.resBuffer) {
-        Dump(ofile, &res.values);
-    }
-
-    fclose(ofile);
-
-    delete[] seqs;
-
-    double elapsed = GetClockSec() - startT;
-
-    XPRINT3(0, stderr, "[INFO] test finished (took %.1fs, word=%d, sent=%d)\n", elapsed, wordCountTotal, sentCount);
-}
-
-/*
-dump the result into the file
->> file - data file
->> output - output tensor
-*/
-void T2TTester::Dump(FILE* file, XTensor* output)
-{
-    int seqLength = output->GetDim(-1);
-
-    for (int i = 0; i < output->unitNum; i += seqLength) {
-        for (int j = 0; j < seqLength; j++) {
-            int w = output->GetInt(i + j);
-            if (w < 0 || w == 1)
-                break;
-            fprintf(file, "%d ", w);
-            
-        }
-
-        fprintf(file, "\n");
-    }
-}
-
-}
--- a/source/sample/transformer/T2TTester.h
+++ b/source/sample/transformer/T2TTester.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- /*
-  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
-  * A week with no trips :)
-  */
-
-#ifndef __T2TTESTER_H__
-#define __T2TTESTER_H__
-
-#include "T2TSearch.h"
-#include "t2tdata/DataSet.h"
-
-namespace transformer
-{
-
-/* This class translates test sentences with a trained model. */
-class T2TTester
-{
-public:
-    /* vocabulary size of the source side */
-    int vSize;
-
-    /* vocabulary size of the target side */
-    int vSizeTgt;
-
-    /* batch size for sentences */
-    int sentBatch;
-
-    /* for batching */
-    DataSet batchLoader;
-
-    /* decoder for inference */
-    T2TSearch seacher;
-
-public:
-    /* constructor */
-    T2TTester();
-
-    /* de-constructor */
-    ~T2TTester();
-
-    /* initialize the model */
-    void Init(int argc, char** argv);
-
-    /* test the model */
-    void Test(const char* fn, const char* ofn, T2TModel* model);
-
-    /* dump the result into the file */
-    void Dump(FILE* file, XTensor* output);
-};
-
-}
-
-#endif
\ No newline at end of file
--- a/source/sample/transformer/T2TUtility.cpp
+++ b/source/sample/transformer/T2TUtility.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-namespace transformer
-{
-
-FILE * tmpFILE;
-int llnum = 0;
-FILE * tf = NULL;
-
-void LoadParamString(int argc, char ** argv, const char * name, char * p, const char * defaultP)
-{
-    char vname[128];
-    vname[0] = '-';
-    strcpy(vname + 1, name);
-    bool hit = false;
-    for(int i = 0; i < argc; i++){
-        if(!strcmp(argv[i], vname) && i + 1 < argc){
-            strcpy(p, argv[i + 1]);
-            //fprintf(stderr, " %s=%s\n", name, argv[i + 1]);
-            hit = true;
-        }
-    }
-    if(!hit)
-        strcpy(p, defaultP);
-}
-
-void LoadParamInt(int argc, char ** argv, const char * name, int * p, int defaultP)
-{
-    char vname[128];
-    vname[0] = '-';
-    strcpy(vname + 1, name);
-    bool hit = false;
-    for(int i = 0; i < argc; i++){
-        if(!strcmp(argv[i], vname) && i + 1 < argc){
-            *(int*)p = atoi(argv[i + 1]);
-            //fprintf(stderr, " %s=%s\n", name, argv[i + 1]);
-            hit = true;
-        }
-    }
-    if(!hit)
-        *p = defaultP;
-}
-
-void LoadParamBool(int argc, char ** argv, const char * name, bool * p, bool defaultP)
-{
-    char vname[128];
-    vname[0] = '-';
-    strcpy(vname + 1, name);
-    bool hit = false;
-    for(int i = 0; i < argc; i++){
-        if(!strcmp(argv[i], vname)){
-            *(bool*)p = true;
-            //fprintf(stderr, " %s=%s\n", name, "true");
-            hit = true;
-        }
-    }
-    if(!hit)
-        *p = defaultP;
-}
-
-void LoadParamFloat(int argc, char ** argv, const char * name, float * p, float defaultP)
-{
-    char vname[128];
-    vname[0] = '-';
-    strcpy(vname + 1, name);
-    bool hit = false;
-    for(int i = 0; i < argc; i++){
-        if(!strcmp(argv[i], vname) && i + 1 < argc){
-            *p = (float)atof(argv[i + 1]);
-            //fprintf(stderr, " %s=%s\n", name, argv[i + 1]);
-            hit = true;
-        }
-    }
-    if(!hit)
-        *p = defaultP;
-}
-
-void ShowParams(int argc, char ** argv)
-{
-    fprintf(stderr, "args:\n");
-    for(int i = 0; i < argc; i++){
-        if(argv[i][1] == 0)
-            continue;
-        if(argv[i][0] == '-' && (argv[i][1] < '1' || argv[i][1] > '9')){
-            if(i + 1 < argc && argv[i + 1][0] != '-')
-                fprintf(stderr, " %s=%s\n", argv[i], argv[i + 1]);
-            else
-                fprintf(stderr, " %s=yes\n", argv[i]);
-        }
-    }
-    fprintf(stderr, "\n");
-}
-
-}
--- a/source/sample/transformer/T2TUtility.h
+++ b/source/sample/transformer/T2TUtility.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
- */
-
-#ifndef __T2TUTILITY_H__
-#define __T2TUTILITY_H__
-
-#include <stdio.h>
-
-namespace transformer
-{
-
-extern FILE * tmpFILE;
-
-/* load arguments */
-void LoadParamString(int argc, char ** argv, const char * name, char * p, const char * defaultP);
-void LoadParamInt(int argc, char ** argv, const char * name, int * p, int defaultP);
-void LoadParamBool(int argc, char ** argv, const char * name, bool * p, bool defaultP);
-void LoadParamFloat(int argc, char ** argv, const char * name, float * p, float defaultP);
-
-/* show arguments */
-void ShowParams(int argc, char ** argv);
-
-extern int llnum;
-extern FILE * tf;
-
-}
-
-#endif
--- a/source/sample/transformer/module/T2TUtility.cpp
+++ b/source/sample/transformer/module/T2TUtility.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -27,13 +26,13 @@
 #include <fstream>
 #include <sstream>

-#include "T2TUtility.h"
-#include "../../../tensor/XGlobal.h"
+#include "Utility.h"
+#include "../../tensor/XGlobal.h"

 using namespace nts;
 using namespace std;

-namespace transformer
+namespace nmt
 {

 /*
@@ -41,7 +40,7 @@ load configurations from the command
 >> argc - number of arguments
 >> argv - the list of arguments
 */
-T2TConfig::T2TConfig(int argc, const char** argv)
+Config::Config(int argc, const char** argv)
 {
    char** args = new char* [MAX_PARAM_NUM];
    for (int i = 0; i < argc; i++) {
@@ -61,22 +60,26 @@ T2TConfig::T2TConfig(int argc, const char** argv)
    ShowParams(argsNum, args);

    /* options for the model */
-    LoadParamInt(argsNum, args, "nhead", &nhead, 8);
-    LoadParamInt(argsNum, args, "enclayer", &nEncLayer, 1);
-    LoadParamInt(argsNum, args, "declayer", &nDecLayer, 1);
+    LoadParamInt(argsNum, args, "nhead", &nhead, 4);
+    LoadParamInt(argsNum, args, "enclayer", &nEncLayer, 6);
+    LoadParamInt(argsNum, args, "declayer", &nDecLayer, 6);
    LoadParamInt(argsNum, args, "maxrp", &maxRP, 8);
-    LoadParamInt(argsNum, args, "embsize", &embSize, 256);
-    LoadParamInt(argsNum, args, "modelsize", &modelSize, 256);
+    LoadParamInt(argsNum, args, "embsize", &embSize, 512);
+    LoadParamInt(argsNum, args, "modelsize", &modelSize, 512);
    LoadParamInt(argsNum, args, "maxpos", &maxPosLen, 1024);
-    LoadParamInt(argsNum, args, "fnnhidden", &fnnHiddenSize, modelSize * 4);
-    LoadParamInt(argsNum, args, "vsize", &srcVocabSize, 10000);
-    LoadParamInt(argsNum, args, "vsizetgt", &tgtVocabSize, 10000);
+    LoadParamInt(argsNum, args, "fnnhidden", &fnnHiddenSize, modelSize * 2);
+    LoadParamInt(argsNum, args, "vsize", &srcVocabSize, 10152);
+    LoadParamInt(argsNum, args, "vsizetgt", &tgtVocabSize, 10152);
    LoadParamInt(argsNum, args, "padid", &padID, 1);
    LoadParamInt(argsNum, args, "startid", &startID, 2);
    LoadParamInt(argsNum, args, "endid", &endID, 2);
    LoadParamBool(argsNum, args, "rpr", &useRPR, false);
-    LoadParamBool(argsNum, args, "prenorm", &preNorm, false);
-    LoadParamString(argsNum, args, "model", modelFN, "model.bin");
+    LoadParamBool(argsNum, args, "prenorm", &preNorm, true);
+
+    // TODO: refactor the parameters type to support weight sharing during training
+    LoadParamInt(argsNum, args, "shareemb", &shareAllEmbeddings, 0);
+    LoadParamInt(argsNum, args, "sharedec", &shareDecInputOutputWeight, 0);
+    LoadParamString(argsNum, args, "model", modelFN, "");
    LoadParamString(argsNum, args, "srcvocab", srcVocabFN, "vocab.src");
    LoadParamString(argsNum, args, "tgtvocab", tgtVocabFN, "vocab.tgt");

@@ -84,19 +87,20 @@ T2TConfig::T2TConfig(int argc, const char** argv)
    LoadParamString(argsNum, args, "train", trainFN, "");
    LoadParamString(argsNum, args, "valid", validFN, "");
    LoadParamInt(argsNum, args, "dev", &devID, 0);
-    LoadParamInt(argsNum, args, "wbatch", &wBatchSize, 2048);
-    LoadParamInt(argsNum, args, "sbatch", &sBatchSize, 1);
+    LoadParamInt(argsNum, args, "wbatch", &wBatchSize, 4096);
+    LoadParamInt(argsNum, args, "sbatch", &sBatchSize, 8);
    isTraining = (strcmp(trainFN, "") == 0) ? false : true;
    LoadParamBool(argsNum, args, "mt", &isMT, true);
-    LoadParamFloat(argsNum, args, "dropout", &dropout, 0.1);
-    LoadParamFloat(argsNum, args, "fnndrop", &fnnDropout, 0.0);
-    LoadParamFloat(argsNum, args, "attdrop", &attDropout, 0.0);
+    LoadParamFloat(argsNum, args, "dropout", &dropout, 0.3);
+    LoadParamFloat(argsNum, args, "fnndrop", &fnnDropout, 0.1);
+    LoadParamFloat(argsNum, args, "attdrop", &attDropout, 0.1);

-    LoadParamFloat(argc, args, "lrate", &lrate, 1.0F);
+    LoadParamFloat(argc, args, "lrate", &lrate, 0.0015F);
    LoadParamFloat(argc, args, "lrbias", &lrbias, 0);
-    LoadParamInt(argc, args, "nepoch", &nepoch, 20);
+    LoadParamInt(argc, args, "nepoch", &nepoch, 50);
+    LoadParamInt(argc, args, "maxcheckpoint", &maxCheckpoint, 10);
    LoadParamInt(argc, args, "nstep", &nstep, 100000);
-    LoadParamInt(argc, args, "nwarmup", &nwarmup, 3000);
+    LoadParamInt(argc, args, "nwarmup", &nwarmup, 8000);
    LoadParamBool(argc, args, "adam", &useAdam, true);
    LoadParamFloat(argc, args, "adambeta1", &adamBeta1, 0.9F);
    LoadParamFloat(argc, args, "adambeta2", &adamBeta2, 0.98F);
@@ -104,9 +108,8 @@ T2TConfig::T2TConfig(int argc, const char** argv)
    LoadParamBool(argc, args, "shuffled", &isShuffled, true);
    LoadParamFloat(argc, args, "labelsmoothing", &labelSmoothingP, 0.1);
    LoadParamInt(argc, args, "nstepcheckpoint", &nStepCheckpoint, -1);
-    LoadParamBool(argc, args, "epochcheckpoint", &useEpochCheckpoint, false);
+    LoadParamBool(argc, args, "epochcheckpoint", &useEpochCheckpoint, true);
    LoadParamInt(argc, args, "updatestep", &updateStep, 1);
-    LoadParamBool(argc, args, "debug", &isDebugged, false);
    LoadParamBool(argc, args, "sorted", &isLenSorted, false);

    LoadParamInt(argc, args, "bufsize", &bufSize, 50000);
@@ -114,7 +117,7 @@ T2TConfig::T2TConfig(int argc, const char** argv)
    LoadParamBool(argc, args, "smallbatch", &isSmallBatch, true);
    LoadParamBool(argc, args, "bigbatch", &isBigBatch, false);
    LoadParamBool(argc, args, "randbatch", &isRandomBatch, false);
-    LoadParamInt(argc, args, "bucketsize", &bucketSize, 0);
+    LoadParamInt(argc, args, "bucketsize", &bucketSize, wBatchSize * 10);

    /* options for translating */
    LoadParamString(argsNum, args, "test", testFN, "");
@@ -122,7 +125,7 @@ T2TConfig::T2TConfig(int argc, const char** argv)
    LoadParamInt(argsNum, args, "beamsize", &beamSize, 1);
    LoadParamBool(argsNum, args, "fp16", &useFP16, false);
    LoadParamFloat(argsNum, args, "lenalpha", &lenAlpha, 0.6);
-    LoadParamFloat(argsNum, args, "maxlenalpha", &maxLenAlpha, 2.0);
+    LoadParamFloat(argsNum, args, "maxlenalpha", &maxLenAlpha, 1.2);

    for (int i = 0; i < argc; i++)
        delete[] args[i];
@@ -136,7 +139,7 @@ load configurations from a file
 >> args - the list to store the configurations
 format: one option per line, separated by a blank or a tab
 */
-int T2TConfig::LoadFromFile(const char* configFN, char** args) {
+int Config::LoadFromFile(const char* configFN, char** args) {
    ifstream f(configFN, ios::in);
    CheckNTErrors(f.is_open(), "unable to open the config file");


--- a/source/sample/transformer/module/T2TUtility.h
+++ b/source/sample/transformer/module/T2TUtility.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,18 +19,18 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
 */

-#ifndef __T2TUTILITY_H__
-#define __T2TUTILITY_H__
+#ifndef __UTILITY_H__
+#define __UTILITY_H__

 #include <string>
 #include <cstdio>

-#include "../../../tensor/XList.h"
+#include "../../tensor/XList.h"

 using namespace std;
 using namespace nts;

-namespace transformer
+namespace nmt
 {

 #define MAX_PARAM_NUM 100
@@ -50,8 +49,8 @@ IntList SplitInt(const string& s, const string& delimiter);
 FloatList SplitFloat(const string& s, const string& delimiter);
 UInt64List SplitToPos(const string& s, const string& delimiter);

-/* configurations for t2t */
-class T2TConfig {
+/* configurations for  */
+class Config {
 public:
    /* path to the model */
    char modelFN[1024];
@@ -131,6 +130,12 @@ public:
    /* indicates whether the model is running for machine translation */
    bool isMT;

+    /* indicates whether share encoder decoder embeddings */
+    int shareAllEmbeddings;
+
+    /* indicates whether share decoder embeddings and output weights */
+    int shareDecInputOutputWeight;
+
    /* indicates whether the model is running with FP16 data type */
    bool useFP16;

@@ -164,9 +169,12 @@ public:
    /* training epoch number */
    int nepoch;

-    /* traing step number */
+    /* training step number */
    int nstep;

+    /* the maximum number of saved checkpoints */
+    int maxCheckpoint;
+
    /* indicates whether we use Adam */
    bool useAdam;

@@ -193,9 +201,6 @@ public:
    /* number of batches on which we do model update */
    int updateStep;

-    /* indicates whether we intend to debug the net */
-    bool isDebugged;
-
    /* indicates whether the sequence is sorted by length */
    bool isLenSorted;

@@ -222,7 +227,7 @@ public:
 public:

    /* load configurations from the command */
-    T2TConfig(int argc, const char** argv);
+    Config(int argc, const char** argv);

    /* load configurations from a file */
    int LoadFromFile(const char* configFN, char** args);

--- a/source/sample/transformer/module/T2TAttention.cpp
+++ b/source/sample/transformer/module/T2TAttention.cpp
--- a/source/sample/transformer/module/T2TAttention.h
+++ b/source/sample/transformer/module/T2TAttention.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,17 +19,17 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
 */

-#ifndef __T2TATTENTION_H__
-#define __T2TATTENTION_H__
+#ifndef __ATTENTION_H__
+#define __ATTENTION_H__

-#include "T2TNNUtil.h"
-#include "T2TUtility.h"
+#include "NNUtil.h"
+#include "../Utility.h"
 #include "../../../network/XNet.h"
 #include "../../../tensor/core/CHeader.h"

 using namespace nts;

-namespace transformer
+namespace nmt
 {
 /* attention type */
 enum { NONE, SELF_ATT, EN_DE_ATT };
@@ -50,6 +49,9 @@ public:
    /* indicates cache miss if 'true' */
    bool miss;

+    /* indicates whether we use cache */
+    bool enable;
+
    /* constructor */
    Cache();

@@ -64,7 +66,7 @@ public:
 };

 /* multi-head attention */
-class T2TAttention
+class Attention
 {
 public:
    /* device id */
@@ -74,22 +76,22 @@ public:
    int nhead;

    /* transformation matrix for Q */
-    XTensor wq;
+    XTensor weightQ;

    /* bias for Q */
-    XTensor bq;
+    XTensor biasQ;

    /* transformation matrix for K */
-    XTensor wk;
+    XTensor weightK;

    /* bias for K */
-    XTensor bk;
+    XTensor biasK;

    /* transformation matrix for V */
-    XTensor wv;
+    XTensor weightV;

    /* bias for V */
-    XTensor bv;
+    XTensor biasV;

    XTensor wBig;

@@ -99,10 +101,10 @@ public:
    XTensor RPEmbK;

    /* transformation after dot-product attention */
-    XTensor wo;
+    XTensor weightO;

    /* bias after dot-product attention */
-    XTensor bo;
+    XTensor biasO;

    /* size of transformed Q and K */
    int dk;
@@ -124,13 +126,13 @@ public:

 public:
    /* constructor */
-    T2TAttention();
+    Attention();

    /* de-constructor */
-    ~T2TAttention();
+    ~Attention();

    /* initialize the model */
-    void InitModel(T2TConfig& config);
+    void InitModel(Config& config);

    /* make the network */
    XTensor Make(XTensor& k, XTensor& q, XTensor& v,
@@ -145,8 +147,10 @@ public:
    XTensor MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
                             XTensor* mask, bool isTraining, bool isEnc);

+    /* generate relative position embeddings */
    XTensor GetRPEmbedding(const int lenQ, const int lenKV, const int maxRelativeLen, const bool isEnc);

+    /* relative position-aware dot-product attention inner calculation */
    XTensor RPDotProduct(XTensor& x, XTensor& y, XTensor& z, const bool is_key);
 };
 }

--- a/source/sample/transformer/module/T2TCommonModules.cpp
+++ b/source/sample/transformer/module/T2TCommonModules.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,13 +19,11 @@
 * This file includes some common modules of the Transformer model
 */

-#include <cmath>
-
-#include "T2TCommonModules.h"
+#include "CommonModules.h"
 #include "../../../tensor/core/CHeader.h"
 #include "../../../tensor/function/FHeader.h"

-namespace transformer
+namespace nmt
 {

 /* 
@@ -37,7 +34,7 @@ flexible layer normalization for the Transformer
 >> before - whether we use layernorm before attention/fnn
 >> after - whether we use layernorm after attention/fnn
 */
-XTensor LayerNorm(XTensor& input, T2TLN& ln, bool prenorm, bool before, bool after)
+XTensor LayerNorm(XTensor& input, LN& ln, bool prenorm, bool before, bool after)
 {
    if (after ^ prenorm)
        return ln.Make(input);

--- a/source/sample/transformer/module/T2TCommonModules.h
+++ b/source/sample/transformer/module/T2TCommonModules.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,16 +21,16 @@
 #ifndef __COMMONMODULE_H__
 #define __COMMONMODULE_H__

-#include "T2TLayerNormal.h"
-#include "T2TCommonModules.h"
+#include "LayerNorm.h"
+#include "CommonModules.h"

 using namespace nts;

-namespace transformer
+namespace nmt
 {

 /* the layer normalization module to control pre-norm or post-norm*/
-XTensor LayerNorm(XTensor& input, T2TLN& ln, bool prenorm, bool before, bool after);
+XTensor LayerNorm(XTensor& input, LN& ln, bool prenorm, bool before, bool after);

 }


--- a/source/sample/transformer/module/T2TEmbedding.cpp
+++ b/source/sample/transformer/module/T2TEmbedding.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,17 +19,15 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-07
 */

-#include <cmath>
-
-#include "T2TUtility.h"
-#include "T2TEmbedding.h"
+#include "Embedding.h"
+#include "../Utility.h"
 #include "../../../tensor/core/CHeader.h"

-namespace transformer
+namespace nmt
 {

 /* constructor */
-T2TEmbedder::T2TEmbedder()
+Embedder::Embedder()
 {
    devID = -1;
    vSize = -1;
@@ -38,7 +35,7 @@ T2TEmbedder::T2TEmbedder()
 }

 /* de-constructor */
-T2TEmbedder::~T2TEmbedder()
+Embedder::~Embedder()
 {
 }

@@ -47,7 +44,7 @@ initialize the model
 >> config - configurations of the model
 >> isEnc - indicates if it is used for the encoder
 */
-void T2TEmbedder::InitModel(T2TConfig& config, bool isEnc)
+void Embedder::InitModel(Config& config, bool isEnc)
 {
    devID = config.devID;
    d = config.modelSize;
@@ -70,7 +67,7 @@ void T2TEmbedder::InitModel(T2TConfig& config, bool isEnc)
 make positional embeddings (of size eSize * length)
 >> length - length of the sequence
 */
-void T2TEmbedder::MakePosEmbedding(int length)
+void Embedder::MakePosEmbedding(int length)
 {
    InitTensor2D(&posEmbeddingBase, length, eSize, X_FLOAT, devID);

@@ -110,58 +107,45 @@ make the network
 >> isTraining - indicates whether it is training
 << return - word & position embeddings of the input
 */
-XTensor T2TEmbedder::Make(XTensor& input, bool isDec, bool isTraining, int nstep)
+XTensor Embedder::Make(XTensor& input, bool isDec, bool isTraining, int nstep)
 {
    /* make sure the padding index is 1 */
    CheckNTErrors(input.order > 1, "Wrong input tensor size!");
    CheckNTErrors(input.dimSize[input.order - 1] < maxLength, "The sequence is too long!");
-    CheckNTErrors(vSize > 0, "set vocabulary size by \"-vsize\"");
-    CheckNTErrors(eSize > 0, "set embedding size by \"-esize\"");
+    CheckNTErrors(vSize > 0, "Set vocabulary size by \"-vsize\"");
+    CheckNTErrors(eSize > 0, "Set embedding size by \"-esize\"");

    XTensor wordEmbedding, position, posEmbedding;
-    InitTensor(&position, &input);
-
-    int* posData = new int[input.unitNum];

-    XTensor inputCPU;
-    InitTensorOnCPU(&inputCPU, &input);
-    _CopyValues(&input, &inputCPU);
+    InitTensor1D(&position, input.GetDim(-1), X_INT, devID);

-    if (!isDec)
+    if (!isDec || isTraining || input.GetDim(-1) > 1)
    {
-        /* encoder embeddings */
-        for (int i = 0; i < inputCPU.dimSize[0]; i++) {
-            int startNoPad = 1 + 1;
-            int* p = ((int*)inputCPU.data) + i * inputCPU.dimSize[1];
-            for (int j = 0; j < inputCPU.dimSize[1]; j++) {
-                if (p[j] == 1) {
-                    posData[i * inputCPU.dimSize[1] + j] = 1;
-                }
-                else {
-                    posData[i * inputCPU.dimSize[1] + j] = startNoPad++;
-                }
-            }
-        }
-        position.SetData(posData, position.unitNum);
+        position.Range(0, position.unitNum, 1);
+
+        // disable grad
+        ScaleAndShiftMe(position, 1.0F, float(padIdx + 1));
    }
    else
    {
-        /* decoder embeddings */
-        position.SetDataFixed(nstep + 2);
+        /* decoder embeddings during decoding */
+        position.SetDataFixed(nstep + padIdx + 1);
    }

-    delete[] posData;
-
    /* we make positional embeddings first */
-    posEmbedding = Gather(posEmbeddingBase, position);
+    XTensor embTMP;
+    embTMP = Gather(posEmbeddingBase, position);
+    posEmbedding = Unsqueeze(embTMP, 0, input.GetDim(0));

    /* then we make word embeddings */
+    //w.enableGrad = false;
    wordEmbedding = Gather(w, input);

    wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize));

    /* we sum over the two embeddings */
-    return wordEmbedding + posEmbedding;
+    SumMe(wordEmbedding, posEmbedding);
+    return wordEmbedding;
 }

 }
\ No newline at end of file
--- a/source/sample/transformer/module/T2TEmbedding.h
+++ b/source/sample/transformer/module/T2TEmbedding.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,15 +19,15 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-07
 */

-#ifndef __T2TEMBEDDING_H__
-#define __T2TEMBEDDING_H__
+#ifndef __EMBEDDING_H__
+#define __EMBEDDING_H__

-#include "T2TUtility.h"
+#include "../Utility.h"
 #include "../../../network/XNet.h"

 using namespace nts;

-namespace transformer
+namespace nmt
 {

 #define DEFAULT_EMBEDDING_SIZE 512
@@ -37,7 +36,7 @@ namespace transformer
 embedding (of word at position i):
 word embedding + positional embedding
 */
-class T2TEmbedder
+class Embedder
 {
 public:
    /* device id */
@@ -52,7 +51,7 @@ public:
    /* maximum length of the sequence */
    int maxLength;

-    /* dimension size of the hidden layers in the t2t model */
+    /* dimension size of the hidden layers in the  model */
    int d;

    /* padding index */
@@ -67,13 +66,13 @@ public:

 public:
    /* constructor */
-    T2TEmbedder();
+    Embedder();

    /* de-constructor */
-    ~T2TEmbedder();
+    ~Embedder();

    /* initialize the model */
-    void InitModel(T2TConfig& config, bool isEnc = true);
+    void InitModel(Config& config, bool isEnc = true);

    /* make positional embeddings */
    void MakePosEmbedding(int length);

--- a/source/sample/transformer/module/T2TFNN.cpp
+++ b/source/sample/transformer/module/T2TFNN.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,19 +19,17 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

-#include <cmath>
-
-#include "T2TFNN.h"
-#include "T2TUtility.h"
-#include "T2TEmbedding.h"
+#include "FNN.h"
+#include "Embedding.h"
+#include "../Utility.h"
 #include "../../../tensor/core/CHeader.h"
 #include "../../../tensor/function/FHeader.h"

-namespace transformer
+namespace nmt
 {

 /* constructor */
-T2TFNN::T2TFNN()
+FNN::FNN()
 {
    inSize = -1;
    outSize = -1;
@@ -40,7 +37,7 @@ T2TFNN::T2TFNN()
 }

 /* de-constructor */
-T2TFNN::~T2TFNN()
+FNN::~FNN()
 {
 }

@@ -50,7 +47,7 @@ initialize the model
 >> argv - list of pointers to the arguments
 >> config - configurations of the model
 */
-void T2TFNN::InitModel(T2TConfig& config)
+void FNN::InitModel(Config& config)
 {
    devID = config.devID;

@@ -69,6 +66,9 @@ void T2TFNN::InitModel(T2TConfig& config)
    _SetDataFanInOut(&w1, scale);
    _SetDataFanInOut(&w2, scale);

+    w1.SetDataRand(-(DTYPE)sqrt(6.0F / inSize), (DTYPE)sqrt(6.0F / inSize));
+    w2.SetDataRand(-(DTYPE)sqrt(6.0F / hSize), (DTYPE)sqrt(6.0F / hSize));
+
    b1.SetZeroAll();
    b2.SetZeroAll();
 }
@@ -79,7 +79,7 @@ y = max(0, x * w1 + b1) * w2 + b2
 >> input - the input tensor
 >> return - the output tensor
 */
-XTensor T2TFNN::Make(XTensor& input, bool isTraining)
+XTensor FNN::Make(XTensor& input, bool isTraining)
 {
    XTensor t1;


--- a/source/sample/transformer/module/T2TFNN.h
+++ b/source/sample/transformer/module/T2TFNN.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,20 +19,20 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

-#ifndef __T2TFNN_H__
-#define __T2TFNN_H__
+#ifndef __FNN_H__
+#define __FNN_H__

-#include "T2TUtility.h"
-#include "T2TLayerNormal.h"
+#include "LayerNorm.h"
+#include "../Utility.h"
 #include "../../../tensor/XTensor.h"

 using namespace nts;

-namespace transformer
+namespace nmt
 {

 /* a fnn: y = max(0, x * w1 + b1) * w2 + b2 */
-class T2TFNN
+class FNN
 {
 public:
    /* device id */
@@ -66,13 +65,13 @@ public:
 public:

    /* constructor */
-    T2TFNN();
+    FNN();

    /* de-constructor */
-    ~T2TFNN();
+    ~FNN();

    /* initialize the model */
-    void InitModel(T2TConfig& config);
+    void InitModel(Config& config);

    /* make the network */
    XTensor Make(XTensor& input, bool isTraining);

--- a/source/sample/transformer/module/T2TGatedLinearUnit.cpp
+++ b/source/sample/transformer/module/T2TGatedLinearUnit.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,16 +18,13 @@
 * $Created by: Bei Li (libei_neu@outlook.com) 2020-02-03
 */

-
-#include <cmath>
-
-#include "T2TUtility.h"
-#include "T2TEmbedding.h"
-#include "T2TGatedLinearUnit.h"
+#include "GLU.h"
+#include "Embedding.h"
+#include "../Utility.h"
 #include "../../../tensor/core/CHeader.h"
 #include "../../../tensor/function/FHeader.h"

-namespace transformer
+namespace nmt
 {

 /* constructor */
@@ -48,7 +44,7 @@ GLU::~GLU()
 initialize the model
 >> config - configurations of the model
 */
-void GLU::InitModel(T2TConfig& config)
+void GLU::InitModel(Config& config)
 {
    devID = config.devID;


--- a/source/sample/transformer/module/T2TGatedLinearUnit.h
+++ b/source/sample/transformer/module/T2TGatedLinearUnit.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,12 +22,11 @@
 #ifndef __GLU_H__
 #define __GLU_H__

-#include "T2TLayerNormal.h"
-#include "T2TGatedLinearUnit.h"
+#include "LayerNorm.h"

 using namespace nts;

-namespace transformer
+namespace nmt
 {

 /* a fnn: y = max(0, x * w1 + b1) * w2 + b2 */
@@ -68,7 +66,7 @@ public:
    ~GLU();

    /* initialize the model */
-    void InitModel(T2TConfig& config);
+    void InitModel(Config& config);

    /* make the network */
    XTensor Make(XTensor& input);

--- a/source/sample/transformer/module/T2TLayerHistory.cpp
+++ b/source/sample/transformer/module/T2TLayerHistory.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,19 +18,16 @@
 * $Created by: Bei Li (libei_neu@outlook.com) 2020-02-03
 */

-#include <cmath>
-
-#include "T2TUtility.h"
-#include "T2TEmbedding.h"
-#include "T2TLayerNormal.h"
-#include "T2TLayerHistory.h"
-
+#include "Embedding.h"
+#include "LayerNorm.h"
+#include "LayerHistory.h"
+#include "../Utility.h"
 #include "../../../tensor/core/CHeader.h"

 #define SAFE_DELETE(x) do{ if((x) != NULL){delete (x); (x) = NULL;} } while(false)
 #define SAFE_DELETE_ARRAY(x) do{ if((x) != NULL) {delete [] (x); (x)=NULL;} } while(false)

-namespace transformer
+namespace nmt
 {

 /* constructor */
@@ -54,7 +50,7 @@ LayerHistory::~LayerHistory()
 initialize the model
 >> config - configurations of the model
 */
-void LayerHistory::InitModel(T2TConfig& config)
+void LayerHistory::InitModel(Config& config)
 {
    devID = config.devID;
    d = config.modelSize;
@@ -62,7 +58,7 @@ void LayerHistory::InitModel(T2TConfig& config)

    InitTensor2D(&weight, nlayer + 1, nlayer + 1, X_FLOAT, devID);

-    layerNorms = new T2TLN[nlayer];
+    layerNorms = new LN[nlayer];

    /* initialize the layer normalization of each layer */
    for (int i = 0; i < nlayer; i++) {

--- a/source/sample/transformer/module/T2TLayerHistory.h
+++ b/source/sample/transformer/module/T2TLayerHistory.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,14 +21,14 @@
 #ifndef __LAYERHISTORY_H__
 #define __LAYERHISTORY_H__

-#include "T2TLayerNormal.h"
-#include "T2TLayerHistory.h"
+#include "LayerNorm.h"
+#include "LayerHistory.h"

 #include "../../../tensor/function/FHeader.h"

 using namespace nts;

-namespace transformer
+namespace nmt
 {

 /*
@@ -61,7 +60,7 @@ public:
    TensorList history;

    /* layer normalization for each intimidate layer */
-    T2TLN* layerNorms;
+    LN* layerNorms;

 public:
    /* constructor */
@@ -71,7 +70,7 @@ public:
    ~LayerHistory();

    /* initialize the model */
-    void InitModel(T2TConfig& config);
+    void InitModel(Config& config);

    /* add the layer output to the history */
    void Add(XTensor& tensor);

--- a/source/sample/transformer/module/T2TLayerNormal.cpp
+++ b/source/sample/transformer/module/T2TLayerNormal.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,24 +19,23 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

-#include <cmath>
-#include "T2TUtility.h"
-#include "T2TEmbedding.h"
-#include "T2TLayerNormal.h"
+#include "Embedding.h"
+#include "LayerNorm.h"
+#include "../Utility.h"
 #include "../../../tensor/core/CHeader.h"

-namespace transformer
+namespace nmt
 {

 /* constructor */
-T2TLN::T2TLN()
+LN::LN()
 {
    devID = -1;
    d = 0;
 }

 /* de-constructor */
-T2TLN::~T2TLN()
+LN::~LN()
 {
 }

@@ -47,7 +45,7 @@ initialize the model
 >> argv - list of pointers to the arguments
 >> config - configurations of the model
 */
-void T2TLN::InitModel(T2TConfig& config)
+void LN::InitModel(Config& config)
 {
    devID = config.devID;

@@ -57,6 +55,8 @@ void T2TLN::InitModel(T2TConfig& config)
    InitTensor1D(&b, d, X_FLOAT, devID);
    w.SetDataRand(1.0F, 1.0F);
    b.SetZeroAll();
+
+    w.SetDataFixed(1);
 }

 /*
@@ -64,7 +64,7 @@ make the network
 >> input - the input tensor
 >> return - layer normalization output
 */
-XTensor T2TLN::Make(XTensor& input)
+XTensor LN::Make(XTensor& input)
 {
    XTensor& x = input;
    XTensor xn;

--- a/source/sample/transformer/module/T2TLayerNormal.h
+++ b/source/sample/transformer/module/T2TLayerNormal.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,20 +19,20 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

-#ifndef __T2TLAYERNORMAL_H__
-#define __T2TLAYERNORMAL_H__
+#ifndef __LAYERNORMAL_H__
+#define __LAYERNORMAL_H__

-#include "T2TUtility.h"
-#include "../../../network/XNet.h"
+#include "../Utility.h"
+#include "../../../network//XNet.h"

 using namespace nts;

-namespace transformer
+namespace nmt
 {

 /* layer normalization: y = norm(x) * w + b
   where norm(x) = (x - mean)/standardDeviation */
-class T2TLN
+class LN
 {
 public:
    /* device id */
@@ -50,13 +49,13 @@ public:

 public:
    /* constructor */
-    T2TLN();
+    LN();

    /* de-constructor */
-    ~T2TLN();
+    ~LN();

    /* initialize the model */
-    void InitModel(T2TConfig& config);
+    void InitModel(Config& config);

    /* make the network */
    XTensor Make(XTensor& input);

--- a/source/sample/transformer/module/T2TNNUtil.cpp
+++ b/source/sample/transformer/module/T2TNNUtil.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,12 +15,12 @@
 */

 /*
- * $Created by: Chi (huchinlp@foxmail.com) 2020-03-21
+ * $Created by: HU Chi (huchinlp@foxmail.com) 2020-03-21
 */

-#include "T2TNNUtil.h"
+#include "NNUtil.h"

-namespace transformer
+namespace nmt
 {

 /* 

--- a/source/sample/transformer/module/T2TNNUtil.h
+++ b/source/sample/transformer/module/T2TNNUtil.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,11 +15,11 @@
 */

 /*
- * $Created by: Chi (huchinlp@foxmail.com) 2020-03-21
+ * $Created by: HU Chi (huchinlp@foxmail.com) 2020-03-21
 */

-#ifndef __T2TNNUTIL_H__
-#define __T2TNNUTIL_H__
+#ifndef __NNUTIL_H__
+#define __NNUTIL_H__

 #include "../../../tensor/XGlobal.h"
 #include "../../../tensor/core/CHeader.h"
@@ -28,7 +27,7 @@

 using namespace nts;

-namespace transformer
+namespace nmt
 {

 /* the gather function for tensor with any dimension */

--- a/source/sample/transformer/module/T2TOutput.cpp
+++ b/source/sample/transformer/module/T2TOutput.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,18 +19,16 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

-#include <cmath>
-
-#include "T2TOutput.h"
-#include "T2TUtility.h"
-#include "T2TEmbedding.h"
+#include "Output.h"
+#include "Embedding.h"
+#include "../Utility.h"
 #include "../../../tensor/core/CHeader.h"

-namespace transformer
+namespace nmt
 {

 /* constructor */
-T2TOutput::T2TOutput()
+Output::Output()
 {
    devID = -1;
    vSize = -1;
@@ -39,7 +36,7 @@ T2TOutput::T2TOutput()
 }

 /* de-constructor */
-T2TOutput::~T2TOutput()
+Output::~Output()
 {
 }

@@ -47,7 +44,7 @@ T2TOutput::~T2TOutput()
 initialize the model
 >> config - configurations of the model
 */
-void T2TOutput::InitModel(T2TConfig& config)
+void Output::InitModel(Config& config)
 {
    devID = config.devID;
    hSize = config.modelSize;
@@ -66,7 +63,7 @@ make the network (redefined output tensor)
 >> isTraining - whether it is used for training
 >> normalized - whether ignore the log-softmax
 */
-void T2TOutput::Make(XTensor& input, XTensor& output, bool isTraining, bool normalized)
+void Output::Make(XTensor& input, XTensor& output, bool isTraining, bool normalized)
 {
    XTensor& x = input;


--- a/source/sample/transformer/module/T2TOutput.h
+++ b/source/sample/transformer/module/T2TOutput.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,19 +19,19 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

-#ifndef __T2TOUTPUT_H__
-#define __T2TOUTPUT_H__
+#ifndef __OUTPUT_H__
+#define __OUTPUT_H__

-#include "T2TUtility.h"
+#include "../Utility.h"
 #include "../../../tensor/function/FHeader.h"

 using namespace nts;

-namespace transformer
+namespace nmt
 {

 /* output layer */
-class T2TOutput
+class Output
 {
 public:
    /* device id */
@@ -49,13 +48,13 @@ public:

 public:
    /* constructor */
-    T2TOutput();
+    Output();

    /* de-constructor */
-    ~T2TOutput();
+    ~Output();

    /* initialize the model */
-    void InitModel(T2TConfig& config);
+    void InitModel(Config& config);

    /* make the network (redefined output tensor) */
    void Make(XTensor& input, XTensor& output, bool isTraining, bool normalized);

--- a/source/sample/transformer/t2tdata/DataSet.cpp
+++ b/source/sample/transformer/t2tdata/DataSet.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-* $Created by: HU Chi (huchinlp@foxmail.com) 2019-04-05
-*/
-
-#include "DataSet.h"
-#include "StringUtil.h"
-
-#include <string>
-#include <vector>
-#include <fstream>
-#include <algorithm>
-#include "..//..//..//tensor/XUtility.h"
-
-using namespace nts;
-
-bool Compare(const Example& a, const Example& b) {
-    return a.values.size() > b.values.size();
-}
-
-bool CompareRes(const Result& a, const Result& b) {
-    return a.id < b.id;
-}
-
-void DataSet::RerankRes(){
-    sort(resBuffer.begin(), resBuffer.end(), CompareRes);
-}
-
-/*
-load data from the file to the buffer
-*/
-void DataSet::LoadDataToBuffer()
-{
-    string line;
-    buffer.clear();
-    bufferUsed = 0;
-    const string tokenDelimiter = " ";
-
-    int id = 0;
-    while (getline(*fp, line)) {
-        vector<int> values = Split<int>(line, tokenDelimiter);
-
-        Example example;
-        example.id = id++;
-        example.values = values;
-        buffer.emplace_back(example);
-    }
-    if (fp->eof()) {
-        fp->seekg(fp->beg);
-    }
-    if (sortBuffer) {
-        sort(buffer.begin(), buffer.end(), Compare);
-    }
-    resBuffer.reserve(buffer.size());
-}
-
-/*
-select a field and generate a mini-batch by indices
->>> batchEnc - a tensor to store the batch of input
->>> paddingEnc - a tensor to store the batch of paddings
->>> batchSize - batch size
->>> devID - devices id, -1 for CPU
->>> mem - the memory pool
-*/
-vector<int> DataSet::LoadBatch(XTensor * batchEnc, XTensor * paddingEnc, 
-                        size_t batchSize, int devID)
-{
-    size_t realBatchSize = batchSize;
-
-    /* real batch size */
-    if ((buffer.size()-bufferUsed) < batchSize) {
-        realBatchSize = buffer.size()-bufferUsed;
-    }
-
-    /* get the maximum sentence length in a mini-batch */
-    size_t maxLen = 0;
-    if (realBatchSize == 1)
-        maxLen = buffer[bufferUsed].values.size();
-    for (size_t i = 0; i < realBatchSize - 1; ++i) {
-        maxLen = max(maxLen, buffer[bufferUsed+i].values.size());
-    }
-    CheckNTErrors(maxLen != 0, "wrong length dectected");
-
-    int* batchValues = new int[realBatchSize * maxLen];
-    float* paddingValues = new float[realBatchSize * maxLen];
-
-    for (int i = 0; i < realBatchSize * maxLen; ++i) {
-        batchValues[i] = 1.0F;
-    }
-    memset(paddingValues, 0, sizeof(float) * maxLen * realBatchSize);
-
-    size_t cur = 0;
-    
-    /* left padding */
-    vector<int> indices;
-    indices.reserve(realBatchSize);
-    for (size_t i = 0; i < realBatchSize; ++i) {
-        indices.push_back(buffer[bufferUsed + i].id);
-        cur = maxLen * (i + 1) - buffer[bufferUsed+i].values.size();
-        for (int v : buffer[bufferUsed + i].values) {
-            batchValues[cur] = v;
-            paddingValues[cur++] = 1.0F;
-        }
-        cur = maxLen * (i + 1);
-    }
-
-    InitTensor2DV2(batchEnc, realBatchSize, maxLen, X_INT, devID);
-    InitTensor2DV2(paddingEnc, realBatchSize, maxLen, X_FLOAT, devID);
-
-    bufferUsed += realBatchSize;
-
-    batchEnc->SetData(batchValues, batchEnc->unitNum);
-    paddingEnc->SetData(paddingValues, paddingEnc->unitNum);
-
-    delete[] batchValues;
-    delete[] paddingValues;
-
-    return indices;
-}
-
-/*
-the constructor of DataSet
->>> fname - path of the data file
-*/
-void DataSet::Init(const char* fname)
-{
-    fp = new ifstream(fname);
-    CheckNTErrors(fp->is_open(), "can not open the file");
-    bufferUsed = 0;
-    
-    LoadDataToBuffer();
-    if (bufferSize == 0)
-        bufferSize = buffer.size();
-}
-
--- a/source/sample/transformer/t2tdata/DataSet.h
+++ b/source/sample/transformer/t2tdata/DataSet.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-* $Created by: HU Chi (huchinlp@foxmail.com) 2019-04-03
-*/
-
-#ifndef __DATASET_H__
-#define __DATASET_H__
-
-#include "../../..//tensor/XTensor.h"
-#include "../../..//tensor/XGlobal.h"
-
-#include <cstdio>
-#include <fstream>
-#include <unordered_map>
-#include <vector>
-
-using namespace std;
-using namespace nts;
-
-struct Example {
-    int id;
-    vector<int> values;
-};
-
-struct Result {
-    int id;
-    XTensor values;
-};
-
-using BufferType = vector<Example>;
-using ResBufferType = vector<Result>;
-
-bool Compare(const Example& a, const Example& b);
-
-bool CompareRes(const Result& a, const Result& b);
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-/* A `DataSet` is associated with a file which contains variable length data.*/
-struct DataSet {
-
-    /* the data buffer */
-    BufferType buffer;
-
-    /* the result buffer */
-    ResBufferType resBuffer;
-
-    /* the pointer to file stream */
-    ifstream* fp{nullptr};
-
-    /* size of the data buffer */
-    size_t bufferSize{ 0 };
-
-    /* size of used data in buffer */
-    size_t bufferUsed{ 0 };
-
-    /* wether sort the dataset */
-    bool sortBuffer{ true };
-
-    /* load data from a file to the buffer */
-    void LoadDataToBuffer();
-
-    /* rerank result for output */
-    void RerankRes();
-
-    /* generate a mini-batch */
-    vector<int> LoadBatch(XTensor * batchEnc, XTensor * paddingEnc, 
-                   size_t batchSize, int devID);
-
-    /* initlization function */
-    void Init(const char* fname);
-
-    /* check if the buffer is empty */
-    bool IsEmpty() {
-        if (bufferUsed < bufferSize)
-            return false;
-        return true;
-    }
-
-    /* de-constructor */
-    ~DataSet() {
-        if (fp)
-            fp->close();
-        delete fp;
-    }
-};
-
-} // namespace nts(NiuTrans.Tensor)
-
-#endif // __DATASET_H__
\ No newline at end of file
--- a/source/sample/transformer/t2tdata/StringUtil.cpp
+++ b/source/sample/transformer/t2tdata/StringUtil.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-* $Created by: HU Chi (huchinlp@foxmail.com) 2019-03-18
-*/
-
-#include "StringUtil.h"
-
-namespace nts {
-
-/* split string by delimiter, this will return indices of all sub-strings */
-vector<pair<int, int>> SplitToPos(const string& s, const string& delimiter)
-{
-    vector<pair<int, int>> fields;
-    if (delimiter.length() == 0) {
-        fields.emplace_back(0, s.length());
-        return fields;
-    }
-    int pos = 0;
-    int start = 0;
-    while ((pos = s.find(delimiter, start)) != string::npos) {
-        if (pos != start) {
-            fields.emplace_back(start, pos);
-        }
-        start = pos + delimiter.length();
-    }
-    if (start != s.length()) {
-        fields.emplace_back(start, s.length());
-    }
-    return fields;
-}
-}
\ No newline at end of file
--- a/source/sample/transformer/t2tdata/StringUtil.h
+++ b/source/sample/transformer/t2tdata/StringUtil.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * $Created by: HU Chi (huchinlp@foxmail.com) 2019-03-18
- */
-
-#ifndef __STRING_UTIL_H__
-#define __STRING_UTIL_H__
-
-#include <cstdlib>
-#include <string>
-#include <utility>
-#include <vector>
-using namespace std;
-
-namespace nts {
-
-/* Splits a string based on the given delimiter string. Each pair in the
- * returned vector has the start and past-the-end positions for each of the
- * parts of the original string. Empty fields are not represented in the output.
- */
-vector<pair<int, int>> SplitToPos(const string& s, const string& delimiter);
-
-/* Splits the given string and converts each part to the given T. */
-template <typename T>
-vector<T> Split(const string& s, const string& delimiter);
-
-template <>
-inline vector<string> Split(const string& s, const string& delimiter)
-{
-    vector<string> fields;
-    for (const auto& p : SplitToPos(s, delimiter)) {
-        fields.emplace_back(s.substr(p.first, p.second - p.first));
-    }
-    return fields;
-}
-
-template <>
-inline vector<int> Split(const string& s, const string& delimiter)
-{
-    vector<int> fields;
-    for (const auto& p : SplitToPos(s, delimiter)) {
-        fields.emplace_back(strtol(s.data() + p.first, nullptr, 10));
-    }
-    return fields;
-}
-
-template <>
-inline vector<int64_t> Split(const string& s, const string& delimiter)
-{
-    vector<int64_t> fields;
-    for (const auto& p : SplitToPos(s, delimiter)) {
-        fields.emplace_back(strtoll(s.data() + p.first, nullptr, 10));
-    }
-    return fields;
-}
-
-template <>
-inline vector<float> Split(const string& s, const string& delimiter)
-{
-    vector<float> fields;
-    for (const auto& p : SplitToPos(s, delimiter)) {
-        fields.emplace_back(strtof(s.data() + p.first, nullptr));
-    }
-    return fields;
-}
-
-template <>
-inline vector<uint8_t> Split(const string& s, const string& delimiter)
-{
-    vector<uint8_t> fields;
-    for (const auto& p : SplitToPos(s, delimiter)) {
-        fields.emplace_back(strtol(s.data() + p.first, nullptr, 10));
-    }
-    return fields;
-}
-
-} // namespace nts
-
-#endif // __STRING_UTIL_H__
--- a/source/sample/transformer/train/T2TBatchLoader.cpp
+++ b/source/sample/transformer/train/T2TBatchLoader.cpp
--- a/source/sample/transformer/train/T2TBatchLoader.h
+++ b/source/sample/transformer/train/T2TBatchLoader.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-04-25
- * it is cold today but I'll move to a warm place tomorrow :)
- */
-
-#ifndef __T2TBATCHLOADER_H__
-#define __T2TBATCHLOADER_H__
-
-#include "../module/T2TUtility.h"
-#include "../../../network/XNet.h"
-
-using namespace nts;
-
-namespace transformer
-{
-
-#define MAX_SEQUENCE_LENGTH 1024 * 4
-
-/* node to keep batch information */
-struct BatchNode
-{
-    /* beginning position */
-    int beg;
-
-    /* end position */
-    int end;
-
-    /* maximum word number on the encoder side */
-    int maxEnc;
-
-    /* maximum word number on the decoder side */
-    int maxDec;
-
-    /* a key for sorting */
-    int key;
-};
-
-class T2TBatchLoader
-{
-public:
-    /* buffer for loading words */
-    int* buf;
-
-    /* another buffer */
-    int* buf2;
-
-    /* batch buf */
-    BatchNode* bufBatch;
-
-    /* buffer size */
-    int bufSize;
-
-    /* size of batch buffer */
-    int bufBatchSize;
-
-    /* length of each sequence */
-    int* seqLen;
-
-    /* another array */
-    int* seqLen2;
-
-    /* offset of the first word for each sequence */
-    int* seqOffset;
-
-    /* number of sequences in the buffer */
-    int nseqBuf;
-
-    /* offset for next sequence in the buffer */
-    int nextSeq;
-
-    /* offset for next batch */
-    int nextBatch;
-
-    /* indicates whether we double the </s> symbol for the output of LM */
-    bool isDoubledEnd;
-
-    /* indicates whether we use batchsize = max * sc
-       rather rather than batchsize = word-number, where max is the maximum
-       length and sc is the sentence number */
-    bool isSmallBatch;
-
-    /* counterpart of "isSmallBatch" */
-    bool isBigBatch;
-
-    /* randomize batches */
-    bool isRandomBatch;
-
-    /* bucket size */
-    int bucketSize;
-
-public:
-    /* constructor */
-    T2TBatchLoader();
-
-    /* de-constructor */
-    ~T2TBatchLoader();
-
-    /* initialization */
-    void Init(T2TConfig& config);
-
-    /* load data to buffer */
-    int LoadBuf(FILE* file, bool isSorted, int step);
-
-    /* clear data buffer */
-    void ClearBuf();
-
-    /* set the random batch flag */
-    void SetRandomBatch(bool flag = true);
-
-    /* load a batch of sequences */
-    int LoadBatch(FILE* file, bool isLM,
-        XTensor* batchEnc, XTensor* paddingEnc,
-        XTensor* batchDec, XTensor* paddingDec,
-        XTensor* gold, XTensor* label,
-        int* seqs,
-        int vsEnc, int vsDec, int sBatch, int wBatch,
-        bool isSorted, int& ws, int& wCount,
-        int devID, bool isTraining);
-
-    /* load a batch of sequences (for language modeling) */
-    int LoadBatchLM(FILE* file,
-        XTensor* batchEnc, XTensor* paddingEnc,
-        XTensor* batchDec, XTensor* paddingDec,
-        XTensor* gold, XTensor* label,
-        int* seqs, int vs, int sBatch, int wBatch,
-        bool isSorted, int& wCount,
-        int devID, bool isTraining);
-
-    /* load a batch of sequences (for machine translation) */
-    int LoadBatchMT(FILE* file,
-        XTensor* batchEnc, XTensor* paddingEnc,
-        XTensor* batchDec, XTensor* paddingDec,
-        XTensor* gold, XTensor* label,
-        int* seqs, int vsEnc, int vsDec, int sBatch, int wBatch,
-        bool isSorted, int& ws, int& wCount,
-        int devID, bool isTraining);
-
-    /* shuffle the data file */
-    void Shuffle(const char* srcFile, const char* tgtFile);
-};
-
-}
-
-#endif
\ No newline at end of file
--- a/source/sample/transformer/train/TrainDataSet.cpp
+++ b/source/sample/transformer/train/TrainDataSet.cpp
--- a/source/sample/transformer/train/TrainDataSet.h
+++ b/source/sample/transformer/train/TrainDataSet.h
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: HU Chi (huchinlp@foxmail.com) 2019-04-03
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
+ */
+
+#ifndef __TRAIN_DATASET_H__
+#define __TRAIN_DATASET_H__
+
+#include <cstdio>
+#include <vector>
+#include <fstream>
+
+#include "../../../tensor/XList.h"
+#include "../../../tensor/XTensor.h"
+#include "../../../tensor/XGlobal.h"
+
+#define MAX_WORD_NUM 120
+
+using namespace std;
+
+namespace nts {
+
+/* a class of sentence pairs for training */
+struct TrainExample {
+
+    /* id of the sentence pair */
+    int id;
+
+    /* source language setence (tokenized) */
+    IntList srcSent;
+
+    /* target language setence (tokenized) */
+    IntList tgtSent;
+
+    /* the key used to shuffle items in a bucket */
+    int key;
+
+    /* the key used to shuffle buckets */
+    int bucketKey;
+};
+
+/* A `TrainDataSet` is associated with a file which contains training data. */
+struct TrainDataSet {
+public:
+    /* the data buffer */
+    TrainBufferType buffer;
+
+    /* a list of empty line number */
+    IntList emptyLines;
+
+    /* the pointer to file stream */
+    FILE* fp;
+
+    /* current index in the buffer */
+    size_t curIdx;
+
+    /* size of used data in the buffer */
+    size_t bufferUsed;
+
+    /* size of the bucket used for grouping sentences */
+    size_t bucketSize;
+
+    /* indicates whether it is used for training */
+    bool isTraining;
+
+public:
+
+    /* sort the input by length (in descending order) */
+    void SortByLength();
+
+    /* sort buckets by key (in descending order) */
+    void SortBucket();
+
+    /* sort the output by key (in descending order) */
+    void SortInBucket(int begin, int end);
+
+    /* load data from a file to the buffer */
+    void LoadDataToBuffer();
+
+    /* generate a mini-batch */
+    UInt64List LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
+                         XTensor* batchDec, XTensor* paddingDec, XTensor* label,
+                         size_t minSentBatch, size_t batchSize, int devID);
+
+    /* initialization function */
+    void Init(const char* dataFile, int bucketSize, bool training);
+
+    /* check if the buffer is empty */
+    bool IsEmpty();
+
+    /* reset the buffer */
+    void ClearBuf();
+
+    /* group data into buckets with similar length */
+    void BuildBucket();
+
+    /* de-constructor */
+    ~TrainDataSet();
+};
+}
+
+#endif // __TRAIN_DATASET_H__
\ No newline at end of file
--- a/source/sample/transformer/train/T2TTrainer.cpp
+++ b/source/sample/transformer/train/T2TTrainer.cpp
--- a/source/sample/transformer/train/T2TTrainer.h
+++ b/source/sample/transformer/train/T2TTrainer.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,25 +18,24 @@
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-02
 */

-#ifndef __T2TTRAINER_H__
-#define __T2TTRAINER_H__
+#ifndef __TRAINER_H__
+#define __TRAINER_H__

-#include "../T2TModel.h"
-#include "T2TBatchLoader.h"
-#include "../../../tensor/function/FHeader.h"
+#include "../Model.h"
+#include "TrainDataSet.h"

 using namespace nts;

-namespace transformer
+namespace nmt
 {

-/* trainer of the T2T model */
-class T2TTrainer
+/* trainer of the  model */
+class Trainer
 {
 public:

    /* configurations */
-    T2TConfig* cfg;
+    Config* cfg;

    /* dimension size of each inner layer */
    int d;
@@ -63,12 +61,18 @@ public:
    /* word batch size */
    int wBatchSize;

+    /* size of bucket for grouping data by length */
+    int bucketSize;
+
    /* training epoch number */
    int nepoch;

    /* traing step number */
    int nstep;

+    /* the maximum number of saved checkpoints */
+    int maxCheckpoint;
+
    /* indicates whether we use adam */
    bool useAdam;

@@ -100,39 +104,36 @@ public:
    /* number of batches on which we do model update */
    int updateStep;

-    /* indicates whether we intend to debug the net */
-    bool isDebugged;
-
    /* indicates whether the sequence is sorted by length */
    bool isLenSorted;

-    /* for batching */
-    T2TBatchLoader batchLoader;
+    /* used for loading batches */
+    TrainDataSet batchLoader;

 public:
    /* constructor */
-    T2TTrainer();
+    Trainer();

    /* de-constructor */
-    ~T2TTrainer();
+    ~Trainer();

    /* initialize the trainer */
-    void Init(T2TConfig& config);
+    void Init(Config& config);

    /* train the model */
-    void Train(const char* fn, const char* validFN, const char* modelFN, T2TModel* model);
+    void Train(const char* fn, const char* validFN, const char* modelFN, Model* model);

    /* test the model */
-    void Validate(const char* fn, const char* ofn, T2TModel* model);
+    void Validate(const char* fn, const char* ofn, Model* model);

    /* make a checkpoint */
-    void MakeCheckpoint(T2TModel* model, const char* validFN, const char* modelFN, const char* label, int id);
+    void MakeCheckpoint(Model* model, const char* validFN, const char* modelFN, const char* label, int id);

    /* update the model by delta rule */
-    void Update(T2TModel* model, const float lr);
+    void Update(Model* model, const float lr);

    /* prepare model for training */
-    void PrepareModel(T2TModel* model);
+    void PrepareModel(Model* model);
 };

 }

--- a/source/sample/transformer/translate/T2TDataSet.cpp
+++ b/source/sample/transformer/translate/T2TDataSet.cpp
--- a/source/sample/transformer/translate/T2TDataSet.h
+++ b/source/sample/transformer/translate/T2TDataSet.h
--- a/source/sample/transformer/translate/T2TLengthPenalty.cpp
+++ b/source/sample/transformer/translate/T2TLengthPenalty.cpp
--- a/source/sample/transformer/translate/T2TLengthPenalty.h
+++ b/source/sample/transformer/translate/T2TLengthPenalty.h
--- a/source/sample/transformer/translate/T2TPredictor.cpp
+++ b/source/sample/transformer/translate/T2TPredictor.cpp
--- a/source/sample/transformer/translate/T2TPredictor.h
+++ b/source/sample/transformer/translate/T2TPredictor.h
--- a/source/sample/transformer/translate/T2TSearch.cpp
+++ b/source/sample/transformer/translate/T2TSearch.cpp
--- a/source/sample/transformer/translate/T2TSearch.h
+++ b/source/sample/transformer/translate/T2TSearch.h
--- a/source/sample/transformer/translate/T2TTranslator.cpp
+++ b/source/sample/transformer/translate/T2TTranslator.cpp
--- a/source/sample/transformer/translate/T2TTranslator.h
+++ b/source/sample/transformer/translate/T2TTranslator.h
--- a/source/sample/transformer/translate/T2TVocab.cpp
+++ b/source/sample/transformer/translate/T2TVocab.cpp
--- a/source/sample/transformer/translate/T2TVocab.h
+++ b/source/sample/transformer/translate/T2TVocab.h
--- a/source/tensor/XGlobal.h
+++ b/source/tensor/XGlobal.h
--- a/source/tensor/XMem.cpp
+++ b/source/tensor/XMem.cpp
--- a/source/tensor/core/arithmetic/Sub.cu
+++ b/source/tensor/core/arithmetic/Sub.cu
--- a/source/tensor/core/arithmetic/Sub.cuh
+++ b/source/tensor/core/arithmetic/Sub.cuh
--- a/source/tensor/core/arithmetic/SubDim.cpp
+++ b/source/tensor/core/arithmetic/SubDim.cpp
--- a/source/tensor/core/arithmetic/SubDim.cu
+++ b/source/tensor/core/arithmetic/SubDim.cu
--- a/source/tensor/core/arithmetic/SubDim.cuh
+++ b/source/tensor/core/arithmetic/SubDim.cuh
--- a/source/tensor/core/arithmetic/SubDim.h
+++ b/source/tensor/core/arithmetic/SubDim.h
--- a/source/tensor/core/arithmetic/SumDim.cpp
+++ b/source/tensor/core/arithmetic/SumDim.cpp
--- a/source/tensor/core/getandset/SetData.h
+++ b/source/tensor/core/getandset/SetData.h
--- a/source/tensor/core/utilities/Float16.cpp
+++ b/source/tensor/core/utilities/Float16.cpp
--- a/source/tensor/core/utilities/Float16.h
+++ b/source/tensor/core/utilities/Float16.h
--- a/source/tensor/test/TMultiply.cpp
+++ b/source/tensor/test/TMultiply.cpp
--- a/source/tensor/test/TSub.cpp
+++ b/source/tensor/test/TSub.cpp
--- a/source/tensor/test/TSubDim.cpp
+++ b/source/tensor/test/TSubDim.cpp
--- a/source/tensor/test/TSubDim.h
+++ b/source/tensor/test/TSubDim.h
--- a/source/tensor/test/TSum.cpp
+++ b/source/tensor/test/TSum.cpp
--- a/tools/pack_model.py
+++ b/tools/pack_model.py