no message

b9c318bd · hello · 1d17c439 · b9c318bd · b9c318bd · b9c318bd
Commit b9c318bd authored Feb 06, 2021 by hello
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,4 @@ x64/
 vc140.pdb
 NiuTrans.Tensor.vcxproj.user
 NiuTrans.Tensor.aps
+*.tgz
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -97,35 +97,47 @@ if(USE_CUDA)
        add_definitions(-DHALF_PRECISION)
    endif()
    find_package(CUDA REQUIRED)
-    if(WIN32)
+    if(GPU_ARCH STREQUAL K) # Kepler cards (CUDA 5 until CUDA 10)
-        SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4819")
+        set(ARCH_FLAGS -arch=compute_30 -code=compute_30,sm_30,sm_35,sm_37)
-        set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-maxrregcount=0 -m64 --disable-warnings -use_fast_math -DUSE_CUDA")
+    elseif(GPU_ARCH STREQUAL M) # Maxwell cards (CUDA 6 until CUDA 11)
+        set(ARCH_FLAGS -arch=compute_50 -code=compute_50,sm_50,sm_52,sm_53)
+    elseif(GPU_ARCH STREQUAL P) # Pascal (CUDA 8 and later)
+        set(ARCH_FLAGS -arch=compute_60 -code=compute_60,sm_60,sm_61,sm_62)
+    elseif(GPU_ARCH STREQUAL V) # Volta (CUDA 9 and later)
+        set(ARCH_FLAGS -arch=compute_70 -code=compute_70,sm_70,sm_72)
+    elseif(GPU_ARCH STREQUAL T) # Turing (CUDA 10 and later)
+        set(ARCH_FLAGS -arch=compute_75 -code=sm_75)
+    elseif(GPU_ARCH STREQUAL A) # Ampere (CUDA 11 and later)
+        set(ARCH_FLAGS -arch=compute_80 -code=sm_80)
+    endif()
    if(USE_HALF_PRECISION)
-            set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-DHALF_PRECISION")
+        if(NOT DEFINED GPU_ARCH)
-            set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} -arch=sm_60
+            set(ARCH_FLAGS -arch=sm_60
-                -gencode=arch=compute_60,code=sm_60
-                -gencode=arch=compute_61,code=sm_61
-                -gencode=arch=compute_62,code=sm_62
-                -gencode=arch=compute_70,code=sm_70
-                -gencode=arch=compute_70,code=compute_70
-            )
-        else()
-            set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} -arch=sm_30
-                -gencode=arch=compute_30,code=sm_30
-                -gencode=arch=compute_50,code=sm_50
-                -gencode=arch=compute_52,code=sm_52
                -gencode=arch=compute_60,code=sm_60
                -gencode=arch=compute_61,code=sm_61
                -gencode=arch=compute_62,code=sm_62
                -gencode=arch=compute_70,code=sm_70
+                -gencode=arch=compute_72,code=sm_72
                -gencode=arch=compute_70,code=compute_70
            )
+        elseif(${GPU_ARCH} STREQUAL K OR ${GPU_ARCH} STREQUAL M)
+            message(FATAL_ERROR "your GPU cannot use the function half precision")
        endif()
+    endif()
+    if(WIN32)
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4819")
+        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-maxrregcount=0 -m64 -Wno-deprecated-gpu-targets -use_fast_math")
+        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${ARCH_FLAGS})
        set(CMAKE_POLICY_DEFAULT_CMP0028 NEW)
        link_directories("${CUDA_ROOT}/lib/x64")
        include_directories("${CUDA_ROOT}/include")
        set(CUDA_LIB_DIR "${CUDA_ROOT}/lib/x64/")
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}cublas.lib")
+        if(CUDA_VERSION_MAJOR EQUAL 11)
+            set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}cublasLt.lib")
+        endif()
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}npps.lib")
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}nppc.lib")
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}cudadevrt.lib")
@@ -133,31 +145,14 @@ if(USE_CUDA)
    else()
        set(CMAKE_CXX_FLAGS "-fPIC -msse4.2 -w -march=native -Wno-enum-compare -Wno-sign-compare -Wno-format -Wno-dev -O3 -DNDEBUG -rdynamic")
        set(CUDA_NVCC_FLAGS "-Xcompiler -fPIC -maxrregcount=0 --disable-warnings -use_fast_math -DUSE_CUDA -Wno-deprecated-gpu-targets -std=c++11")
-        if(USE_HALF_PRECISION)
+        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${ARCH_FLAGS})
-            set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-DHALF_PRECISION")
+        link_directories("${CUDA_ROOT}/lib64")
-            set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -arch=sm_60
+        include_directories("${CUDA_ROOT}/include")
-                -gencode=arch=compute_60,code=sm_60
-                -gencode=arch=compute_61,code=sm_61
-                -gencode=arch=compute_62,code=sm_62
-                -gencode=arch=compute_70,code=sm_70
-                -gencode=arch=compute_70,code=compute_70
-            )
-        else()
-            set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -arch=sm_30 
-                -gencode=arch=compute_30,code=sm_30 
-                -gencode=arch=compute_50,code=sm_50 
-                -gencode=arch=compute_52,code=sm_52 
-                -gencode=arch=compute_60,code=sm_60 
-                -gencode=arch=compute_61,code=sm_61 
-                -gencode=arch=compute_62,code=sm_62 
-                -gencode=arch=compute_70,code=sm_70 
-                -gencode=arch=compute_70,code=compute_70 
-            )
-        endif()
-        link_directories(${CUDA_ROOT}/lib64)
-        include_directories(${CUDA_ROOT}/include)
        set(CUDA_LIB_DIR "${CUDA_ROOT}/lib64/")
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcublas_static.a")
+        if(CUDA_VERSION_MAJOR EQUAL 11)
+            set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcublasLt_static.a")
+        endif()
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libculibos.a")
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libnpps_static.a")
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libnppc_static.a")

--- a/README.md
+++ b/README.md
@@ -46,7 +46,7 @@ NiuTensor蟾･蜈ｷ蛹庄莉･蝨ｨWindows縲´inux莉･蜿確acOS邇ｯ蠅ｸ玖ｿ幄｡悟ｮ芽｣ｼ梧髪
 - 执行CMake命令对Visual Studio项目进行生成（如果 visual studio 版本低于 2019，则在使用下列命令的时候需额外加上`-A x64`的CMake参数），如计划生成动态链接库，则仅需在命令中额外加上`-DGEN_DLL=ON`的CMake参数即可，否则默认生成可执行程序。
  - 如项目计划启用MKL数学运算库（需用户自行安装），则仅需在CMake命令中使用`-DUSE_MKL=ON`参数，并通过`-DINTEL_ROOT='/intel/root/path'`指定MKL库（Intel工具包）的安装路径。如`cmake -DUSE_MKL=ON -DINTEL_ROOT='C:/Program Files (x86)/IntelSWTools/compilers_and_libraries_2020.2.254/windows' ..`。
  - 如项目计划启用OpenBLAS数学运算库（需用户自行安装），则仅需在CMake命令中使用`-DUSE_OPENBLAS=ON`参数，并通过`-DOPENBLAS_ROOT='/openblas/root/path'`指定OpenBLAS库的安装路径。如`cmake -DUSE_OPENBLAS=ON -DOPENBLAS_ROOT='C:/Program Files/OpenBLAS' ..`。
-  - 如项目计划启用CUDA数学运算库（需用户自行安装），则仅需在CMake命令中使用`-DUSE_CUDA=ON`参数，并通过`-DCUDA_ROOT='/cuda/root/path'`指定CUDA库的安装路径。如`cmake -DUSE_CUDA=ON -DCUDA_ROOT='C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.2' ..`。如需在GPU设备上使用半精度浮点数进行运算，需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数（需要注意的是半精度但需要注意的是，半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持，该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询）。
+  - 如项目计划启用CUDA数学运算库（需用户自行安装），则仅需在CMake命令中使用`-DUSE_CUDA=ON`参数，并通过`-DCUDA_ROOT='/cuda/root/path'`指定CUDA库的安装路径，通过-DGPU_ARCH=ARCH指定所在GPU设备的架构（K：Kepler架构；M：Maxwell架构；P：Pascal架构；V：Volta架构；T：Turing架构；A：Ampere架构）。如`cmake -DUSE_CUDA=ON -DCUDA_ROOT='C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.2' -DGPU_ARCH=P ..`。如需在GPU设备上使用半精度浮点数进行运算，需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数（需要注意的是半精度但需要注意的是，半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持，该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询）。
 - 执行成功将显示`Build files have been written to:...`。
 - 打开build目录中的NiuTensor.sln文件即可通过Visual Studio打开NiuTensor项目。
 - 打开后在解决方案管理器中选中NiuTensor，右键将其设为启动项目即可开始使用。
@@ -67,7 +67,7 @@ NiuTensor蟾･蜈ｷ蛹庄莉･蝨ｨWindows縲´inux莉･蜿確acOS邇ｯ蠅ｸ玖ｿ幄｡悟ｮ芽｣ｼ梧髪
 - 打开CLion首选项，点击“构建，执行，部署”选项卡中的CMake，在“CMake选项”中进行设置，设置完成后CLion将自动使用CMake对项目进行构建，如计划生成动态链接库，则仅需在在“CMake选项”中额外加上`-DGEN_DLL=ON`的CMake参数即可，否则默认生成可执行程序。
  - 如项目计划启用MKL数学运算库（需用户自行安装），则仅需在“CMake选项”中填入`-DUSE_MKL=ON`，并通过`-DINTEL_ROOT='/intel/root/path'`指定MKL库（Intel工具包）的安装路径。如`-DUSE_MKL=ON -DINTEL_ROOT='/opt/intel/compilers_and_libraries_2020.2.254/linux'`。
  - 如项目计划启用OpenBLAS数学运算库（需用户自行安装），则仅需在“CMake选项”中填入`-DUSE_OPENBLAS=ON`，并通过`-DOPENBLAS_ROOT='/openblas/root/path'`指定OpenBLAS库的安装路径。如`-DUSE_OPENBLAS=ON -DOPENBLAS_ROOT='/opt/OpenBLAS'`。
-  - 如项目计划启用CUDA数学运算库（需用户自行安装），则仅需在“CMake选项”中填入`-DUSE_CUDA=ON`，并通过`-DCUDA_ROOT='/cuda/root/path'`指定CUDA库的安装路径。如`-DUSE_CUDA=ON -DCUDA_ROOT='/usr/local/cuda-9.2'`。如需在GPU设备上使用半精度浮点数进行运算，需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数（需要注意的是半精度但需要注意的是，半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持，该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询）。
+  - 如项目计划启用CUDA数学运算库（需用户自行安装），则仅需在“CMake选项”中填入`-DUSE_CUDA=ON`，并通过`-DCUDA_ROOT='/cuda/root/path'`指定CUDA库的安装路径，通过-DGPU_ARCH=ARCH指定所在GPU设备的架构（K：Kepler架构；M：Maxwell架构；P：Pascal架构；V：Volta架构；T：Turing架构；A：Ampere架构）。如`-DUSE_CUDA=ON -DCUDA_ROOT='/usr/local/cuda-9.2' -DGPU_ARCH=P `。如需在GPU设备上使用半精度浮点数进行运算，需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数（需要注意的是半精度但需要注意的是，半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持，该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询）。
 ##### CMake方式（命令行）
@@ -78,7 +78,7 @@ NiuTensor蟾･蜈ｷ蛹庄莉･蝨ｨWindows縲´inux莉･蜿確acOS邇ｯ蠅ｸ玖ｿ幄｡悟ｮ芽｣ｼ梧髪
 - 执行CMake命令对项目进行生成，如计划生成动态链接库，则仅需在命令中额外加上`-DGEN_DLL=ON`的CMake参数即可，否则默认生成可执行程序。
  - 如项目计划启用MKL数学运算库（需用户自行安装），则仅需在CMake命令中使用`-DUSE_MKL=ON`参数，并通过`-DINTEL_ROOT='/intel/root/path'`指定MKL库（Intel工具包）的安装路径。如`cmake -DUSE_MKL=ON -DINTEL_ROOT='/opt/intel/compilers_and_libraries_2020.2.254/linux' ..`。
  - 如项目计划启用OpenBLAS数学运算库（需用户自行安装），则仅需在CMake命令中使用`-DUSE_OPENBLAS=ON`参数，并通过`-DOPENBLAS_ROOT='/openblas/root/path'`指定OpenBLAS库的安装路径。如`cmake -DUSE_OPENBLAS=ON -DOPENBLAS_ROOT='/opt/OpenBLAS' ..`。
-  - 如项目计划启用CUDA数学运算库（需用户自行安装），则仅需在CMake命令中使用`-DUSE_CUDA=ON`参数，并通过`-DCUDA_ROOT='/cuda/root/path'`指定CUDA库的安装路径。如`cmake -DUSE_CUDA=ON -DCUDA_ROOT='/usr/local/cuda-9.2' ..`。如需在GPU设备上使用半精度浮点数进行运算，需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数（需要注意的是半精度但需要注意的是，半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持，该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询）。
+  - 如项目计划启用CUDA数学运算库（需用户自行安装），则仅需在CMake命令中使用`-DUSE_CUDA=ON`参数，并通过`-DCUDA_ROOT='/cuda/root/path'`指定CUDA库的安装路径，通过-DGPU_ARCH=ARCH指定所在GPU设备的架构（K：Kepler架构；M：Maxwell架构；P：Pascal架构；V：Volta架构；T：Turing架构；A：Ampere架构）。如`cmake -DUSE_CUDA=ON -DCUDA_ROOT='/usr/local/cuda-9.2' -DGPU_ARCH=P ..`。如需在GPU设备上使用半精度浮点数进行运算，需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数（需要注意的是半精度但需要注意的是，半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持，该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询）。
 - 执行成功将显示`Build files have been written to:...`并在该目录下生成Makefile文件。
 - 执行`make -j`命令对NiuTensor项目进行编译，执行成功将显示`Built target NiuTensor`，安装完毕。
@@ -137,4 +137,4 @@ NiuTensor蠑驥剰ｮ｡邂怜ｺ鍋罰荳懷圏螟ｧ蟄ｦ閾ｪ辟ｶ隸ｭ險螟炊螳樣ｪ悟ｮ､蟆冗央蠑貅仙
 ## 更新版本
-NiuTensor version 0.3.3 - 2020年9月14日
+NiuTensor version 0.3.5 - 2021年2月6日
--- a/data/fnnlm/test/wsj.test
+++ b/data/fnnlm/test/wsj.test
--- a/data/fnnlm/train/wsj.train
+++ b/data/fnnlm/train/wsj.train
--- a/data/transformer/test/bpevocab
+++ b/data/transformer/test/bpevocab
--- a/data/transformer/test/code
+++ b/data/transformer/test/code
--- a/data/transformer/test/test.de
+++ b/data/transformer/test/test.de
--- a/data/transformer/test/test.en
+++ b/data/transformer/test/test.en
--- a/data/transformer/train/bpevocab
+++ b/data/transformer/train/bpevocab
--- a/data/transformer/train/code
+++ b/data/transformer/train/code
--- a/doc/Configuration.md
+++ b/doc/Configuration.md
-# NiuTrans.Tensor环境配置
-## 注意事项
-CUDA最新版本9.2尚且不支持VS2017最新版本，因此建议使用CUDA版本为9.0或9.1，建议使用VS版本为VS2015，或使用VS2017时安装v140工具集，解决方案平台设置为×64。
-## CUDA配置
-在已安装好VS、CUDA并配置好环境变量后，一些关键的CUDA配置选项如下所示，以下配置选项在 **项目 -> 属性** 中可以找到。
->$(CUDA_PATH)\include
-加入到 **VC++目录 -> 包含** 中。
->$(CUDA_PATH)\lib\Win32
-加入到 **VC++目录 -> 库** 中。
->cuda.lib;cudadevrt.lib;cudart.lib;cudart_static.lib;nvcuvid.lib;OpenCL.lib;cublas.lib;curand.lib;
-加入到 **链接器->输入->附加依赖项** 中。
-配置完成后，右键 **工程->项目依赖性** ，选择CUDA9。
-在.cu文件上右键属性，在项类型中选择"CUDA C/C++"（最好搜索.cu文件，然后全选设置）。
-## 其他配置
-**C/C++->常规->SDL检查**，设为否。
-在 **C/C++->预处理器->预处理器定义** 中，添加
->USE_CUDA;USE_BLAS;WIN32;MKL;_DEBUG;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_WARNINGS_
-CONSOLE;
-**链接器->系统->子系统**，设置为控制台。
-**常规->字符集**，使用Unicode字符集。
-**调试->命令参数**中设置可执行文件所需要的参数。
--- a/doc/manual.md
+++ b/doc/manual.md
@@ -39,7 +39,7 @@ NiuTensor工具包可以在Windows、Linux以及macOS环境下进行安装，支
 - 执行CMake命令对Visual Studio项目进行生成（如果 visual studio 版本低于 2019，则在使用下列命令的时候需额外加上`-A x64`的CMake参数），如计划生成动态链接库，则仅需在命令中额外加上`-DGEN_DLL=ON`的CMake参数即可，否则默认生成可执行程序。
  - 如项目计划启用MKL数学运算库（需用户自行安装），则仅需在CMake命令中使用`-DUSE_MKL=ON`参数，并通过`-DINTEL_ROOT='/intel/root/path'`指定MKL库（Intel工具包）的安装路径。如`cmake -DUSE_MKL=ON -DINTEL_ROOT='C:/Program Files (x86)/IntelSWTools/compilers_and_libraries_2020.2.254/windows' ..`。
  - 如项目计划启用OpenBLAS数学运算库（需用户自行安装），则仅需在CMake命令中使用`-DUSE_OPENBLAS=ON`参数，并通过`-DOPENBLAS_ROOT='/openblas/root/path'`指定OpenBLAS库的安装路径。如`cmake -DUSE_OPENBLAS=ON -DOPENBLAS_ROOT='C:/Program Files/OpenBLAS' ..`。
-  - 如项目计划启用CUDA数学运算库（需用户自行安装），则仅需在CMake命令中使用`-DUSE_CUDA=ON`参数，并通过`-DCUDA_ROOT='/cuda/root/path'`指定CUDA库的安装路径。如`cmake -DUSE_CUDA=ON -DCUDA_ROOT='C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.2' ..`。如需在GPU设备上使用半精度浮点数进行运算，需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数（需要注意的是半精度但需要注意的是，半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持，该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询）。
+  - 如项目计划启用CUDA数学运算库（需用户自行安装），则仅需在CMake命令中使用`-DUSE_CUDA=ON`参数，并通过`-DCUDA_ROOT='/cuda/root/path'`指定CUDA库的安装路径，通过-DGPU_ARCH=ARCH指定所在GPU设备的架构（K：Kepler架构；M：Maxwell架构；P：Pascal架构；V：Volta架构；T：Turing架构；A：Ampere架构）。如`cmake -DUSE_CUDA=ON -DCUDA_ROOT='C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.2' -DGPU_ARCH=P ..`。如需在GPU设备上使用半精度浮点数进行运算，需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数（需要注意的是半精度但需要注意的是，半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持，该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询）。
 - 执行成功将显示`Build files have been written to:...`。
 - 打开build目录中的NiuTensor.sln文件即可通过Visual Studio打开NiuTensor项目。
 - 打开后在解决方案管理器中选中NiuTensor，右键将其设为启动项目即可开始使用。
@@ -60,7 +60,7 @@ NiuTensor工具包可以在Windows、Linux以及macOS环境下进行安装，支
 - 打开CLion首选项，点击“构建，执行，部署”选项卡中的CMake，在“CMake选项”中进行设置，设置完成后CLion将自动使用CMake对项目进行构建，如计划生成动态链接库，则仅需在在“CMake选项”中额外加上`-DGEN_DLL=ON`的CMake参数即可，否则默认生成可执行程序。
  - 如项目计划启用MKL数学运算库（需用户自行安装），则仅需在“CMake选项”中填入`-DUSE_MKL=ON`，并通过`-DINTEL_ROOT='/intel/root/path'`指定MKL库（Intel工具包）的安装路径。如`-DUSE_MKL=ON -DINTEL_ROOT='/opt/intel/compilers_and_libraries_2020.2.254/linux'`。
  - 如项目计划启用OpenBLAS数学运算库（需用户自行安装），则仅需在“CMake选项”中填入`-DUSE_OPENBLAS=ON`，并通过`-DOPENBLAS_ROOT='/openblas/root/path'`指定OpenBLAS库的安装路径。如`-DUSE_OPENBLAS=ON -DOPENBLAS_ROOT='/opt/OpenBLAS'`。
-  - 如项目计划启用CUDA数学运算库（需用户自行安装），则仅需在“CMake选项”中填入`-DUSE_CUDA=ON`，并通过`-DCUDA_ROOT='/cuda/root/path'`指定CUDA库的安装路径。如`-DUSE_CUDA=ON -DCUDA_ROOT='/usr/local/cuda-9.2'`。如需在GPU设备上使用半精度浮点数进行运算，需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数（需要注意的是半精度但需要注意的是，半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持，该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询）。
+  - 如项目计划启用CUDA数学运算库（需用户自行安装），则仅需在“CMake选项”中填入`-DUSE_CUDA=ON`，并通过`-DCUDA_ROOT='/cuda/root/path'`指定CUDA库的安装路径，通过-DGPU_ARCH=ARCH指定所在GPU设备的架构（K：Kepler架构；M：Maxwell架构；P：Pascal架构；V：Volta架构；T：Turing架构；A：Ampere架构）。如`-DUSE_CUDA=ON -DCUDA_ROOT='/usr/local/cuda-9.2' -DGPU_ARCH=P `。如需在GPU设备上使用半精度浮点数进行运算，需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数（需要注意的是半精度但需要注意的是，半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持，该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询）。
 ##### CMake方式（命令行）
@@ -71,7 +71,7 @@ NiuTensor工具包可以在Windows、Linux以及macOS环境下进行安装，支
 - 执行CMake命令对项目进行生成，如计划生成动态链接库，则仅需在命令中额外加上`-DGEN_DLL=ON`的CMake参数即可，否则默认生成可执行程序。
  - 如项目计划启用MKL数学运算库（需用户自行安装），则仅需在CMake命令中使用`-DUSE_MKL=ON`参数，并通过`-DINTEL_ROOT='/intel/root/path'`指定MKL库（Intel工具包）的安装路径。如`cmake -DUSE_MKL=ON -DINTEL_ROOT='/opt/intel/compilers_and_libraries_2020.2.254/linux' ..`。
  - 如项目计划启用OpenBLAS数学运算库（需用户自行安装），则仅需在CMake命令中使用`-DUSE_OPENBLAS=ON`参数，并通过`-DOPENBLAS_ROOT='/openblas/root/path'`指定OpenBLAS库的安装路径。如`cmake -DUSE_OPENBLAS=ON -DOPENBLAS_ROOT='/opt/OpenBLAS' ..`。
-  - 如项目计划启用CUDA数学运算库（需用户自行安装），则仅需在CMake命令中使用`-DUSE_CUDA=ON`参数，并通过`-DCUDA_ROOT='/cuda/root/path'`指定CUDA库的安装路径。如`cmake -DUSE_CUDA=ON -DCUDA_ROOT='/usr/local/cuda-9.2' ..`。如需在GPU设备上使用半精度浮点数进行运算，需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数（需要注意的是半精度但需要注意的是，半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持，该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询）。
+  - 如项目计划启用CUDA数学运算库（需用户自行安装），则仅需在CMake命令中使用`-DUSE_CUDA=ON`参数，并通过`-DCUDA_ROOT='/cuda/root/path'`指定CUDA库的安装路径，通过-DGPU_ARCH=ARCH指定所在GPU设备的架构（K：Kepler架构；M：Maxwell架构；P：Pascal架构；V：Volta架构；T：Turing架构；A：Ampere架构）。如`cmake -DUSE_CUDA=ON -DCUDA_ROOT='/usr/local/cuda-9.2' -DGPU_ARCH=P ..`。如需在GPU设备上使用半精度浮点数进行运算，需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数（需要注意的是半精度但需要注意的是，半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持，该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询）。
 - 执行成功将显示`Build files have been written to:...`并在该目录下生成Makefile文件。
 - 执行`make -j`命令对NiuTensor项目进行编译，执行成功将显示`Built target NiuTensor`，安装完毕。

--- a/source/Main.cpp
+++ b/source/Main.cpp
@@ -26,7 +26,7 @@
 #include "./tensor/core/CHeader.h"
 #include "./tensor/test/Test.h"
 #include "./sample/fnnlm/FNNLM.h"
-#include "./sample/transformer/Transformer.h"
+#include "./sample/transformer/NMT.h"
 //#define CRTDBG_MAP_ALLOC
 //#include <stdlib.h>
@@ -34,7 +34,7 @@
 using namespace nts;
 using namespace fnnlm;
-using namespace transformer;
+using namespace nmt;
 int main( int argc, const char ** argv )
 {
@@ -43,7 +43,7 @@ int main( int argc, const char ** argv )
    else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
        FNNLMMain(argc - 1, argv + 1);
    else if(argc > 1 && !strcmp(argv[1], "-t2t"))
-        TransformerMain(argc - 1, argv + 1);
+        NMTMain(argc - 1, argv + 1);
    else{
        fprintf(stderr, "Thanks for using NiuTensor! This is a library for building\n");
        fprintf(stderr, "neural networks in an easy way. \n\n");

--- a/source/sample/transformer/T2TDecoder.cpp
+++ b/source/sample/transformer/T2TDecoder.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
- * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,15 +19,13 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */
-#include <cmath>
+#include "Decoder.h"
+#include "Utility.h"
-#include "T2TDecoder.h"
+#include "module/LayerNorm.h"
-#include "module/T2TUtility.h"
+#include "module/CommonModules.h"
-#include "module/T2TLayerNormal.h"
-#include "module/T2TCommonModules.h"
 #include "../../tensor/core/CHeader.h"
-namespace transformer
+namespace nmt
 {
 /* constructor */
@@ -64,7 +61,7 @@ AttDecoder::~AttDecoder()
 initialize the model
 >> config - configurations of the model
 */
-void AttDecoder::InitModel(T2TConfig& config)
+void AttDecoder::InitModel(Config& config)
 {
    devID = config.devID;
    nlayer = config.nDecLayer;
@@ -80,16 +77,17 @@ void AttDecoder::InitModel(T2TConfig& config)
    /* embedding model */
    embedder.InitModel(config, false);
-    selfAtt = new T2TAttention[nlayer];
+    selfAtt = new Attention[nlayer];
-    fnns = new T2TFNN[nlayer];
+    fnns = new FNN[nlayer];
-    selfAttLayerNorms = new T2TLN[nlayer];
+    selfAttLayerNorms = new LN[nlayer];
-    enDeAtt = new T2TAttention[nlayer];
+    enDeAtt = new Attention[nlayer];
-    enDeAttLayerNorms = new T2TLN[nlayer];
+    enDeAttLayerNorms = new LN[nlayer];
-    fnnLayerNorms = new T2TLN[nlayer];
+    fnnLayerNorms = new LN[nlayer];
    selfAttCache = new Cache[nlayer];
    enDeAttCache = new Cache[nlayer];
    if (preNorm)
-        decoderLayerNorm = new T2TLN;
+        decoderLayerNorm = new LN;
    /* initialize the stacked layers */
    for (int i = 0; i < nlayer; i++) {
@@ -99,6 +97,8 @@ void AttDecoder::InitModel(T2TConfig& config)
        fnnLayerNorms[i].InitModel(config);
        enDeAtt[i].InitModel(config);
        enDeAttLayerNorms[i].InitModel(config);
+        selfAttCache[i].enable = true;
+        enDeAttCache[i].enable = true;
    }
    if (preNorm)
        decoderLayerNorm->InitModel(config);
@@ -118,6 +118,7 @@ XTensor AttDecoder::Make(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
                         XTensor* maskEncDec, int nstep, bool isTraining)
 {
    XTensor x;
    x = embedder.Make(inputDec, true, isTraining, nstep);
    /* dropout */
@@ -188,8 +189,86 @@ XTensor AttDecoder::Make(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
    }
    if (preNorm)
+        return decoderLayerNorm->Make(x);
+    return x;
+}
+/*
+make the decoding network
+>> inputDec - the input tensor of the decoder
+>> outputEnc - the output tensor of the encoder
+>> mask - mask that indicates which position is valid
+>> maskEncDec - mask for the encoder-decoder attention
+>> nstep - the current length of the decoder input
+>> isTraining - indicates whether the model is used for training
+<< return - the output tensor of the decoder
+*/
+XTensor AttDecoder::MakeFast(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
+                             XTensor* maskEncDec, int nstep, bool isTraining)
+{
+    XTensor x;
+    x = embedder.Make(inputDec, true, isTraining, nstep);
+    /* dropout */
+    if (isTraining && dropoutP > 0)
+        x = Dropout(x, dropoutP);
+    for (int i = 0; i < nlayer; i++) {
+        XTensor res;
+        res = x;
+        /* layer normalization with pre-norm for self-attn */
+        x = selfAttLayerNorms[i].Make(x);
+        /******************/
+        /* self attention */
+        x = selfAtt[i].Make(x, x, x, mask, isTraining, &selfAttCache[i], SELF_ATT);
+        /* dropout */
+        if (isTraining && dropoutP > 0)
+            x = Dropout(x, dropoutP);
+        /* residual connection */
+        x = Sum(res, x);
+        res = x;
+        /* layer normalization with pre-norm for encoder-decoder attention */
+        x = enDeAttLayerNorms[i].Make(x);
+        /* encoder-decoder attention */
+        x = enDeAtt[i].Make(outputEnc, x, outputEnc, maskEncDec,
+                            isTraining, &enDeAttCache[i], EN_DE_ATT);
+        /* dropout */
+        if (isTraining && dropoutP > 0)
+            x = Dropout(x, dropoutP);
+        /* residual connection */
+        x = Sum(res, x);
+        res = x;
+        /* layer normalization with pre-norm for fnn */
+        x = fnnLayerNorms[i].Make(x);
+        /* fnn */
+        x = fnns[i].Make(x, isTraining);
+        /* dropout */
+        if (isTraining && dropoutP > 0)
+            x = Dropout(x, dropoutP);
+        /* residual connection */
+        x = Sum(res, x);
+    }
    x = decoderLayerNorm->Make(x);
    return x;
 }
 }
\ No newline at end of file
--- a/source/sample/transformer/T2TDecoder.h
+++ b/source/sample/transformer/T2TDecoder.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
- * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,13 +19,13 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */
-#ifndef __T2TDECODER_H__
+#ifndef __DECODER_H__
-#define __T2TDECODER_H__
+#define __DECODER_H__
-#include "T2TEncoder.h"
+#include "Encoder.h"
-#include "module/T2TUtility.h"
+#include "Utility.h"
-namespace transformer
+namespace nmt
 {
 class AttDecoder
@@ -52,28 +51,28 @@ public:
    DTYPE dropoutP;
    /* embedding of word at each position */
-    T2TEmbedder embedder;
+    Embedder embedder;
    /* FNN model of each layer */
-    T2TFNN* fnns;
+    FNN* fnns;
    /* attention model of each layer */
-    T2TAttention* selfAtt;
+    Attention* selfAtt;
    /* layer normalization for attention */
-    T2TLN* selfAttLayerNorms;
+    LN* selfAttLayerNorms;
    /* layer normalization for fnn */
-    T2TLN* fnnLayerNorms;
+    LN* fnnLayerNorms;
    /* layer normalization for decoder */
-    T2TLN* decoderLayerNorm;
+    LN* decoderLayerNorm;
    /* encoder-decoder attention model of each layer */
-    T2TAttention* enDeAtt;
+    Attention* enDeAtt;
    /* layer normalization for encoder-decoder attention */
-    T2TLN* enDeAttLayerNorms;
+    LN* enDeAttLayerNorms;
    /* layer cache list */
    Cache* selfAttCache;
@@ -92,11 +91,15 @@ public:
    ~AttDecoder();
    /* initialize the model */
-    void InitModel(T2TConfig& config);
+    void InitModel(Config& config);
    /* make the decoding network */
    XTensor Make(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
                 XTensor* maskEncDec, int nstep, bool isTraining);
+    /* make the decoding network (pre norm) */
+    XTensor MakeFast(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
+                     XTensor* maskEncDec, int nstep, bool isTraining);
 };
 }

--- a/source/sample/transformer/T2TEncoder.cpp
+++ b/source/sample/transformer/T2TEncoder.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
- * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,15 +19,13 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */
-#include <cmath>
+#include "Encoder.h"
+#include "Utility.h"
-#include "T2TEncoder.h"
+#include "module/LayerNorm.h"
-#include "module/T2TUtility.h"
+#include "module/CommonModules.h"
-#include "module/T2TLayerNormal.h"
-#include "module/T2TCommonModules.h"
 #include "../../tensor/core/CHeader.h"
-namespace transformer
+namespace nmt
 {
 /* constructor */
@@ -56,7 +53,7 @@ AttEncoder::~AttEncoder()
 initialize the model
 >> config - configurations for the model
 */
-void AttEncoder::InitModel(T2TConfig& config)
+void AttEncoder::InitModel(Config& config)
 {
    devID = config.devID;
@@ -68,18 +65,18 @@ void AttEncoder::InitModel(T2TConfig& config)
    dropoutP = config.dropout;
    CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
-    CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsize\"");
+    CheckNTErrors(vSize > 1, "Set vocabulary size by \"-vsize\"");
    /* embedding model */
    embedder.InitModel(config);
-    selfAtt = new T2TAttention[nlayer];
+    selfAtt = new Attention[nlayer];
-    fnns = new T2TFNN[nlayer];
+    fnns = new FNN[nlayer];
-    attLayerNorms = new T2TLN[nlayer];
+    attLayerNorms = new LN[nlayer];
-    fnnLayerNorms = new T2TLN[nlayer];
+    fnnLayerNorms = new LN[nlayer];
    if (preNorm)
-        encoderLayerNorm = new T2TLN;
+        encoderLayerNorm = new LN;
    /* initialize the stacked layers */
    for (int i = 0; i < nlayer; i++) {
@@ -122,7 +119,7 @@ XTensor AttEncoder::Make(XTensor& input, XTensor* mask, XTensor& maskEncDec, boo
        attnBefore = LayerNorm(x, attLayerNorms[i], preNorm, true, false);
        /* self attention */
-        att = selfAtt[i].Make(attnBefore, attnBefore, attnBefore, mask, isTraining, NULL, 0);
+        att = selfAtt[i].Make(attnBefore, attnBefore, attnBefore, mask, isTraining, NULL, SELF_ATT);
        /* dropout */
        if (isTraining && dropoutP > 0)
@@ -151,6 +148,62 @@ XTensor AttEncoder::Make(XTensor& input, XTensor* mask, XTensor& maskEncDec, boo
        x = LayerNorm(res, fnnLayerNorms[i], preNorm, false, true);
    }
    if (preNorm)
+        return encoderLayerNorm->Make(x);
+    return x;
+}
+/*
+make the encoding network
+>> input - the input tensor of the encoder
+>> mask - the mask that indicate each position is valid
+>> maskEncDec - no use
+>> isTraining - indicates whether the model is used for training
+<< return - the output tensor of the encoder
+*/
+XTensor AttEncoder::MakeFast(XTensor& input, XTensor* mask, XTensor& maskEncDec, bool isTraining)
+{
+    XTensor x;
+    x = embedder.Make(input, false, isTraining);
+    /* dropout */
+    if (isTraining && dropoutP > 0)
+        x = Dropout(x, dropoutP);
+    for (int i = 0; i < nlayer; i++) {
+        XTensor res;
+        res = x;
+        /* layer normalization with pre-norm for self-attn */
+        x = attLayerNorms[i].Make(x);
+        /* self attention */
+        x = selfAtt[i].Make(x, x, x, mask, isTraining, NULL, SELF_ATT);
+        /* dropout */
+        if (isTraining && dropoutP > 0)
+            x = Dropout(x, dropoutP);
+        /* residual connection */
+        x = Sum(res, x);
+        res = x;
+        /* layer normalization with pre-norm for fnn */
+        x = fnnLayerNorms[i].Make(x);
+        /* fnn */
+        x = fnns[i].Make(x, isTraining);
+        /* dropout */
+        if (isTraining && dropoutP > 0)
+            x = Dropout(x, dropoutP);
+        /* residual connection */
+        x = Sum(res, x);
+    }
    x = encoderLayerNorm->Make(x);
    return x;

--- a/source/sample/transformer/T2TEncoder.h
+++ b/source/sample/transformer/T2TEncoder.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
- * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,25 +19,25 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */
-#ifndef __T2TENCODER_H__
+#ifndef __ENCODER_H__
-#define __T2TENCODER_H__
+#define __ENCODER_H__
-#include "module/T2TFNN.h"
+#include "Utility.h"
-#include "module/T2TUtility.h"
+#include "module/FNN.h"
-#include "module/T2TAttention.h"
+#include "module/Attention.h"
-#include "module/T2TEmbedding.h"
+#include "module/Embedding.h"
-#include "module/T2TLayerNormal.h"
+#include "module/LayerNorm.h"
 #include "../../network/XNet.h"
 using namespace nts;
-namespace transformer
+namespace nmt
 {
 /*
 base class of the encoder
 */
-class T2TEncoder
+class Encoder
 {
 public:
    virtual XTensor Make(XTensor& input, XTensor* mask, XTensor& mask2, bool isTraining) = 0;
@@ -47,7 +46,7 @@ public:
 /*
 the encoder based on self-attention
 */
-class AttEncoder : T2TEncoder
+class AttEncoder : Encoder
 {
 public:
    /* device id */
@@ -73,22 +72,22 @@ public:
    int ignored;
    /* embedding of word at each position */
-    T2TEmbedder embedder;
+    Embedder embedder;
    /* FNN model of each layer */
-    T2TFNN* fnns;
+    FNN* fnns;
    /* attention model of each layer */
-    T2TAttention* selfAtt;
+    Attention* selfAtt;
    /* layer normalizations for attention */
-    T2TLN* attLayerNorms;
+    LN* attLayerNorms;
    /* layer normalization for fnn */
-    T2TLN* fnnLayerNorms;
+    LN* fnnLayerNorms;
    /* layer normalization for encoder */
-    T2TLN* encoderLayerNorm;
+    LN* encoderLayerNorm;
    /* the location of layer normalization */
    bool preNorm;
@@ -101,11 +100,14 @@ public:
    ~AttEncoder();
    /* initialize the model */
-    void InitModel(T2TConfig& config);
+    void InitModel(Config& config);
    /* make the encoding network */
    XTensor Make(XTensor& input, XTensor* mask, XTensor& maskEncDec, bool isTraining);
+    /* make the encoding network */
+    XTensor MakeFast(XTensor& input, XTensor* mask, XTensor& maskEncDec, bool isTraining);
    /* make the encoding network (wrapper) */
    XTensor Make(XTensor& input, XTensor* mask, bool isTraining);
 };

--- a/source/sample/transformer/T2TModel.cpp
+++ b/source/sample/transformer/T2TModel.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
- * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,32 +21,32 @@
 #include <cstdint>
-#include "T2TModel.h"
+#include "Model.h"
-#include "module/T2TUtility.h"
+#include "Utility.h"
 #include "../../tensor/XUtility.h"
 #include "../../tensor/core/CHeader.h"
-namespace transformer
+namespace nmt
 {
 /* constructor */
-T2TModel::T2TModel()
+Model::Model()
 {
    devID = -1;
    isLM = false;
    isMT = false;
    useFP16 = false;
-    shareAllEmbeddings = false;
+    shareAllEmbeddings = 0;
-    shareDecInputOutputWeight = false;
+    shareDecInputOutputWeight = 0;
    nhead = 1;
    encoder = new AttEncoder();
    decoder = new AttDecoder();
-    outputLayer = new T2TOutput();
+    outputLayer = new Output();
 }
 /* de-constructor */
-T2TModel::~T2TModel()
+Model::~Model()
 {
    delete encoder;
    delete decoder;
@@ -58,7 +57,7 @@ T2TModel::~T2TModel()
 initialize the model
 >> config - configurations of the model
 */
-void T2TModel::InitModel(T2TConfig& config)
+void Model::InitModel(Config& config)
 {
    devID = config.devID;
    isMT = config.isMT;
@@ -71,8 +70,8 @@ void T2TModel::InitModel(T2TConfig& config)
        &config.fnnHiddenSize, &config.modelSize,
        &config.embSize, &config.srcVocabSize,
        &config.tgtVocabSize, &config.nhead,
-        &config.maxRP, &shareAllEmbeddings,
+        &config.maxRP, &config.shareAllEmbeddings,
-        &shareDecInputOutputWeight,
+        &config.shareDecInputOutputWeight,
        &config.maxPosLen
    };
@@ -81,10 +80,28 @@ void T2TModel::InitModel(T2TConfig& config)
    /* read model configurations */
    if (!config.isTraining) {
        modelFile = fopen(config.modelFN, "rb");
-        for (auto& meta : metaInfo)
+        CheckNTErrors(modelFile, "Failed to open the model file");
+        for (auto& meta : metaInfo) {
            fread(meta, sizeof(int), 1, modelFile);
        }
+    }
+    else {
+        /* read the source and target vocab size */
+        FILE* trainF = fopen(config.trainFN, "rb");
+        CheckNTErrors(trainF, "Failed to open the training file");
+        fread(&config.srcVocabSize, sizeof(config.srcVocabSize), 1, trainF);
+        fread(&config.tgtVocabSize, sizeof(config.tgtVocabSize), 1, trainF);
+        CheckNTErrors(config.srcVocabSize > 0, "Invalid source vocabulary size");
+        CheckNTErrors(config.tgtVocabSize > 0, "Invalid target vocabulary size");
+        fclose(trainF);
+    }
    nhead = config.nhead;
+    shareAllEmbeddings = config.shareAllEmbeddings;
+    shareDecInputOutputWeight = config.shareDecInputOutputWeight;
+    ShowModelConfig(config);
    encoder->InitModel(config);
    outputLayer->InitModel(config);
@@ -92,13 +109,12 @@ void T2TModel::InitModel(T2TConfig& config)
    if (isMT)
        decoder->InitModel(config);
-    TensorList params(10);
-    GetParams(params);
    /* load parameters */
    if (!config.isTraining)
        Read(modelFile);
    else {
+        TensorList params;
+        GetParams(params);
        for (int i = 0; i < params.Size(); i++)
            params[i]->SetVarFlag();
    }
@@ -108,13 +124,28 @@ void T2TModel::InitModel(T2TConfig& config)
 }
 /*
+print model configurations
+>> config - model configurations
+*/
+void Model::ShowModelConfig(Config& config)
+{
+    /* TODO: output more info */
+    XPRINT1(0, stderr, "encoder layer: %d\n", config.nEncLayer);
+    XPRINT1(0, stderr, "decoder layer: %d\n", config.nDecLayer);
+    XPRINT1(0, stderr, "attention heads: %d\n", config.nhead);
+    XPRINT1(0, stderr, "model size: %d\n", config.modelSize);
+    XPRINT1(0, stderr, "source vocab size: %d\n", config.srcVocabSize);
+    XPRINT1(0, stderr, "target vocab size: %d\n", config.tgtVocabSize);
+}
+/*
 make the encoding network
->> input - input tensor
+>> input - input tensor, (batchSize, srcLen)
->> mask - the mask for positions that are/not involved in computation
+>> mask - the mask for encoder self-attention, (headNum, batchSize, srcLen, srcLen)
 >> isTraining - indicates whether we are training the model
-<< return - encoding result
+<< return - encoding result, (batchSize, srcLen, hiddenDim)
 */
-XTensor T2TModel::MakeEncoder(XTensor& input, XTensor* mask, bool isTraining)
+XTensor Model::MakeEncoder(XTensor& input, XTensor* mask, bool isTraining)
 {
    XTensor nothing;
@@ -123,15 +154,14 @@ XTensor T2TModel::MakeEncoder(XTensor& input, XTensor* mask, bool isTraining)
 /*
 make the decoding network
->> inputDec - input tensor of the decoder
+>> inputDec - input tensor of the decoder, (batchSize, tgtLen)
->> outputEnc - output tensor of the encoder
+>> outputEnc - output tensor of the encoder, (batchSize, srcLen, hiddenDim)
->> output - output tensor (distribution)
+>> mask - mask for decoder self-attention, (headNum, batchSize, tgtLen, tgtLen)
->> mask - mask for positions that are/not involved in computation
+>> maskEncDec - mask for the encoder-decoder attention, (headNum, batchSize, tgtLen, srcLen)
->> maskEncDec - mask for the encoder-decoder attention
 >> isTraining - indicates whether we are training the model
-<< return - encoding result
+<< return - decoding result, (batchSize, tgtLen, hiddenDim)
 */
-XTensor T2TModel::MakeDecoder(XTensor& inputDec, XTensor& outputEnc, 
+XTensor Model::MakeDecoder(XTensor& inputDec, XTensor& outputEnc,
    XTensor* mask, XTensor& maskEncDec, bool isTraining)
 {
    return decoder->Make(inputDec, outputEnc, mask, &maskEncDec,
@@ -145,7 +175,7 @@ make the network for language modeling (with the output softmax layer)
 >> padding - padding of the sequences
 >> isTraining - indicates whether the model is for training
 */
-void T2TModel::MakeLM(XTensor& input, XTensor& output, XTensor& padding, bool isTraining)
+void Model::MakeLM(XTensor& input, XTensor& output, XTensor& padding, bool isTraining)
 {
    int len = padding.GetDim(padding.order - 1);
    int* dims = new int[padding.order + 2];
@@ -173,19 +203,19 @@ void T2TModel::MakeLM(XTensor& input, XTensor& output, XTensor& padding, bool is
 /*
 make the network for machine translation (with the output softmax layer)
->> inputEnc - input tensor of the encoder
+>> inputEnc - input tensor of the encoder, (batchSize, srcLen)
->> inputDec - input tensor of the decoder
+>> inputDec - input tensor of the decoder, (batchSize, tgtLen)
->> output - output tensor (distribution)
+>> output - output tensor (distribution), (batchSize, tgtLen, hiddenDim)
->> paddingEnc - padding of the sequences (on the encoder side)
+>> paddingEnc - padding of the sequences (on the encoder side), (batchSize, srcLen)
->> paddingDec - padding of the sequences (on the decoder side)
+>> paddingDec - padding of the sequences (on the decoder side), (batchSize, tgtLen)
 >> isTraining - indicates whether the model is for training
 */
-void T2TModel::MakeMT(XTensor& inputEnc, XTensor& inputDec, XTensor& output,
+void Model::MakeMT(XTensor& inputEnc, XTensor& inputDec, XTensor& output,
-    XTensor& paddingEnc, XTensor& paddingDec,
+                   XTensor& paddingEnc, XTensor& paddingDec, bool isTraining)
-    bool isTraining)
 {
    XTensor encoding;
    XTensor decoding;
    XTensor maskEnc;
    XTensor maskDec;
    XTensor maskEncDec;
@@ -213,7 +243,7 @@ make the mask for training MT models
 >> maksDec - mask of the decoder self-attention
 >> maksEncDec - mask of the decoder enc-dec attention
 */
-void T2TModel::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
+void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
                       XTensor& paddingEnc, XTensor& paddingDec,
                       XTensor& maskEnc, XTensor& maskDec, XTensor& maskEncDec)
 {
@@ -260,8 +290,7 @@ void T2TModel::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
        dimsPadding[i + 1] = padding2->GetDim(i);
    dimsPadding[0] = nhead;
-    XTensor* padding3 = NewTensorBuf(paddingEnc.order + 2, dimsPadding, paddingEnc.dataType,
+    XTensor* padding3 = NewTensorBuf(paddingEnc.order + 2, dimsPadding, paddingEnc.dataType, paddingEnc.devID);
-        paddingEnc.devID);
    /* mask of the padding */
    _Unsqueeze(&paddingEnc, padding2, paddingEnc.order - 1, paddingEnc.GetDim(-1));
@@ -284,37 +313,27 @@ void T2TModel::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
 /*
 make the mask of the encoder
->> inputEnc - input of the encoder
+>> paddingEnc - padding of the encoder input, (batchSize, srcLen)
->> paddingEnc - padding of the encoder input
+>> maskEnc - mask of the encoder self-attention, (headNum, batchSize, srcLen, srcLen)
->> maskEnc - mask of the encoder self-attention
 */
-void T2TModel::MakeMTMaskEnc(XTensor& paddingEnc, XTensor& maskEnc)
+void Model::MakeMTMaskEnc(XTensor& paddingEnc, XTensor& maskEnc)
 {
    XTensor padding2;
-    XTensor padding3;
    /* mask of the padding */
    Unsqueeze(paddingEnc, padding2, paddingEnc.order - 1, paddingEnc.GetDim(-1));
-    Unsqueeze(padding2, padding3, 0, nhead);
+    Unsqueeze(padding2, maskEnc, 0, nhead);
-    ScaleAndShiftMe(padding3, 1e9F, -1e9F);
+    ScaleAndShiftMe(maskEnc, 1e9F, -1e9F);
-    InitTensor(&maskEnc, &padding3);
-    maskEnc.SetZeroAll();
-    /* generate the mask on the source language side (for padding) */
-    SumMe(maskEnc, padding3);
 }
 /*
 make the mask of the decoder
->> inputEnc - input of the encoder
+>> paddingEnc - padding of the encoder input, (batchSize, srcLen)
->> inputDec - input of the decoder
+>> paddingDec - padding of the decoder input, (batchSize, tgtLen)
->> paddingEnc - padding of the encoder input
+>> maksDec - mask of the decoder self-attention, (headNum, batchSize, tgtLen, tgtLen)
->> paddingDec - padding of the decoder input
+>> maksEncDec - mask of the decoder enc-dec attention, (headNum, batchSize, tgtLen, srcLen)
->> maksDec - mask of the decoder self-attention
->> maksEncDec - mask of the decoder enc-dec attention
 */
-void T2TModel::MakeMTMaskDec(XTensor& paddingEnc, XTensor& paddingDec,
+void Model::MakeMTMaskDec(XTensor& paddingEnc, XTensor& paddingDec,
                          XTensor& maskDec, XTensor& maskEncDec)
 {
    int len = paddingDec.GetDim(paddingDec.order - 1);
@@ -340,26 +359,27 @@ void T2TModel::MakeMTMaskDec(XTensor& paddingEnc, XTensor& paddingDec,
    delete[] dims;
 }
 /*
 get parameter matrices
 >> list - the list that keeps the parameter matrics
 */
-void T2TModel::GetParams(TensorList& list)
+void Model::GetParams(TensorList& list)
 {
    list.Clear();
    /* encoder parameters */
    for (int i = 0; i < encoder->nlayer; i++) {
-        list.Add(&encoder->selfAtt[i].wq);
+        list.Add(&encoder->selfAtt[i].weightQ);
-        list.Add(&encoder->selfAtt[i].wk);
+        list.Add(&encoder->selfAtt[i].weightK);
-        list.Add(&encoder->selfAtt[i].wv);
+        list.Add(&encoder->selfAtt[i].weightV);
-        list.Add(&encoder->selfAtt[i].bq);
+        list.Add(&encoder->selfAtt[i].biasQ);
-        list.Add(&encoder->selfAtt[i].bk);
+        list.Add(&encoder->selfAtt[i].biasK);
-        list.Add(&encoder->selfAtt[i].bv);
+        list.Add(&encoder->selfAtt[i].biasV);
        if (encoder->selfAtt[i].useRPR)
            list.Add(&encoder->selfAtt[i].RPEmbK);
-        list.Add(&encoder->selfAtt[i].wo);
+        list.Add(&encoder->selfAtt[i].weightO);
-        list.Add(&encoder->selfAtt[i].bo);
+        list.Add(&encoder->selfAtt[i].biasO);
        list.Add(&encoder->fnns[i].w1);
        list.Add(&encoder->fnns[i].b1);
        list.Add(&encoder->fnns[i].w2);
@@ -377,26 +397,26 @@ void T2TModel::GetParams(TensorList& list)
    if (isMT) {
        /* decoder parameters */
        for (int i = 0; i < decoder->nlayer; i++) {
-            list.Add(&decoder->selfAtt[i].wq);
+            list.Add(&decoder->selfAtt[i].weightQ);
-            list.Add(&decoder->selfAtt[i].wk);
+            list.Add(&decoder->selfAtt[i].weightK);
-            list.Add(&decoder->selfAtt[i].wv);
+            list.Add(&decoder->selfAtt[i].weightV);
-            list.Add(&decoder->selfAtt[i].bq);
+            list.Add(&decoder->selfAtt[i].biasQ);
-            list.Add(&decoder->selfAtt[i].bk);
+            list.Add(&decoder->selfAtt[i].biasK);
-            list.Add(&decoder->selfAtt[i].bv);
+            list.Add(&decoder->selfAtt[i].biasV);
            if (decoder->selfAtt[i].useRPR)
                list.Add(&decoder->selfAtt[i].RPEmbK);
-            list.Add(&decoder->selfAtt[i].wo);
+            list.Add(&decoder->selfAtt[i].weightO);
-            list.Add(&decoder->selfAtt[i].bo);
+            list.Add(&decoder->selfAtt[i].biasO);
            list.Add(&decoder->selfAttLayerNorms[i].w);
            list.Add(&decoder->selfAttLayerNorms[i].b);
-            list.Add(&decoder->enDeAtt[i].wq);
+            list.Add(&decoder->enDeAtt[i].weightQ);
-            list.Add(&decoder->enDeAtt[i].wk);
+            list.Add(&decoder->enDeAtt[i].weightK);
-            list.Add(&decoder->enDeAtt[i].wv);
+            list.Add(&decoder->enDeAtt[i].weightV);
-            list.Add(&decoder->enDeAtt[i].bq);
+            list.Add(&decoder->enDeAtt[i].biasQ);
-            list.Add(&decoder->enDeAtt[i].bk);
+            list.Add(&decoder->enDeAtt[i].biasK);
-            list.Add(&decoder->enDeAtt[i].bv);
+            list.Add(&decoder->enDeAtt[i].biasV);
-            list.Add(&decoder->enDeAtt[i].wo);
+            list.Add(&decoder->enDeAtt[i].weightO);
-            list.Add(&decoder->enDeAtt[i].bo);
+            list.Add(&decoder->enDeAtt[i].biasO);
            list.Add(&decoder->enDeAttLayerNorms[i].w);
            list.Add(&decoder->enDeAttLayerNorms[i].b);
            list.Add(&decoder->fnns[i].w1);
@@ -418,8 +438,9 @@ void T2TModel::GetParams(TensorList& list)
        list.Add(&decoder->embedder.w);
    }
-    if (shareDecInputOutputWeight == 0)
+    if (shareDecInputOutputWeight == 0) {
        list.Add(&outputLayer->w);
+    }
 }
 /*
@@ -427,14 +448,14 @@ dump the model to a file
 >> fn - where to save the model
 >> model - the model
 */
-void T2TModel::Dump(const char* fn)
+void Model::Dump(const char* fn)
 {
    double startT = GetClockSec();
    FILE* file = fopen(fn, "wb");
    CheckNTErrors(file, "Cannot open the model file");
-    TensorList params(100);
+    TensorList params;
    GetParams(params);
@@ -459,22 +480,29 @@ void T2TModel::Dump(const char* fn)
    double elapsed = GetClockSec() - startT;
-    XPRINT1(0, stderr, "[INFO] model saved (took %.1fs)\n", elapsed);
+    LOG("model saved (took %.1fs)", elapsed);
 }
 /* read the parameters */
-void T2TModel::Read(FILE* file)
+void Model::Read(FILE* file)
 {
    double startT = GetClockSec();
-    TensorList params(100);
+    TensorList params;
    GetParams(params);
+    LOG("params count: %lu", params.Size());
+    int size = 0;
+    for (int i = 0; i < params.Size(); i++) {
+        size += params[i]->unitNum;
+    }
+    LOG("params size: %d", size);
-    /* convert parameters to FP16 */
+    /* convert parameters to FP16 before reading files */
    if (useFP16) {
+        LOG("Convert parameters to FP16");
        for (int i = 0; i < params.Size(); i++) {
            XTensor* p = params[i];
-            InitTensorV2(p, p->order, p->dimSize, X_FLOAT16, 1, p->devID);
+            InitTensor(p, p->order, p->dimSize, X_FLOAT16, p->devID, p->enableGrad && X_ENABLE_GRAD);
        }
        auto& encEmb = encoder->embedder.posEmbeddingBase;
@@ -488,18 +516,18 @@ void T2TModel::Read(FILE* file)
    /* share all embeddings */
    if (shareAllEmbeddings == 1) {
-        decoder->embedder.w = CopyValues(encoder->embedder.w);
+        _CopyValues(&encoder->embedder.w, &decoder->embedder.w);
-        XPRINT(0, stderr, "[INFO] sharing encoder decoder embeddings\n");
+        LOG("sharing encoder decoder embeddings");
    }
    /* share embeddings with output weights */
    if (shareDecInputOutputWeight == 1) {
-        outputLayer->w = CopyValues(decoder->embedder.w);
+        _CopyValues(&decoder->embedder.w, &outputLayer->w);
-        XPRINT(0, stderr, "[INFO] sharing decoder embeddings with output weights\n");
+        LOG("sharing decoder embeddings with output weights");
    }
    double elapsed = GetClockSec() - startT;
-    XPRINT1(0, stderr, "[INFO] model loaded (took %.1fs)\n", elapsed);
+    LOG("model loaded (took %.1fs)", elapsed);
 }
 }
\ No newline at end of file
--- a/source/sample/transformer/T2TModel.h
+++ b/source/sample/transformer/T2TModel.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
- * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,23 +19,22 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */
-#ifndef __T2TMODEL_H__
+#ifndef __MODEL_H__
-#define __T2TMODEL_H__
+#define __MODEL_H__
-#include "T2TEncoder.h"
+#include "Encoder.h"
-#include "T2TDecoder.h"
+#include "Decoder.h"
-#include "module/T2TFNN.h"
+#include "module/FNN.h"
-#include "module/T2TOutput.h"
+#include "module/Output.h"
-#include "module/T2TUtility.h"
+#include "Utility.h"
-#include "module/T2TAttention.h"
+#include "module/Attention.h"
-namespace transformer
+namespace nmt
 {
-/* a transformer model that keeps parameters of the encoder,
+/* a nmt model that keeps parameters of the encoder,
-   the decoder and the output layer (softmax). Also, it creates
+   the decoder and the output layer (softmax). */
-   the network used in transformer. */
+class Model
-class T2TModel
 {
 public:
    /* device id */
@@ -49,7 +47,7 @@ public:
    AttDecoder* decoder;
    /* output layer */
-    T2TOutput* outputLayer;
+    Output* outputLayer;
    /* indicates whether the model is running for language modeling */
    bool isLM;
@@ -71,13 +69,16 @@ public:
 public:
    /* constructor */
-    T2TModel();
+    Model();
    /* de-constructor */
-    ~T2TModel();
+    ~Model();
    /* initialize the model */
-    void InitModel(T2TConfig& config);
+    void InitModel(Config& config);
+    /* print model configurations */
+    void ShowModelConfig(Config& config);
    /* make the encoding network */
    XTensor MakeEncoder(XTensor& input, XTensor* mask, bool isTraining);

--- a/source/sample/transformer/Transformer.cpp
+++ b/source/sample/transformer/Transformer.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
- * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
- * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,49 +16,47 @@
 /*
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
- * $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-06, 2020-07
 */
-#include <cmath>
 #include <ctime>
-#include "Transformer.h"
+#include "NMT.h"
-#include "train/T2TTrainer.h"
+#include "train/Trainer.h"
-#include "module/T2TUtility.h"
+#include "translate/Translator.h"
-#include "translate/T2TTranslator.h"
-#include "../../tensor/XDevice.h"
-#include "../../tensor/XGlobal.h"
-#include "../../tensor/XUtility.h"
-namespace transformer
+namespace nmt
 {
-int TransformerMain(int argc, const char** argv)
+int NMTMain(int argc, const char** argv)
 {
    if (argc == 0)
        return 1;
    /* load configurations */
-    T2TConfig config(argc, argv);
+    Config config(argc, argv);
-    srand((unsigned int)time(NULL));
+    srand(1);
-    /* train the model */
+    /* training */
    if (strcmp(config.trainFN, "") != 0) {
-        ENABLE_GRAD;
-        T2TModel model;
+        Model model;
        model.InitModel(config);
-        T2TTrainer trainer;
+        Trainer trainer;
        trainer.Init(config);
        trainer.Train(config.trainFN, config.validFN, config.modelFN, &model);
    }
-    /* translate the test file */
+    /* translating */
    if (strcmp(config.testFN, "") != 0 && strcmp(config.outputFN, "") != 0) {
+        /* disable grad flow */
        DISABLE_GRAD;
-        T2TModel model;
+        Model model;
        model.InitModel(config);
-        T2TTranslator translator;
+        Translator translator;
        translator.Init(config);
        translator.Translate(config.testFN, config.srcVocabFN, 
                             config.tgtVocabFN, config.outputFN, &model);

--- a/source/sample/transformer/Transformer.h
+++ b/source/sample/transformer/Transformer.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
- * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,29 +15,17 @@
 */
 /*
- *
+ * An implementation of the NMT system. 
- * An implementation of the transformer system. See more details
- * about FNNLM in
- * "Attention Is All You Need" by Vaswani et al.
- * https://arxiv.org/pdf/1706.03762.pdf
- *
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
- * I start writing the code related to NMT - a long time since my last coding
- * work on MT
 */
-#ifndef __TRANSFORMER_H__
+#ifndef __NMT_H__
-#define __TRANSFORMER_H__
+#define __NMT_H__
-#include "../../tensor/XGlobal.h"
-#include "../../tensor/XTensor.h"
-#include "../../tensor/core/CHeader.h"
-namespace transformer
+namespace nmt
 {
 /* entrance of the program */
-int TransformerMain(int argc, const char** argv);
+int NMTMain(int argc, const char** argv);
 }

--- a/source/sample/transformer/T2TAttention.cpp
+++ b/source/sample/transformer/T2TAttention.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- /*
-  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
-  */
-#include <math.h>
-#include "T2TAttention.h"
-#include "T2TUtility.h"
-#include "T2TEmbedding.h"
-#include "../../tensor/core/CHeader.h"
-namespace transformer
-{
-/* constructor */
-T2TAttention::T2TAttention()
-{
-    nhead = -1;
-    dk = -1;
-    dv = -1;
-    d = -1;
-    isMasked = false;
-    ignored = 0;
-}
-/* deconstructor */
-T2TAttention::~T2TAttention()
-{
-}
-/*
-initialize the model
->> argc - number of arguments
->> argv - list of pointers to the arguments
->> myIgnored - number of position ignored in attention (from the begining)
->> myIsMasked - indicates whether the attention is with a mask
->> myDevID - device id
-*/
-void T2TAttention::InitModel(int argc, char** argv,
-    bool myIsMasked, int myIgnored,
-    int myDevID)
-{
-    devID = myDevID;
-    isMasked = myIsMasked;
-    ignored = myIgnored;
-    float minmax = 0;
-    LoadParamInt(argc, argv, "nhead", &nhead, 4);
-    LoadParamInt(argc, argv, "d", &dk, DEFAULT_EMBEDDING_SIZE);
-    LoadParamInt(argc, argv, "d", &dv, DEFAULT_EMBEDDING_SIZE);
-    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
-    LoadParamInt(argc, argv, "maxPosition", &max_relative_position, 8);
-    LoadParamFloat(argc, argv, "attminmax", &minmax, 0.1F);
-    LoadParamFloat(argc, argv, "dropoutatt", &dropoutP, 0);
-    InitTensor2DV2(&wq, d, d, X_FLOAT, devID);
-    InitTensor1DV2(&bq, d, X_FLOAT, devID);
-    InitTensor2DV2(&wk, d, d, X_FLOAT, devID);
-    InitTensor1DV2(&bk, d, X_FLOAT, devID);
-    InitTensor2DV2(&wv, d, d, X_FLOAT, devID);
-    InitTensor1DV2(&bv, d, X_FLOAT, devID);
-    InitTensor2DV2(&rp_embedding_k, max_relative_position * 2 + 1, d/nhead, X_FLOAT, devID);
-    InitTensor2DV2(&wo, d, d, X_FLOAT, devID);
-    InitTensor1DV2(&bo, d, X_FLOAT, devID);
-}
-/*
-make the network
->> k - keys. It might be of size B * L * H
-        where B = batch size, L = sequence length,
-        and H = vector size of each position
->> q - queries
->> v - values
->> mask - as it is
->> isTraining - indicates whether the model is used for training
->> cache - layer cache list
->> cacheType - which type that cache is
-<< return - multi-attention result
-*/
-XTensor T2TAttention::Make(XTensor& k, XTensor& q, XTensor& v, XTensor* mask, bool isTraining, Cache* cache, int cacheType)
-{
-    const bool isEnc = (!cache) ? true : false;
-    /* linear transformation before self-attention */
-    XTensor q2, k2, v2;
-    q2 = MatrixMul(q, wq) + bq;
-    if (!cache) {
-        /* self attention for encoder layers */
-        k2 = MatrixMul(k, wk) + bk;
-        v2 = MatrixMul(v, wv) + bv;
-        return MakeRPRAttention(k2, q2, v2, mask, isTraining, isEnc);
-    }
-    else {
-        if (cacheType == SELF_ATT) {
-            k2 = MatrixMul(k, wk) + bk;
-            v2 = MatrixMul(v, wv) + bv;
-            /* if hit, we only concat the cache with the new token */
-            if (!cache->miss) {
-                k2 = Concatenate(cache->key, k2, 1);
-                v2 = Concatenate(cache->value, v2, 1);
-            }
-            cache->key = k2;
-            cache->value = v2;
-            cache->miss = false;
-            return MakeRPRAttention(cache->key, q2, cache->value, mask, isTraining, isEnc);
-        }
-        else if (cacheType == EN_DE_ATT) {
-            if (cache->miss) {
-                cache->key = MatrixMul(k, wk) + bk;
-                cache->value = MatrixMul(v, wv) + bv;
-                cache->miss = false;
-            }
-            return MakeAttention(cache->key, q2, cache->value, mask, isTraining, isEnc);
-        }
-        CheckNTErrors(0, "invalid cache type");
-    }
-}
-/*
-make the attention network given keys, queries and values (after linear transformation)
->> k - keys. It might be of size B * L * H
-        where B = batch size, L = sequence length,
-        and H = vector size of each position
->> q - queries
->> v - values
->> mask - as it is
->> isTraining - indicates whether the model is used for training
-*/
-XTensor T2TAttention::MakeAttention(XTensor& k, XTensor& q, XTensor& v, XTensor* mask, bool isTraining, bool is_encoder)
-{
-    XTensor kheads;
-    XTensor qheads;
-    XTensor vheads;
-    /* multi head */
-    kheads = Split(k, k.order - 1, nhead);
-    qheads = Split(q, q.order - 1, nhead);
-    vheads = Split(v, v.order - 1, nhead);
-    XTensor att;
-    XTensor dot;
-    XTensor scalar;
-    /* scalar = softmax(Q * K^T / sqrt(dk)) * V */
-    dot = BMMul(qheads, X_NOTRANS, kheads, X_TRANS);
-    /*if (isMasked && mask)
-        _SumMe(&dot, mask);*/
-    dot = Linear(dot, 1.0F / (float)sqrt((float)dk / nhead));
-    scalar = Softmax(dot, -1);
-    if(isTraining && dropoutP > 0)
-        scalar = Dropout(scalar, dropoutP);
-    att = BMMul(scalar, vheads);
-    /* concatenate the heads */
-    return MulAndShift(Merge(att, att.order - 1), wo, bo);
-}
-/*
-make the attention network by incorporating the relative position representation with the given keys, queries and values (after linear transformation)
->> k - keys. It might be of size B * L * H
-        where B = batch size, L = sequence length,
-        and H = vector size of each position
->> q - queries
->> v - values
->> mask - as it is
->> isTraining - indicates whether the model is used for training
-*/
-XTensor T2TAttention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v, XTensor* mask, bool isTraining, bool is_encoder)
-{
-    XTensor kheads;
-    XTensor qheads;
-    XTensor vheads;
-    const int batch_size = q.GetDim(0);
-    const int len_q = q.GetDim(1);
-    const int len_kv = k.GetDim(1);
-    /* multi head */
-    kheads = Split(k, k.order - 1, nhead);
-    qheads = Split(q, q.order - 1, nhead);
-    vheads = Split(v, v.order - 1, nhead);
-    XTensor att;
-    XTensor dot;
-    XTensor scalar;
-    XTensor emb_matrix, relative_key;
-    InitTensor2DV2(&emb_matrix, len_q, len_kv, X_INT, q.devID);
-    InitTensor3DV2(&relative_key, len_q, len_kv, kheads.GetDim(-1), X_FLOAT, q.devID);
-    InitTensor4DV2(&dot, nhead, batch_size, len_q, len_kv, X_FLOAT, q.devID);
-    /* generate the relative emb index (L_q, L_kv) */
-    GetRPEmbedding(&emb_matrix, len_q, len_kv, max_relative_position, q.devID, is_encoder);
-    /* generate the relative key from the rp_embedding_k (L_q, L_kv, H/K) */
-    _Gather(&rp_embedding_k, &relative_key, &emb_matrix);
-    /* RPR dot product (K, B, L_q, L_kv)*/
-    RPDotProduct(&qheads, &kheads, &relative_key, &dot, true);
-    /*if (isMasked && mask)
-        _SumMe(&dot, mask);*/
-    /* scale the dot result */
-    dot = Linear(dot, 1.0F / (float)sqrt((float)dk / nhead));
-    /* softmax */
-    scalar = Softmax(dot, -1);
-    if (isTraining && dropoutP > 0)
-        scalar = Dropout(scalar, dropoutP);
-    /* generate the relative attention output (K, B, L_q, H/K) */
-    att = BMMul(scalar, vheads);
-    /* concatenate the heads */
-    return MulAndShift(Merge(att, att.order - 1), wo, bo);
-}
-void T2TAttention::GetRPEmbedding(XTensor* emb_matrix, const int len_q, const int len_kv, const int max_relative_length, const int devID, const bool is_encoder)
-{
-    XTensor range;
-    InitTensor1DV2(&range, len_kv, X_INT, devID);
-    int* index = new int[len_kv];
-    // for encoder self-attention which the L_q = L_kv
-    if (is_encoder)
-    {
-        for (int i = 0; i < len_kv; i++)
-            index[i] = i;
-        range.SetData(index, len_kv);
-        XTensor range_2D, range_2D_t;
-        InitTensor2DV2(&range_2D, len_q, len_kv, X_INT, devID);
-        InitTensor2DV2(&range_2D_t, len_q, len_kv, X_INT, devID);
-        _Unsqueeze(&range, &range_2D, 0, len_q);
-        _Transpose(&range_2D, &range_2D_t, 0, 1);
-        _Sum(&range_2D, &range_2D_t, emb_matrix, -1);
-    }
-    // for decoder self-attention which the L_q != L_kv, and L_q is 1
-    else
-    {
-        for (int i = 0; i < len_kv; i++)
-            index[i] = -len_kv + i + 1;
-        range.SetData(index, len_kv);
-        _Unsqueeze(&range, emb_matrix, 0, len_q);
-    }
-    // clip the tensor range from -max_ralative_length to max_relative_length
-    _Clip(emb_matrix, emb_matrix, -max_relative_length, max_relative_length);
-    // (L_q, L_kv)
-    _ScaleAndShift(emb_matrix, emb_matrix, 1, max_relative_length);
-    delete[] index;
-}
-void T2TAttention::RPDotProduct(XTensor* x, XTensor* y, XTensor* z, XTensor* attention, const bool is_key)
-{
-    const int head_num = nhead;
-    const int batch_size = x->dimSize[1];
-    const int len_kv = y->dimSize[2];
-    const int len_q = x->dimSize[2];
-    const int depth = y->dimSize[3];
-    // L_kv (is_key=True) or H/K (is_key=False)
-    const int last_dim = is_key ? len_kv : depth;
-    MATRIX_TRANS_TYPE transpose_flag = is_key ? X_TRANS : X_NOTRANS;
-    //if (profiler_) profiler_->StartTimer("RPDotPro-BMM");
-    // for key: batch-MM: (K,B,L_q,H/K) * (K,B,H/K,L_kv) -> (K,B,L_q,L_kv)
-    // for not key: batch-MM: (K,B,L_q,L_kv) * (K,B,L_kv,H/K) -> (K,B,L_q,H/K)
-    XTensor context;
-    InitTensor4DV2(&context, head_num, batch_size, len_q, last_dim, X_FLOAT, x->devID);
-    _MatrixMulBatched(x, X_NOTRANS, y, transpose_flag, &context);
-    // reshape and transpose x to (L_q, K*B, H/K or L_kv)
-    int merge_dims[] = { head_num * batch_size, len_q, x->dimSize[3] };
-    x->Reshape(3, merge_dims);
-    XTensor x_t;
-    InitTensor3DV2(&x_t, len_q, head_num * batch_size, x->GetDim(-1), X_FLOAT, x->devID);
-    _Transpose(x, &x_t, 0, 1);
-    // for key: batch-MM: (L_q, K*B, H/K) * (L_q, L_kv, H/K) -> (L_q, K*B, L_kv)
-    // for not key: batch-MM: (L_q, K*B, L_kv) * (L_q, L_kv, H/K) -> (L_q, K*B, H/K)
-    XTensor relative;
-    InitTensor3DV2(&relative, len_q, head_num * batch_size, last_dim, X_FLOAT, x->devID);
-    _MatrixMulBatched(&x_t, X_NOTRANS, z, transpose_flag, &relative);
-    // (L_q, K*B, H/K or L_kv) -> (K*B, L_q, H/K or L_kv)
-    XTensor relative_t;
-    InitTensor3DV2(&relative_t, head_num * batch_size, len_q, last_dim, X_FLOAT, x->devID);
-    _Transpose(&relative, &relative_t, 0, 1);
-    int split_dims[] = { head_num, batch_size, len_q, last_dim };
-    relative_t.Reshape(4, split_dims);
-    _Sum(&context, &relative_t, attention);
-}
-}
--- a/source/sample/transformer/T2TAttention.h
+++ b/source/sample/transformer/T2TAttention.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- /*
-  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
-  */
-#ifndef __T2TATTENTION_H__
-#define __T2TATTENTION_H__
-#include "../../network/XNet.h"
-using namespace nts;
-namespace transformer
-{
-/* attention type */
-enum { NONE, SELF_ATT, EN_DE_ATT };
-/* layer cache for keys and values */
-class Cache
-{
-public:
-    /* cache for keys */
-    XTensor key;
-    /* cache for values */
-    XTensor value;
-public:
-    bool miss;
-    Cache() {
-        miss = true;
-    }
-    void Update(XTensor&& k, XTensor&& v) {
-        key = k;
-        value = v;
-        miss = false;
-    }
-};
-/*
-multi-head attention
-y(Q, K, V) = cat(head_1, head_2, ..., head_n)
-where head_i = Attention(Q * w_i^Q, K * w_i^K, V * w_i^V)
-        attention(Q, K, V) = softmax(Q * K^T/d_k^0.5) V
-        d_k = dimension size of K
-*/
-class T2TAttention
-{
-public:
-    /* device id */
-    int devID;
-    /* head number */
-    int nhead;
-    /* transformation matrix for Q */
-    XTensor wq;
-    /* bias for Q */
-    XTensor bq;
-    /* transformation matrix for K */
-    XTensor wk;
-    /* bias for K */
-    XTensor bk;
-    /* transformation matrix for V */
-    XTensor wv;
-    /* bias for V */
-    XTensor bv;
-    XTensor wBig;
-    XTensor bBig;
-    /* RPR emb */
-    XTensor rp_embedding_k;
-    /* transformation after dot-product attention */
-    XTensor wo;
-    /* bias after dot-product attention */
-    XTensor bo;
-    /* size of transformed Q and K */
-    int dk;
-    /* size of transformed V */
-    int dv;
-    /* size of input Q, K and V */
-    int d;
-    /* indicates whether the attention is masked */
-    bool isMasked;
-    /* some positions can be ignored in attention. this is useful in lm where the first position needs
-        special design for the attention model. */
-    int ignored;
-    /* indicates whether the model is used for training */
-    bool isTraining;
-    /* dropout probability */
-    DTYPE dropoutP;
-    /* max relative window size */
-    int max_relative_position;
-public:
-    /* constructor */
-    T2TAttention();
-    /* de-constructor */
-    ~T2TAttention();
-    /* initialize the model */
-    void InitModel(int argc, char** argv,
-        bool myIsMasked, int myIgnored,
-        int myDevID = -1);
-    /* make the network */
-    XTensor Make( XTensor& k,  XTensor& q,  XTensor& v,
-         XTensor* mask, bool isTraining, Cache* cache, int cacheType);
-    /* make the attention network given keys, queries and values (after linear transformation) */
-    XTensor MakeAttention(XTensor& k, XTensor& q, XTensor& v,  XTensor* mask, bool isTraining, bool is_encoder);
-    /* make the attention network given keys, queries and values (after linear transformation) */
-    XTensor MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v, XTensor* mask, bool isTraining, bool is_encoder);
-    void GetRPEmbedding(XTensor* emb_matrix, const int len_q, const int len_kv, const int max_relative_length, const int device_id, const bool is_encoder);
-    void RPDotProduct(XTensor* x, XTensor* y, XTensor* z, XTensor* attention, const bool is_key);
-};
-}
-#endif
--- a/source/sample/transformer/T2TEmbedding.cpp
+++ b/source/sample/transformer/T2TEmbedding.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-01
- */
-#include <math.h>
-#include "T2TEmbedding.h"
-#include "T2TUtility.h"
-#include "../../tensor/core/CHeader.h"
-namespace transformer
-{
-/* constructor */
-T2TEmbedder::T2TEmbedder()
-{
-    devID = -1;
-    vSize = -1;
-    maxLength = -1;
-}
-/* deconstructor */
-T2TEmbedder::~T2TEmbedder()
-{
-}
-/* 
-initialize the model 
->> argc - number of arguments
->> argv - list of pointers to the arguments
->> myDevID - device id
-*/
-void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, bool isEnc)
-{
-    devID = myDevID;
-    if(isEnc){
-        LoadParamInt(argc, argv, "vsize", &vSize, -1);
-    }
-    else{
-        LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
-    }
-    //LoadParamInt(argc, argv, "vsize", &vSize, -1);
-    LoadParamInt(argc, argv, "maxlen", &maxLength, 1024);
-    LoadParamInt(argc, argv, "d", &eSize, DEFAULT_EMBEDDING_SIZE);
-    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
-    LoadParamInt(argc, argv, "pad", &padIdx, 1);
-    InitTensor2DV2(&w, vSize, eSize, X_FLOAT, devID);
-    maxLength = maxLength + 1 + 1;
-    DTYPE v = 1.0F/(float)sqrt((float)eSize);
-    w.SetDataRandn(0, v);
-    /* create the positional embedding matrix */
-    MakePosEmbedding(eSize, d, maxLength, padIdx);
-}
-/* 
-make positional embeddings (of size eSize * length)
->> eSize - embedding size
->> d - dimension size of the hidden layers
->> length - length of the sequence
-*/
-void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length, int padIdx)
-{
-    InitTensor2DV2(&posEmbeddingBase, length, eSize, X_FLOAT, devID);
-    float * data = new float[posEmbeddingBase.unitNum];
-    for(int pos = 0; pos < length; pos++){
-        float * dp = data + pos * eSize;
-        int channelSize = eSize / 2;
-        int offset = 0;
-        for(int i = 0; i < channelSize; i++){
-            dp[offset++] = (float)sin(pos * exp(-i * log(10000.0F) / (channelSize - 1)));
-        }
-        for(int i = 0; i < channelSize; i++){
-            dp[offset++] = (float)cos(pos * exp(-i * log(10000.0F) / (channelSize - 1)));
-        }
-    }
-    /* padding zeros */
-    int padStart = padIdx * eSize;
-    for (int i = padStart; i < padStart + eSize; i++)
-        data[i] = 0.F;
-    posEmbeddingBase.SetData(data, posEmbeddingBase.unitNum);
-    delete[] data;
-}
-/* 
-make the network 
-*/
-XTensor T2TEmbedder::Make(XTensor &input, int prevLen, int nstep, bool isDec)
-{
-	/* assert padding index is 1 */
-	CheckNTErrors(input.order > 1, "Wrong input tensor size!");
-	CheckNTErrors(input.dimSize[input.order - 1] < maxLength, "The sequence is too long!");
-	CheckNTErrors(vSize > 0, "set vocabulary size by \"-vsize\"");
-	CheckNTErrors(eSize > 0, "set embedding size by \"-esize\"");
-	XTensor wordEmbedding, position, posEmbedding;
-	InitTensorV2(&position, &input);
-	int* posData = new int[input.unitNum];
-	XTensor inputCPU;
-	InitTensorOnCPU(&inputCPU, &input);
-	_CopyValues(&input, &inputCPU);
-	if (!isDec)
-	{
-		for (int i = 0; i < inputCPU.GetDim(0); i++) {
-			int startNoPad = 2 + prevLen;
-			int* p = ((int*)inputCPU.data) + i * inputCPU.GetDim(1);
-			for (int j = 0; j < inputCPU.GetDim(1); j++) {
-				if (p[j] == 1) {
-					posData[i * inputCPU.GetDim(1) + j] = 1;
-				}
-				else {
-					posData[i * inputCPU.GetDim(1) + j] = startNoPad++;
-				}
-			}
-		}
-		position.SetData(posData, position.unitNum);
-	}
-	else
-	{
-		for (int i = 0; i < position.GetDim(0); i++) {
-			for (int j = 0; j < position.GetDim(1); j++) {
-				position.Set2DInt(nstep + 2, i, j);
-			}
-		}
-	}
-	delete[] posData;
-	/* we make positional embeddings first */
-	if (true) {
-		posEmbedding = Gather(posEmbeddingBase, position);
-	}
-    /* then we make word embeddings */
-    wordEmbedding = Gather(w, input);
-    wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize));
-    /* we sum over the two embeddings */
-	return Sum(wordEmbedding, posEmbedding);
-}
-}
--- a/source/sample/transformer/T2TEmbedding.h
+++ b/source/sample/transformer/T2TEmbedding.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-01
- */
-#ifndef __T2TEMBEDDING_H__
-#define __T2TEMBEDDING_H__
-#include "../../network/XNet.h"
-using namespace nts;
-namespace transformer
-{
-#define DEFAULT_EMBEDDING_SIZE 128
-/* 
-embedding (of word at position i):
-word embedding + positional embedding
-*/
-class T2TEmbedder
-{
-public:
-    /* device id */
-    int devID;
-    /* vocabulary size */
-    int vSize;
-    /* embedding size */
-    int eSize;
-    /* maximum length of the sequence */
-    int maxLength;
-    /* dimension size of the hidden layers in the t2t model */
-    int d;
-    /* padding index */
-    int padIdx;
-    /* word embedding matrix */
-    XTensor w;
-    /* predefined positional embeddings. It can speeds up 
-       the embedding processing by re-loading. */
-    XTensor posEmbeddingBase;
-public:
-    /* constructor */
-    T2TEmbedder();
-    /* de-constructor */
-    ~T2TEmbedder();
-    /* initialize the model */
-    void InitModel(int argc, char ** argv, int myDevID = -1, bool isEnc = true);
-    /* make positional embeddings */
-    void MakePosEmbedding(int eSize, int d, int length, int padIdx);
-    /* make the network */
-    XTensor Make(XTensor &input, int prevLen=0, int nstep = -1, bool isDec = false);
-};
-}
-#endif
--- a/source/sample/transformer/T2TFNN.cpp
+++ b/source/sample/transformer/T2TFNN.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- /*
-  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
-  */
-#include <math.h>
-#include "T2TFNN.h"
-#include "T2TUtility.h"
-#include "T2TEmbedding.h"
-#include "../../tensor/core/CHeader.h"
-#include "../../tensor/function/FHeader.h"
-namespace transformer
-{
-/* constructor */
-T2TFNN::T2TFNN()
-{
-    inSize = -1;
-    outSize = -1;
-    hSize = -1;
-}
-/* deconstructor */
-T2TFNN::~T2TFNN()
-{
-}
-/*
-initialize the model
->> argc - number of arguments
->> argv - list of pointers to the arguments
->> myDevID - device id
-*/
-void T2TFNN::InitModel(int argc, char** argv, int myDevID)
-{
-    devID = myDevID;
-    float minmax = 0;
-    LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE);
-    LoadParamInt(argc, argv, "d", &outSize, DEFAULT_EMBEDDING_SIZE);
-    LoadParamInt(argc, argv, "fnnh", &hSize, outSize * 8);
-    LoadParamFloat(argc, argv, "fnnminmax", &minmax, 0.1F);
-    LoadParamFloat(argc, argv, "dropoutfnn", &dropoutP, 0);
-    InitTensor2DV2(&w1,  inSize, hSize, X_FLOAT, devID);
-    InitTensor1DV2(&b1, hSize, X_FLOAT, devID);
-    InitTensor2DV2(&w2, hSize, outSize,  X_FLOAT, devID);
-    InitTensor1DV2(&b2, outSize, X_FLOAT, devID);
-    fnnLayerNorm.InitModel(argc, argv, myDevID);
-    //float scale = 1.0F;
-    //float finfout1 = (float)sqrt(6.0F * scale/(inSize + hSize));
-    //float finfout2 = (float)sqrt(6.0F * scale/(hSize + outSize));
-    //
-    //w1.SetDataRand(-finfout1, finfout1);
-    //b1.SetZeroAll();
-    //w2.SetDataRand(-finfout2, finfout2);
-    //b2.SetZeroAll();
-}
-/*
-make the network
-y = max(0, x * w1 + b1) * w2 + b2
->> input - the input tensor
->> return - the output tensor
-*/
-XTensor T2TFNN::Make(XTensor& input, bool isTraining)
-{
-    XTensor t1;
-    /* t1 = max(0, x * w1 + b1) */
-    t1 = Rectify(MulAndShift(fnnLayerNorm.Make(input), w1, b1));
-    if (isTraining && dropoutP > 0)
-        t1 = Dropout(t1, dropoutP);
-    /* result = t1 * w2 + b2 */
-    XTensor res;
-    res = MulAndShift(t1, w2, b2);
-    _SumMe(&res, &input);
-    return  res;
-}
-}
--- a/source/sample/transformer/T2TFNN.h
+++ b/source/sample/transformer/T2TFNN.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
- */
-#ifndef __T2TFNN_H__
-#define __T2TFNN_H__
-#include "T2TLayerNormal.h"
-#include "../../tensor/XTensor.h"
-using namespace nts;
-namespace transformer
-{
-/* a fnn: y = max(0, x * w1 + b1) * w2 + b2 */
-class T2TFNN
-{
-public:
-    /* device id */
-    int devID;
-    /* size of input vector */
-    int inSize;
-    /* size of output vector */
-    int outSize;
-    /* size of hidden layers */
-    int hSize;
-    /* matrix of transformation 1 */
-    XTensor w1;
-    /* bias of transformation 1 */
-    XTensor b1;
-    /* matrix of transformation 2 */
-    XTensor w2;
-    /* bias of transformation 2 */
-    XTensor b2;
-    /* layer normalization for fnn */
-    T2TLN fnnLayerNorm;
-    /* dropout probability */
-    DTYPE dropoutP;
-public:
-    /* constructor */
-    T2TFNN();
-    /* deconstructor */
-    ~T2TFNN();
-    /* initialize the model */
-    void InitModel(int argc, char ** argv, int myDevID = -1);
-    /* make the network */
-    XTensor Make(XTensor &input, bool isTraining);
-};
-}
-#endif
--- a/source/sample/transformer/T2TLayerNormal.cpp
+++ b/source/sample/transformer/T2TLayerNormal.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
- */
-#include <math.h>
-#include "T2TLayerNormal.h"
-#include "T2TUtility.h"
-#include "T2TEmbedding.h"
-#include "../../tensor/core/CHeader.h"
-namespace transformer
-{
-/* constructor */
-T2TLN::T2TLN()
-{
-    devID = -1;
-    d = 0;
-}
-/* de-constructor */
-T2TLN::~T2TLN()
-{
-}
-/*
-initialize the model
->> argc - number of arguments
->> argv - list of pointers to the arguments
->> myDevID - device id
-*/
-void T2TLN::InitModel(int argc, char ** argv, int myDevID)
-{
-    devID = myDevID;
-    d = 0;
-    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
-    InitTensor1DV2(&w, d, X_FLOAT, devID);
-    InitTensor1DV2(&b, d, X_FLOAT, devID);
-}
-/*
-make the network
-for each layer representation x, we have
-y =
->> input - the input tensor
->> return - layer normalization output
-*/
-XTensor T2TLN::Make(XTensor &input)
-{
-    XTensor &x = input;
-    XTensor xn;
-    XTensor mean;
-    XTensor variance;
-    XTensor standard;
-    XTensor meanFilled;
-    XTensor standardFilled;
-    /* \mu = (sum_i x_i)/m */
-    mean = ReduceMean(x, x.order - 1);
-    /* \sigma = (sum_i (x_i - \mu)^2)/m */
-    variance = ReduceVariance(x, x.order - 1, mean) + 1e-5F;
-    /* standard = sqrt(variance) */
-    standard = Power(variance, 0.5F);
-    /* unsqueeze mean and standard deviation to fit them into
-       the same shape of x */
-    meanFilled = Unsqueeze(mean, x.order - 1, x.GetDim(-1));
-    standardFilled = Unsqueeze(standard, x.order - 1, x.GetDim(-1));
-    /* x' = (x - \mu)/standard */
-    xn = (x - meanFilled) / standardFilled;
-    /* result = x' * w + b   */
-    return xn *  w + b;
-}
-}
--- a/source/sample/transformer/T2TLayerNormal.h
+++ b/source/sample/transformer/T2TLayerNormal.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
- */
-#ifndef __T2TLAYERNORMAL_H__
-#define __T2TLAYERNORMAL_H__
-#include "../../network/XNet.h"
-using namespace nts;
-namespace transformer
-{
-/* layer normalization: y = norm(x) * w + b 
-   where norm(x) = (x - mean)/standardDeviation */
-class T2TLN
-{
-public:
-    /* device id */
-    int devID;
-    /* the transformation matrix w */
-    XTensor w;
-    /* the bias term b */
-    XTensor b;
-    /* dimension size of the model */
-    int d;
-public:
-    /* constructor */
-    T2TLN();
-    /* de-constructor */
-    ~T2TLN();
-    /* initialize the model */
-    void InitModel(int argc, char ** argv, int myDevID = -1);
-    /* make the network */
-    XTensor Make(XTensor &input);
-};
-}
-#endif
--- a/source/sample/transformer/T2TLengthPenalty.cpp
+++ b/source/sample/transformer/T2TLengthPenalty.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2019, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "../../tensor/core/CHeader.h"
-#include "T2TLengthPenalty.h"
-using namespace nts;
-namespace transformer
-{
-/* 
-GNMT-like length penalty: pl = ((5 + n)/(5 + 1))^\alpha 
-where n = length of the sequence 
->> length - length of the sequence (for each entry)
->> alpha - the parameter controls the length preference
-<< return - length penaltyof the sequence (for each entry)
-*/
-XTensor T2TLengthPenalizer::GNMT(const XTensor & length, float alpha)
-{
-    XTensor base;
-    XTensor lp;
-    //base = ScaleAndShift(ScaleAndShift(length, 0, 5.0F), 1.0F/(5 + 1));
-    base = (length + 5)/(1 + 5);
-    lp = Power(base, alpha);
-    return lp;
-}
-}
--- a/source/sample/transformer/T2TLengthPenalty.h
+++ b/source/sample/transformer/T2TLengthPenalty.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2019, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-04-08
- * Start of a new week - I just finished several documents.
- * Writing document is harder than writing code :)
- */
-#ifndef __T2TLENGTHPENALTY_H__
-#define __T2TLENGTHPENALTY_H__
-#include "../../tensor/XTensor.h"
-using namespace nts;
-namespace transformer
-{
-/* We intend to penalize short sequences because they have higher score
-   in product of a sequence of probability-like terms and have more chances
-   to beat others in search. */
-class T2TLengthPenalizer
-{
-public:
-    /* GNMT-like length penalty: pl = ((5 + n)/(5 + 1))^\alpha 
-       where n = length of the sequence */
-    static
-    XTensor GNMT(const XTensor & length, float alpha);
-};
-}
-#endif
--- a/source/sample/transformer/T2TOutput.cpp
+++ b/source/sample/transformer/T2TOutput.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
- */
-#include <math.h>
-#include "T2TOutput.h"
-#include "T2TUtility.h"
-#include "T2TEmbedding.h"
-#include "../../tensor/core/CHeader.h"
-namespace transformer
-{
-/* constructor */
-T2TOutput::T2TOutput()
-{
-    devID = -1;
-    vSize = -1;
-    inSize = -1;
-    hSize = -1;
-}
-/* de-constructor */
-T2TOutput::~T2TOutput()
-{
-}
-/*
-initialize the model 
->> argc - number of arguments
->> argv - list of pointers to the arguments
->> myDevID - device id
-*/
-void T2TOutput::InitModel(int argc, char ** argv, int myDevID)
-{
-    devID = myDevID;
-    float minmax = 0;
-    LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
-    LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE);
-    LoadParamInt(argc, argv, "d", &hSize, DEFAULT_EMBEDDING_SIZE);
-    InitTensor2DV2(&w, vSize, hSize, X_FLOAT, devID);
-}
-/* 
-make the network (redefined output tensor) 
->> input - input tensor
->> output - output tensor 
-*/
-void T2TOutput::Make(XTensor &input, XTensor &output)
-{
-    XTensor &x = input;
-    output = LogSoftmax(MMul(x, X_NOTRANS, w, X_TRANS), -1);
-}
-}
--- a/source/sample/transformer/T2TOutput.h
+++ b/source/sample/transformer/T2TOutput.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
- */
-#ifndef __T2TOUTPUT_H__
-#define __T2TOUTPUT_H__
-#include "../../tensor/function/FHeader.h"
-using namespace nts;
-namespace transformer
-{
-#define OUTPUT_NAME "output"
-/* output layer */
-class T2TOutput
-{
-public:
-    /* device id */
-    int devID;
-    /* vocabulary size */
-    int vSize;
-    /* input vector size */
-    int inSize;
-    /* vector size of the linear transformation */
-    int hSize;
-    /* transformation matrix */
-    XTensor w;
-public:
-    /* constructor */
-    T2TOutput();
-    /* de-constructor */
-    ~T2TOutput();
-    /* initialize the model */
-    void InitModel(int argc, char ** argv, int myDevID = -1);
-    /* make the network */
-    XTensor Make(XTensor &input);
-    /* make the network (redefined output tensor) */
-    void Make(XTensor &input, XTensor &output);
-};
-}
-#endif
\ No newline at end of file
--- a/source/sample/transformer/T2TPredictor.cpp
+++ b/source/sample/transformer/T2TPredictor.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- /*
-  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-13
-  */
-#include "T2TPredictor.h"
-#include "../../tensor/core/CHeader.h"
-#include <iostream>
-using namespace nts;
-namespace transformer
-{
-/* constructor */
-T2TStateBundle::T2TStateBundle()
-{
-    states = NULL;
-    isStart = false;
-}
-/* de-constructor */
-T2TStateBundle::~T2TStateBundle()
-{
-    if (states != NULL)
-        delete[] states;
-}
-/*
-create states
->> num - number of states
-*/
-void T2TStateBundle::MakeStates(int num)
-{
-    CheckNTErrors(num > 0, "invalid number");
-    if (states != NULL)
-        delete[] states;
-    states = new T2TState[num];
-    for (int i = 0; i < num; i++) {
-        states[i].prediction = -1;
-        states[i].pid = T2T_PID_EMPTY;
-        states[i].isEnd = false;
-        states[i].isStart = false;
-        states[i].isCompleted = false;
-        states[i].prob = 0;
-        states[i].probPath = 0;
-        states[i].modelScore = 0;
-        states[i].nstep = 0;
-        states[i].last = NULL;
-    }
-    stateNum = num;
-}
-/* constructor */
-T2TPredictor::T2TPredictor()
-{
-    startSymbol = 2;
-}
-/* de-constructor */
-T2TPredictor::~T2TPredictor()
-{
-}
-/*
-create an initial state
->> model - the t2t model
->> top - the top-most layer of the network
->> input - input of the network
->> beamSize - beam size
->> state - the state to be initialized
-*/
-void T2TPredictor::Create(T2TModel* model, XTensor* top, const XTensor* input, int beamSize, T2TStateBundle* state)
-{
-    int dims[MAX_TENSOR_DIM_NUM];
-    for (int i = 0; i < input->order - 1; i++)
-        dims[i] = input->GetDim(i);
-    dims[input->order - 1] = beamSize;
-    InitTensorV2(&state->probPath, input->order, dims, X_FLOAT, 1.0F, input->devID);
-    InitTensorV2(&state->nstep, input->order, dims, X_FLOAT, 1.0F, input->devID);
-    InitTensorV2(&state->endMark, input->order, dims, X_INT, 1.0F, input->devID);
-    state->probPath.SetZeroAll();
-    state->nstep.SetZeroAll();
-    state->endMark.SetZeroAll();
-    state->stateNum = 0;
-}
-/*
-set start symbol
->> symbol - the symbol (in integer)
-*/
-void T2TPredictor::SetStartSymbol(int symbol)
-{
-    startSymbol = symbol;
-}
-/*
-read a state
->> model - the t2t model that keeps the network created so far
->> state - a set of states. It keeps
-             1) hypotheses (states)
-             2) probablities of hypotheses
-             3) parts of the network for expanding toward the next state
-*/
-void T2TPredictor::Read(T2TModel* model, T2TStateBundle* state)
-{
-    m = model;
-    s = state;
-}
-/*
-predict the next state
->> next - next states (assuming that the current state has been read)
->> encoding - encoder output
->> inputEnc - input of the encoder
->> paddingEnc - padding of the encoder
->>> isStart - is the start or not
-*/
-void T2TPredictor::Predict(T2TStateBundle* next, XTensor* encoding, XTensor* inputEnc, XTensor* paddingEnc, bool isStart)
-{
-    int dims[MAX_TENSOR_DIM_NUM];
-    /* word indices of positions up to next state */
-    XTensor inputDec;
-    /* the first token */
-    XTensor first;
-    CheckNTErrors(inputEnc->order >= 2, "Wrong order of the tensor!");
-    for (int i = 0; i < inputEnc->order - 1; i++)
-        dims[i] = inputEnc->GetDim(i);
-    dims[inputEnc->order - 1] = 1;
-    InitTensorV2(&first, inputEnc->order, dims, X_INT, 1.0F, inputEnc->devID);
-    SetDataFixedInt(first, startSymbol);
-    /* add a new word into the input sequence of the decoder side */
-    if (isStart) {
-        inputDec = Identity(first);
-    }
-    else {
-        /* only pass one step to the decoder */
-        inputDec = GetLastPrediction(s);
-        inputDec.SetDevice(inputEnc->devID);
-    }
-    /* prediction probabilities */
-    XTensor& output = next->prob;
-    XTensor decoding;
-    for (int i = 0; i < inputDec.order - 1; i++)
-        dims[i] = inputDec.GetDim(i);
-    dims[inputDec.order - 1] = inputDec.GetDim(-1);
-    XTensor paddingDec;
-    InitTensorV2(&paddingDec, inputDec.order, dims, X_INT, 1.0F, paddingEnc->devID);
-    SetDataFixedInt(paddingDec, 1);
-    XTensor maskDec;
-    XTensor maskEncDec;
-    /* decoder mask */
-    m->MakeMTMaskDec(*inputEnc, inputDec, *paddingEnc, paddingDec, maskDec, maskEncDec);
-    /* make the decoding network */
-    decoding = m->decoder->Make(inputDec, *encoding, &maskDec, maskEncDec, false);
-    CheckNTErrors(decoding.order >= 2, "The tensor must be of order 2 or larger!");
-    /* generate the output probabilities */
-    m->outputLayer->Make(decoding, output);
-}
-/*
-generate paths up to the states of the current step
->> state - state bundle of the current step
-*/
-XTensor T2TPredictor::GeneratePaths(T2TStateBundle* state)
-{
-    CheckNTErrors(state->stateNum >= 0, "Illegal state!");
-    int distance = -1;
-    for (int i = 0; i < state->stateNum; i++) {
-        T2TState* cur = state->states + i;
-        int nsteps = 0;
-        while (cur != NULL) {
-            nsteps++;
-            cur = cur->last;
-        }
-        if (nsteps > distance)
-            distance = nsteps;
-    }
-    XTensor path;
-    InitTensor2DV2(&path, state->stateNum, distance, X_INT);
-    path.SetZeroAll();
-    for (int i = 0; i < state->stateNum; i++) {
-        T2TState* cur = state->states + i;
-        int nsteps = 0;
-        while (cur != NULL) {
-            nsteps++;
-            path.Set2DInt(cur->prediction, i, distance - nsteps);
-            cur = cur->last;
-        }
-    }
-    return path;
-}
-/*
-get the predictions of the previous step
->> state - state bundle of the current step
-*/
-XTensor T2TPredictor::GetLastPrediction(T2TStateBundle* state)
-{
-    CheckNTErrors(state->stateNum >= 0, "Illegal state!");
-    XTensor lastPred;
-    InitTensor2DV2(&lastPred, state->stateNum, 1, X_INT);
-    for (int i = 0; i < state->stateNum; i++) {
-        T2TState* cur = state->states + i;
-        lastPred.Set2DInt(cur->prediction, i, 0);
-    }
-    return lastPred;
-}
-}
--- a/source/sample/transformer/T2TPredictor.h
+++ b/source/sample/transformer/T2TPredictor.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- /*
-  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-13
-  * This is the first source file I create in 2019 - new start!
-  */
-#ifndef __T2TPREDICTOR_H__
-#define __T2TPREDICTOR_H__
-#include "T2TModel.h"
-#include "T2TLengthPenalty.h"
-namespace transformer
-{
-#define T2T_PID_EMPTY -1
-/* state for search. It keeps the path (back-pointer), prediction distribution,
-   and etc. It can be regarded as a hypothsis in translation. */
-class T2TState
-{
-public:
-    /* we assume that the prediction is an integer */
-    int prediction;
-    /* id of the problem. One can regard it as the sentence id when we
-       translate a number of sentences in the batched manner. The hypothesis
-       is empty if id = -1 */
-    int pid;
-    /* indicates whether the state is an end */
-    bool isEnd;
-    /* indicates whether the state is the start */
-    bool isStart;
-    /* indicates whether the state is completed */
-    bool isCompleted;
-    /* probability of every prediction (last state of the path) */
-    float prob;
-    /* probability of every path */
-    float probPath;
-    /* model score of every path. A model score = path probability + some other stuff */
-    float modelScore;
-    /* nubmer of steps we go over so far */
-    int nstep;
-    /* pointer to the previous state */
-    T2TState* last;
-};
-/* a bundle of states */
-class T2TStateBundle
-{
-public:
-    /* predictions */
-    XTensor prediction;
-    /* id of the previous state that generates the current one  */
-    XTensor preID;
-    /* mark that indicates whether each hypothesis is completed */
-    XTensor endMark;
-    /* probability of every prediction (last state of the path) */
-    XTensor prob;
-    /* probability of every path */
-    XTensor probPath;
-    /* model score of every path */
-    XTensor modelScore;
-    /* step number of each hypothesis */
-    XTensor nstep;
-    /* list of states */
-    T2TState* states;
-    /* number of states */
-    int stateNum;
-    /* indicates whether it is the first state */
-    bool isStart;
-public:
-    /* constructor */
-    T2TStateBundle();
-    /* de-constructor */
-    ~T2TStateBundle();
-    /* create states */
-    void MakeStates(int num);
-};
-/* The predictor reads the current state and then predicts the next.
-   It is exactly the same procedure of MT inference -
-   we get the state of previous words and then generate the next word.
-   Here, a state can be regared as the representation of words (word
-   indices, hidden states, embeddings and etc.).  */
-class T2TPredictor
-{
-private:
-    /* pointer to the transformer model */
-    T2TModel* m;
-    /* current state */
-    T2TStateBundle* s;
-    /* start symbol */
-    int startSymbol;
-public:
-    /* constructor */
-    T2TPredictor();
-    /* de-constructor */
-    ~T2TPredictor();
-    /* create an initial state */
-    void Create(T2TModel* model, XTensor* top, const XTensor* input, int beamSize, T2TStateBundle* state);
-    /* set the start symbol */
-    void SetStartSymbol(int symbol);
-    /* read a state */
-    void Read(T2TModel* model, T2TStateBundle* state);
-    /* predict the next state */
-    void Predict(T2TStateBundle* next, XTensor* encoding, XTensor* inputEnc, XTensor* paddingEnc, bool isStart);
-    /* generate paths up to the states of the current step */
-    XTensor GeneratePaths(T2TStateBundle* state);
-    /* get the predictions of the previous step */
-    XTensor GetLastPrediction(T2TStateBundle* state);
-};
-}
-#endif
--- a/source/sample/transformer/T2TSearch.cpp
+++ b/source/sample/transformer/T2TSearch.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- /*
-  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
-  */
-#include "T2TSearch.h"
-#include "T2TUtility.h"
-#include "../../tensor/core/CHeader.h"
-using namespace nts;
-namespace transformer
-{
-/* constructor */
-T2TSearch::T2TSearch()
-{
-    alpha = 0;
-    maxLength = 0;
-    beamSize = 0;
-    batchSize = 0;
-    endSymbolNum = 0;
-    fullHypos = NULL;
-    endSymbols = new int[32];
-    startSymbol = -1;
-}
-/* de-constructor */
-T2TSearch::~T2TSearch()
-{
-    if (fullHypos != NULL)
-        delete[] fullHypos;
-    if (endSymbols != NULL)
-        delete[] endSymbols;
-}
-/*
-initialize the model
->> argc - number of arguments
->> argv - list of pointers to the arguments
-*/
-void T2TSearch::Init(int argc, char** argv)
-{
-    LoadParamInt(argc, argv, "beamsize", &beamSize, 1);
-    LoadParamInt(argc, argv, "batchsize", &batchSize, 1);
-    LoadParamFloat(argc, argv, "lenalpha", &alpha, 1.0F);
-    LoadParamInt(argc, argv, "endid", endSymbols, -1);
-    LoadParamInt(argc, argv, "startid", &startSymbol, -1);
-    LoadParamFloat(argc, argv, "maxlenalpha", &scalarMaxLength, 2.0F);
-    LoadParamBool(argc, argv, "earlystop", &isEarlyStop, false);
-    if (endSymbols[0] >= 0)
-        endSymbolNum = 1;
-}
-/*
-search for the most promising states
->> model - the transformer model
->> input - input of the model
->> padding - padding of the input
->> output - output that represents the sequences as rows
->> score - score of the sequences
-*/
-void T2TSearch::Search(T2TModel * model, XTensor * input, XTensor * padding, 
-                       XTensor * output, XTensor * score)
-{
-    T2TPredictor predictor;
-    XTensor maskEnc;
-    XTensor encoding;
-    XTensor encodingBeam;
-    XTensor inputBeam;
-    XTensor paddingBeam;
-    CheckNTErrors(endSymbolNum > 0, "The search class is not initialized!");
-    CheckNTErrors(startSymbol >= 0, "The search class is not initialized!");
-    Prepare(input->unitNum / input->GetDim(-1), beamSize);
-    /* encoder mask */
-    model->MakeMTMaskEnc(*input, *padding, maskEnc);
-    /* make the encoding network */
-    encoding = model->MakeEncoder(*input, &maskEnc, false);
-    encodingBeam = Unsqueeze(encoding, encoding.order - 2, beamSize);
-    inputBeam = Unsqueeze(*input, input->order - 1, beamSize);
-    paddingBeam = Unsqueeze(*padding, padding->order - 1, beamSize);
-    encodingBeam.ReshapeMerged(encodingBeam.order - 4);
-    inputBeam.ReshapeMerged(inputBeam.order - 3);
-    paddingBeam.ReshapeMerged(paddingBeam.order - 3);
-    /* max output-length = scalar * source-length */
-    int lengthLimit = (int)(input->GetDim(-1) * scalarMaxLength);
-    CheckNTErrors(lengthLimit > 0, "no max length specified!");
-    maxLength = lengthLimit;
-    T2TStateBundle * states = new T2TStateBundle[lengthLimit + 1];
-    T2TStateBundle * first = states;
-    T2TStateBundle * cur = NULL;
-    T2TStateBundle * next = NULL;
-    /* create the first state */
-    predictor.Create(model, &encodingBeam, input, beamSize, first);
-    predictor.SetStartSymbol(startSymbol);
-    first->isStart = true;
-    /* generate the sequence from left to right */
-    for(int l = 0 ; l < lengthLimit; l++){
-        cur = states + l;
-        next = states + l + 1;
-        /* read the current state */
-        predictor.Read(model, cur);
-        /* predict the next state */
-        predictor.Predict(next, &encodingBeam, &inputBeam, &paddingBeam, l == 0);
-        /* compute the model score (given the prediction probability) */
-        Score(cur, next);
-        /* beam pruning */
-        Generate(next);
-        /* expand the search graph */
-        Expand(cur, next);
-        /* push complete hypotheses into the heap */
-        Collect(next);
-        /* stop searching when all hypotheses are completed */
-        if(IsAllCompleted(next)){
-            maxLength = l + 1;
-            break;
-        }
-    }
-    /* fill the heap with imcomplete hypotheses if neccesary */
-    FillHeap(next);
-    Dump(output, score);
-    delete[] states;
-}
-/*
-prepare for search
->> batchSize - size of the batch
->> beamSize - size of the beam
-*/
-void T2TSearch::Prepare(int myBatchSize, int myBeamSize)
-{
-    batchSize = myBatchSize;
-    beamSize = myBeamSize;
-    if (fullHypos != NULL)
-        delete[] fullHypos;
-    fullHypos = new XHeap<MIN_HEAP, float>[batchSize];
-    for (int i = 0; i < batchSize; i++)
-        fullHypos[i].Init(beamSize);
-}
-/*
-compute the model score for each hypothesis
->> prev - the beam of the previous state
->> beam - the beam that keeps a number of states
-*/
-void T2TSearch::Score(T2TStateBundle* prev, T2TStateBundle* beam)
-{
-    XTensor& score = beam->modelScore;
-    XTensor& prob = beam->prob;
-    XTensor& probPath = beam->probPath;
-    XTensor& probPathPrev = prev->probPath;
-    XTensor& lenPrev = prev->nstep;
-    XTensor& len = beam->nstep;
-    XTensor lp;
-    XTensor mask;
-    int order = prob.order;
-    int outputSize = prob.GetDim(-1);
-    int dims[MAX_TENSOR_DIM_NUM];
-    for (int i = 0; i < order; i++)
-        dims[i] = prob.GetDim(i);
-    InitTensorV2(&score, &prob);
-    InitTensorV2(&probPath, &prob);
-    prob.Reshape(prob.unitNum / outputSize, outputSize);
-    score.Reshape(score.unitNum / outputSize, outputSize);
-    probPath.Reshape(score.unitNum / outputSize, outputSize);
-    probPathPrev.Reshape(probPathPrev.unitNum);
-    /* the log-scale probability of the entire sequence */
-    _SumDim(&prob, &probPathPrev, &probPath, 0);
-    InitTensorV2(&len, &lenPrev);
-    InitTensorV2(&lp, &lenPrev);
-    _ScaleAndShift(&lenPrev, &len, 1.0F, 1.0F);
-    /* the GNMT-like length penalty */
-    lp = T2TLengthPenalizer::GNMT(len, alpha);
-    lp.Reshape(lp.unitNum);
-    /* score = log-prob/lp */
-    _DivDim(&probPath, &lp, &score, 0);
-    if (prev->isStart) {
-        XTensor firstMask = MakeFirstMask(beam);
-        firstMask.Reshape(firstMask.unitNum);
-        /* mask the hypotheses in the beam except the first one */
-        _SumDim(&score, &firstMask, &score, 0);
-    }
-    InitTensorV2(&mask, prev->endMark.order, prev->endMark.dimSize, X_FLOAT, 1.0F, prev->endMark.devID);
-    mask.SetZeroAll();
-    _SetDataFixedCond(&mask, &prev->endMark, -1e9F);
-    mask.Reshape(mask.unitNum);
-    /* mask the completed hypotheses so that they cannot
-       be involved in further sorting and beam search. */
-    _SumDim(&score, &mask, &score, 0);
-    prob.Reshape(order, dims);
-    score.Reshape(order, dims);
-    probPath.Reshape(order, dims);
-    probPathPrev.Reshape(order - 1, dims);
-    lp.Reshape(order - 1, dims);
-    mask.Reshape(order - 1, dims);
-}
-/*
-generate tokens for the next state via beam pruning
->> beam - the beam that keeps a number of states
-*/
-void T2TSearch::Generate(T2TStateBundle* beam)
-{
-    int dims[MAX_TENSOR_DIM_NUM];
-    int dimsBeam[MAX_TENSOR_DIM_NUM];
-    int dimsTopK[MAX_TENSOR_DIM_NUM];
-    XTensor scoreTopK;
-    XTensor indexCPU;
-    XTensor &score = beam->modelScore;
-    XTensor &index = beam->prediction;
-    XTensor &preID = beam->preID;
-    XTensor &probPath = beam->probPath;
-    XTensor &prob = beam->prob;
-    int order = score.order;
-    for (int i = 0; i < order; i++) {
-        dims[i] = score.GetDim(i);
-        dimsBeam[i] = score.GetDim(i);
-        dimsTopK[i] = score.GetDim(i);
-    }
-    CheckNTErrors(order >= 3, "The tensor must be of order 2 or larger.");
-    CheckNTErrors(dimsBeam[order - 3] % beamSize == 0, "Wrong dimension size!");
-    int sizeVocab = score.GetDim(-1);
-    int stride = score.GetDim(-1);
-    dimsBeam[order - 3] /= beamSize;
-    dimsBeam[order - 1] *= beamSize;
-    dimsTopK[order - 3] = dimsBeam[order - 3];
-    dimsTopK[order - 1] = beamSize;
-    InitTensorV2(&scoreTopK, order, dimsTopK, score.dataType, 1.0F, score.devID);
-    InitTensorV2(&index, order, dimsTopK, X_INT, 1.0F, score.devID);
-    InitTensorV2(&preID, order, dimsTopK, X_INT, 1.0F, -1);
-    InitTensorV2(&indexCPU, order, dimsTopK, X_INT, 1.0F, -1);
-    /* TODO: check the mask - mask the first and the padding id */
-    int dimMask[]{ score.GetDim(-1) };
-    XTensor mask;
-    InitTensorV2(&mask, 1, dimMask, X_FLOAT, 1.0F, -1);
-    mask.SetZeroAll();
-    mask.Set1D(-1e9F, 0);
-    mask.Set1D(-1e9F, 1);
-    mask.SetDevice(score.devID);
-    _SumDim(&score, &mask, 2);
-    score.Reshape(order, dimsBeam);
-    /* keep the most promissing candidates in the beam */
-    TopK(score, scoreTopK, index, -1, beamSize);
-    CopyValues(index, indexCPU);
-    CopyValues(index, preID);
-    /* "preID" represents the id (or the offset) of the previous state used to make the current
-       hypothesis. Note that we reshape the "score" tensor into a matrix where each
-       row means a previous state. The column number is size-of-beam \times vocab-size. We,
-       therefore, divide entries of the top-k index by vocab-size to compute the id of the
-       previous state for each hypothesis in the top-k list. */
-    DescaleMe(preID, sizeVocab);
-    /* Then, we do something similar to "preID". For the top-k predictions, we need
-       to know their indices in the vocabulary. We compute the offset of each prediction
-       in the vocabulary by dividing it with vocab-size and computing the remainder. */
-    ModMe(index, sizeVocab);
-    score.Reshape(order, dims);
-    /* we keep the top-k scores */
-    InitTensorV2(&score, &scoreTopK);
-    CopyValues(scoreTopK, score);
-    /*  CPU data (TODO: remove GPU->CPU data copy!!!) */
-    for (int i = 0; i < indexCPU.unitNum; i += beamSize){
-        for (int j = 0; j < beamSize; j++) {
-            indexCPU.SetInt(i * stride + indexCPU.GetInt(i + j), i + j);
-        }
-    }
-    CheckNTErrors(IsSameShaped(prob, probPath), "Wrong tensor shape!");
-    /* sequence probability of top-k candidates */
-    XTensor probPathTopK;
-    InitTensorV2(&probPathTopK, &scoreTopK);
-    XTensor probTopK;
-    InitTensorV2(&probTopK, &scoreTopK);
-    for (int i = 0; i < probPath.order; i++) {
-        dims[i] = probPath.GetDim(i);
-        dimsTopK[i] = probPathTopK.GetDim(i);
-    }
-    order = probPath.order;
-    prob.Reshape(prob.unitNum, 1);
-    probPath.Reshape(probPath.unitNum, 1);
-    indexCPU.Reshape(indexCPU.GetDim(0), indexCPU.GetDim(-1));
-    indexCPU.SetDevice(prob.devID);
-    probTopK = Gather(prob, indexCPU);
-    probPathTopK = Gather(probPath, indexCPU);
-    probPath.Reshape(order, dims);
-    probPathTopK.Reshape(order, dimsTopK);
-    prob.Reshape(order, dims);
-    probTopK.Reshape(order, dimsTopK);
-    probPath = probPathTopK;
-    prob = probTopK;
-}
-/*
-expand the search graph
->> beam - the beam that keeps a number of states
-*/
-void T2TSearch::Expand(T2TStateBundle* prev, T2TStateBundle* beam)
-{
-    CheckNTErrors(beam->prediction.unitNum == beam->preID.unitNum, "A problem occurs in the beam!");
-    beam->MakeStates(beam->prediction.unitNum);
-    T2TState* states = beam->states;
-    XTensor& idRef = beam->preID;
-    XTensor& modelScoreRef = beam->modelScore;
-    XTensor& probRef = beam->prob;
-    XTensor& probPathRef = beam->probPath;
-    XTensor& predictionRef = beam->prediction;
-    XTensor& endMark = beam->endMark;
-    XTensor   id;
-    XTensor   modelScore;
-    XTensor   prob;
-    XTensor   probPath;
-    XTensor   prediction;
-    XTensor   endMarkCPU;
-    InitTensorOnCPU(&id, &idRef);
-    InitTensorOnCPU(&modelScore, &modelScoreRef);
-    InitTensorOnCPU(&prob, &probRef);
-    InitTensorOnCPU(&probPath, &probPathRef);
-    InitTensorOnCPU(&prediction, &predictionRef);
-    InitTensorOnCPU(&endMarkCPU, &predictionRef);
-    InitTensorV2(&endMark, &predictionRef);
-    /* we copy the data to CPU because the frequent access to GPU is slow
-       and we can speed-up the process by doing the job on CPU. */
-    CopyValues(idRef, id);
-    CopyValues(modelScoreRef, modelScore);
-    CopyValues(probRef, prob);
-    CopyValues(probPathRef, probPath);
-    CopyValues(predictionRef, prediction);
-    CheckNTErrors(beam->stateNum == id.unitNum, "Errors occur in counting!");
-    /* Related variables are kept on the states of the graph. All these are
-       maintained on CPUs to ease the implementation of frequent access and
-       modification of the states. An alternative is to do this on GPUs but
-       it needs much more coding work and the speed-up is not obvious. */
-    for (int i = 0; i < beam->stateNum; i += beamSize) {
-        for (int j = 0; j < beamSize; j++) {
-            int k = i + j;
-            T2TState& state = states[k];
-            int offset = id.GetInt(k);
-            int pid = i / beamSize;
-            T2TState* last = prev->states + pid * beamSize + offset;
-            CheckNTErrors(offset >= 0, "Wrong state index!");
-            /* pointer to the previous state */
-            if (prev->isStart) {
-                state.last = NULL;
-                state.pid = pid;
-                state.nstep = 0;
-                state.isCompleted = false;
-            }
-            else {
-                state.last = last;
-                state.pid = state.last->pid;
-                state.nstep = last->nstep + 1;
-                state.isCompleted = last->isCompleted;
-                CheckNTErrors(offset < prev->stateNum, "Wrong state index!");
-            }
-            /* scores */
-            state.modelScore = modelScore.Get(k);
-            state.prob = prob.Get(k);
-            state.probPath = probPath.Get(k);
-            /* prediction */
-            state.prediction = prediction.GetInt(k);
-            CheckNTErrors(state.prediction >= 0, "Illegal prediction!");
-            /* check if it is the end of the sequence */
-            state.isEnd = IsEnd(state.prediction);
-            state.isCompleted = (state.isCompleted || state.isEnd);
-            /* set the ending mark */
-            endMarkCPU.SetInt(state.isEnd, k);
-        }
-    }
-    /* copy the ending mark from CPU to the target device */
-    CopyValues(endMarkCPU, endMark);
-}
-/*
-collect hypotheses with ending symbols. Given a beam of hypotheses,
-we remove the finished hypotheses and keep them in a heap.
->> beam  - the beam that keeps a number of states
-*/
-void T2TSearch::Collect(T2TStateBundle* beam)
-{
-    T2TState* states = beam->states;
-    for (int i = 0; i < beam->stateNum; i++) {
-        T2TState& state = states[i];
-        CheckNTErrors(state.pid >= 0 && state.pid < batchSize, "Invalid sample id!");
-        /* check if this is the first end symbol. It is false
-           if there have been end symbols in previously generated words. */
-        bool isCompleted = state.isCompleted && (state.last == NULL || !state.last->isCompleted);
-        /* we push the hypothesis into the heap when it is completed */
-        if (state.isEnd && isCompleted) {
-            fullHypos[state.pid].Push(HeapNode<float>(&state, state.modelScore));
-        }
-    }
-}
-/*
-fill the hypotheis heap with incomplete hypotheses
->> beam  - the beam that keeps a number of states (final)
-*/
-void T2TSearch::FillHeap(T2TStateBundle* beam)
-{
-    bool* emptyFlags = new bool[batchSize];
-    for (int i = 0; i < batchSize; i++)
-        emptyFlags[i] = (fullHypos[i].Count() == 0);
-    T2TState* states = beam->states;
-    for (int i = 0; i < beam->stateNum; i++) {
-        T2TState& state = states[i];
-        CheckNTErrors(state.pid >= 0 && state.pid < batchSize,
-                      "Invalid sample id!");
-        /* check if this is the first end symbol. It is false
-           if there have been end symbols in previously generated words. */
-        bool isCompleted = state.isCompleted && (state.last == NULL || !state.last->isCompleted);
-        /* we push the imcomplete hypothesis into the heap */
-        if (emptyFlags[state.pid] || state.isEnd || isCompleted)
-            fullHypos[state.pid].Push(HeapNode<float>(&state, state.modelScore));
-    }
-    delete[] emptyFlags;
-}
-/*
-save the output sequences in a tensor
->> output - output sequences (for return)
->> score - score of thes sequences
-*/
-void T2TSearch::Dump(XTensor * output, XTensor * score)
-{
-    int dims[3] = { batchSize, beamSize, maxLength };
-    int* words = new int[maxLength];
-    InitTensorV2(output, 3, dims, X_INT);
-    InitTensorV2(score, 2, dims, X_FLOAT);
-    SetDataFixedInt(*output, -1);
-    score->SetZeroAll();
-    /* heap for an input sentence in the batch */
-    for (int h = 0; h < batchSize; h++) {
-        XHeap<MIN_HEAP, float> &heap = fullHypos[h];
-        int c = heap.Count();
-        /* for each output in the beam */
-        for(int i = 0; i < beamSize && heap.Count() > 0; i++){
-            HeapNode<float> node = heap.Pop();
-            T2TState * state = (T2TState *)node.index;
-            int count = 0;
-            bool isCompleted = true;
-            /* we track the state from the end to the beginning */
-            while (state != NULL) {
-                if (!state->isCompleted)
-                    isCompleted = false;
-                if (isCompleted)
-                    words[count++] = 2;
-                else
-                    words[count++] = state->prediction;
-                state = state->last;
-            }
-            /* dump the sentence to the output tensor */
-            for(int w = 0; w < count; w++)
-                output->Set3DInt(words[count - w - 1], h, c - i - 1, w);
-            score->Set2D(node.value, h, c - i - 1);
-        }
-    }
-    delete[] words;
-}
-/*
-check if the token is an end symbol
->> token - token to be checked
-*/
-bool T2TSearch::IsEnd(int token)
-{
-    CheckNTErrors(endSymbolNum > 0, "No end symbol?");
-    for (int i = 0; i < endSymbolNum; i++) {
-        if (endSymbols[i] == token)
-            return true;
-    }
-    return false;
-}
-/*
-set end symbols for search
->> tokens - end symbols
->> tokenNum - number of the end symbols
-*/
-void T2TSearch::SetEnd(const int* tokens, const int tokenNum)
-{
-    if (endSymbols != NULL)
-        delete[] endSymbols;
-    if (tokenNum <= 0)
-        return;
-    /* we may have multiple end symbols */
-    tokens = new int[tokenNum];
-    for (int i = 0; i < tokenNum; i++)
-        endSymbols[i] = tokens[i];
-    endSymbolNum = tokenNum;
-}
-/* 
-check whether all hypotheses are completed 
->> beam - the beam that keeps the searching states
-*/
-bool T2TSearch::IsAllCompleted(T2TStateBundle * beam)
-{
-    T2TState * states = beam->states;
-     for (int i = 0; i < beam->stateNum; i++) {
-        T2TState & state = states[i];
-        if(!state.isCompleted)
-            return false;
-     }
-     return true;
-}
-/*
-make a mask to prevent duplicated entries in beam expansion for the first position
->> beam - the beam that keeps the searching states
-*/
-XTensor T2TSearch::MakeFirstMask(T2TStateBundle* beam)
-{
-    XTensor& prob = beam->prob;
-    XTensor mask;
-    int order = prob.order;
-    int dims[MAX_TENSOR_DIM_NUM];
-    for (int i = 0; i < order - 1; i++)
-        dims[i] = prob.GetDim(i);
-    InitTensorV2(&mask, order - 1, dims, X_FLOAT);
-    mask.SetZeroAll();
-    for (int i = 0; i < mask.unitNum; i++) {
-        if (i % beamSize != 0)
-            mask.Set(-1e9, i);
-    }
-    mask.SetDevice(prob.devID);
-    return mask;
-}
-}
--- a/source/sample/transformer/T2TSearch.h
+++ b/source/sample/transformer/T2TSearch.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- /*
-  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
-  */
-#ifndef __T2TSEARCH_H__
-#define __T2TSEARCH_H__
-#include "T2TModel.h"
-#include "T2TPredictor.h"
-namespace transformer
-{
-/* The class orgnizes the search process. It calls "predictors" to generate
-   distributions of the predictions and prunes the search space by beam pruning.
-   This makes a graph where each path respresents a translation hypothsis.
-   The output can be the path with the highest model score. */
-class T2TSearch
-{
-private:
-    /* the alpha parameter controls the length preference */
-    float alpha;
-    /* predictor */
-    T2TPredictor predictor;
-    /* max length of the generated sequence */
-    int maxLength;
-    /* beam size */
-    int beamSize;
-    /* batch size */
-    int batchSize;
-    /* we keep the final hypotheses in a heap for each sentence in the batch. */
-    XHeap<MIN_HEAP, float>* fullHypos;
-    /* array of the end symbols */
-    int* endSymbols;
-    /* number of the end symbols */
-    int endSymbolNum;
-    /* start symbol */
-    int startSymbol;
-    /* scalar of the input sequence (for max number of search steps) */
-    float scalarMaxLength;
-    /* indicate whether the early stop strategy is used */
-    bool isEarlyStop;
-public:
-    /* constructor */
-    T2TSearch();
-    /* de-constructor */
-    ~T2TSearch();
-    /* initialize the model */
-    void Init(int argc, char** argv);
-    /* search for the most promising states */
-    void Search(T2TModel* model, XTensor* input, XTensor* padding, XTensor* output, XTensor* score);
-    /* preparation */
-    void Prepare(int myBatchSize, int myBeamSize);
-    /* compute the model score for each hypothesis */
-    void Score(T2TStateBundle* prev, T2TStateBundle* beam);
-    /* generate token indices via beam pruning */
-    void Generate(T2TStateBundle* beam);
-    /* expand the search graph */
-    void Expand(T2TStateBundle* prev, T2TStateBundle* beam);
-    /* collect hypotheses with ending symbol */
-    void Collect(T2TStateBundle* beam);
-    /* fill the hypotheis heap with incomplete hypothses */
-    void FillHeap(T2TStateBundle* beam);
-    /* save the output sequences and score */
-    void Dump(XTensor* output, XTensor* score);
-    /* check if the token is an end symbol */
-    bool IsEnd(int token);
-    /*check whether all hypotheses are completed*/
-    bool IsAllCompleted(T2TStateBundle* beam);
-    /* set end symbols for search */
-    void SetEnd(const int* tokens, const int tokenNum);
-    /* make a mask to prevent duplicated entries in beam expansion for the first position */
-    XTensor MakeFirstMask(T2TStateBundle* beam);
-};
-}
-#endif
--- a/source/sample/transformer/T2TTester.cpp
+++ b/source/sample/transformer/T2TTester.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- /*
-  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
-  */
-#include <math.h>
-#include "T2TUtility.h"
-#include "T2TTester.h"
-#include "T2TSearch.h"
-#include "../../tensor/XUtility.h"
-#include "../../tensor/core/CHeader.h"
-#include "../../network/XNoder.h"
-#include "..//..//tensor/XTensor.h"
-using namespace nts;
-namespace transformer
-{
-/* constructor */
-T2TTester::T2TTester()
-{
-}
-/* de-constructor */
-T2TTester::~T2TTester()
-{
-}
-/* initialize the model */
-void T2TTester::Init(int argc, char** argv)
-{
-    LoadParamInt(argc, argv, "vsize", &vSize, 34040);
-    LoadParamInt(argc, argv, "vsizetgt", &vSizeTgt, vSize);
-    LoadParamInt(argc, argv, "sentbatch", &sentBatch, 1);
-    LoadParamBool(argc, argv, "sort", &batchLoader.sortBuffer, true);
-    seacher.Init(argc, argv);
-}
-/*
-test the model
->> fn - test data file
->> ofn - output data file
->> model - model that is trained
-*/
-void T2TTester::Test(const char* fn, const char* ofn, T2TModel* model)
-{
-    int wc = 0;
-    int wordCount = 0;
-    int wordCountTotal = 0;
-    int sentCount = 0;
-    int batchCount = 0;
-    /* data files */
-    FILE* ofile = fopen(ofn, "wb");
-    CheckNTErrors(ofile, "Cannot open the output file");
-    int devID = model->devID;
-    double startT = GetClockSec();
-    /* batch of input sequences */
-    XTensor batchEnc;
-    /* padding */
-    XTensor paddingEnc;
-    /* an array that keeps the sequences */
-    int* seqs = new int[MILLION];
-    batchLoader.Init(fn);
-    int count = 0;
-    while (!batchLoader.IsEmpty())
-    {
-        count++;
-        wordCount = 0;
-        for (int i = 0; i < model->decoder->nlayer; ++i) {
-            model->decoder->selfAttCache[i].miss = true;
-            model->decoder->enDeAttCache[i].miss = true;
-        }
-        vector<int> indices = batchLoader.LoadBatch(&batchEnc, &paddingEnc, sentBatch, devID);
-        XTensor output;
-        XTensor score;
-        seacher.Search(model, &batchEnc, &paddingEnc, &output, &score);
-        for (int i = 0; i < indices.size(); ++i) {
-            Result res;
-            XTensor sent, srcIdx, tgtIdx;
-            InitTensor1DV2(&srcIdx, 1, X_INT, output.devID);
-            int idx[]{ i };
-            srcIdx.SetData(idx, 1);
-            InitTensorV2(&tgtIdx, &srcIdx);
-            SetAscendingOrder(tgtIdx, 0);
-            sent = CopyIndexed(output, 0, srcIdx, tgtIdx);
-            res.values = sent;
-            res.id = indices[i];
-            batchLoader.resBuffer.emplace_back(res);
-        }
-        wc = batchEnc.GetDim(-1);
-        wordCount += wc;
-        wordCountTotal += wc;
-        sentCount += batchEnc.GetDim(-2);
-        batchCount += 1;
-        if (batchCount % 1 == 0) {
-            double elapsed = GetClockSec() - startT;
-            XPRINT3(0, stderr, "[INFO] elapsed=%.1fs, sentence=%d, sword=%d\n", elapsed, sentCount, wordCount);
-        }
-    }
-    batchLoader.RerankRes();
-    for (auto res : batchLoader.resBuffer) {
-        Dump(ofile, &res.values);
-    }
-    fclose(ofile);
-    delete[] seqs;
-    double elapsed = GetClockSec() - startT;
-    XPRINT3(0, stderr, "[INFO] test finished (took %.1fs, word=%d, sent=%d)\n", elapsed, wordCountTotal, sentCount);
-}
-/*
-dump the result into the file
->> file - data file
->> output - output tensor
-*/
-void T2TTester::Dump(FILE* file, XTensor* output)
-{
-    int seqLength = output->GetDim(-1);
-    for (int i = 0; i < output->unitNum; i += seqLength) {
-        for (int j = 0; j < seqLength; j++) {
-            int w = output->GetInt(i + j);
-            if (w < 0 || w == 1)
-                break;
-            fprintf(file, "%d ", w);
-        }
-        fprintf(file, "\n");
-    }
-}
-}
--- a/source/sample/transformer/T2TTester.h
+++ b/source/sample/transformer/T2TTester.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- /*
-  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
-  * A week with no trips :)
-  */
-#ifndef __T2TTESTER_H__
-#define __T2TTESTER_H__
-#include "T2TSearch.h"
-#include "t2tdata/DataSet.h"
-namespace transformer
-{
-/* This class translates test sentences with a trained model. */
-class T2TTester
-{
-public:
-    /* vocabulary size of the source side */
-    int vSize;
-    /* vocabulary size of the target side */
-    int vSizeTgt;
-    /* batch size for sentences */
-    int sentBatch;
-    /* for batching */
-    DataSet batchLoader;
-    /* decoder for inference */
-    T2TSearch seacher;
-public:
-    /* constructor */
-    T2TTester();
-    /* de-constructor */
-    ~T2TTester();
-    /* initialize the model */
-    void Init(int argc, char** argv);
-    /* test the model */
-    void Test(const char* fn, const char* ofn, T2TModel* model);
-    /* dump the result into the file */
-    void Dump(FILE* file, XTensor* output);
-};
-}
-#endif
\ No newline at end of file
--- a/source/sample/transformer/T2TUtility.cpp
+++ b/source/sample/transformer/T2TUtility.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
- */
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-namespace transformer
-{
-FILE * tmpFILE;
-int llnum = 0;
-FILE * tf = NULL;
-void LoadParamString(int argc, char ** argv, const char * name, char * p, const char * defaultP)
-{
-    char vname[128];
-    vname[0] = '-';
-    strcpy(vname + 1, name);
-    bool hit = false;
-    for(int i = 0; i < argc; i++){
-        if(!strcmp(argv[i], vname) && i + 1 < argc){
-            strcpy(p, argv[i + 1]);
-            //fprintf(stderr, " %s=%s\n", name, argv[i + 1]);
-            hit = true;
-        }
-    }
-    if(!hit)
-        strcpy(p, defaultP);
-}
-void LoadParamInt(int argc, char ** argv, const char * name, int * p, int defaultP)
-{
-    char vname[128];
-    vname[0] = '-';
-    strcpy(vname + 1, name);
-    bool hit = false;
-    for(int i = 0; i < argc; i++){
-        if(!strcmp(argv[i], vname) && i + 1 < argc){
-            *(int*)p = atoi(argv[i + 1]);
-            //fprintf(stderr, " %s=%s\n", name, argv[i + 1]);
-            hit = true;
-        }
-    }
-    if(!hit)
-        *p = defaultP;
-}
-void LoadParamBool(int argc, char ** argv, const char * name, bool * p, bool defaultP)
-{
-    char vname[128];
-    vname[0] = '-';
-    strcpy(vname + 1, name);
-    bool hit = false;
-    for(int i = 0; i < argc; i++){
-        if(!strcmp(argv[i], vname)){
-            *(bool*)p = true;
-            //fprintf(stderr, " %s=%s\n", name, "true");
-            hit = true;
-        }
-    }
-    if(!hit)
-        *p = defaultP;
-}
-void LoadParamFloat(int argc, char ** argv, const char * name, float * p, float defaultP)
-{
-    char vname[128];
-    vname[0] = '-';
-    strcpy(vname + 1, name);
-    bool hit = false;
-    for(int i = 0; i < argc; i++){
-        if(!strcmp(argv[i], vname) && i + 1 < argc){
-            *p = (float)atof(argv[i + 1]);
-            //fprintf(stderr, " %s=%s\n", name, argv[i + 1]);
-            hit = true;
-        }
-    }
-    if(!hit)
-        *p = defaultP;
-}
-void ShowParams(int argc, char ** argv)
-{
-    fprintf(stderr, "args:\n");
-    for(int i = 0; i < argc; i++){
-        if(argv[i][1] == 0)
-            continue;
-        if(argv[i][0] == '-' && (argv[i][1] < '1' || argv[i][1] > '9')){
-            if(i + 1 < argc && argv[i + 1][0] != '-')
-                fprintf(stderr, " %s=%s\n", argv[i], argv[i + 1]);
-            else
-                fprintf(stderr, " %s=yes\n", argv[i]);
-        }
-    }
-    fprintf(stderr, "\n");
-}
-}
--- a/source/sample/transformer/T2TUtility.h
+++ b/source/sample/transformer/T2TUtility.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
- */
-#ifndef __T2TUTILITY_H__
-#define __T2TUTILITY_H__
-#include <stdio.h>
-namespace transformer
-{
-extern FILE * tmpFILE;
-/* load arguments */
-void LoadParamString(int argc, char ** argv, const char * name, char * p, const char * defaultP);
-void LoadParamInt(int argc, char ** argv, const char * name, int * p, int defaultP);
-void LoadParamBool(int argc, char ** argv, const char * name, bool * p, bool defaultP);
-void LoadParamFloat(int argc, char ** argv, const char * name, float * p, float defaultP);
-/* show arguments */
-void ShowParams(int argc, char ** argv);
-extern int llnum;
-extern FILE * tf;
-}
-#endif
--- a/source/sample/transformer/module/T2TUtility.cpp
+++ b/source/sample/transformer/module/T2TUtility.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
- * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -27,13 +26,13 @@
 #include <fstream>
 #include <sstream>
-#include "T2TUtility.h"
+#include "Utility.h"
-#include "../../../tensor/XGlobal.h"
+#include "../../tensor/XGlobal.h"
 using namespace nts;
 using namespace std;
-namespace transformer
+namespace nmt
 {
 /*
@@ -41,7 +40,7 @@ load configurations from the command
 >> argc - number of arguments
 >> argv - the list of arguments
 */
-T2TConfig::T2TConfig(int argc, const char** argv)
+Config::Config(int argc, const char** argv)
 {
    char** args = new char* [MAX_PARAM_NUM];
    for (int i = 0; i < argc; i++) {
@@ -61,22 +60,26 @@ T2TConfig::T2TConfig(int argc, const char** argv)
    ShowParams(argsNum, args);
    /* options for the model */
-    LoadParamInt(argsNum, args, "nhead", &nhead, 8);
+    LoadParamInt(argsNum, args, "nhead", &nhead, 4);
-    LoadParamInt(argsNum, args, "enclayer", &nEncLayer, 1);
+    LoadParamInt(argsNum, args, "enclayer", &nEncLayer, 6);
-    LoadParamInt(argsNum, args, "declayer", &nDecLayer, 1);
+    LoadParamInt(argsNum, args, "declayer", &nDecLayer, 6);
    LoadParamInt(argsNum, args, "maxrp", &maxRP, 8);
-    LoadParamInt(argsNum, args, "embsize", &embSize, 256);
+    LoadParamInt(argsNum, args, "embsize", &embSize, 512);
-    LoadParamInt(argsNum, args, "modelsize", &modelSize, 256);
+    LoadParamInt(argsNum, args, "modelsize", &modelSize, 512);
    LoadParamInt(argsNum, args, "maxpos", &maxPosLen, 1024);
-    LoadParamInt(argsNum, args, "fnnhidden", &fnnHiddenSize, modelSize * 4);
+    LoadParamInt(argsNum, args, "fnnhidden", &fnnHiddenSize, modelSize * 2);
-    LoadParamInt(argsNum, args, "vsize", &srcVocabSize, 10000);
+    LoadParamInt(argsNum, args, "vsize", &srcVocabSize, 10152);
-    LoadParamInt(argsNum, args, "vsizetgt", &tgtVocabSize, 10000);
+    LoadParamInt(argsNum, args, "vsizetgt", &tgtVocabSize, 10152);
    LoadParamInt(argsNum, args, "padid", &padID, 1);
    LoadParamInt(argsNum, args, "startid", &startID, 2);
    LoadParamInt(argsNum, args, "endid", &endID, 2);
    LoadParamBool(argsNum, args, "rpr", &useRPR, false);
-    LoadParamBool(argsNum, args, "prenorm", &preNorm, false);
+    LoadParamBool(argsNum, args, "prenorm", &preNorm, true);
-    LoadParamString(argsNum, args, "model", modelFN, "model.bin");
+    // TODO: refactor the parameters type to support weight sharing during training
+    LoadParamInt(argsNum, args, "shareemb", &shareAllEmbeddings, 0);
+    LoadParamInt(argsNum, args, "sharedec", &shareDecInputOutputWeight, 0);
+    LoadParamString(argsNum, args, "model", modelFN, "");
    LoadParamString(argsNum, args, "srcvocab", srcVocabFN, "vocab.src");
    LoadParamString(argsNum, args, "tgtvocab", tgtVocabFN, "vocab.tgt");
@@ -84,19 +87,20 @@ T2TConfig::T2TConfig(int argc, const char** argv)
    LoadParamString(argsNum, args, "train", trainFN, "");
    LoadParamString(argsNum, args, "valid", validFN, "");
    LoadParamInt(argsNum, args, "dev", &devID, 0);
-    LoadParamInt(argsNum, args, "wbatch", &wBatchSize, 2048);
+    LoadParamInt(argsNum, args, "wbatch", &wBatchSize, 4096);
-    LoadParamInt(argsNum, args, "sbatch", &sBatchSize, 1);
+    LoadParamInt(argsNum, args, "sbatch", &sBatchSize, 8);
    isTraining = (strcmp(trainFN, "") == 0) ? false : true;
    LoadParamBool(argsNum, args, "mt", &isMT, true);
-    LoadParamFloat(argsNum, args, "dropout", &dropout, 0.1);
+    LoadParamFloat(argsNum, args, "dropout", &dropout, 0.3);
-    LoadParamFloat(argsNum, args, "fnndrop", &fnnDropout, 0.0);
+    LoadParamFloat(argsNum, args, "fnndrop", &fnnDropout, 0.1);
-    LoadParamFloat(argsNum, args, "attdrop", &attDropout, 0.0);
+    LoadParamFloat(argsNum, args, "attdrop", &attDropout, 0.1);
-    LoadParamFloat(argc, args, "lrate", &lrate, 1.0F);
+    LoadParamFloat(argc, args, "lrate", &lrate, 0.0015F);
    LoadParamFloat(argc, args, "lrbias", &lrbias, 0);
-    LoadParamInt(argc, args, "nepoch", &nepoch, 20);
+    LoadParamInt(argc, args, "nepoch", &nepoch, 50);
+    LoadParamInt(argc, args, "maxcheckpoint", &maxCheckpoint, 10);
    LoadParamInt(argc, args, "nstep", &nstep, 100000);
-    LoadParamInt(argc, args, "nwarmup", &nwarmup, 3000);
+    LoadParamInt(argc, args, "nwarmup", &nwarmup, 8000);
    LoadParamBool(argc, args, "adam", &useAdam, true);
    LoadParamFloat(argc, args, "adambeta1", &adamBeta1, 0.9F);
    LoadParamFloat(argc, args, "adambeta2", &adamBeta2, 0.98F);
@@ -104,9 +108,8 @@ T2TConfig::T2TConfig(int argc, const char** argv)
    LoadParamBool(argc, args, "shuffled", &isShuffled, true);
    LoadParamFloat(argc, args, "labelsmoothing", &labelSmoothingP, 0.1);
    LoadParamInt(argc, args, "nstepcheckpoint", &nStepCheckpoint, -1);
-    LoadParamBool(argc, args, "epochcheckpoint", &useEpochCheckpoint, false);
+    LoadParamBool(argc, args, "epochcheckpoint", &useEpochCheckpoint, true);
    LoadParamInt(argc, args, "updatestep", &updateStep, 1);
-    LoadParamBool(argc, args, "debug", &isDebugged, false);
    LoadParamBool(argc, args, "sorted", &isLenSorted, false);
    LoadParamInt(argc, args, "bufsize", &bufSize, 50000);
@@ -114,7 +117,7 @@ T2TConfig::T2TConfig(int argc, const char** argv)
    LoadParamBool(argc, args, "smallbatch", &isSmallBatch, true);
    LoadParamBool(argc, args, "bigbatch", &isBigBatch, false);
    LoadParamBool(argc, args, "randbatch", &isRandomBatch, false);
-    LoadParamInt(argc, args, "bucketsize", &bucketSize, 0);
+    LoadParamInt(argc, args, "bucketsize", &bucketSize, wBatchSize * 10);
    /* options for translating */
    LoadParamString(argsNum, args, "test", testFN, "");
@@ -122,7 +125,7 @@ T2TConfig::T2TConfig(int argc, const char** argv)
    LoadParamInt(argsNum, args, "beamsize", &beamSize, 1);
    LoadParamBool(argsNum, args, "fp16", &useFP16, false);
    LoadParamFloat(argsNum, args, "lenalpha", &lenAlpha, 0.6);
-    LoadParamFloat(argsNum, args, "maxlenalpha", &maxLenAlpha, 2.0);
+    LoadParamFloat(argsNum, args, "maxlenalpha", &maxLenAlpha, 1.2);
    for (int i = 0; i < argc; i++)
        delete[] args[i];
@@ -136,7 +139,7 @@ load configurations from a file
 >> args - the list to store the configurations
 format: one option per line, separated by a blank or a tab
 */
-int T2TConfig::LoadFromFile(const char* configFN, char** args) {
+int Config::LoadFromFile(const char* configFN, char** args) {
    ifstream f(configFN, ios::in);
    CheckNTErrors(f.is_open(), "unable to open the config file");

--- a/source/sample/transformer/module/T2TUtility.h
+++ b/source/sample/transformer/module/T2TUtility.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
- * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
- * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,18 +19,18 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
 */
-#ifndef __T2TUTILITY_H__
+#ifndef __UTILITY_H__
-#define __T2TUTILITY_H__
+#define __UTILITY_H__
 #include <string>
 #include <cstdio>
-#include "../../../tensor/XList.h"
+#include "../../tensor/XList.h"
 using namespace std;
 using namespace nts;
-namespace transformer
+namespace nmt
 {
 #define MAX_PARAM_NUM 100
@@ -50,8 +49,8 @@ IntList SplitInt(const string& s, const string& delimiter);
 FloatList SplitFloat(const string& s, const string& delimiter);
 UInt64List SplitToPos(const string& s, const string& delimiter);
-/* configurations for t2t */
+/* configurations for  */
-class T2TConfig {
+class Config {
 public:
    /* path to the model */
    char modelFN[1024];
@@ -131,6 +130,12 @@ public:
    /* indicates whether the model is running for machine translation */
    bool isMT;
+    /* indicates whether share encoder decoder embeddings */
+    int shareAllEmbeddings;
+    /* indicates whether share decoder embeddings and output weights */
+    int shareDecInputOutputWeight;
    /* indicates whether the model is running with FP16 data type */
    bool useFP16;
@@ -164,9 +169,12 @@ public:
    /* training epoch number */
    int nepoch;
-    /* traing step number */
+    /* training step number */
    int nstep;
+    /* the maximum number of saved checkpoints */
+    int maxCheckpoint;
    /* indicates whether we use Adam */
    bool useAdam;
@@ -193,9 +201,6 @@ public:
    /* number of batches on which we do model update */
    int updateStep;
-    /* indicates whether we intend to debug the net */
-    bool isDebugged;
    /* indicates whether the sequence is sorted by length */
    bool isLenSorted;
@@ -222,7 +227,7 @@ public:
 public:
    /* load configurations from the command */
-    T2TConfig(int argc, const char** argv);
+    Config(int argc, const char** argv);
    /* load configurations from a file */
    int LoadFromFile(const char* configFN, char** args);

--- a/source/sample/transformer/module/T2TAttention.cpp
+++ b/source/sample/transformer/module/T2TAttention.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
- * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -15,22 +14,20 @@
 * limitations under the License.
 */
-/*
+ /*
  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
  * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
  */
-#include <cmath>
+#include "Attention.h"
+#include "Embedding.h"
-#include "T2TUtility.h"
+#include "../Utility.h"
-#include "T2TAttention.h"
-#include "T2TEmbedding.h"
 #include "../../../tensor/core/CHeader.h"
-namespace transformer
+namespace nmt
 {
 /* constructor */
-T2TAttention::T2TAttention()
+Attention::Attention()
 {
    nhead = -1;
    dk = -1;
@@ -39,7 +36,7 @@ T2TAttention::T2TAttention()
 }
 /* de-constructor */
-T2TAttention::~T2TAttention()
+Attention::~Attention()
 {
 }
@@ -47,7 +44,7 @@ T2TAttention::~T2TAttention()
 initialize the model
 >> config - the configurations of the network
 */
-void T2TAttention::InitModel(T2TConfig& config)
+void Attention::InitModel(Config& config)
 {
    devID = config.devID;
    useRPR = config.useRPR;
@@ -59,28 +56,34 @@ void T2TAttention::InitModel(T2TConfig& config)
    maxRP = config.maxRP;
    dropoutP = config.attDropout;
-    InitTensor2D(&wq, d, d, X_FLOAT, devID);
+    /* initialize the parameters */
-    InitTensor1D(&bq, d, X_FLOAT, devID);
+    InitTensor2D(&weightQ, d, d, X_FLOAT, devID);
-    InitTensor2D(&wk, d, d, X_FLOAT, devID);
+    InitTensor1D(&biasQ, d, X_FLOAT, devID);
-    InitTensor1D(&bk, d, X_FLOAT, devID);
+    InitTensor2D(&weightK, d, d, X_FLOAT, devID);
-    InitTensor2D(&wv, d, d, X_FLOAT, devID);
+    InitTensor1D(&biasK, d, X_FLOAT, devID);
-    InitTensor1D(&bv, d, X_FLOAT, devID);
+    InitTensor2D(&weightV, d, d, X_FLOAT, devID);
+    InitTensor1D(&biasV, d, X_FLOAT, devID);
    if (useRPR)
        InitTensor2D(&RPEmbK, maxRP * 2 + 1, d / nhead, X_FLOAT, devID);
-    InitTensor2D(&wo, d, d, X_FLOAT, devID);
-    InitTensor1D(&bo, d, X_FLOAT, devID);
+    InitTensor2D(&weightO, d, d, X_FLOAT, devID);
+    InitTensor1D(&biasO, d, X_FLOAT, devID);
    float scale = 1.0F;
-    _SetDataFanInOut(&wk, scale);
+    _SetDataFanInOut(&weightK, scale);
-    _SetDataFanInOut(&wq, scale);
+    _SetDataFanInOut(&weightQ, scale);
-    _SetDataFanInOut(&wv, scale);
+    _SetDataFanInOut(&weightV, scale);
-    _SetDataFanInOut(&wo, scale);
+    _SetDataFanInOut(&weightO, scale);
    if (useRPR)
        _SetDataFanInOut(&RPEmbK, scale);
-    bk.SetZeroAll();
-    bq.SetZeroAll();
+    biasQ.SetZeroAll();
-    bv.SetZeroAll();
+    biasO.SetZeroAll();
-    bo.SetZeroAll();
+    biasK.SetDataRand(-(DTYPE)sqrt(6.0F / d), (DTYPE)sqrt(6.0F / d));
+    biasV.SetDataRand(-(DTYPE)sqrt(6.0F / d), (DTYPE)sqrt(6.0F / d));
 }
 /*
@@ -96,30 +99,30 @@ make the network
 >> cacheType - type of cache, e.g., self-attention
 << return - multi-attention result
 */
-XTensor T2TAttention::Make(XTensor& k, XTensor& q, XTensor& v, XTensor* mask,
+XTensor Attention::Make(XTensor& k, XTensor& q, XTensor& v, XTensor* mask,
-                           bool isTraining, Cache* cache, int cacheType)
+    bool isTraining, Cache* cache, int attType)
 {
    const bool isEnc = (!cache) ? true : false;
    /* linear transformation before self-attention */
    XTensor q2, k2, v2;
-    q2 = MulAndShift(q, wq, bq);
+    q2 = MulAndShift(q, weightQ, biasQ);
-    if (!cache || isTraining) {
+    if (!cache || isTraining || !(cache->enable)) {
        /* self attention for encoder layers */
-        k2 = MulAndShift(k, wk, bk);
+        k2 = MulAndShift(k, weightK, biasK);
-        v2 = MulAndShift(v, wv, bv);
+        v2 = MulAndShift(v, weightV, biasV);
-        if (useRPR)
+        if (useRPR && attType == SELF_ATT)
            return MakeRPRAttention(k2, q2, v2, mask, isTraining, isEnc);
        return MakeAttention(k2, q2, v2, mask, isTraining);
    }
    else {
-        if (cacheType == SELF_ATT) {
+        if (attType == SELF_ATT) {
-            k2 = MulAndShift(k, wk, bk);
+            k2 = MulAndShift(k, weightK, biasK);
-            v2 = MulAndShift(v, wv, bv);
+            v2 = MulAndShift(v, weightV, biasV);
            /* if hit, we only concat the cache with the new token */
            if (!cache->miss) {
@@ -134,10 +137,10 @@ XTensor T2TAttention::Make(XTensor& k, XTensor& q, XTensor& v, XTensor* mask,
                return MakeRPRAttention(cache->key, q2, cache->value, mask, isTraining, isEnc);
            return MakeAttention(cache->key, q2, cache->value, mask, isTraining);
        }
-        else if (cacheType == EN_DE_ATT) {
+        else if (attType == EN_DE_ATT) {
            if (cache->miss) {
-                cache->key = MulAndShift(k, wk, bk);
+                cache->key = MulAndShift(k, weightK, biasK);
-                cache->value = MulAndShift(v, wv, bv);
+                cache->value = MulAndShift(v, weightV, biasV);
                cache->miss = false;
            }
@@ -155,7 +158,7 @@ make the attention network given keys, queries and values (after linear transfor
 >> mask - as it is
 >> isTraining - indicates whether the model is used for training
 */
-XTensor T2TAttention::MakeAttention(XTensor& k, XTensor& q, XTensor& v,
+XTensor Attention::MakeAttention(XTensor& k, XTensor& q, XTensor& v,
    XTensor* mask, bool isTraining)
 {
    XTensor kheads;
@@ -185,7 +188,7 @@ XTensor T2TAttention::MakeAttention(XTensor& k, XTensor& q, XTensor& v,
    dot = BMMul(qheads, X_NOTRANS, kheads, X_TRANS);
    if (mask)
-        dot = dot + (*mask);
+        dot = dot + *mask;
    dot = Linear(dot, 1.0F / (float)sqrt((float)dk / nhead));
@@ -203,7 +206,7 @@ XTensor T2TAttention::MakeAttention(XTensor& k, XTensor& q, XTensor& v,
        att = ConvertDataType(att, dataType);
    /* concatenate the heads */
-    return MulAndShift(Merge(att, att.order - 1), wo, bo);
+    return MulAndShift(Merge(att, att.order - 1), weightO, biasO);
 }
 /*
@@ -216,16 +219,16 @@ with the given keys, queries and values (after linear transformation)
 >> isTraining - indicates whether the model is used for training
 >> isEnc - indicates whether it is encoder
 */
-XTensor T2TAttention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
+XTensor Attention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
                                    XTensor* mask, bool isTraining, bool isEnc)
 {
    XTensor kheads;
    XTensor qheads;
    XTensor vheads;
-    const int batchSize = q.dimSize[0];
+    const int batchSize = q.GetDim(0);
-    const int lenQ = q.dimSize[1];
+    const int lenQ = q.GetDim(1);
-    const int lenKV = k.dimSize[1];
+    const int lenKV = k.GetDim(1);
    const auto dataType = k.dataType;
@@ -241,7 +244,7 @@ XTensor T2TAttention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
    XTensor embMatrix, relativeKey;
    /* generate the relative emb index (L_q, L_kv) */
-    embMatrix = GetRPEmbedding(lenQ, lenKV, maxRP, isEnc);
+    embMatrix = GetRPEmbedding(lenQ, lenKV, maxRP, isEnc || isTraining);
    /* generate the relative key from the RPEmbK (L_q, L_kv, H/K) */
    relativeKey = Gather(RPEmbK, embMatrix);
@@ -252,12 +255,13 @@ XTensor T2TAttention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
        relativeKey = ConvertDataType(relativeKey, X_FLOAT);
    }
-    ScaleAndShiftMe(qheads, 1.0F / float(nhead));
+    float scaling = sqrt(d / nhead);
+    qheads = ScaleAndShift(qheads, 1.0F / scaling);
    dot = RPDotProduct(qheads, kheads, relativeKey, true);
    if (mask)
-        dot = dot + (*mask);
+        dot = dot + *mask;
    /* softmax */
    scalar = Softmax(dot, -1);
@@ -275,7 +279,7 @@ XTensor T2TAttention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
        att = ConvertDataType(att, dataType);
    /* concatenate the heads */
-    return MulAndShift(Merge(att, att.order - 1), wo, bo);
+    return MulAndShift(Merge(att, att.order - 1), weightO, biasO);
 }
 /*
@@ -284,7 +288,7 @@ generate relative position embeddings
 >> lenKV - the length of key and value
 >> maxRelativeLen - the maximum length of relative position
 */
-XTensor T2TAttention::GetRPEmbedding(const int lenQ, const int lenKV,
+XTensor Attention::GetRPEmbedding(const int lenQ, const int lenKV,
    const int maxRelativeLen, const bool isEnc)
 {
    XTensor range;
@@ -300,7 +304,7 @@ XTensor T2TAttention::GetRPEmbedding(const int lenQ, const int lenKV,
        XTensor range2DTrans;
        range2D = Unsqueeze(range, 0, lenQ);
        range2DTrans = Transpose(range2D, 0, 1);
-        embMatrix = Sum(range2D, range2DTrans, -1);
+        embMatrix = Sum(range2D, range2DTrans, false, -1);
    }
    else {
        for (int i = 0; i < lenKV; i++)
@@ -309,37 +313,46 @@ XTensor T2TAttention::GetRPEmbedding(const int lenQ, const int lenKV,
        embMatrix = Unsqueeze(range, 0, lenQ);
    }
-    ClipMe(embMatrix, -float(maxRelativeLen), float(maxRelativeLen));
+    //ClipMe(embMatrix, -float(maxRelativeLen), float(maxRelativeLen));
-    ScaleAndShiftMe(embMatrix, 1.0F, float(maxRelativeLen));
+    embMatrix = Clip(embMatrix, -float(maxRelativeLen), float(maxRelativeLen));
+    embMatrix = ScaleAndShift(embMatrix, 1.0F, float(maxRelativeLen));
    delete[] index;
    return embMatrix;
 }
 /*
-Relative position-aware dot-product attention inner calculation.
+relative position-aware dot-product attention inner calculation.
 >> x - Tensor with shape [batch_size*heads, length, length or depth].
 >> y - Tensor with shape [batch_size*heads, length, depth].
 >> z - Tensor with shape [length, length, depth].
 >> isKey - Whether y is key.
 << return - A Tensor with shape [batch_size*heads, length, length or depth].
 */
-XTensor T2TAttention::RPDotProduct(XTensor& x, XTensor& y, XTensor& z, const bool isKey)
+XTensor Attention::RPDotProduct(XTensor& x, XTensor& y, XTensor& z, const bool isKey)
 {
    const int headNum = nhead;
-    const int batchSize = x.dimSize[1];
+    const int batchSize = x.GetDim(1);
-    const int lenQ = x.dimSize[2];
+    const int lenQ = x.GetDim(2);
-    const int lenKV = y.dimSize[2];
+    const int lenKV = y.GetDim(2);
-    const int depth = y.dimSize[3];
+    const int depth = y.GetDim(3);
    const int lastDim = isKey ? lenKV : depth;
-    MATRIX_TRANS_TYPE transposeFlag = isKey ? X_TRANS : X_NOTRANS;
+    auto transposeFlag = isKey ? X_TRANS : X_NOTRANS;
-    XTensor context;
+    int mergeDimsX[] = { headNum * batchSize, lenQ, x.GetDim(3) };
-    context = MatrixMulBatched(x, X_NOTRANS, y, transposeFlag);
+    int mergeDimsY[] = { headNum * batchSize, lenKV, y.GetDim(3) };
+    x = Reshape(x, 3, mergeDimsX);
+    y = Reshape(y, 3, mergeDimsY);
+    if (isKey) {
+        y = Transpose(y, 1, 2);
+    }
-    int mergeDims[] = { headNum * batchSize, lenQ, x.dimSize[3] };
+    XTensor context;
-    x.Reshape(3, mergeDims);
+    context = BMMul(x, y);
+    int newDims[]{ headNum, batchSize, context.GetDim(1), context.GetDim(2) };
+    context = Reshape(context, 4, newDims);
    XTensor xTrans;
    xTrans = Transpose(x, 0, 1);
@@ -351,15 +364,17 @@ XTensor T2TAttention::RPDotProduct(XTensor& x, XTensor& y, XTensor& z, const boo
    relativeTrans = Transpose(relative, 0, 1);
    int splitDims[] = { headNum, batchSize, lenQ, lastDim };
-    relativeTrans.Reshape(4, splitDims);
-    return Sum(context, relativeTrans);
+    relativeTrans = Reshape(relativeTrans, 4, splitDims);
+    return context + relativeTrans;
 }
 /* constructor */
 Cache::Cache()
 {
    miss = true;
+    enable = true;
 }
 /* update the states cache */

--- a/source/sample/transformer/module/T2TAttention.h
+++ b/source/sample/transformer/module/T2TAttention.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
- * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,17 +19,17 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
 */
-#ifndef __T2TATTENTION_H__
+#ifndef __ATTENTION_H__
-#define __T2TATTENTION_H__
+#define __ATTENTION_H__
-#include "T2TNNUtil.h"
+#include "NNUtil.h"
-#include "T2TUtility.h"
+#include "../Utility.h"
 #include "../../../network/XNet.h"
 #include "../../../tensor/core/CHeader.h"
 using namespace nts;
-namespace transformer
+namespace nmt
 {
 /* attention type */
 enum { NONE, SELF_ATT, EN_DE_ATT };
@@ -50,6 +49,9 @@ public:
    /* indicates cache miss if 'true' */
    bool miss;
+    /* indicates whether we use cache */
+    bool enable;
    /* constructor */
    Cache();
@@ -64,7 +66,7 @@ public:
 };
 /* multi-head attention */
-class T2TAttention
+class Attention
 {
 public:
    /* device id */
@@ -74,22 +76,22 @@ public:
    int nhead;
    /* transformation matrix for Q */
-    XTensor wq;
+    XTensor weightQ;
    /* bias for Q */
-    XTensor bq;
+    XTensor biasQ;
    /* transformation matrix for K */
-    XTensor wk;
+    XTensor weightK;
    /* bias for K */
-    XTensor bk;
+    XTensor biasK;
    /* transformation matrix for V */
-    XTensor wv;
+    XTensor weightV;
    /* bias for V */
-    XTensor bv;
+    XTensor biasV;
    XTensor wBig;
@@ -99,10 +101,10 @@ public:
    XTensor RPEmbK;
    /* transformation after dot-product attention */
-    XTensor wo;
+    XTensor weightO;
    /* bias after dot-product attention */
-    XTensor bo;
+    XTensor biasO;
    /* size of transformed Q and K */
    int dk;
@@ -124,13 +126,13 @@ public:
 public:
    /* constructor */
-    T2TAttention();
+    Attention();
    /* de-constructor */
-    ~T2TAttention();
+    ~Attention();
    /* initialize the model */
-    void InitModel(T2TConfig& config);
+    void InitModel(Config& config);
    /* make the network */
    XTensor Make(XTensor& k, XTensor& q, XTensor& v,
@@ -145,8 +147,10 @@ public:
    XTensor MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
                             XTensor* mask, bool isTraining, bool isEnc);
+    /* generate relative position embeddings */
    XTensor GetRPEmbedding(const int lenQ, const int lenKV, const int maxRelativeLen, const bool isEnc);
+    /* relative position-aware dot-product attention inner calculation */
    XTensor RPDotProduct(XTensor& x, XTensor& y, XTensor& z, const bool is_key);
 };
 }

--- a/source/sample/transformer/module/T2TCommonModules.cpp
+++ b/source/sample/transformer/module/T2TCommonModules.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
- * Copyright (C) 2020, Natural Language Processing Lab, Northestern University. 
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
- * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,13 +19,11 @@
 * This file includes some common modules of the Transformer model
 */
-#include <cmath>
+#include "CommonModules.h"
-#include "T2TCommonModules.h"
 #include "../../../tensor/core/CHeader.h"
 #include "../../../tensor/function/FHeader.h"
-namespace transformer
+namespace nmt
 {
 /* 
@@ -37,7 +34,7 @@ flexible layer normalization for the Transformer
 >> before - whether we use layernorm before attention/fnn
 >> after - whether we use layernorm after attention/fnn
 */
-XTensor LayerNorm(XTensor& input, T2TLN& ln, bool prenorm, bool before, bool after)
+XTensor LayerNorm(XTensor& input, LN& ln, bool prenorm, bool before, bool after)
 {
    if (after ^ prenorm)
        return ln.Make(input);

--- a/source/sample/transformer/module/T2TCommonModules.h
+++ b/source/sample/transformer/module/T2TCommonModules.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
- * Copyright (C) 2020, Natural Language Processing Lab, Northestern University. 
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
- * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,16 +21,16 @@
 #ifndef __COMMONMODULE_H__
 #define __COMMONMODULE_H__
-#include "T2TLayerNormal.h"
+#include "LayerNorm.h"
-#include "T2TCommonModules.h"
+#include "CommonModules.h"
 using namespace nts;
-namespace transformer
+namespace nmt
 {
 /* the layer normalization module to control pre-norm or post-norm*/
-XTensor LayerNorm(XTensor& input, T2TLN& ln, bool prenorm, bool before, bool after);
+XTensor LayerNorm(XTensor& input, LN& ln, bool prenorm, bool before, bool after);
 }

--- a/source/sample/transformer/module/T2TEmbedding.cpp
+++ b/source/sample/transformer/module/T2TEmbedding.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
- * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,17 +19,15 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-07
 */
-#include <cmath>
+#include "Embedding.h"
+#include "../Utility.h"
-#include "T2TUtility.h"
-#include "T2TEmbedding.h"
 #include "../../../tensor/core/CHeader.h"
-namespace transformer
+namespace nmt
 {
 /* constructor */
-T2TEmbedder::T2TEmbedder()
+Embedder::Embedder()
 {
    devID = -1;
    vSize = -1;
@@ -38,7 +35,7 @@ T2TEmbedder::T2TEmbedder()
 }
 /* de-constructor */
-T2TEmbedder::~T2TEmbedder()
+Embedder::~Embedder()
 {
 }
@@ -47,7 +44,7 @@ initialize the model
 >> config - configurations of the model
 >> isEnc - indicates if it is used for the encoder
 */
-void T2TEmbedder::InitModel(T2TConfig& config, bool isEnc)
+void Embedder::InitModel(Config& config, bool isEnc)
 {
    devID = config.devID;
    d = config.modelSize;
@@ -70,7 +67,7 @@ void T2TEmbedder::InitModel(T2TConfig& config, bool isEnc)
 make positional embeddings (of size eSize * length)
 >> length - length of the sequence
 */
-void T2TEmbedder::MakePosEmbedding(int length)
+void Embedder::MakePosEmbedding(int length)
 {
    InitTensor2D(&posEmbeddingBase, length, eSize, X_FLOAT, devID);
@@ -110,58 +107,45 @@ make the network
 >> isTraining - indicates whether it is training
 << return - word & position embeddings of the input
 */
-XTensor T2TEmbedder::Make(XTensor& input, bool isDec, bool isTraining, int nstep)
+XTensor Embedder::Make(XTensor& input, bool isDec, bool isTraining, int nstep)
 {
    /* make sure the padding index is 1 */
    CheckNTErrors(input.order > 1, "Wrong input tensor size!");
    CheckNTErrors(input.dimSize[input.order - 1] < maxLength, "The sequence is too long!");
-    CheckNTErrors(vSize > 0, "set vocabulary size by \"-vsize\"");
+    CheckNTErrors(vSize > 0, "Set vocabulary size by \"-vsize\"");
-    CheckNTErrors(eSize > 0, "set embedding size by \"-esize\"");
+    CheckNTErrors(eSize > 0, "Set embedding size by \"-esize\"");
    XTensor wordEmbedding, position, posEmbedding;
-    InitTensor(&position, &input);
-    int* posData = new int[input.unitNum];
-    XTensor inputCPU;
+    InitTensor1D(&position, input.GetDim(-1), X_INT, devID);
-    InitTensorOnCPU(&inputCPU, &input);
-    _CopyValues(&input, &inputCPU);
-    if (!isDec)
+    if (!isDec || isTraining || input.GetDim(-1) > 1)
    {
-        /* encoder embeddings */
+        position.Range(0, position.unitNum, 1);
-        for (int i = 0; i < inputCPU.dimSize[0]; i++) {
-            int startNoPad = 1 + 1;
+        // disable grad
-            int* p = ((int*)inputCPU.data) + i * inputCPU.dimSize[1];
+        ScaleAndShiftMe(position, 1.0F, float(padIdx + 1));
-            for (int j = 0; j < inputCPU.dimSize[1]; j++) {
-                if (p[j] == 1) {
-                    posData[i * inputCPU.dimSize[1] + j] = 1;
-                }
-                else {
-                    posData[i * inputCPU.dimSize[1] + j] = startNoPad++;
-                }
-            }
-        }
-        position.SetData(posData, position.unitNum);
    }
    else
    {
-        /* decoder embeddings */
+        /* decoder embeddings during decoding */
-        position.SetDataFixed(nstep + 2);
+        position.SetDataFixed(nstep + padIdx + 1);
    }
-    delete[] posData;
    /* we make positional embeddings first */
-    posEmbedding = Gather(posEmbeddingBase, position);
+    XTensor embTMP;
+    embTMP = Gather(posEmbeddingBase, position);
+    posEmbedding = Unsqueeze(embTMP, 0, input.GetDim(0));
    /* then we make word embeddings */
+    //w.enableGrad = false;
    wordEmbedding = Gather(w, input);
    wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize));
    /* we sum over the two embeddings */
-    return wordEmbedding + posEmbedding;
+    SumMe(wordEmbedding, posEmbedding);
+    return wordEmbedding;
 }
 }
\ No newline at end of file
--- a/source/sample/transformer/module/T2TEmbedding.h
+++ b/source/sample/transformer/module/T2TEmbedding.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
- * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,15 +19,15 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-07
 */
-#ifndef __T2TEMBEDDING_H__
+#ifndef __EMBEDDING_H__
-#define __T2TEMBEDDING_H__
+#define __EMBEDDING_H__
-#include "T2TUtility.h"
+#include "../Utility.h"
 #include "../../../network/XNet.h"
 using namespace nts;
-namespace transformer
+namespace nmt
 {
 #define DEFAULT_EMBEDDING_SIZE 512
@@ -37,7 +36,7 @@ namespace transformer
 embedding (of word at position i):
 word embedding + positional embedding
 */
-class T2TEmbedder
+class Embedder
 {
 public:
    /* device id */
@@ -52,7 +51,7 @@ public:
    /* maximum length of the sequence */
    int maxLength;
-    /* dimension size of the hidden layers in the t2t model */
+    /* dimension size of the hidden layers in the  model */
    int d;
    /* padding index */
@@ -67,13 +66,13 @@ public:
 public:
    /* constructor */
-    T2TEmbedder();
+    Embedder();
    /* de-constructor */
-    ~T2TEmbedder();
+    ~Embedder();
    /* initialize the model */
-    void InitModel(T2TConfig& config, bool isEnc = true);
+    void InitModel(Config& config, bool isEnc = true);
    /* make positional embeddings */
    void MakePosEmbedding(int length);

--- a/source/sample/transformer/module/T2TFNN.cpp
+++ b/source/sample/transformer/module/T2TFNN.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
- * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,19 +19,17 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */
-#include <cmath>
+#include "FNN.h"
+#include "Embedding.h"
-#include "T2TFNN.h"
+#include "../Utility.h"
-#include "T2TUtility.h"
-#include "T2TEmbedding.h"
 #include "../../../tensor/core/CHeader.h"
 #include "../../../tensor/function/FHeader.h"
-namespace transformer
+namespace nmt
 {
 /* constructor */
-T2TFNN::T2TFNN()
+FNN::FNN()
 {
    inSize = -1;
    outSize = -1;
@@ -40,7 +37,7 @@ T2TFNN::T2TFNN()
 }
 /* de-constructor */
-T2TFNN::~T2TFNN()
+FNN::~FNN()
 {
 }
@@ -50,7 +47,7 @@ initialize the model
 >> argv - list of pointers to the arguments
 >> config - configurations of the model
 */
-void T2TFNN::InitModel(T2TConfig& config)
+void FNN::InitModel(Config& config)
 {
    devID = config.devID;
@@ -69,6 +66,9 @@ void T2TFNN::InitModel(T2TConfig& config)
    _SetDataFanInOut(&w1, scale);
    _SetDataFanInOut(&w2, scale);
+    w1.SetDataRand(-(DTYPE)sqrt(6.0F / inSize), (DTYPE)sqrt(6.0F / inSize));
+    w2.SetDataRand(-(DTYPE)sqrt(6.0F / hSize), (DTYPE)sqrt(6.0F / hSize));
    b1.SetZeroAll();
    b2.SetZeroAll();
 }
@@ -79,7 +79,7 @@ y = max(0, x * w1 + b1) * w2 + b2
 >> input - the input tensor
 >> return - the output tensor
 */
-XTensor T2TFNN::Make(XTensor& input, bool isTraining)
+XTensor FNN::Make(XTensor& input, bool isTraining)
 {
    XTensor t1;

--- a/source/sample/transformer/module/T2TFNN.h
+++ b/source/sample/transformer/module/T2TFNN.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
- * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,20 +19,20 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */
-#ifndef __T2TFNN_H__
+#ifndef __FNN_H__
-#define __T2TFNN_H__
+#define __FNN_H__
-#include "T2TUtility.h"
+#include "LayerNorm.h"
-#include "T2TLayerNormal.h"
+#include "../Utility.h"
 #include "../../../tensor/XTensor.h"
 using namespace nts;
-namespace transformer
+namespace nmt
 {
 /* a fnn: y = max(0, x * w1 + b1) * w2 + b2 */
-class T2TFNN
+class FNN
 {
 public:
    /* device id */
@@ -66,13 +65,13 @@ public:
 public:
    /* constructor */
-    T2TFNN();
+    FNN();
    /* de-constructor */
-    ~T2TFNN();
+    ~FNN();
    /* initialize the model */
-    void InitModel(T2TConfig& config);
+    void InitModel(Config& config);
    /* make the network */
    XTensor Make(XTensor& input, bool isTraining);

--- a/source/sample/transformer/module/T2TGatedLinearUnit.cpp
+++ b/source/sample/transformer/module/T2TGatedLinearUnit.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
- * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,16 +18,13 @@
 * $Created by: Bei Li (libei_neu@outlook.com) 2020-02-03
 */
+#include "GLU.h"
-#include <cmath>
+#include "Embedding.h"
+#include "../Utility.h"
-#include "T2TUtility.h"
-#include "T2TEmbedding.h"
-#include "T2TGatedLinearUnit.h"
 #include "../../../tensor/core/CHeader.h"
 #include "../../../tensor/function/FHeader.h"
-namespace transformer
+namespace nmt
 {
 /* constructor */
@@ -48,7 +44,7 @@ GLU::~GLU()
 initialize the model
 >> config - configurations of the model
 */
-void GLU::InitModel(T2TConfig& config)
+void GLU::InitModel(Config& config)
 {
    devID = config.devID;

--- a/source/sample/transformer/module/T2TGatedLinearUnit.h
+++ b/source/sample/transformer/module/T2TGatedLinearUnit.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
- * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,12 +22,11 @@
 #ifndef __GLU_H__
 #define __GLU_H__
-#include "T2TLayerNormal.h"
+#include "LayerNorm.h"
-#include "T2TGatedLinearUnit.h"
 using namespace nts;
-namespace transformer
+namespace nmt
 {
 /* a fnn: y = max(0, x * w1 + b1) * w2 + b2 */
@@ -68,7 +66,7 @@ public:
    ~GLU();
    /* initialize the model */
-    void InitModel(T2TConfig& config);
+    void InitModel(Config& config);
    /* make the network */
    XTensor Make(XTensor& input);

--- a/source/sample/transformer/module/T2TLayerHistory.cpp
+++ b/source/sample/transformer/module/T2TLayerHistory.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
- * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,19 +18,16 @@
 * $Created by: Bei Li (libei_neu@outlook.com) 2020-02-03
 */
-#include <cmath>
+#include "Embedding.h"
+#include "LayerNorm.h"
-#include "T2TUtility.h"
+#include "LayerHistory.h"
-#include "T2TEmbedding.h"
+#include "../Utility.h"
-#include "T2TLayerNormal.h"
-#include "T2TLayerHistory.h"
 #include "../../../tensor/core/CHeader.h"
 #define SAFE_DELETE(x) do{ if((x) != NULL){delete (x); (x) = NULL;} } while(false)
 #define SAFE_DELETE_ARRAY(x) do{ if((x) != NULL) {delete [] (x); (x)=NULL;} } while(false)
-namespace transformer
+namespace nmt
 {
 /* constructor */
@@ -54,7 +50,7 @@ LayerHistory::~LayerHistory()
 initialize the model
 >> config - configurations of the model
 */
-void LayerHistory::InitModel(T2TConfig& config)
+void LayerHistory::InitModel(Config& config)
 {
    devID = config.devID;
    d = config.modelSize;
@@ -62,7 +58,7 @@ void LayerHistory::InitModel(T2TConfig& config)
    InitTensor2D(&weight, nlayer + 1, nlayer + 1, X_FLOAT, devID);
-    layerNorms = new T2TLN[nlayer];
+    layerNorms = new LN[nlayer];
    /* initialize the layer normalization of each layer */
    for (int i = 0; i < nlayer; i++) {

--- a/source/sample/transformer/module/T2TLayerHistory.h
+++ b/source/sample/transformer/module/T2TLayerHistory.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
- * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,14 +21,14 @@
 #ifndef __LAYERHISTORY_H__
 #define __LAYERHISTORY_H__
-#include "T2TLayerNormal.h"
+#include "LayerNorm.h"
-#include "T2TLayerHistory.h"
+#include "LayerHistory.h"
 #include "../../../tensor/function/FHeader.h"
 using namespace nts;
-namespace transformer
+namespace nmt
 {
 /*
@@ -61,7 +60,7 @@ public:
    TensorList history;
    /* layer normalization for each intimidate layer */
-    T2TLN* layerNorms;
+    LN* layerNorms;
 public:
    /* constructor */
@@ -71,7 +70,7 @@ public:
    ~LayerHistory();
    /* initialize the model */
-    void InitModel(T2TConfig& config);
+    void InitModel(Config& config);
    /* add the layer output to the history */
    void Add(XTensor& tensor);

--- a/source/sample/transformer/module/T2TLayerNormal.cpp
+++ b/source/sample/transformer/module/T2TLayerNormal.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
- * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,24 +19,23 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */
-#include <cmath>
+#include "Embedding.h"
-#include "T2TUtility.h"
+#include "LayerNorm.h"
-#include "T2TEmbedding.h"
+#include "../Utility.h"
-#include "T2TLayerNormal.h"
 #include "../../../tensor/core/CHeader.h"
-namespace transformer
+namespace nmt
 {
 /* constructor */
-T2TLN::T2TLN()
+LN::LN()
 {
    devID = -1;
    d = 0;
 }
 /* de-constructor */
-T2TLN::~T2TLN()
+LN::~LN()
 {
 }
@@ -47,7 +45,7 @@ initialize the model
 >> argv - list of pointers to the arguments
 >> config - configurations of the model
 */
-void T2TLN::InitModel(T2TConfig& config)
+void LN::InitModel(Config& config)
 {
    devID = config.devID;
@@ -57,6 +55,8 @@ void T2TLN::InitModel(T2TConfig& config)
    InitTensor1D(&b, d, X_FLOAT, devID);
    w.SetDataRand(1.0F, 1.0F);
    b.SetZeroAll();
+    w.SetDataFixed(1);
 }
 /*
@@ -64,7 +64,7 @@ make the network
 >> input - the input tensor
 >> return - layer normalization output
 */
-XTensor T2TLN::Make(XTensor& input)
+XTensor LN::Make(XTensor& input)
 {
    XTensor& x = input;
    XTensor xn;

--- a/source/sample/transformer/module/T2TLayerNormal.h
+++ b/source/sample/transformer/module/T2TLayerNormal.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
- * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,20 +19,20 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */
-#ifndef __T2TLAYERNORMAL_H__
+#ifndef __LAYERNORMAL_H__
-#define __T2TLAYERNORMAL_H__
+#define __LAYERNORMAL_H__
-#include "T2TUtility.h"
+#include "../Utility.h"
-#include "../../../network/XNet.h"
+#include "../../../network//XNet.h"
 using namespace nts;
-namespace transformer
+namespace nmt
 {
 /* layer normalization: y = norm(x) * w + b
   where norm(x) = (x - mean)/standardDeviation */
-class T2TLN
+class LN
 {
 public:
    /* device id */
@@ -50,13 +49,13 @@ public:
 public:
    /* constructor */
-    T2TLN();
+    LN();
    /* de-constructor */
-    ~T2TLN();
+    ~LN();
    /* initialize the model */
-    void InitModel(T2TConfig& config);
+    void InitModel(Config& config);
    /* make the network */
    XTensor Make(XTensor& input);

--- a/source/sample/transformer/module/T2TNNUtil.cpp
+++ b/source/sample/transformer/module/T2TNNUtil.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
- * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,12 +15,12 @@
 */
 /*
- * $Created by: Chi (huchinlp@foxmail.com) 2020-03-21
+ * $Created by: HU Chi (huchinlp@foxmail.com) 2020-03-21
 */
-#include "T2TNNUtil.h"
+#include "NNUtil.h"
-namespace transformer
+namespace nmt
 {
 /* 

--- a/source/sample/transformer/module/T2TNNUtil.h
+++ b/source/sample/transformer/module/T2TNNUtil.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
- * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,11 +15,11 @@
 */
 /*
- * $Created by: Chi (huchinlp@foxmail.com) 2020-03-21
+ * $Created by: HU Chi (huchinlp@foxmail.com) 2020-03-21
 */
-#ifndef __T2TNNUTIL_H__
+#ifndef __NNUTIL_H__
-#define __T2TNNUTIL_H__
+#define __NNUTIL_H__
 #include "../../../tensor/XGlobal.h"
 #include "../../../tensor/core/CHeader.h"
@@ -28,7 +27,7 @@
 using namespace nts;
-namespace transformer
+namespace nmt
 {
 /* the gather function for tensor with any dimension */

--- a/source/sample/transformer/module/T2TOutput.cpp
+++ b/source/sample/transformer/module/T2TOutput.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
- * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,18 +19,16 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */
-#include <cmath>
+#include "Output.h"
+#include "Embedding.h"
-#include "T2TOutput.h"
+#include "../Utility.h"
-#include "T2TUtility.h"
-#include "T2TEmbedding.h"
 #include "../../../tensor/core/CHeader.h"
-namespace transformer
+namespace nmt
 {
 /* constructor */
-T2TOutput::T2TOutput()
+Output::Output()
 {
    devID = -1;
    vSize = -1;
@@ -39,7 +36,7 @@ T2TOutput::T2TOutput()
 }
 /* de-constructor */
-T2TOutput::~T2TOutput()
+Output::~Output()
 {
 }
@@ -47,7 +44,7 @@ T2TOutput::~T2TOutput()
 initialize the model
 >> config - configurations of the model
 */
-void T2TOutput::InitModel(T2TConfig& config)
+void Output::InitModel(Config& config)
 {
    devID = config.devID;
    hSize = config.modelSize;
@@ -66,7 +63,7 @@ make the network (redefined output tensor)
 >> isTraining - whether it is used for training
 >> normalized - whether ignore the log-softmax
 */
-void T2TOutput::Make(XTensor& input, XTensor& output, bool isTraining, bool normalized)
+void Output::Make(XTensor& input, XTensor& output, bool isTraining, bool normalized)
 {
    XTensor& x = input;

--- a/source/sample/transformer/module/T2TOutput.h
+++ b/source/sample/transformer/module/T2TOutput.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
- * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,19 +19,19 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */
-#ifndef __T2TOUTPUT_H__
+#ifndef __OUTPUT_H__
-#define __T2TOUTPUT_H__
+#define __OUTPUT_H__
-#include "T2TUtility.h"
+#include "../Utility.h"
 #include "../../../tensor/function/FHeader.h"
 using namespace nts;
-namespace transformer
+namespace nmt
 {
 /* output layer */
-class T2TOutput
+class Output
 {
 public:
    /* device id */
@@ -49,13 +48,13 @@ public:
 public:
    /* constructor */
-    T2TOutput();
+    Output();
    /* de-constructor */
-    ~T2TOutput();
+    ~Output();
    /* initialize the model */
-    void InitModel(T2TConfig& config);
+    void InitModel(Config& config);
    /* make the network (redefined output tensor) */
    void Make(XTensor& input, XTensor& output, bool isTraining, bool normalized);

--- a/source/sample/transformer/t2tdata/DataSet.cpp
+++ b/source/sample/transformer/t2tdata/DataSet.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*
-* $Created by: HU Chi (huchinlp@foxmail.com) 2019-04-05
-*/
-#include "DataSet.h"
-#include "StringUtil.h"
-#include <string>
-#include <vector>
-#include <fstream>
-#include <algorithm>
-#include "..//..//..//tensor/XUtility.h"
-using namespace nts;
-bool Compare(const Example& a, const Example& b) {
-    return a.values.size() > b.values.size();
-}
-bool CompareRes(const Result& a, const Result& b) {
-    return a.id < b.id;
-}
-void DataSet::RerankRes(){
-    sort(resBuffer.begin(), resBuffer.end(), CompareRes);
-}
-/*
-load data from the file to the buffer
-*/
-void DataSet::LoadDataToBuffer()
-{
-    string line;
-    buffer.clear();
-    bufferUsed = 0;
-    const string tokenDelimiter = " ";
-    int id = 0;
-    while (getline(*fp, line)) {
-        vector<int> values = Split<int>(line, tokenDelimiter);
-        Example example;
-        example.id = id++;
-        example.values = values;
-        buffer.emplace_back(example);
-    }
-    if (fp->eof()) {
-        fp->seekg(fp->beg);
-    }
-    if (sortBuffer) {
-        sort(buffer.begin(), buffer.end(), Compare);
-    }
-    resBuffer.reserve(buffer.size());
-}
-/*
-select a field and generate a mini-batch by indices
->>> batchEnc - a tensor to store the batch of input
->>> paddingEnc - a tensor to store the batch of paddings
->>> batchSize - batch size
->>> devID - devices id, -1 for CPU
->>> mem - the memory pool
-*/
-vector<int> DataSet::LoadBatch(XTensor * batchEnc, XTensor * paddingEnc, 
-                        size_t batchSize, int devID)
-{
-    size_t realBatchSize = batchSize;
-    /* real batch size */
-    if ((buffer.size()-bufferUsed) < batchSize) {
-        realBatchSize = buffer.size()-bufferUsed;
-    }
-    /* get the maximum sentence length in a mini-batch */
-    size_t maxLen = 0;
-    if (realBatchSize == 1)
-        maxLen = buffer[bufferUsed].values.size();
-    for (size_t i = 0; i < realBatchSize - 1; ++i) {
-        maxLen = max(maxLen, buffer[bufferUsed+i].values.size());
-    }
-    CheckNTErrors(maxLen != 0, "wrong length dectected");
-    int* batchValues = new int[realBatchSize * maxLen];
-    float* paddingValues = new float[realBatchSize * maxLen];
-    for (int i = 0; i < realBatchSize * maxLen; ++i) {
-        batchValues[i] = 1.0F;
-    }
-    memset(paddingValues, 0, sizeof(float) * maxLen * realBatchSize);
-    size_t cur = 0;
-    /* left padding */
-    vector<int> indices;
-    indices.reserve(realBatchSize);
-    for (size_t i = 0; i < realBatchSize; ++i) {
-        indices.push_back(buffer[bufferUsed + i].id);
-        cur = maxLen * (i + 1) - buffer[bufferUsed+i].values.size();
-        for (int v : buffer[bufferUsed + i].values) {
-            batchValues[cur] = v;
-            paddingValues[cur++] = 1.0F;
-        }
-        cur = maxLen * (i + 1);
-    }
-    InitTensor2DV2(batchEnc, realBatchSize, maxLen, X_INT, devID);
-    InitTensor2DV2(paddingEnc, realBatchSize, maxLen, X_FLOAT, devID);
-    bufferUsed += realBatchSize;
-    batchEnc->SetData(batchValues, batchEnc->unitNum);
-    paddingEnc->SetData(paddingValues, paddingEnc->unitNum);
-    delete[] batchValues;
-    delete[] paddingValues;
-    return indices;
-}
-/*
-the constructor of DataSet
->>> fname - path of the data file
-*/
-void DataSet::Init(const char* fname)
-{
-    fp = new ifstream(fname);
-    CheckNTErrors(fp->is_open(), "can not open the file");
-    bufferUsed = 0;
-    LoadDataToBuffer();
-    if (bufferSize == 0)
-        bufferSize = buffer.size();
-}
--- a/source/sample/transformer/t2tdata/DataSet.h
+++ b/source/sample/transformer/t2tdata/DataSet.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*
-* $Created by: HU Chi (huchinlp@foxmail.com) 2019-04-03
-*/
-#ifndef __DATASET_H__
-#define __DATASET_H__
-#include "../../..//tensor/XTensor.h"
-#include "../../..//tensor/XGlobal.h"
-#include <cstdio>
-#include <fstream>
-#include <unordered_map>
-#include <vector>
-using namespace std;
-using namespace nts;
-struct Example {
-    int id;
-    vector<int> values;
-};
-struct Result {
-    int id;
-    XTensor values;
-};
-using BufferType = vector<Example>;
-using ResBufferType = vector<Result>;
-bool Compare(const Example& a, const Example& b);
-bool CompareRes(const Result& a, const Result& b);
-namespace nts { // namespace nts(NiuTrans.Tensor)
-/* A `DataSet` is associated with a file which contains variable length data.*/
-struct DataSet {
-    /* the data buffer */
-    BufferType buffer;
-    /* the result buffer */
-    ResBufferType resBuffer;
-    /* the pointer to file stream */
-    ifstream* fp{nullptr};
-    /* size of the data buffer */
-    size_t bufferSize{ 0 };
-    /* size of used data in buffer */
-    size_t bufferUsed{ 0 };
-    /* wether sort the dataset */
-    bool sortBuffer{ true };
-    /* load data from a file to the buffer */
-    void LoadDataToBuffer();
-    /* rerank result for output */
-    void RerankRes();
-    /* generate a mini-batch */
-    vector<int> LoadBatch(XTensor * batchEnc, XTensor * paddingEnc, 
-                   size_t batchSize, int devID);
-    /* initlization function */
-    void Init(const char* fname);
-    /* check if the buffer is empty */
-    bool IsEmpty() {
-        if (bufferUsed < bufferSize)
-            return false;
-        return true;
-    }
-    /* de-constructor */
-    ~DataSet() {
-        if (fp)
-            fp->close();
-        delete fp;
-    }
-};
-} // namespace nts(NiuTrans.Tensor)
-#endif // __DATASET_H__
\ No newline at end of file
--- a/source/sample/transformer/t2tdata/StringUtil.cpp
+++ b/source/sample/transformer/t2tdata/StringUtil.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*
-* $Created by: HU Chi (huchinlp@foxmail.com) 2019-03-18
-*/
-#include "StringUtil.h"
-namespace nts {
-/* split string by delimiter, this will return indices of all sub-strings */
-vector<pair<int, int>> SplitToPos(const string& s, const string& delimiter)
-{
-    vector<pair<int, int>> fields;
-    if (delimiter.length() == 0) {
-        fields.emplace_back(0, s.length());
-        return fields;
-    }
-    int pos = 0;
-    int start = 0;
-    while ((pos = s.find(delimiter, start)) != string::npos) {
-        if (pos != start) {
-            fields.emplace_back(start, pos);
-        }
-        start = pos + delimiter.length();
-    }
-    if (start != s.length()) {
-        fields.emplace_back(start, s.length());
-    }
-    return fields;
-}
-}
\ No newline at end of file
--- a/source/sample/transformer/t2tdata/StringUtil.h
+++ b/source/sample/transformer/t2tdata/StringUtil.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*
- * $Created by: HU Chi (huchinlp@foxmail.com) 2019-03-18
- */
-#ifndef __STRING_UTIL_H__
-#define __STRING_UTIL_H__
-#include <cstdlib>
-#include <string>
-#include <utility>
-#include <vector>
-using namespace std;
-namespace nts {
-/* Splits a string based on the given delimiter string. Each pair in the
- * returned vector has the start and past-the-end positions for each of the
- * parts of the original string. Empty fields are not represented in the output.
- */
-vector<pair<int, int>> SplitToPos(const string& s, const string& delimiter);
-/* Splits the given string and converts each part to the given T. */
-template <typename T>
-vector<T> Split(const string& s, const string& delimiter);
-template <>
-inline vector<string> Split(const string& s, const string& delimiter)
-{
-    vector<string> fields;
-    for (const auto& p : SplitToPos(s, delimiter)) {
-        fields.emplace_back(s.substr(p.first, p.second - p.first));
-    }
-    return fields;
-}
-template <>
-inline vector<int> Split(const string& s, const string& delimiter)
-{
-    vector<int> fields;
-    for (const auto& p : SplitToPos(s, delimiter)) {
-        fields.emplace_back(strtol(s.data() + p.first, nullptr, 10));
-    }
-    return fields;
-}
-template <>
-inline vector<int64_t> Split(const string& s, const string& delimiter)
-{
-    vector<int64_t> fields;
-    for (const auto& p : SplitToPos(s, delimiter)) {
-        fields.emplace_back(strtoll(s.data() + p.first, nullptr, 10));
-    }
-    return fields;
-}
-template <>
-inline vector<float> Split(const string& s, const string& delimiter)
-{
-    vector<float> fields;
-    for (const auto& p : SplitToPos(s, delimiter)) {
-        fields.emplace_back(strtof(s.data() + p.first, nullptr));
-    }
-    return fields;
-}
-template <>
-inline vector<uint8_t> Split(const string& s, const string& delimiter)
-{
-    vector<uint8_t> fields;
-    for (const auto& p : SplitToPos(s, delimiter)) {
-        fields.emplace_back(strtol(s.data() + p.first, nullptr, 10));
-    }
-    return fields;
-}
-} // namespace nts
-#endif // __STRING_UTIL_H__
--- a/source/sample/transformer/train/T2TBatchLoader.cpp
+++ b/source/sample/transformer/train/T2TBatchLoader.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
- */
-#include "T2TBatchLoader.h"
-#include "../module/T2TUtility.h"
-#include "../../../tensor/XUtility.h"
-#include "../../../tensor/core/CHeader.h"
-#include "../../../network/XNoder.h"
-namespace transformer
-{
-/* constructor */
-T2TBatchLoader::T2TBatchLoader()
-{
-    seqLen = NULL;
-    seqLen2 = NULL;
-    nseqBuf = 0;
-    nextSeq = -1;
-    nextBatch = -1;
-    buf = NULL;
-    buf2 = NULL;
-    bufBatch = NULL;
-    bufSize = 0;
-    bufBatchSize = 0;
-    seqOffset = NULL;
-}
-/* de-constructor */
-T2TBatchLoader::~T2TBatchLoader()
-{
-    delete[] buf;
-    delete[] buf2;
-    delete[] bufBatch;
-    delete[] seqLen;
-    delete[] seqLen2;
-    delete[] seqOffset;
-}
-/*
-initialization
->> argc - number of arguments
->> argv - list of pointers to the arguments
-*/
-void T2TBatchLoader::Init(T2TConfig& config)
-{
-    bufSize = config.bufSize;
-    isDoubledEnd = config.isDoubledEnd;
-    isSmallBatch = config.isSmallBatch;
-    isBigBatch = config.isBigBatch;
-    isRandomBatch = config.isRandomBatch;
-    bucketSize = config.bucketSize;
-    buf = new int[bufSize];
-    buf2 = new int[bufSize];
-    bufBatch = new BatchNode[bufSize];
-    seqLen = new int[bufSize];
-    seqLen2 = new int[bufSize];
-    seqOffset = new int[bufSize];
-}
-char line[MAX_SEQUENCE_LENGTH];
-struct SampleNode
-{
-    int id;
-    int offset;
-    int* p;
-    int size;
-    int value;
-    int key;
-};
-int CompareSampleNode(const void* a, const void* b)
-{
-    return ((SampleNode*)b)->value - ((SampleNode*)a)->value;
-}
-int CompareSampleNodeV2(const void* a, const void* b)
-{
-    return ((SampleNode*)b)->key - ((SampleNode*)a)->key;
-}
-/*
-load data to buffer
->> file - where to load data
->> isSorted - indicates whether the samples are sorted by length
->> step - the number of sequences we go over when move to the next sample
-*/
-int T2TBatchLoader::LoadBuf(FILE* file, bool isSorted, int step)
-{
-    int lineCount = 0;
-    int seqCount = 0;
-    int wordCount = 0;
-    while (fgets(line, MAX_SEQUENCE_LENGTH - 1, file)) {
-        int len = (int)strlen(line);
-        while (line[len - 1] == '\r' || line[len - 1] == '\n') {
-            line[len - 1] = 0;
-            len--;
-        }
-        len = (int)strlen(line);
-        if (len == 0)
-            continue;
-        /* how many characters are in a word */
-        int wSize = 0;
-        /* how many words are in the sentence */
-        int wNum = 0;
-        int wNumLocal = 0;
-        int i = 0;
-        for (i = 0; i < len; i++) {
-            /* load word (id) seperated by space or tab */
-            if ((line[i] == ' ' || line[i] == '\t') && wSize > 0) {
-                line[i] = 0;
-                if (wSize == 3 && line[i - 1] == '|' && line[i - 2] == '|' && line[i - 3] == '|') {
-                    seqLen[seqCount] = wNumLocal;
-                    seqOffset[seqCount] = wordCount + wNum - wNumLocal;
-                    seqCount++;
-                    wNumLocal = 0;
-                }
-                else {
-                    buf[wordCount + wNum++] = atoi(line + i - wSize);
-                    wNumLocal++;
-                }
-                wSize = 0;
-            }
-            else
-                wSize++;
-        }
-        if (wSize > 0) {
-            buf[wordCount + wNum++] = atoi(line + i - wSize);
-            wNumLocal++;
-        }
-        seqLen[seqCount] = wNumLocal;
-        seqOffset[seqCount] = wordCount + wNum - wNumLocal;
-        seqCount++;
-        wordCount += wNum;
-        lineCount++;
-        if (wordCount >= bufSize - MAX_SEQUENCE_LENGTH)
-            break;
-        CheckNTErrors(seqCount % step == 0, "Wrong number of sequences!");
-    }
-    nseqBuf = seqCount;
-    nextSeq = 0;
-    /* sort the sequences by length */
-    if (isSorted) {
-        CheckNTErrors(seqCount % step == 0, "Wrong number of sequences!");
-        SampleNode* nodes = new SampleNode[seqCount];
-        int count = 0;
-        int offset = 0;
-        for (int i = 0; i < seqCount; i += step) {
-            SampleNode& node = nodes[count];
-            node.id = count;
-            node.offset = i;
-            node.p = buf + offset;
-            node.size = 0;
-            int max = 0;
-            for (int j = 0; j < step; j++) {
-                node.size += seqLen[i + j];
-                max = MAX(max, seqLen[i + j]);
-            }
-            node.value = max;
-            node.key = rand();
-            count++;
-            offset += node.size;
-        }
-        qsort(nodes, count, sizeof(SampleNode), CompareSampleNode);
-        /* distribute samples into buckets. In each bucket, sequences have
-           similar a length */
-        if (bucketSize > 0) {
-            int low = 0;
-            int high = low + bucketSize;
-            int n = count - 1;
-            int m = n;
-            int num = 0;
-            while (num < count) {
-                for (m = n; m >= 0; m--) {
-                    if (nodes[m].value > high)
-                        break;
-                }
-                qsort(nodes + m + 1, n - m, sizeof(SampleNode), CompareSampleNodeV2);
-                num += (n - m);
-                n = m;
-                low += bucketSize;
-                high = low + bucketSize;
-            }
-        }
-        count = 0;
-        offset = 0;
-        for (int i = 0; i < seqCount; i += step) {
-            SampleNode& node = nodes[count];
-            memcpy(buf2 + offset, node.p, sizeof(int) * node.size);
-            for (int j = 0; j < step; j++) {
-                seqLen2[i + j] = seqLen[node.offset + j];
-                seqOffset[i + j] = offset + (j > 0 ? seqLen[node.offset + j - 1] : 0);
-            }
-            count += 1;
-            offset += node.size;
-        }
-        int* tmp = buf;
-        buf = buf2;
-        buf2 = tmp;
-        tmp = seqLen;
-        seqLen = seqLen2;
-        seqLen2 = tmp;
-        delete[] nodes;
-    }
-    return lineCount;
-}
-/* clear the data buffer */
-void T2TBatchLoader::ClearBuf()
-{
-    nseqBuf = 0;
-    nextSeq = -1;
-}
-/*
-set the random batch flag
->> flag - as it is
-*/
-void T2TBatchLoader::SetRandomBatch(bool flag)
-{
-    isRandomBatch = flag;
-}
-/*
-load a batch of sequences
->> file - the handle to the data file
->> isLM - indicates whether the data is used for training lms
->> batchEnc - the batch of the input sequences
->> paddingEnc - padding of the input sequences
->> batchDec - the batch of the output sequences
->> paddingDec - padding of the output sequences
->> gold - gold standard
->> seqs - keep the sequences in an array
->> vsEnc - size of the encoder vocabulary
->> vsDec - size of the decoder vocabulary
->> sBatch - batch size of sequences
->> wBatch - batch size of words
->> isSorted - indicates whether the sequences are sorted by length
->> wCount - word count
->> devID - device id
->> isTraining - indicates whether we are training the model
-*/
-int T2TBatchLoader::LoadBatch(FILE* file, bool isLM,
-    XTensor* batchEnc, XTensor* paddingEnc,
-    XTensor* batchDec, XTensor* paddingDec,
-    XTensor* gold, XTensor* label,
-    int* seqs,
-    int vsEnc, int vsDec, int sBatch, int wBatch,
-    bool isSorted, int& ws, int& wCount,
-    int devID, bool isTraining)
-{
-    if (isLM) {
-        return LoadBatchLM(file, batchEnc, paddingEnc, batchDec, paddingDec, gold, label,
-            seqs, vsEnc, sBatch, wBatch,
-            isSorted, wCount, devID, isTraining);
-    }
-    else {
-        return LoadBatchMT(file, batchEnc, paddingEnc, batchDec, paddingDec, gold, label,
-            seqs, vsEnc, vsDec, sBatch, wBatch,
-            isSorted, ws, wCount, devID, isTraining);
-    }
-}
-/*
-load a batch of sequences (for LM)
->> file - the handle to the data file
->> isLM - indicates whether the data is used for training lms
->> batchEnc - the batch of the input sequences
->> paddingEnc - padding of the input sequences
->> batchDec - the batch of the output sequences
->> paddingDec - padding of the output sequences
->> gold - gold standard (distribution of every position)
->> label - (gold standard) label index of every position
->> seqs - keep the sequences in an array
->> vSize - vocabulary size
->> sBatch - batch size of sequences
->> wBatch - batch size of words
->> isSorted - indicates whether the sequences are sorted by length
->> wCount - word count
->> devID - device id
->> isTraining - indicates whether we are training the model
-*/
-int T2TBatchLoader::LoadBatchLM(FILE* file,
-    XTensor* batchEnc, XTensor* paddingEnc,
-    XTensor* batchDec, XTensor* paddingDec,
-    XTensor* gold, XTensor* label,
-    int* seqs,
-    int vSize, int sBatch, int wBatch,
-    bool isSorted, int& wCount,
-    int devID, bool isTraining)
-{
-    if (nextSeq < 0 || nextSeq >= nseqBuf)
-        LoadBuf(file, isSorted, 1);
-    int seq = MAX(nextSeq, 0);
-    int wc = 0;
-    int wn = 0;
-    int sc = 0;
-    int max = 0;
-    while (seq + sc < nseqBuf) {
-        int len = isDoubledEnd ? seqLen[seq + sc] : seqLen[seq + sc] - 1;
-        CheckNTErrors(len > 0, "Empty sequence!");
-        wn = len;
-        wc += wn;
-        sc += 1;
-        if (max < wn)
-            max = wn;
-        int tc = isBigBatch ? wc : max * sc;
-        if (sc >= sBatch && tc >= wBatch)
-            break;
-    }
-    wCount = 0;
-    nextSeq = seq + sc;
-    if (sc <= 0)
-        return 0;
-    int dims[MAX_TENSOR_DIM_NUM];
-    dims[0] = sc;
-    dims[1] = max;
-    dims[2] = vSize;
-    InitTensor2D(batchEnc, sc, max, X_INT, devID);
-    InitTensor2D(label, sc, max, X_INT, devID);
-    InitTensor(gold, 3, dims, X_FLOAT, devID);
-    InitTensor2D(paddingEnc, sc, max, X_FLOAT, devID);
-    InitTensor2D(paddingDec, sc, max, X_FLOAT, devID);
-    batchEnc->SetZeroAll();
-    label->SetZeroAll();
-    gold->SetZeroAll();
-    paddingEnc->SetZeroAll();
-    paddingDec->SetZeroAll();
-    int seqSize = 0;
-    int* batchEncValues = new int[batchEnc->unitNum];
-    int* labelValues = new int[label->unitNum];
-    MTYPE* goldOffsets = new MTYPE[gold->unitNum];
-    MTYPE* paddingEncOffsets = new MTYPE[paddingEnc->unitNum];
-    MTYPE* paddingDecOffsets = new MTYPE[paddingDec->unitNum];
-    int wGold = 0;
-    memset(batchEncValues, 0, sizeof(int) * batchEnc->unitNum);
-    memset(labelValues, 0, sizeof(int) * label->unitNum);
-    for (int s = seq; s < seq + sc; s++) {
-        int len = isDoubledEnd ? seqLen[s] : seqLen[s] - 1;
-        CheckNTErrors(len <= max, "Something is wrong!");
-        for (int w = 0; w < len; w++) {
-            int num = buf[seqOffset[s] + w];
-            batchEncValues[(int)batchEnc->GetOffset2D(s - seq, w)] = num;
-            paddingEncOffsets[wCount] = paddingEnc->GetOffset2D(s - seq, w);
-            paddingDecOffsets[wCount] = paddingDec->GetOffset2D(s - seq, w);
-            if (w > 0) {
-                goldOffsets[wGold++] = gold->GetOffset3D(s - seq, w - 1, num);
-                labelValues[(int)label->GetOffset2D(s - seq, w - 1)] = buf[seqOffset[s] + w];
-            }
-            if (w == len - 1) {
-                if (isDoubledEnd) {
-                    goldOffsets[wGold++] = gold->GetOffset3D(s - seq, w, num);
-                    labelValues[(int)label->GetOffset2D(s - seq, w)] = buf[seqOffset[s] + w];
-                }
-                else {
-                    goldOffsets[wGold++] = gold->GetOffset3D(s - seq, w, buf[seqOffset[s] + w + 1]);
-                    labelValues[(int)label->GetOffset2D(s - seq, w)] = buf[seqOffset[s] + w + 1];
-                }
-            }
-            wCount++;
-            if (seqs != NULL)
-                seqs[seqSize++] = buf[seqOffset[s] + w];
-        }
-        if (seqs != NULL) {
-            for (int w = len; w < max; w++)
-                seqs[seqSize++] = -1;
-        }
-    }
-    batchEnc->SetData(batchEncValues, batchEnc->unitNum);
-    label->SetData(labelValues, label->unitNum);
-    gold->SetDataBatched(goldOffsets, 1.0F, wGold);
-    paddingEnc->SetDataBatched(paddingEncOffsets, 1.0F, wCount);
-    paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCount);
-    /*XTensor * tmp = NewTensorBuf(paddingEnc, devID);
-    _ConvertDataType(batchEnc, tmp);
-    _NotEqual(tmp, paddingEnc, 0);
-    DelTensorBuf(tmp);
-    XTensor * tmp2 = NewTensorBuf(paddingDec, devID);
-    _ConvertDataType(batchEnc, tmp2);
-    _NotEqual(tmp2, paddingDec, 0);
-    DelTensorBuf(tmp2);*/
-    delete[] batchEncValues;
-    delete[] labelValues;
-    delete[] goldOffsets;
-    delete[] paddingEncOffsets;
-    delete[] paddingDecOffsets;
-    return sc;
-}
-int CompareBatchNode(const void* a, const void* b)
-{
-    return ((BatchNode*)b)->key - ((BatchNode*)a)->key;
-}
-/*
-load a batch of sequences (for MT)
->> file - the handle to the data file
->> batchEnc - the batch of the input sequences
->> paddingEnc - padding of the input sequences
->> batchDec - the batch of the output sequences
->> paddingDec - padding of the output sequences
->> gold - gold standard (distribution of every position)
->> label - (gold standard) label index of every position
->> seqs - keep the sequences in an array
->> vSizeEnc - size of the encoder vocabulary
->> vSizeDec - size of the decoder vocabulary
->> sBatch - batch size of sequences
->> wBatch - batch size of words
->> isSorted - indicates whether the sequences are sorted by length
->> wCount - word count
->> devID - device id
->> isTraining - indicates whether we are training the model
-*/
-int T2TBatchLoader::LoadBatchMT(FILE* file,
-    XTensor* batchEnc, XTensor* paddingEnc,
-    XTensor* batchDec, XTensor* paddingDec,
-    XTensor* gold, XTensor* label,
-    int* seqs,
-    int vSizeEnc, int vSizeDec, int sBatch, int wBatch,
-    bool isSorted, int& ws, int& wCount,
-    int devID, bool isTraining)
-{
-    if (nextBatch < 0 || nextBatch >= bufBatchSize) {
-        LoadBuf(file, isSorted, 2);
-        int seq = 0;
-        bufBatchSize = 0;
-        nextBatch = 0;
-        /* we segment the buffer into batches */
-        while (seq < nseqBuf) {
-            int wcEnc = 0;
-            int wcDec = 0;
-            int wnEnc = 0;
-            int wnDec = 0;
-            int maxEnc = 0;
-            int maxDec = 0;
-            int sc = 0;
-            while (seq + sc < nseqBuf) {
-                /* source-side sequence */
-                wnEnc = seqLen[seq + sc];
-                /* target-side sequence */
-                wnDec = isDoubledEnd ? seqLen[seq + sc + 1] : seqLen[seq + sc + 1] - 1;
-                int tcEnc = isBigBatch ? (wcEnc + wnEnc) : MAX(maxEnc, wnEnc) * (sc + 2) / 2;
-                int tcDec = isBigBatch ? (wcDec + wnDec) : MAX(maxDec, wnDec) * (sc + 2) / 2;
-                if (sc != 0 && sc > sBatch * 2 && (tcEnc > wBatch || tcDec > wBatch))
-                    break;
-                wcEnc += wnEnc;
-                sc += 1;
-                if (maxEnc < wnEnc)
-                    maxEnc = wnEnc;
-                wcDec += wnDec;
-                sc += 1;
-                if (maxDec < wnDec)
-                    maxDec = wnDec;
-            }
-            BatchNode& batch = bufBatch[bufBatchSize];
-            batch.beg = seq;
-            batch.end = seq + sc;
-            batch.maxEnc = maxEnc;
-            batch.maxDec = maxDec;
-            batch.key = rand();
-            bufBatchSize++;
-            seq = seq + sc;
-        }
-        if (isRandomBatch)
-            qsort(bufBatch, bufBatchSize, sizeof(BatchNode), CompareBatchNode);
-    }
-    if (bufBatchSize <= 0)
-        return 0;
-    BatchNode& batch = bufBatch[nextBatch++];
-    int seq = batch.beg;
-    int sc = batch.end - batch.beg;
-    int maxEnc = batch.maxEnc;
-    int maxDec = batch.maxDec;
-    CheckNTErrors(sc % 2 == 0, "The input samples must be paired");
-    int sCount = sc / 2;
-    int seqSize = 0;
-    InitTensor2D(batchEnc, sCount, maxEnc, X_INT, devID);
-    InitTensor2D(paddingEnc, sCount, maxEnc, X_FLOAT, devID);
-    InitTensor2D(batchDec, sCount, maxDec, X_INT, devID);
-    InitTensor2D(paddingDec, sCount, maxDec, X_FLOAT, devID);
-    InitTensor2D(label, sCount, maxDec, X_INT, devID);
-    //InitTensor(gold, 3, dimsDec, X_FLOAT, devID);
-    batchEnc->SetZeroAll();
-    paddingEnc->SetZeroAll();
-    batchDec->SetZeroAll();
-    paddingDec->SetZeroAll();
-    label->SetZeroAll();
-    //gold->SetZeroAll();
-    int wCountEnc = 0;
-    int wCountDec = 0;
-    int wCountPad = 0;
-    wCount = 0;
-    int* batchEncValues = new int[batchEnc->unitNum];
-    int* batchDecValues = new int[batchDec->unitNum];
-    int* labelValues = new int[label->unitNum];
-    MTYPE* paddingEncOffsets = new MTYPE[sc * maxEnc / 2];
-    MTYPE* paddingDecOffsets = new MTYPE[sc * maxDec / 2];
-    //MTYPE * goldOffsets = new MTYPE[sc * maxDec / 2];
-    memset(batchEncValues, 0, sizeof(int) * batchEnc->unitNum);
-    memset(batchDecValues, 0, sizeof(int) * batchDec->unitNum);
-    memset(labelValues, 0, sizeof(int) * batchDec->unitNum);
-    /* batch of the source-side sequences */
-    for (int s = seq; s < seq + sc; s += 2) {
-        int len = seqLen[s];
-        int sent = (s - seq) / 2;
-        for (int w = 0; w < len; w++) {
-            int num = buf[seqOffset[s] + w];
-            batchEncValues[batchEnc->GetOffset2D(sent, w)] = num;
-            paddingEncOffsets[wCountEnc] = paddingEnc->GetOffset2D(sent, w);
-            wCountEnc++;
-        }
-    }
-    ws = wCountEnc;
-    batchEnc->SetData(batchEncValues, batchEnc->unitNum);
-    paddingEnc->SetDataBatched(paddingEncOffsets, 1.0F, wCountEnc);
-    //XTensor * tmp = NewTensorBuf(paddingEnc, devID);
-    //_ConvertDataType(batchEnc, tmp);
-    //tmp->Dump(stderr, "tmp:");
-    //_NotEqual(tmp, paddingEnc, 0);
-    //DelTensorBuf(tmp);
-    /* batch of the target-side sequences */
-    for (int s = seq + 1; s < seq + sc; s += 2) {
-        int len = isDoubledEnd ? seqLen[s] : seqLen[s] - 1;
-        CheckNTErrors(len <= maxDec, "Something is wrong!");
-        int sent = (s - seq - 1) / 2;
-        for (int w = 0; w < len; w++) {
-            int num = buf[seqOffset[s] + w];
-            batchDecValues[batchDec->GetOffset2D(sent, w)] = num;
-            //paddingDecOffsets[wCountDec] = paddingDec->GetOffset2D(sent, w);
-            if (w < len - 1) {
-                paddingDecOffsets[wCountPad++] = paddingDec->GetOffset2D(sent, w);
-                wCount++;
-            }
-            if (w > 0) {
-                //goldOffsets[wGold++] = gold->GetOffset3D(sent, w - 1, buf[seqOffset[s] + w]);
-                labelValues[label->GetOffset2D(sent, w - 1)] = buf[seqOffset[s] + w];
-            }
-            if (w == len - 1) {
-                if (isDoubledEnd) {
-                    //goldOffsets[wGold++] = gold->GetOffset3D(sent, w, buf[seqOffset[s] + w]);
-                    labelValues[label->GetOffset2D(sent, w)] = buf[seqOffset[s] + w];
-                }
-                else {
-                    //goldOffsets[wGold++] = gold->GetOffset3D(sent, w, buf[seqOffset[s] + w + 1]);
-                    labelValues[label->GetOffset2D(sent, w)] = buf[seqOffset[s] + w + 1];
-                }
-            }
-            //wCount++;
-            wCountDec++;
-            if (seqs != NULL)
-                seqs[seqSize++] = buf[seqOffset[s] + w];
-        }
-        if (seqs != NULL) {
-            for (int w = len; w < maxDec; w++)
-                seqs[seqSize++] = -1;
-        }
-    }
-    batchDec->SetData(batchDecValues, batchDec->unitNum);
-    label->SetData(labelValues, label->unitNum);
-    paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCountPad);
-    //XTensor * tmp2 = NewTensorBuf(paddingDec, devID);
-    //_ConvertDataType(batchDec, tmp2);
-    //_NotEqual(tmp2, paddingDec, 0);
-    //DelTensorBuf(tmp2);
-    //gold->SetDataBatched(goldOffsets, 1.0F, wGold);
-    delete[] batchEncValues;
-    delete[] batchDecValues;
-    delete[] labelValues;
-    delete[] paddingEncOffsets;
-    delete[] paddingDecOffsets;
-    //delete[] goldOffsets;
-    return sc;
-}
-/*
-shuffle lines of the file
->> srcFile - the source file to shuffle
->> tgtFile - the resulting file
-*/
-void T2TBatchLoader::Shuffle(const char* srcFile, const char* tgtFile)
-{
-    char* line = new char[MAX_LINE_LENGTH];
-#ifndef WIN32
-    sprintf(line, "shuf %s > %s", srcFile, tgtFile);
-    system(line);
-#else
-    ShowNTErrors("Cannot shuffle the file on WINDOWS systems!");
-#endif
-    delete[] line;
-}
-}
\ No newline at end of file
--- a/source/sample/transformer/train/T2TBatchLoader.h
+++ b/source/sample/transformer/train/T2TBatchLoader.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-04-25
- * it is cold today but I'll move to a warm place tomorrow :)
- */
-#ifndef __T2TBATCHLOADER_H__
-#define __T2TBATCHLOADER_H__
-#include "../module/T2TUtility.h"
-#include "../../../network/XNet.h"
-using namespace nts;
-namespace transformer
-{
-#define MAX_SEQUENCE_LENGTH 1024 * 4
-/* node to keep batch information */
-struct BatchNode
-{
-    /* beginning position */
-    int beg;
-    /* end position */
-    int end;
-    /* maximum word number on the encoder side */
-    int maxEnc;
-    /* maximum word number on the decoder side */
-    int maxDec;
-    /* a key for sorting */
-    int key;
-};
-class T2TBatchLoader
-{
-public:
-    /* buffer for loading words */
-    int* buf;
-    /* another buffer */
-    int* buf2;
-    /* batch buf */
-    BatchNode* bufBatch;
-    /* buffer size */
-    int bufSize;
-    /* size of batch buffer */
-    int bufBatchSize;
-    /* length of each sequence */
-    int* seqLen;
-    /* another array */
-    int* seqLen2;
-    /* offset of the first word for each sequence */
-    int* seqOffset;
-    /* number of sequences in the buffer */
-    int nseqBuf;
-    /* offset for next sequence in the buffer */
-    int nextSeq;
-    /* offset for next batch */
-    int nextBatch;
-    /* indicates whether we double the </s> symbol for the output of LM */
-    bool isDoubledEnd;
-    /* indicates whether we use batchsize = max * sc
-       rather rather than batchsize = word-number, where max is the maximum
-       length and sc is the sentence number */
-    bool isSmallBatch;
-    /* counterpart of "isSmallBatch" */
-    bool isBigBatch;
-    /* randomize batches */
-    bool isRandomBatch;
-    /* bucket size */
-    int bucketSize;
-public:
-    /* constructor */
-    T2TBatchLoader();
-    /* de-constructor */
-    ~T2TBatchLoader();
-    /* initialization */
-    void Init(T2TConfig& config);
-    /* load data to buffer */
-    int LoadBuf(FILE* file, bool isSorted, int step);
-    /* clear data buffer */
-    void ClearBuf();
-    /* set the random batch flag */
-    void SetRandomBatch(bool flag = true);
-    /* load a batch of sequences */
-    int LoadBatch(FILE* file, bool isLM,
-        XTensor* batchEnc, XTensor* paddingEnc,
-        XTensor* batchDec, XTensor* paddingDec,
-        XTensor* gold, XTensor* label,
-        int* seqs,
-        int vsEnc, int vsDec, int sBatch, int wBatch,
-        bool isSorted, int& ws, int& wCount,
-        int devID, bool isTraining);
-    /* load a batch of sequences (for language modeling) */
-    int LoadBatchLM(FILE* file,
-        XTensor* batchEnc, XTensor* paddingEnc,
-        XTensor* batchDec, XTensor* paddingDec,
-        XTensor* gold, XTensor* label,
-        int* seqs, int vs, int sBatch, int wBatch,
-        bool isSorted, int& wCount,
-        int devID, bool isTraining);
-    /* load a batch of sequences (for machine translation) */
-    int LoadBatchMT(FILE* file,
-        XTensor* batchEnc, XTensor* paddingEnc,
-        XTensor* batchDec, XTensor* paddingDec,
-        XTensor* gold, XTensor* label,
-        int* seqs, int vsEnc, int vsDec, int sBatch, int wBatch,
-        bool isSorted, int& ws, int& wCount,
-        int devID, bool isTraining);
-    /* shuffle the data file */
-    void Shuffle(const char* srcFile, const char* tgtFile);
-};
-}
-#endif
\ No newline at end of file
--- a/source/sample/transformer/train/TrainDataSet.cpp
+++ b/source/sample/transformer/train/TrainDataSet.cpp
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: HU Chi (huchinlp@foxmail.com) 2020-08-09
+ * TODO: refactor the data loader class and references
+ */
+#include <string>
+#include <vector>
+#include <cstdlib>
+#include <fstream>
+#include <algorithm>
+#include "TrainDataSet.h"
+#include "../Utility.h"
+#include "../translate/Vocab.h"
+using namespace nmt;
+namespace nts {
+/* sort the dataset by length (in descending order) */
+void TrainDataSet::SortByLength() {
+    sort(buffer.items, buffer.items + buffer.count,
+        [](TrainExample* a, TrainExample* b) {
+            return (a->srcSent.Size() + a->tgtSent.Size())
+                 > (b->srcSent.Size() + b->tgtSent.Size());
+        });
+}
+/* sort buckets by key (in descending order) */
+void TrainDataSet::SortBucket() {
+    sort(buffer.items, buffer.items + buffer.count,
+        [](TrainExample* a, TrainExample* b) {
+            return a->bucketKey > b->bucketKey;
+        });
+}
+/*
+sort the output by key in a range (in descending order)
+>> begin - the first index of the range
+>> end - the last index of the range
+*/
+void TrainDataSet::SortInBucket(int begin, int end) {
+    sort(buffer.items + begin, buffer.items + end,
+        [](TrainExample* a, TrainExample* b) {
+            return (a->key > b->key);
+        });
+}
+/*
+load all data from a file to the buffer
+training data format (binary):
+first 8 bit: number of sentence pairs
+subsequent segements:
+source sentence length (4 bit)
+target sentence length (4 bit)
+source tokens (4 bit per token)
+target tokens (4 bit per token)
+*/
+void TrainDataSet::LoadDataToBuffer()
+{
+    buffer.Clear();
+    curIdx = 0;
+    int id = 0;
+    uint64_t sentNum = 0;
+    int srcVocabSize = 0;
+    int tgtVocabSize = 0;
+    fread(&srcVocabSize, sizeof(srcVocabSize), 1, fp);
+    fread(&tgtVocabSize, sizeof(tgtVocabSize), 1, fp);
+    fread(&sentNum, sizeof(uint64_t), 1, fp);
+    CheckNTErrors(sentNum > 0, "Invalid sentence pairs number");
+    while (id < sentNum) {
+        int srcLen = 0;
+        int tgtLen = 0;
+        fread(&srcLen, sizeof(int), 1, fp);
+        fread(&tgtLen, sizeof(int), 1, fp);
+        CheckNTErrors(srcLen > 0, "Invalid source sentence length");
+        CheckNTErrors(tgtLen > 0, "Invalid target sentence length");
+        IntList srcSent;
+        IntList tgtSent;
+        srcSent.ReadFromFile(fp, srcLen);
+        tgtSent.ReadFromFile(fp, tgtLen);
+        TrainExample* example = new TrainExample;
+        example->id = id++;
+        example->key = id;
+        example->srcSent = srcSent;
+        example->tgtSent = tgtSent;
+        buffer.Add(example);
+    }
+    fclose(fp);
+    XPRINT1(0, stderr, "[INFO] loaded %d sentences\n", id);
+}
+/*
+load a mini-batch to the device (for training)
+>> batchEnc - a tensor to store the batch of encoder input
+>> paddingEnc - a tensor to store the batch of encoder paddings
+>> batchDec - a tensor to store the batch of decoder input
+>> paddingDec - a tensor to store the batch of decoder paddings
+>> label - a tensor to store the label of input
+>> minSentBatch - the minimum number of sentence batch
+>> batchSize - the maxium number of words in a batch
+>> devID - the device id, -1 for the CPU
+<< return - number of target tokens and sentences
+*/
+UInt64List TrainDataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
+                                   XTensor* batchDec, XTensor* paddingDec, XTensor* label,
+                                   size_t minSentBatch, size_t batchSize, int devID)
+{
+    UInt64List info;
+    size_t srcTokenNum = 0;
+    size_t tgtTokenNum = 0;
+    int realBatchSize = 1;
+    if (!isTraining)
+        realBatchSize = minSentBatch;
+    /* get the maximum source sentence length in a mini-batch */
+    size_t maxSrcLen = buffer[curIdx]->srcSent.Size();
+    /* max batch size */
+    const int MAX_BATCH_SIZE = 512;
+    /* dynamic batching for sentences, enabled when the dataset is used for training */
+    if (isTraining) {
+        while ((realBatchSize < (buffer.Size() - curIdx))
+            && (realBatchSize * maxSrcLen < batchSize)
+            && (realBatchSize < MAX_BATCH_SIZE)
+            && (realBatchSize * buffer[curIdx + realBatchSize]->srcSent.Size() < batchSize)) {
+            if (maxSrcLen < buffer[curIdx + realBatchSize]->srcSent.Size())
+                maxSrcLen = buffer[curIdx + realBatchSize]->srcSent.Size();
+            realBatchSize++;
+        }
+    }
+    /* real batch size */
+    if ((buffer.Size() - curIdx) < realBatchSize) {
+        realBatchSize = buffer.Size() - curIdx;
+    }
+    CheckNTErrors(realBatchSize > 0, "Invalid batch size");
+    /* get the maximum target sentence length in a mini-batch */
+    size_t maxTgtLen = buffer[curIdx]->tgtSent.Size();
+    for (size_t i = 0; i < realBatchSize; i++) {
+        if (maxTgtLen < buffer[curIdx + i]->tgtSent.Size())
+            maxTgtLen = buffer[curIdx + i]->tgtSent.Size();
+    }
+    for (size_t i = 0; i < realBatchSize; i++) {
+        if (maxSrcLen < buffer[curIdx + i]->srcSent.Size())
+            maxSrcLen = buffer[curIdx + i]->srcSent.Size();
+    }
+    CheckNTErrors(maxSrcLen != 0, "Invalid source length for batching");
+    int* batchEncValues = new int[realBatchSize * maxSrcLen];
+    float* paddingEncValues = new float[realBatchSize * maxSrcLen];
+    int* labelVaues = new int[realBatchSize * maxTgtLen];
+    int* batchDecValues = new int[realBatchSize * maxTgtLen];
+    float* paddingDecValues = new float[realBatchSize * maxTgtLen];
+    for (int i = 0; i < realBatchSize * maxSrcLen; i++) {
+        batchEncValues[i] = PAD;
+        paddingEncValues[i] = 1;
+    }
+    for (int i = 0; i < realBatchSize * maxTgtLen; i++) {
+        batchDecValues[i] = PAD;
+        labelVaues[i] = PAD;
+        paddingDecValues[i] = 1.0F;
+    }
+    size_t curSrc = 0;
+    size_t curTgt = 0;
+    /*
+    batchEnc: end with EOS (left padding)
+    batchDec: begin with SOS (right padding)
+    label:    end with EOS (right padding)
+    */
+    for (int i = 0; i < realBatchSize; ++i) {
+        srcTokenNum += buffer[curIdx + i]->srcSent.Size();
+        tgtTokenNum += buffer[curIdx + i]->tgtSent.Size();
+        curSrc = maxSrcLen * i;
+        for (int j = 0; j < buffer[curIdx + i]->srcSent.Size(); j++) {
+            batchEncValues[curSrc++] = buffer[curIdx + i]->srcSent[j];
+        }
+        curTgt = maxTgtLen * i;
+        for (int j = 0; j < buffer[curIdx + i]->tgtSent.Size(); j++) {
+            if (j > 0)
+                labelVaues[curTgt - 1] = buffer[curIdx + i]->tgtSent[j];
+            batchDecValues[curTgt++] = buffer[curIdx + i]->tgtSent[j];
+        }
+        labelVaues[curTgt - 1] = EOS;
+        while (curSrc < maxSrcLen * (i + 1))
+            paddingEncValues[curSrc++] = 0;
+        while (curTgt < maxTgtLen * (i + 1))
+            paddingDecValues[curTgt++] = 0;
+    }
+    InitTensor2D(batchEnc, realBatchSize, maxSrcLen, X_INT, devID);
+    InitTensor2D(paddingEnc, realBatchSize, maxSrcLen, X_FLOAT, devID);
+    InitTensor2D(batchDec, realBatchSize, maxTgtLen, X_INT, devID);
+    InitTensor2D(paddingDec, realBatchSize, maxTgtLen, X_FLOAT, devID);
+    InitTensor2D(label, realBatchSize, maxTgtLen, X_INT, devID);
+    curIdx += realBatchSize;
+    batchEnc->SetData(batchEncValues, batchEnc->unitNum);
+    paddingEnc->SetData(paddingEncValues, paddingEnc->unitNum);
+    batchDec->SetData(batchDecValues, batchDec->unitNum);
+    paddingDec->SetData(paddingDecValues, paddingDec->unitNum);
+    label->SetData(labelVaues, label->unitNum);
+    delete[] batchEncValues;
+    delete[] paddingEncValues;
+    delete[] batchDecValues;
+    delete[] paddingDecValues;
+    delete[] labelVaues;
+    info.Add(tgtTokenNum);
+    info.Add(realBatchSize);
+    return info;
+}
+/*
+the constructor of DataSet
+>> dataFile - path of the data file
+>> bucketSize - size of the bucket to keep similar length sentence pairs
+>> training - indicates whether it is used for training
+*/
+void TrainDataSet::Init(const char* dataFile, int myBucketSize, bool training)
+{
+    fp = fopen(dataFile, "rb");
+    CheckNTErrors(fp, "can not open the training file");
+    curIdx = 0;
+    bucketSize = myBucketSize;
+    isTraining = training;
+    LoadDataToBuffer();
+    SortByLength();
+    if (isTraining)
+        BuildBucket();
+}
+/* check if the buffer is empty */
+bool TrainDataSet::IsEmpty() {
+    if (curIdx < buffer.Size())
+        return false;
+    return true;
+}
+/* reset the buffer */
+void TrainDataSet::ClearBuf()
+{
+    curIdx = 0;
+    /* make different batches in different epochs */
+    SortByLength();
+    if (isTraining)
+        BuildBucket();
+}
+/* group data into buckets with similar length */
+void TrainDataSet::BuildBucket()
+{
+    size_t idx = 0;
+    /* build and shuffle buckets */
+    while (idx < buffer.Size()) {
+        /* sentence number in a bucket */
+        size_t sentNum = 1;
+        /* get the maximum source sentence length in a bucket */
+        size_t maxSrcLen = buffer[idx]->srcSent.Size();
+        /* bucketing for sentences */
+        while ((sentNum < (buffer.Size() - idx))
+            && (sentNum * maxSrcLen < bucketSize)
+            && (sentNum * buffer[curIdx + sentNum]->srcSent.Size() < bucketSize)) {
+            if (maxSrcLen < buffer[idx + sentNum]->srcSent.Size())
+                maxSrcLen = buffer[idx + sentNum]->srcSent.Size();
+            sentNum++;
+        }
+        /* make sure the number is valid */
+        if ((buffer.Size() - idx) < sentNum) {
+            sentNum = buffer.Size() - idx;
+        }
+        int randomKey = rand();
+        /* shuffle items in a bucket */
+        for (size_t i = 0; i < sentNum; i++) {
+            buffer[idx + i]->bucketKey = randomKey;
+        }
+        idx += sentNum;
+    }
+    SortBucket();
+    /* sort items in a bucket */
+    idx = 0;
+    while (idx < buffer.Size()) {
+        size_t sentNum = 0;
+        int bucketKey = buffer[idx + sentNum]->bucketKey;
+        while (sentNum < (buffer.Size() - idx)
+            && buffer[idx + sentNum]->bucketKey == bucketKey) {
+            buffer[idx + sentNum]->key = buffer[idx + sentNum]->srcSent.Size();
+            sentNum++;
+        }
+        SortInBucket(idx, idx + sentNum);
+        idx += sentNum;
+    }
+}
+/* de-constructor */
+TrainDataSet::~TrainDataSet()
+{
+    /* release the buffer */
+    for (int i = 0; i < buffer.Size(); i++)
+        delete buffer[i];
+}
+}
\ No newline at end of file
--- a/source/sample/transformer/train/TrainDataSet.h
+++ b/source/sample/transformer/train/TrainDataSet.h
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: HU Chi (huchinlp@foxmail.com) 2019-04-03
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
+ */
+#ifndef __TRAIN_DATASET_H__
+#define __TRAIN_DATASET_H__
+#include <cstdio>
+#include <vector>
+#include <fstream>
+#include "../../../tensor/XList.h"
+#include "../../../tensor/XTensor.h"
+#include "../../../tensor/XGlobal.h"
+#define MAX_WORD_NUM 120
+using namespace std;
+namespace nts {
+/* a class of sentence pairs for training */
+struct TrainExample {
+    /* id of the sentence pair */
+    int id;
+    /* source language setence (tokenized) */
+    IntList srcSent;
+    /* target language setence (tokenized) */
+    IntList tgtSent;
+    /* the key used to shuffle items in a bucket */
+    int key;
+    /* the key used to shuffle buckets */
+    int bucketKey;
+};
+/* A `TrainDataSet` is associated with a file which contains training data. */
+struct TrainDataSet {
+public:
+    /* the data buffer */
+    TrainBufferType buffer;
+    /* a list of empty line number */
+    IntList emptyLines;
+    /* the pointer to file stream */
+    FILE* fp;
+    /* current index in the buffer */
+    size_t curIdx;
+    /* size of used data in the buffer */
+    size_t bufferUsed;
+    /* size of the bucket used for grouping sentences */
+    size_t bucketSize;
+    /* indicates whether it is used for training */
+    bool isTraining;
+public:
+    /* sort the input by length (in descending order) */
+    void SortByLength();
+    /* sort buckets by key (in descending order) */
+    void SortBucket();
+    /* sort the output by key (in descending order) */
+    void SortInBucket(int begin, int end);
+    /* load data from a file to the buffer */
+    void LoadDataToBuffer();
+    /* generate a mini-batch */
+    UInt64List LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
+                         XTensor* batchDec, XTensor* paddingDec, XTensor* label,
+                         size_t minSentBatch, size_t batchSize, int devID);
+    /* initialization function */
+    void Init(const char* dataFile, int bucketSize, bool training);
+    /* check if the buffer is empty */
+    bool IsEmpty();
+    /* reset the buffer */
+    void ClearBuf();
+    /* group data into buckets with similar length */
+    void BuildBucket();
+    /* de-constructor */
+    ~TrainDataSet();
+};
+}
+#endif // __TRAIN_DATASET_H__
\ No newline at end of file
--- a/source/sample/transformer/train/T2TTrainer.cpp
+++ b/source/sample/transformer/train/T2TTrainer.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
- * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,30 +18,31 @@
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-02
 */
-#include <cmath>
+#include "Trainer.h"
-#include "T2TTrainer.h"
+#include "../Utility.h"
-#include "../module/T2TUtility.h"
+#include "../../../network/XNoder.h"
 #include "../../../tensor/XUtility.h"
 #include "../../../tensor/core/CHeader.h"
 #include "../../../tensor/loss/LHeader.h"
-#include "../../../network/XNoder.h"
 #ifndef WIN32
 #include <sys/time.h>
 #include <unistd.h>
 #endif
+#include "../../../tensor/XMem.h"
-namespace transformer
+namespace nmt
 {
 /* constructor */
-T2TTrainer::T2TTrainer()
+Trainer::Trainer()
 {
    cfg = NULL;
 }
 /* de-constructor */
-T2TTrainer::~T2TTrainer()
+Trainer::~Trainer()
 {
    for (int i = 0; i < moments.count; i++) {
        XTensor* m = (XTensor*)moments.Get(i);
@@ -59,15 +59,17 @@ T2TTrainer::~T2TTrainer()
 initialization
 >> config - configurations of the training process
 */
-void T2TTrainer::Init(T2TConfig& config)
+void Trainer::Init(Config& config)
 {
    cfg = &config;
    lrate = config.lrate;
    lrbias = config.lrbias;
    sBatchSize = config.sBatchSize;
    wBatchSize = config.wBatchSize;
+    bucketSize = config.bucketSize;
    nepoch = config.nepoch;
    nstep = config.nstep;
+    maxCheckpoint = config.maxCheckpoint;
    d = config.modelSize;
    nwarmup = config.nwarmup;
    vSize = config.srcVocabSize;
@@ -81,17 +83,12 @@ void T2TTrainer::Init(T2TConfig& config)
    nStepCheckpoint = config.nStepCheckpoint;
    useEpochCheckpoint = config.useEpochCheckpoint;
    updateStep = config.updateStep;
-    isDebugged = config.isDebugged;
    isLenSorted = config.isLenSorted;
    adamBeta1T = 1.0F;
    adamBeta2T = 1.0F;
-    batchLoader.Init(config);
 }
-int tc = 0;
 /*
 train the model
 >> fn - training data file
@@ -99,8 +96,14 @@ train the model
 >> modelFN - where we keep the model
 >> model - model to train
 */
-void T2TTrainer::Train(const char* fn, const char* validFN, const char* modelFN, T2TModel* model)
+void Trainer::Train(const char* fn, const char* validFN, 
+                    const char* modelFN, Model* model)
 {
+    /* disable cache during training */
+    for (int i = 0; i < model->decoder->nlayer; i++) {
+        model->decoder->selfAttCache[i].enable = false;
+        model->decoder->enDeAttCache[i].enable = false;
+    }
    int step = 0;
    int wc = 0;
    int ws = 0;
@@ -126,26 +129,26 @@ void T2TTrainer::Train(const char* fn, const char* validFN, const char* modelFN,
 #endif
    int devID = model->devID;
-    XNet net;
    PrepareModel(model);
    double startT = GetClockSec();
-    for (epoch = 1; epoch <= nepoch; epoch++) {
+    batchLoader.Init(fn, bucketSize, true);
-#ifndef WIN32
-        if (isShuffled) {
-            fprintf(stderr, "shuffle the file\n");
-            batchLoader.Shuffle(fn, trainFN);
-        }
-#endif
-        FILE* file = fopen(trainFN, "r");
+    for (epoch = 1; epoch <= nepoch; epoch++) {
-        CheckNTErrors(file, "cannot open training file!");
        wordCount = 0;
        loss = 0;
+        /* reset the batch loader */
+        batchLoader.ClearBuf();
+        while (!batchLoader.IsEmpty())
+        {
+            XNet net;
+            net.Clear();
            /* batch of sequences (on the encoder and decoder sides) */
            XTensor batchEnc;
            XTensor batchDec;
@@ -157,14 +160,11 @@ void T2TTrainer::Train(const char* fn, const char* validFN, const char* modelFN,
            XTensor paddingEnc;
            XTensor paddingDec;
-        /* gold standard */
+            UInt64List info = batchLoader.LoadBatch(&batchEnc, &paddingEnc, &batchDec, &paddingDec, &label, 
-        XTensor gold;
+                                                    sBatchSize, wBatchSize, devID);
-        while (batchLoader.LoadBatch(file, model->isLM,
+            wc = info[0];
-            &batchEnc, &paddingEnc, &batchDec, &paddingDec, &gold, &label,
+            ws = info[1];
-            NULL, vSize, vSizeTgt,
-            sBatchSize, wBatchSize, isLenSorted, ws, wc, devID, true))
-        {
            CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch");
            /* output probabilities */
@@ -204,10 +204,18 @@ void T2TTrainer::Train(const char* fn, const char* validFN, const char* modelFN,
                /* update the parameters */
                if (gradStep == updateStep) {
-                    /* learning rate */
-                    lr = lrate * (1.0F / (float)sqrt((float)d)) *
+                    float warmupEndLR = lrate;
-                        (float)MIN(pow((float)validStep + 1, -0.5F - lrbias),
+                    float warmupInitLR = 1e-7;
-                        ((float)validStep + 1) * pow((float)nwarmup, -1.5F - lrbias));
+                    float lrStep = (warmupEndLR - warmupInitLR) / nwarmup;
+                    float decayFactor = warmupEndLR * pow(float(nwarmup), 0.5F);
+                    /* learning rate, scheduled by inverse square root */
+                    if (step < nwarmup)
+                        lr = warmupInitLR + step * lrStep;
+                    else
+                        lr = decayFactor * pow((float)step, -0.5F);
                    /* model update */
                    Update(model, lr);
@@ -224,15 +232,21 @@ void T2TTrainer::Train(const char* fn, const char* validFN, const char* modelFN,
                break;
            }
+            if (step == 10) {
+                // LOG("after backward --------");
+                // lossTensor.mem->ShowMemUsage(stderr);
+                // exit(0);
+            }
            if (step % 100 == 0) {
                double elapsed = GetClockSec() - startT;
-                XPRINT8(0, stderr, "[INFO] elapsed=%.1fs, step=%d, epoch=%d, total word=%d, total batch=%d, loss=%.3f, ppl=%.3f, sppl=%.3f",
+                LOG("elapsed=%.1fs, step=%d, epoch=%d, "
-                    elapsed, step, epoch,
+                    "total word=%d, total batch=%d, loss=%.3f, ppl=%.3f, lr=%.2e", 
-                    wordCountTotal, batchCountTotal,
+                    elapsed, step, epoch, wordCountTotal, batchCountTotal,
-                    loss / wordCount, exp(loss / wordCount), exp(lossBatch / wc));
+                    loss / wordCount / log(2.0), exp(loss / wordCount), lr);
                if (!doUpdate)
                    XPRINT(0, stderr, " (no update)");
-                XPRINT(0, stderr, "\n");
            }
            if (nStepCheckpoint > 0 && ++nStepCheck >= nStepCheckpoint) {
@@ -242,8 +256,6 @@ void T2TTrainer::Train(const char* fn, const char* validFN, const char* modelFN,
            }
        }
-        fclose(file);
        if (isEnd)
            break;
@@ -255,10 +267,14 @@ void T2TTrainer::Train(const char* fn, const char* validFN, const char* modelFN,
    epoch = MIN(epoch, nepoch);
-    XPRINT7(0, stderr, "[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, loss=%.3f, ppl=%.3f\n",
+    LOG("lr=%.2e, elapsed=%.1fs, step=%d, "
-        lr, elapsed, step, epoch, wordCountTotal, loss / wordCount, exp(loss / wordCount));
+        "epoch=%d, word=%d, loss=%.3f, ppl=%.3f",
-    XPRINT4(0, stderr, "[INFO] training finished (took %.1fs, step=%d, skipped=%d and epoch=%d)\n",
+        lr, elapsed, step, epoch, wordCountTotal, loss / wordCount / log(2.0), exp(loss / wordCount));
-        elapsed, step, nSkipped, epoch);
+    LOG("training finished (took %.1fs, step=%d, "
+        "skipped=%d and epoch=%d)", elapsed, step, nSkipped, epoch);
+    LOG("saving the final model");
+    model->Dump(modelFN);
    delete[] trainFN;
 }
@@ -269,7 +285,7 @@ test the model
 >> ofn - output data file
 >> model - model that is trained
 */
-void T2TTrainer::Validate(const char* fn, const char* ofn, T2TModel* model)
+void Trainer::Validate(const char* fn, const char* ofn, Model* model)
 {
    int wc = 0;
    int ws = 0;
@@ -278,13 +294,12 @@ void T2TTrainer::Validate(const char* fn, const char* ofn, T2TModel* model)
    float loss = 0;
    /* data files */
-    FILE* file = fopen(fn, "rb");
+    batchLoader.Init(fn, 0, false);
-    CheckNTErrors(file, "Cannot read the test file");
-    FILE* ofile = fopen(ofn, "wb");
-    CheckNTErrors(ofile, "Cannot open the output file");
    double startT = GetClockSec();
+    while (!batchLoader.IsEmpty())
+    {
        /* batch of input sequences */
        XTensor batchEnc;
        XTensor batchDec;
@@ -296,24 +311,19 @@ void T2TTrainer::Validate(const char* fn, const char* ofn, T2TModel* model)
        XTensor paddingEnc;
        XTensor paddingDec;
-    /* gold standard */
-    XTensor gold;
-    /* an array that keeps the sequences */
-    int* seqs = new int[MILLION];
-    batchLoader.ClearBuf();
-    while (batchLoader.LoadBatch(file, model->isLM,
-        &batchEnc, &paddingEnc, &batchDec, &paddingDec, &gold, &label,
-        seqs, vSize, vSizeTgt,
-        1, 1, false, ws, wc, model->devID, false))
-    {
-        CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch");
        /* output probabilities */
        XTensor output;
+        /* prediction probabilities */
+        XTensor labelOnehot;
+        XTensor lossTensor;
+        UInt64List info = batchLoader.LoadBatch(&batchEnc, &paddingEnc, &batchDec, &paddingDec, &label, 
+                                                sBatchSize, 0, model->devID);
+        wc = info[0];
+        ws = info[1];
+        CheckNTErrors(batchEnc.order == 2, "Wrong tensor order of the sequence batch");
        /* make the network */
        if (model->isLM)
            model->MakeLM(batchEnc, output, paddingEnc, false);
@@ -326,52 +336,20 @@ void T2TTrainer::Validate(const char* fn, const char* ofn, T2TModel* model)
        int bSize = output.GetDim(0);
        int length = output.GetDim(1);
-        /* prediction probabilities */
-        XTensor labelOnehot;
-        XTensor lossTensor;
        labelOnehot = IndexToOnehot(label, vSizeTgt, 0);
        lossTensor = CrossEntropy(output, labelOnehot, paddingDec);
        float lossBatch = ReduceSumAllValue(lossTensor);
-        /* dump the test result */
-        for (int s = 0; s < bSize; s++) {
-            DTYPE sum = 0;
-            int* seq = seqs + s * length;
-            for (int i = 0; i < length; i++) {
-                if (seq[i] >= 0) {
-                    fprintf(ofile, "%d ", seq[i]);
-                }
-                else
-                    break;
-            }
-            fprintf(ofile, "||| ");
-            for (int i = 0; i < length; i++) {
-                if (seq[i] >= 0) {
-                    DTYPE p = lossTensor.Get2D(s, i);
-                    fprintf(ofile, "%.3e ", p);
-                    sum += p;
-                }
-                else
-                    break;
-            }
-            fprintf(ofile, "||| %e\n", sum);
-        }
        loss += lossBatch;
        wordCount += wc;
        sentCount += bSize;
    }
-    fclose(file);
-    fclose(ofile);
-    delete[] seqs;
    double elapsed = GetClockSec() - startT;
-    XPRINT5(0, stderr, "[INFO] test finished (took %.1fs, sentence=%d, word=%d, loss=%.3f and ppl=%.3f)\n",
+    LOG("test finished (took %.1fs, sentence=%d, word=%d, loss=%.3f and ppl=%.3f)",
-        elapsed, sentCount, wordCount, loss / wordCount, exp(loss / wordCount));
+        elapsed, sentCount, wordCount, loss / wordCount / log(2.0), exp(loss / wordCount));
 }
 /*
@@ -382,20 +360,29 @@ make a checkpoint
 >> label - label of the model
 >> id - id of the checkpoint
 */
-void T2TTrainer::MakeCheckpoint(T2TModel* model, const char* validFN, const char* modelFN, const char* label, int id)
+void Trainer::MakeCheckpoint(Model* model, const char* validFN, 
+                             const char* modelFN, const char* label, int id)
 {
-    fprintf(stderr, "make a checkpoint\n");
+    LOG("make a checkpoint");
    char* fn = new char[MAX_LINE_LENGTH];
+    Trainer validator;
+    validator.Init(*cfg);
+    /* save last checkpoints */
+    id = validator.maxCheckpoint - (maxCheckpoint--);
+    if (maxCheckpoint == 0)
+        maxCheckpoint = validator.maxCheckpoint;
    sprintf(fn, "%s.%s.%03d", modelFN, label, id);
    model->Dump(fn);
    delete[] fn;
    char* fn2 = new char[MAX_LINE_LENGTH];
    sprintf(fn2, "%s.%s.%03d.output", modelFN, label, id);
    if (validFN != NULL) {
-        T2TTrainer trainer;
-        trainer.Init(*cfg);
+        validator.Validate(validFN, fn2, model);
-        trainer.Validate(validFN, fn2, model);
    }
    delete[] fn2;
 }
@@ -405,12 +392,12 @@ update the model by delta rule
 \theta_{new} = \theta - \lrate * grad
 where
 \lrate = d^-0.5 * min(stepNum^{-0.5}, stepNum * warmupStepNum^{-1.5})
->> model - the t2t model
+>> model - the  model
 >> lr - learning rate
 */
-void T2TTrainer::Update(T2TModel* model, const float lr)
+void Trainer::Update(Model* model, const float lr)
 {
-    TensorList ws(100);
+    TensorList ws;
    model->GetParams(ws);
@@ -465,12 +452,12 @@ void T2TTrainer::Update(T2TModel* model, const float lr)
 prepare model for training
 >> model - the model for training
 */
-void T2TTrainer::PrepareModel(T2TModel* model)
+void Trainer::PrepareModel(Model* model)
 {
    moments.Clear();
    moments2nd.Clear();
-    TensorList ws(100);
+    TensorList ws;
    model->GetParams(ws);

--- a/source/sample/transformer/train/T2TTrainer.h
+++ b/source/sample/transformer/train/T2TTrainer.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
- * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,25 +18,24 @@
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-02
 */
-#ifndef __T2TTRAINER_H__
+#ifndef __TRAINER_H__
-#define __T2TTRAINER_H__
+#define __TRAINER_H__
-#include "../T2TModel.h"
+#include "../Model.h"
-#include "T2TBatchLoader.h"
+#include "TrainDataSet.h"
-#include "../../../tensor/function/FHeader.h"
 using namespace nts;
-namespace transformer
+namespace nmt
 {
-/* trainer of the T2T model */
+/* trainer of the  model */
-class T2TTrainer
+class Trainer
 {
 public:
    /* configurations */
-    T2TConfig* cfg;
+    Config* cfg;
    /* dimension size of each inner layer */
    int d;
@@ -63,12 +61,18 @@ public:
    /* word batch size */
    int wBatchSize;
+    /* size of bucket for grouping data by length */
+    int bucketSize;
    /* training epoch number */
    int nepoch;
    /* traing step number */
    int nstep;
+    /* the maximum number of saved checkpoints */
+    int maxCheckpoint;
    /* indicates whether we use adam */
    bool useAdam;
@@ -100,39 +104,36 @@ public:
    /* number of batches on which we do model update */
    int updateStep;
-    /* indicates whether we intend to debug the net */
-    bool isDebugged;
    /* indicates whether the sequence is sorted by length */
    bool isLenSorted;
-    /* for batching */
+    /* used for loading batches */
-    T2TBatchLoader batchLoader;
+    TrainDataSet batchLoader;
 public:
    /* constructor */
-    T2TTrainer();
+    Trainer();
    /* de-constructor */
-    ~T2TTrainer();
+    ~Trainer();
    /* initialize the trainer */
-    void Init(T2TConfig& config);
+    void Init(Config& config);
    /* train the model */
-    void Train(const char* fn, const char* validFN, const char* modelFN, T2TModel* model);
+    void Train(const char* fn, const char* validFN, const char* modelFN, Model* model);
    /* test the model */
-    void Validate(const char* fn, const char* ofn, T2TModel* model);
+    void Validate(const char* fn, const char* ofn, Model* model);
    /* make a checkpoint */
-    void MakeCheckpoint(T2TModel* model, const char* validFN, const char* modelFN, const char* label, int id);
+    void MakeCheckpoint(Model* model, const char* validFN, const char* modelFN, const char* label, int id);
    /* update the model by delta rule */
-    void Update(T2TModel* model, const float lr);
+    void Update(Model* model, const float lr);
    /* prepare model for training */
-    void PrepareModel(T2TModel* model);
+    void PrepareModel(Model* model);
 };
 }

--- a/source/sample/transformer/translate/T2TDataSet.cpp
+++ b/source/sample/transformer/translate/T2TDataSet.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
- * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -26,23 +25,25 @@
 #include <fstream>
 #include <algorithm>
-#include "T2TDataSet.h"
+#include "DataSet.h"
-#include "../module/T2TUtility.h"
+#include "../Utility.h"
-using namespace transformer;
+using namespace nmt;
 namespace nts {
 /* sort the output by id (in ascending order) */
 void DataSet::SortInput() {
-    sort(inputBuffer.items, inputBuffer.items + inputBuffer.count, [](Example* a, Example* b) {
+    sort(inputBuffer.items, inputBuffer.items + inputBuffer.count, 
+        [](Example* a, Example* b) {
            return a->values.count > b->values.count;
        });
 }
 /* sort the input by length (in descending order) */
 void DataSet::SortOutput() {
-    sort(outputBuffer.items, outputBuffer.items + outputBuffer.count, [](Result* a, Result* b) {
+    sort(outputBuffer.items, outputBuffer.items + outputBuffer.count, 
+        [](Result* a, Result* b) {
            return a->id < b->id;
        });
 }
@@ -74,7 +75,7 @@ void DataSet::LoadDataToBuffer()
                : line.size() - indices[i];
            string word = line.substr(indices[i], offset);
            if (srcVocab.word2id.find(word) == srcVocab.word2id.end())
-                values.Add(3);
+                values.Add(UNK);
            else
                values.Add(srcVocab.word2id.at(word));
        }
@@ -100,7 +101,7 @@ void DataSet::LoadDataToBuffer()
 }
 /*
-load a mini-batch to the device
+load a mini-batch to the device (for translating)
 >> batchEnc - a tensor to store the batch of input
 >> paddingEnc - a tensor to store the batch of paddings
 >> minSentBatch - the minimum number of sentence batch
@@ -117,10 +118,10 @@ UInt64List DataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
    size_t maxLen = inputBuffer[bufferUsed]->values.Size();
    /* dynamic batching for sentences */
-    while ((realBatchSize < (inputBuffer.Size() - bufferUsed))
+    //while ((realBatchSize < (inputBuffer.Size() - bufferUsed))
-        && (realBatchSize * maxLen < batchSize)) {
+    //    && (realBatchSize * maxLen < batchSize)) {
-        realBatchSize++;
+    //    realBatchSize++;
-    }
+    //}
    /* real batch size */
    if ((inputBuffer.Size() - bufferUsed) < realBatchSize) {
@@ -133,13 +134,13 @@ UInt64List DataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
    float* paddingValues = new float[realBatchSize * maxLen];
    for (int i = 0; i < realBatchSize * maxLen; i++) {
-        batchValues[i] = 1;
+        batchValues[i] = PAD;
-        paddingValues[i] = 0.0F;
+        paddingValues[i] = 1.0F;
    }
-    size_t cur = 0;
+    size_t curSrc = 0;
-    /* left padding */
+    /* right padding */
    UInt64List infos;
    size_t totalLength = 0;
@@ -147,11 +148,11 @@ UInt64List DataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
        infos.Add(inputBuffer[bufferUsed + i]->id);
        totalLength += inputBuffer[bufferUsed + i]->values.Size();
-        cur = maxLen * (i + 1) - inputBuffer[bufferUsed + i]->values.Size();
+        curSrc = maxLen * i;
-        for (int j = 0; j < inputBuffer[bufferUsed + i]->values.Size(); j++) {
+        for (int j = 0; j < inputBuffer[bufferUsed + i]->values.Size(); j++)
-            batchValues[cur] = inputBuffer[bufferUsed + i]->values[j];
+            batchValues[curSrc++] = inputBuffer[bufferUsed + i]->values[j];
-            paddingValues[cur++] = 1.0F;
+        while (curSrc < maxLen * (i + 1))
-        }
+            paddingValues[curSrc++] = 0;
    }
    infos.Add(totalLength);
@@ -178,7 +179,7 @@ the constructor of DataSet
 void DataSet::Init(const char* dataFile, const char* srcVocabFN, const char* tgtVocabFN)
 {
    fp = new ifstream(dataFile);
-    CheckNTErrors(fp->is_open(), "can not open the file");
+    CheckNTErrors(fp->is_open(), "Can not open the test data");
    bufferUsed = 0;
    CheckNTErrors(strcmp(srcVocabFN, "") != 0, "missing source vocab file");

--- a/source/sample/transformer/translate/T2TDataSet.h
+++ b/source/sample/transformer/translate/T2TDataSet.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
- * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -26,7 +25,7 @@
 #include <cstdio>
 #include <vector>
 #include <fstream>
-#include "T2TVocab.h"
+#include "Vocab.h"
 #include "../../../tensor/XList.h"
 #include "../../../tensor/XTensor.h"

--- a/source/sample/transformer/translate/T2TLengthPenalty.cpp
+++ b/source/sample/transformer/translate/T2TLengthPenalty.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
- * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,11 +21,11 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */
-#include "T2TLengthPenalty.h"
+#include "LengthPenalty.h"
 using namespace nts;
-namespace transformer
+namespace nmt
 {
 /*
@@ -36,7 +35,7 @@ where n = length of the sequence
 >> alpha - the parameter controls the length preference
 << return - length penalty of the sequence
 */
-float T2TLengthPenalizer::GNMT(float length, float alpha)
+float LengthPenalizer::GNMT(float length, float alpha)
 {
    float base;
    float lp;

--- a/source/sample/transformer/translate/T2TLengthPenalty.h
+++ b/source/sample/transformer/translate/T2TLengthPenalty.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
- * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,21 +21,21 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */
-#ifndef __T2TLENGTHPENALTY_H__
+#ifndef __LENGTHPENALTY_H__
-#define __T2TLENGTHPENALTY_H__
+#define __LENGTHPENALTY_H__
-#include "../module/T2TUtility.h"
+#include "../Utility.h"
 #include "../../../tensor/XTensor.h"
 using namespace nts;
-namespace transformer
+namespace nmt
 {
 /* We intend to penalize short sequences because they have higher score
   in product of a sequence of probability-like terms and have more chances
   to beat others in search. */
-class T2TLengthPenalizer
+class LengthPenalizer
 {
 public:
    /* GNMT-like length penalty: pl = ((5 + n)/(5 + 1))^\alpha

--- a/source/sample/transformer/translate/T2TPredictor.cpp
+++ b/source/sample/transformer/translate/T2TPredictor.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
- * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,23 +21,23 @@
 #include <iostream>
-#include "T2TPredictor.h"
+#include "Predictor.h"
-#include "../module/T2TNNUtil.h"
+#include "../module/NNUtil.h"
 using namespace nts;
-namespace transformer
+namespace nmt
 {
 /* constructor */
-T2TStateBundle::T2TStateBundle()
+StateBundle::StateBundle()
 {
    states = NULL;
    isStart = false;
 }
 /* de-constructor */
-T2TStateBundle::~T2TStateBundle()
+StateBundle::~StateBundle()
 {
    if (states != NULL)
        delete[] states;
@@ -48,18 +47,18 @@ T2TStateBundle::~T2TStateBundle()
 create states
 >> num - number of states
 */
-void T2TStateBundle::MakeStates(int num)
+void StateBundle::MakeStates(int num)
 {
    CheckNTErrors(num > 0, "invalid number");
    if (states != NULL)
        delete[] states;
-    states = new T2TState[num];
+    states = new State[num];
    for (int i = 0; i < num; i++) {
        states[i].prediction = -1;
-        states[i].pid = T2T_PID_EMPTY;
+        states[i].pid = _PID_EMPTY;
        states[i].isEnd = false;
        states[i].isStart = false;
        states[i].isCompleted = false;
@@ -74,26 +73,26 @@ void T2TStateBundle::MakeStates(int num)
 }
 /* constructor */
-T2TPredictor::T2TPredictor()
+Predictor::Predictor()
 {
    startSymbol = 2;
 }
 /* de-constructor */
-T2TPredictor::~T2TPredictor()
+Predictor::~Predictor()
 {
 }
 /*
 create an initial state
->> model - the t2t model
+>> model - the  model
 >> top - the top-most layer of the network
 >> input - input of the network
 >> beamSize - beam size
 >> state - the state to be initialized
 */
-void T2TPredictor::Create(T2TModel* model, XTensor* top, const XTensor* input,
+void Predictor::Create(Model* model, XTensor* top, const XTensor* input,
-    int beamSize, T2TStateBundle* state)
+                       int beamSize, StateBundle* state)
 {
    int dims[MAX_TENSOR_DIM_NUM];
    for (int i = 0; i < input->order - 1; i++)
@@ -114,20 +113,20 @@ void T2TPredictor::Create(T2TModel* model, XTensor* top, const XTensor* input,
 set start symbol
 >> symbol - the symbol (in integer)
 */
-void T2TPredictor::SetStartSymbol(int symbol)
+void Predictor::SetStartSymbol(int symbol)
 {
    startSymbol = symbol;
 }
 /*
 read a state
->> model - the t2t model that keeps the network created so far
+>> model - the  model that keeps the network created so far
 >> state - a set of states. It keeps
 1) hypotheses (states)
 2) probabilities of hypotheses
 3) parts of the network for expanding toward the next state
 */
-void T2TPredictor::Read(T2TModel* model, T2TStateBundle* state)
+void Predictor::Read(Model* model, StateBundle* state)
 {
    m = model;
    s = state;
@@ -147,7 +146,7 @@ predict the next state
 >> needReorder - whether we need reordering the states
 >> nstep - current time step of the target sequence
 */
-void T2TPredictor::Predict(T2TStateBundle* next, XTensor& aliveState, XTensor& encoding,
+void Predictor::Predict(StateBundle* next, XTensor& aliveState, XTensor& encoding,
                        XTensor& inputEnc, XTensor& paddingEnc, int batchSize, bool isStart,
                        XTensor& reorderState, bool needReorder, int nstep)
 {
@@ -221,14 +220,14 @@ void T2TPredictor::Predict(T2TStateBundle* next, XTensor& aliveState, XTensor& e
 generate paths up to the states of the current step
 >> state - state bundle of the current step
 */
-XTensor T2TPredictor::GeneratePaths(T2TStateBundle* state)
+XTensor Predictor::GeneratePaths(StateBundle* state)
 {
    CheckNTErrors(state->stateNum >= 0, "Illegal state!");
    int distance = -1;
    for (int i = 0; i < state->stateNum; i++) {
-        T2TState* cur = state->states + i;
+        State* cur = state->states + i;
        int nsteps = 0;
        while (cur != NULL) {
@@ -245,7 +244,7 @@ XTensor T2TPredictor::GeneratePaths(T2TStateBundle* state)
    path.SetZeroAll();
    for (int i = 0; i < state->stateNum; i++) {
-        T2TState* cur = state->states + i;
+        State* cur = state->states + i;
        int nsteps = 0;
        while (cur != NULL) {
@@ -263,21 +262,21 @@ get the predictions of the previous step
 >> state - state bundle of the current step
 >> devID - the device id for the predictions
 */
-XTensor T2TPredictor::GetLastPrediction(T2TStateBundle* state, int devID)
+XTensor Predictor::GetLastPrediction(StateBundle* state, int devID)
 {
    CheckNTErrors(state->stateNum >= 0, "Illegal state!");
    IntList last;
    for (int i = 0; i < state->stateNum; i++) {
-        T2TState* cur = state->states + i;
+        State* cur = state->states + i;
        last.Add(cur->prediction);
    }
    XTensor lastPred;
-    InitTensor2D(&lastPred, last.Size(), 1, X_INT, devID);
+    InitTensor2D(&lastPred, int(last.Size()), 1, X_INT, devID);
-    lastPred.SetData(last.items, last.Size());
+    lastPred.SetData(last.items, int(last.Size()));
    return lastPred;
 }

--- a/source/sample/transformer/translate/T2TPredictor.h
+++ b/source/sample/transformer/translate/T2TPredictor.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
- * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -21,22 +20,22 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */
-#ifndef __T2TPREDICTOR_H__
+#ifndef __PREDICTOR_H__
-#define __T2TPREDICTOR_H__
+#define __PREDICTOR_H__
-#include "../T2TModel.h"
+#include "../Model.h"
-#include "T2TLengthPenalty.h"
+#include "LengthPenalty.h"
 using namespace std;
-namespace transformer
+namespace nmt
 {
-#define T2T_PID_EMPTY -1
+#define _PID_EMPTY -1
 /* state for search. It keeps the path (back-pointer), prediction distribution,
   and etc. It can be regarded as a hypotheses in translation. */
-class T2TState
+class State
 {
 public:
    /* we assume that the prediction is an integer */
@@ -69,11 +68,11 @@ public:
    int nstep;
    /* pointer to the previous state */
-    T2TState* last;
+    State* last;
 };
 /* a bundle of states */
-class T2TStateBundle
+class StateBundle
 {
 public:
    /* predictions */
@@ -98,7 +97,7 @@ public:
    float nstep;
    /* list of states */
-    T2TState* states;
+    State* states;
    /* number of states */
    int stateNum;
@@ -108,10 +107,10 @@ public:
 public:
    /* constructor */
-    T2TStateBundle();
+    StateBundle();
    /* de-constructor */
-    ~T2TStateBundle();
+    ~StateBundle();
    /* create states */
    void MakeStates(int num);
@@ -122,14 +121,14 @@ public:
   we get the state of previous words and then generate the next word.
   Here, a state can be regarded as the representation of words (word
   indices, hidden states, embeddings and etc.).  */
-class T2TPredictor
+class Predictor
 {
 private:
    /* pointer to the transformer model */
-    T2TModel* m;
+    Model* m;
    /* current state */
-    T2TStateBundle* s;
+    StateBundle* s;
    /* start symbol */
    int startSymbol;
@@ -139,30 +138,30 @@ private:
 public:
    /* constructor */
-    T2TPredictor();
+    Predictor();
    /* de-constructor */
-    ~T2TPredictor();
+    ~Predictor();
    /* create an initial state */
-    void Create(T2TModel* model, XTensor* top, const XTensor* input, int beamSize, T2TStateBundle* state);
+    void Create(Model* model, XTensor* top, const XTensor* input, int beamSize, StateBundle* state);
    /* set the start symbol */
    void SetStartSymbol(int symbol);
    /* read a state */
-    void Read(T2TModel* model, T2TStateBundle* state);
+    void Read(Model* model, StateBundle* state);
    /* predict the next state */
-    void Predict(T2TStateBundle* next, XTensor& aliveIndices, XTensor& encoding,
+    void Predict(StateBundle* next, XTensor& aliveIndices, XTensor& encoding,
        XTensor& inputEnc, XTensor& paddingEnc, int rawBatchSize,
        bool isStart, XTensor& reorderState, bool needReorder, int nstep);
    /* generate paths up to the states of the current step */
-    XTensor GeneratePaths(T2TStateBundle* state);
+    XTensor GeneratePaths(StateBundle* state);
    /* get the predictions of the previous step */
-    XTensor GetLastPrediction(T2TStateBundle* state, int devID);
+    XTensor GetLastPrediction(StateBundle* state, int devID);
 };
 }

--- a/source/sample/transformer/translate/T2TSearch.cpp
+++ b/source/sample/transformer/translate/T2TSearch.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
- * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,13 +19,13 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
 */
-#include "T2TSearch.h"
+#include "Search.h"
-#include "../module/T2TUtility.h"
+#include "../Utility.h"
 #include "../../../tensor/core/CHeader.h"
 using namespace nts;
-namespace transformer
+namespace nmt
 {
 /* constructor */
 BeamSearch::BeamSearch()
@@ -55,7 +54,7 @@ initialize the model
 >> argc - number of arguments
 >> argv - list of pointers to the arguments
 */
-void BeamSearch::Init(T2TConfig& config)
+void BeamSearch::Init(Config& config)
 {
    beamSize = config.beamSize;
    batchSize = config.sBatchSize;
@@ -105,10 +104,10 @@ search for the most promising states
 >> output - output that represents the sequences as rows
 >> score - score of the sequences
 */
-void BeamSearch::Search(T2TModel* model, XTensor& input, XTensor& padding, 
+void BeamSearch::Search(Model* model, XTensor& input, XTensor& padding, 
                        IntList* output, XTensor& score)
 {
-    T2TPredictor predictor;
+    Predictor predictor;
    XTensor maskEnc;
    XTensor encoding;
    XTensor encodingBeam;
@@ -140,10 +139,10 @@ void BeamSearch::Search(T2TModel* model, XTensor& input, XTensor& padding,
    CheckNTErrors(lengthLimit > 0, "no max length specified!");
    maxLength = lengthLimit;
-    T2TStateBundle* states = new T2TStateBundle[lengthLimit + 1];
+    StateBundle* states = new StateBundle[lengthLimit + 1];
-    T2TStateBundle* first = states;
+    StateBundle* first = states;
-    T2TStateBundle* cur = NULL;
+    StateBundle* cur = NULL;
-    T2TStateBundle* next = NULL;
+    StateBundle* next = NULL;
    /* create the first state */
    predictor.Create(model, &encodingBeam, &input, beamSize, first);
@@ -213,7 +212,7 @@ compute the model score for each hypotheses
 >> prev - the beam of the previous state
 >> beam - the beam that keeps a number of states
 */
-void BeamSearch::Score(T2TStateBundle* prev, T2TStateBundle* beam)
+void BeamSearch::Score(StateBundle* prev, StateBundle* beam)
 {
    XTensor& score = beam->modelScore;
    XTensor& prob = beam->prob;
@@ -244,7 +243,7 @@ void BeamSearch::Score(T2TStateBundle* prev, T2TStateBundle* beam)
    beam->nstep = prev->nstep + 1.0F;
    /* the GNMT-like length penalty */
-    float lp = T2TLengthPenalizer::GNMT(beam->nstep, alpha);
+    float lp = LengthPenalizer::GNMT(beam->nstep, alpha);
    /* score = log-prob/lp */
    score = probPath / lp;
@@ -279,7 +278,7 @@ generate tokens for the next state via beam pruning
 >> prev - the last beam
 >> beam - the beam that keeps a number of states
 */
-void BeamSearch::Generate(T2TStateBundle* prev, T2TStateBundle* beam)
+void BeamSearch::Generate(StateBundle* prev, StateBundle* beam)
 {
    int dims[MAX_TENSOR_DIM_NUM];
    int dimsBeam[MAX_TENSOR_DIM_NUM];
@@ -323,7 +322,7 @@ void BeamSearch::Generate(T2TStateBundle* prev, T2TStateBundle* beam)
    /* keep the most promising candidates in the beam */
    TopK(score, scoreTopK, index, -1, beamSize, true);
-    float lp = T2TLengthPenalizer::GNMT(beam->nstep, alpha);
+    float lp = LengthPenalizer::GNMT(beam->nstep, alpha);
    CopyValues(index, indexCPU);
    CopyValues(index, preID);
@@ -375,14 +374,14 @@ expand the search graph
 >> beam - the beam that keeps a number of states
 >> reorderState - the new order of states
 */
-void BeamSearch::Expand(T2TStateBundle* prev, T2TStateBundle* beam, XTensor& reorderState)
+void BeamSearch::Expand(StateBundle* prev, StateBundle* beam, XTensor& reorderState)
 {
    CheckNTErrors(beam->prediction.unitNum == beam->preID.unitNum, 
                  "A problem occurs in the beam!");
    beam->MakeStates(beam->prediction.unitNum);
-    T2TState* states = beam->states;
+    State* states = beam->states;
    XTensor& idRef = beam->preID;
    XTensor& modelScoreRef = beam->modelScore;
    XTensor& probRef = beam->prob;
@@ -424,7 +423,7 @@ void BeamSearch::Expand(T2TStateBundle* prev, T2TStateBundle* beam, XTensor& reo
    for (int i = 0; i < beam->stateNum; i += beamSize) {
        for (int j = 0; j < beamSize; j++) {
            int k = i + j;
-            T2TState& state = states[k];
+            State& state = states[k];
            int offset = id.GetInt(k);
            int pid = i / beamSize;
@@ -432,7 +431,7 @@ void BeamSearch::Expand(T2TStateBundle* prev, T2TStateBundle* beam, XTensor& reo
            if (offset != j)
                needReorder = true;
-            T2TState* last = prev->states + pid * beamSize + offset;
+            State* last = prev->states + pid * beamSize + offset;
            CheckNTErrors(offset >= 0, "Wrong state index!");
@@ -482,12 +481,12 @@ collect hypotheses with ending symbols. Given a beam of hypotheses,
 we remove the finished hypotheses and keep them in a heap.
 >> beam  - the beam that keeps a number of states
 */
-void BeamSearch::Collect(T2TStateBundle* beam)
+void BeamSearch::Collect(StateBundle* beam)
 {
-    T2TState* states = beam->states;
+    State* states = beam->states;
    for (int i = 0; i < beam->stateNum; i++) {
-        T2TState& state = states[i];
+        State& state = states[i];
        CheckNTErrors(state.pid >= 0 && state.pid < batchSize,
            "Invalid sample id!");
@@ -508,13 +507,13 @@ void BeamSearch::Collect(T2TStateBundle* beam)
 fill the hypothesis heap with incomplete hypotheses
 >> beam  - the beam that keeps a number of states (final)
 */
-void BeamSearch::FillHeap(T2TStateBundle* beam)
+void BeamSearch::FillHeap(StateBundle* beam)
 {
-    T2TState* states = beam->states;
+    State* states = beam->states;
    for (int i = 0; i < beam->stateNum / beamSize; i++) {
        for (int j = 0; j < beamSize; j++) {
-            T2TState& state = states[i * beamSize + j];
+            State& state = states[i * beamSize + j];
            /* we push the incomplete hypothesis into the heap */
            if (fullHypos[state.pid].Count() == 0 && state.isEnd && state.isCompleted) {
@@ -548,10 +547,10 @@ void BeamSearch::Dump(IntList* output, XTensor* score)
        int c = heap.Count();
        float bestScore = -1e9F;
-        T2TState* state = NULL;
+        State* state = NULL;
        for (int i = 0; i < c; i++) {
            auto node = heap.Pop();
-            T2TState* s = (T2TState*)node.index;
+            State* s = (State*)node.index;
            if (i == 0 || bestScore < node.value) {
                state = s;
                bestScore = node.value;
@@ -619,12 +618,12 @@ void BeamSearch::SetEnd(const int* tokens, const int tokenNum)
 check whether all hypotheses are completed
 >> beam - the beam that keeps the searching states
 */
-bool BeamSearch::IsAllCompleted(T2TStateBundle* beam)
+bool BeamSearch::IsAllCompleted(StateBundle* beam)
 {
-    T2TState* states = beam->states;
+    State* states = beam->states;
    for (int i = 0; i < beam->stateNum; i++) {
-        T2TState& state = states[i];
+        State& state = states[i];
        if (!state.isCompleted)
            return false;
    }
@@ -640,11 +639,11 @@ update the beam by removing finished hypotheses
 >> alivePadding - new paddings for the inputs, (B, L)
 << aliveIdx - the indices of alive states
 */
-void BeamSearch::RemoveFinishedStates(T2TStateBundle* beam, XTensor& aliveEncoding,
+void BeamSearch::RemoveFinishedStates(StateBundle* beam, XTensor& aliveEncoding,
                                      XTensor& aliveInput, XTensor& alivePadding, 
                                      XTensor& aliveState)
 {
-    T2TState* states = beam->states;
+    State* states = beam->states;
    /* get the indices of uncompleted sentences and states */
    aliveSentList.Clear();
@@ -674,12 +673,12 @@ void BeamSearch::RemoveFinishedStates(T2TStateBundle* beam, XTensor& aliveEncodi
        }
    }
-    InitTensor1D(&aliveState, aliveStateList.Size(), X_INT, aliveEncoding.devID);
+    InitTensor1D(&aliveState, int(aliveStateList.Size()), X_INT, aliveEncoding.devID);
-    aliveState.SetData(aliveStateList.items, aliveStateList.Size());
+    aliveState.SetData(aliveStateList.items, int(aliveStateList.Size()));
    XTensor aliveSent;
-    InitTensor1D(&aliveSent, aliveSentList.Size(), X_INT, aliveEncoding.devID);
+    InitTensor1D(&aliveSent, int(aliveSentList.Size()), X_INT, aliveEncoding.devID);
-    aliveSent.SetData(aliveSentList.items, aliveSentList.Size());
+    aliveSent.SetData(aliveSentList.items, int(aliveSentList.Size()));
    if (aliveStateList.Size() < aliveEncoding.dimSize[0] && aliveStateList.Size() > 0) {
        aliveInput = AutoGather(aliveInput, aliveState);
@@ -697,7 +696,7 @@ void BeamSearch::RemoveFinishedStates(T2TStateBundle* beam, XTensor& aliveEncodi
 make a mask to prevent duplicated entries in beam expansion for the first position
 >> beam - the beam that keeps the searching states
 */
-XTensor BeamSearch::MakeFirstMask(T2TStateBundle* beam)
+XTensor BeamSearch::MakeFirstMask(StateBundle* beam)
 {
    XTensor& prob = beam->prob;
    XTensor mask;
@@ -742,7 +741,7 @@ initialize the model
 >> argc - number of arguments
 >> argv - list of pointers to the arguments
 */
-void GreedySearch::Init(T2TConfig& config)
+void GreedySearch::Init(Config& config)
 {
    batchSize = config.wBatchSize;
    endSymbols[0] = config.endID;
@@ -798,7 +797,7 @@ search for the most promising states
 >> padding - padding of the input
 >> output - output that represents the sequences as rows
 */
-void GreedySearch::Search(T2TModel* model, XTensor& input, 
+void GreedySearch::Search(Model* model, XTensor& input, 
                          XTensor& padding, IntList* output)
 {
    XTensor maskEnc;

--- a/source/sample/transformer/translate/T2TSearch.h
+++ b/source/sample/transformer/translate/T2TSearch.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
- * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,15 +19,15 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
 */
-#ifndef __T2TSEARCH_H__
+#ifndef __SEARCH_H__
-#define __T2TSEARCH_H__
+#define __SEARCH_H__
-#include "../T2TModel.h"
+#include "../Model.h"
-#include "T2TPredictor.h"
+#include "Predictor.h"
 using namespace std;
-namespace transformer
+namespace nmt
 {
 /* The class organizes the search process. It calls "predictors" to generate
@@ -42,7 +41,7 @@ private:
    float alpha;
    /* predictor */
-    T2TPredictor predictor;
+    Predictor predictor;
    /* max length of the generated sequence */
    int maxLength;
@@ -88,28 +87,28 @@ public:
    ~BeamSearch();
    /* initialize the model */
-    void Init(T2TConfig& config);
+    void Init(Config& config);
    /* search for the most promising states */
-    void Search(T2TModel* model, XTensor& input, XTensor& padding, IntList* output, XTensor& score);
+    void Search(Model* model, XTensor& input, XTensor& padding, IntList* output, XTensor& score);
    /* preparation */
    void Prepare(int myBatchSize, int myBeamSize);
    /* compute the model score for each hypotheses */
-    void Score(T2TStateBundle* prev, T2TStateBundle* beam);
+    void Score(StateBundle* prev, StateBundle* beam);
    /* generate token indices via beam pruning */
-    void Generate(T2TStateBundle* prev, T2TStateBundle* beam);
+    void Generate(StateBundle* prev, StateBundle* beam);
    /* expand the search graph */
-    void Expand(T2TStateBundle* prev, T2TStateBundle* beam, XTensor& reorderState);
+    void Expand(StateBundle* prev, StateBundle* beam, XTensor& reorderState);
    /* collect hypotheses with ending symbol */
-    void Collect(T2TStateBundle* beam);
+    void Collect(StateBundle* beam);
    /* fill the hypotheses heap with incomplete hypotheses */
-    void FillHeap(T2TStateBundle* beam);
+    void FillHeap(StateBundle* beam);
    /* save the output sequences and score */
    void Dump(IntList* output, XTensor* score);
@@ -118,17 +117,17 @@ public:
    bool IsEnd(int token);
    /* check whether all hypotheses are completed */
-    bool IsAllCompleted(T2TStateBundle* beam);
+    bool IsAllCompleted(StateBundle* beam);
    /* update the beam by pruning finished states */
-    void RemoveFinishedStates(T2TStateBundle* beam, XTensor& aliveEncoding,
+    void RemoveFinishedStates(StateBundle* beam, XTensor& aliveEncoding,
        XTensor& aliveInput, XTensor& alivePadding, XTensor& aliveIdx);
    /* set end symbols for search */
    void SetEnd(const int* tokens, const int tokenNum);
    /* make a mask to prevent duplicated entries in beam expansion for the first position */
-    XTensor MakeFirstMask(T2TStateBundle* beam);
+    XTensor MakeFirstMask(StateBundle* beam);
 };
 class GreedySearch
@@ -136,7 +135,7 @@ class GreedySearch
 private:
    /* predictor */
-    T2TPredictor predictor;
+    Predictor predictor;
    /* max length of the generated sequence */
    int maxLength;
@@ -164,10 +163,10 @@ public:
    ~GreedySearch();
    /* initialize the model */
-    void Init(T2TConfig& config);
+    void Init(Config& config);
    /* search for the most promising states */
-    void Search(T2TModel* model, XTensor& input, XTensor& padding, IntList* output);
+    void Search(Model* model, XTensor& input, XTensor& padding, IntList* output);
    /* preparation */
    void Prepare(int myBatchSize);

--- a/source/sample/transformer/translate/T2TTranslator.cpp
+++ b/source/sample/transformer/translate/T2TTranslator.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
- * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,27 +19,25 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
 */
-#include <cmath>
+#include "Search.h"
+#include "Translator.h"
-#include "T2TTranslator.h"
+#include "../Utility.h"
-#include "T2TSearch.h"
-#include "../module/T2TUtility.h"
 #include "../../../tensor/XTensor.h"
 #include "../../../tensor/XUtility.h"
 #include "../../../tensor/core/CHeader.h"
 using namespace nts;
-namespace transformer
+namespace nmt
 {
 /* constructor */
-T2TTranslator::T2TTranslator()
+Translator::Translator()
 {
 }
 /* de-constructor */
-T2TTranslator::~T2TTranslator()
+Translator::~Translator()
 {
    if (beamSize > 1)
        delete (BeamSearch*)seacher;
@@ -49,7 +46,7 @@ T2TTranslator::~T2TTranslator()
 }
 /* initialize the model */
-void T2TTranslator::Init(T2TConfig& config)
+void Translator::Init(Config& config)
 {
    beamSize = config.beamSize;
    vSize = config.srcVocabSize;
@@ -58,17 +55,17 @@ void T2TTranslator::Init(T2TConfig& config)
    wordBatch = config.wBatchSize;
    if (beamSize > 1) {
-        XPRINT1(0, stderr, "Translating with beam search (%d)\n", beamSize);
+        LOG("translating with beam search (%d)", beamSize);
        seacher = new BeamSearch();
        ((BeamSearch*)seacher)->Init(config);
    }
    else if (beamSize == 1) {
-        XPRINT1(0, stderr, "Translating with greedy search\n", beamSize);
+        LOG("translating with greedy search");
        seacher = new GreedySearch();
        ((GreedySearch*)seacher)->Init(config);
    }
    else {
-        CheckNTErrors(false, "invalid beam size\n");
+        CheckNTErrors(false, "Invalid beam size\n");
    }
 }
@@ -80,8 +77,8 @@ test the model
 >> ofn - output data file
 >> model - pretrained model
 */
-void T2TTranslator::Translate(const char* ifn, const char* sfn, const char* tfn, 
+void Translator::Translate(const char* ifn, const char* sfn, 
-                              const char* ofn, T2TModel* model)
+                           const char* tfn, const char* ofn, Model* model)
 {
    int wc = 0;
    int wordCountTotal = 0;
@@ -99,8 +96,7 @@ void T2TTranslator::Translate(const char* ifn, const char* sfn, const char* tfn,
    XTensor paddingEnc;
    batchLoader.Init(ifn, sfn, tfn);
-    XPRINT1(0, stderr, "[INFO] loaded the input file, elapsed=%.1fs \n", 
+    LOG("loaded the input file, elapsed=%.1fs ", GetClockSec() - startT);
-            GetClockSec() - startT);
    int count = 0;
    double batchStart = GetClockSec();
@@ -130,22 +126,22 @@ void T2TTranslator::Translate(const char* ifn, const char* sfn, const char* tfn,
        for (int i = 0; i < indices.Size() - 1; ++i) {
            Result* res = new Result;
-            res->id = indices[i];
+            res->id = int(indices[i]);
            res->res = output[i];
            batchLoader.outputBuffer.Add(res);
        }
        delete[] output;
-        wc += indices[-1];
+        wc += int(indices[-1]);
-        wordCountTotal += indices[-1];
+        wordCountTotal += int(indices[-1]);
-        sentCount += (indices.Size() - 1);
+        sentCount += int(indices.Size() - 1);
        batchCount += 1;
        if (count % 1 == 0) {
            double elapsed = GetClockSec() - batchStart;
            batchStart = GetClockSec();
-            XPRINT3(0, stderr, "[INFO] elapsed=%.1fs, sentence=%f, sword=%.1fw/s\n",
+            LOG("elapsed=%.1fs, sentence=%f, sword=%.1fw/s",
                elapsed, float(sentCount) / float(batchLoader.inputBuffer.Size()), 
                double(wc) / elapsed);
            wc = 0;
@@ -169,7 +165,7 @@ void T2TTranslator::Translate(const char* ifn, const char* sfn, const char* tfn,
    double elapsed = GetClockSec() - startDump;
-    XPRINT2(0, stderr, "[INFO] translation completed (word=%d, sent=%llu)\n", 
+    LOG("translation completed (word=%d, sent=%zu)", 
        wordCountTotal, batchLoader.inputBuffer.Size() + batchLoader.emptyLines.Size());
 }
@@ -178,7 +174,7 @@ dump the result into the file
 >> file - data file
 >> output - output tensor
 */
-void T2TTranslator::Dump(FILE* file, XTensor* output)
+void Translator::Dump(FILE* file, XTensor* output)
 {
    if (output != NULL && output->unitNum != 0) {
        int seqLength = output->dimSize[output->order - 1];

--- a/source/sample/transformer/translate/T2TTranslator.h
+++ b/source/sample/transformer/translate/T2TTranslator.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
- * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -21,17 +20,17 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
 */
-#ifndef __T2TTESTER_H__
+#ifndef __TESTER_H__
-#define __T2TTESTER_H__
+#define __TESTER_H__
-#include "T2TSearch.h"
+#include "Search.h"
-#include "T2TDataSet.h"
+#include "DataSet.h"
-namespace transformer
+namespace nmt
 {
 /* This class translates test sentences with a trained model. */
-class T2TTranslator
+class Translator
 {
 public:
    /* vocabulary size of the source side */
@@ -57,17 +56,17 @@ public:
 public:
    /* constructor */
-    T2TTranslator();
+    Translator();
    /* de-constructor */
-    ~T2TTranslator();
+    ~Translator();
    /* initialize the model */
-    void Init(T2TConfig& config);
+    void Init(Config& config);
    /* test the model */
    void Translate(const char* ifn, const char* vfn, const char* ofn, 
-                   const char* tfn, T2TModel* model);
+                   const char* tfn, Model* model);
    /* dump the result into the file */
    void Dump(FILE* file, XTensor* output);

--- a/source/sample/transformer/translate/T2TVocab.cpp
+++ b/source/sample/transformer/translate/T2TVocab.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
- * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
- * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -21,8 +20,8 @@
 #include <fstream>
-#include "T2TVocab.h"
+#include "Vocab.h"
-#include "../module/T2TUtility.h"
+#include "../Utility.h"
 namespace nts {
@@ -31,7 +30,7 @@ void Vocab::Load(const string& src)
 {
    string vsz, sid;
    ifstream f(src, ios::in);
-    CheckNTErrors(f.is_open(), "Unable to open the vocabulary file");
+    CheckNTErrors(f.is_open(), "unable to open the vocabulary file");
    /* get the vocab size and the start id */
    f >> vsz >> sid;

--- a/source/sample/transformer/translate/T2TVocab.h
+++ b/source/sample/transformer/translate/T2TVocab.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
- * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
- * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,8 +18,8 @@
 * $Created by: HU Chi (huchinlp@foxmail.com) 2020-01-03
 */
-#ifndef __T2TVOCAB_H__
+#ifndef __VOCAB_H__
-#define __T2TVOCAB_H__
+#define __VOCAB_H__
 #include <cstdio>
 #include <unordered_map>
@@ -30,10 +29,10 @@ using namespace std;
 namespace nts {
 /* user-defined symbols */
-#define UNK 0
 #define PAD 1
 #define SOS 2
 #define EOS 2
+#define UNK 3
 /* the vocabulary class */
 struct Vocab

--- a/source/tensor/XGlobal.h
+++ b/source/tensor/XGlobal.h
@@ -180,8 +180,6 @@ extern FILE * tF;
 extern int tmpCountV2;
 extern int nnnTotal;
-void PrintTrace(void);
 } /* end of the nts (NiuTrans.Tensor) namespace */
 #endif
--- a/source/tensor/XMem.cpp
+++ b/source/tensor/XMem.cpp
@@ -1511,9 +1511,12 @@ void XMem::ShowMemUsage(FILE * file)
    }
    MTYPE bufTotal = bufSize;
+    MTYPE bufUsed = bufUsed;
    fprintf(file, "block mem:%.1fMB used:%.1fMB usage:%.3f\n",
           (DTYPE)blockTotal/MILLION, (DTYPE)blockUsed/MILLION, (DTYPE)blockUsed/blockTotal);
+    fprintf(file, "buffer mem:%.1fMB used:%.1fMB usage:%.3f\n",
+            (DTYPE)bufTotal / 1024 / 1024, (DTYPE)bufUsed / 1024 / 1024, (DTYPE)bufUsed / bufTotal);
 }

--- a/source/tensor/core/arithmetic/Sub.cu
+++ b/source/tensor/core/arithmetic/Sub.cu
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-/*
- * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-01
- */
-#include "../../XDevice.h"
-#include "../../XUtility.h"
-#include "Sub.cuh"
-namespace nts { // namespace nts(NiuTrans.Tensor)
-#ifdef USE_CUDA
-/*
-subtraction of data arrays (CUDA Kernel)
-c = a - b * \beta
->> a - A matrix
->> b - another matrix
->> c - where we put a-b
->> size - the size of a/b/c
->> beta - the coefficient
-*/
-__global__
-void KernelSUB(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-    if (i < size)
-        c[i] = a[i] - b[i] * beta;
-}
-/*
-tensor subtraction c = a - b * \beta (cuda version)
->> a - a tensor
->> b - another tensor
->> c - where we put a-b*\beta.
->> beta - the scaling factor
-*/
-void _CudaSub(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
-{
-    CheckNTErrors(a && b && c, "Empty tensor input!");
-    CheckNTErrors((a->unitNum == b->unitNum && a->unitNum == c->unitNum),
-                  "Unmatched tensors in addition!");
-    CheckNTErrors((a->dataType == b->dataType && a->dataType == c->dataType),
-                  "Unmatched tensors in addition!");
-    CheckNTErrors((a->devID == b->devID && a->devID == c->devID),
-                  "The tensors must be on the same!");
-    int devIDBackup = XDevice::GetGPUDevice();
-    XDevice::SetGPUDevice(a->devID);
-    if (!a->isSparse && !b->isSparse) {
-        CheckNTErrors(!c->isSparse, "Illegal use of sparse matrix in addition!");
-        if (a->dataType == DEFAULT_DTYPE &&
-            b->dataType == DEFAULT_DTYPE &&
-            c->dataType == DEFAULT_DTYPE)
-        {
-            int gridSize[3], blockSize[3];
-            GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
-            dim3 blocks(gridSize[0]);
-            dim3 threads(blockSize[0]);
-            KernelSUB << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, a->unitNum, beta);
-        }
-        else {
-            // TODO!!
-            ShowNTErrors("TODO!");
-        }
-    }
-    else {
-        // TODO!!
-        ShowNTErrors("TODO!");
-    }
-    XDevice::SetGPUDevice(devIDBackup);
-}
-/* subtraction over arrays
-tensor subtraction c = a - b * \beta (cuda version) with an input handle
->> devID - device ID (MUST >= 0)
->> handle - cuda handle
->> a - an array
->> b - another array
->> c - where we put a-b
->> size - size of the array
->> beta - the coefficient
-*/
-void _CudaSubWithHandle(int devID, cublasHandle_t * handle, DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta)
-{
-    if (size == 0)
-        return;
-    if (c == NULL)
-        c = a;
-    CheckNTErrors((a && b && c), "Empty arrays in addition!");
-    int devIDBackup;
-    ProtectCudaDev(devID, devIDBackup);
-    if (c == a) {
-#ifdef DOUBELPRICSION
-        cublasDaxpy(*handle, size, &beta, b, 1, a, 1);
-#else
-        cublasSaxpy(*handle, size, &beta, b, 1, a, 1);
-#endif
-    }
-    else {
-        int gridSize[3], blockSize[3];
-        GDevs.GetCudaThread(devID, size, gridSize, blockSize);
-        dim3 blocks(gridSize[0]);
-        dim3 threads(blockSize[0]);
-        KernelSUB<<<blocks, threads>>>((DTYPE*)a, (DTYPE*)b, (DTYPE*)c, size, beta);
-    }
-    BacktoCudaDev(devID, devIDBackup);
-}
-#endif // USE_CUDA
-} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/Sub.cuh
+++ b/source/tensor/core/arithmetic/Sub.cuh
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-/*
- * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-01
- */
-#ifndef __SUB_CUH__
-#define __SUB_CUH__
-#include "Sub.h"
-namespace nts { // namespace nts(NiuTrans.Tensor)
-#ifdef USE_CUDA
-/* subtraction of data arrays (CUDA Kernel) */
-__global__
-void KernelSUB(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta = (DTYPE)1.0);
-/* tensor subtraction c = a - b * \beta (cuda version) */
-void _CudaSub(const XTensor * a, const XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
-/*  tensor subtraction c = a - b * \beta (cuda version) with an input handle */
-void _CudaSubWithHandle(int devID, cublasHandle_t * handle, DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta = (DTYPE)1.0);
-#endif // USE_CUDA
-} // namespace nts(NiuTrans.Tensor)
-#endif // __SUB_CUH__
--- a/source/tensor/core/arithmetic/SubDim.cpp
+++ b/source/tensor/core/arithmetic/SubDim.cpp
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-/*
-* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
-*/
-#include <math.h>
-#include "Sub.h"
-#include "SubDim.h"
-#include "SubDim.cuh"
-#include "../../XName.h"
-#include "../../XUtility.h"
-#include "../movement/CopyValues.h"
-#include "../shape/IsSameShaped.h"
-namespace nts { // namespace nts(NiuTrans.Tensor)
-/*
-tensor subtraction
-c = a - b * \beta
-where the size of b is equal to the n-th dimension of a,
-i.e., a is subtracted with b by broadcasting
->> a - a tensor
->> b - another tensor whose size is equal to that of dimension n of a
->> c - where we put a-b*\beta. we save it in a if c is NULL
->> n - the dimension index
->> beta - the scaling factor
-*/
-void _SubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta)
-{
-    n = MODX(n, a->order);
-    CheckNTErrors(a && b && c, "Empty tensor input!");
-    CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in subtraction!");
-    CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
-                  "Unmatched data types in subtraction!");
-    CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in subtraction!");
-    CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
-    CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
-    CheckDev(a->devID, b->devID);
-    if (beta == 0) {
-        _CopyValues(a, c);
-        return;
-    }
-    if (_IsSameShaped(a, b)) {
-        _Sub(a, b, c, beta);
-        return;
-    }
-    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
-#ifdef USE_CUDA
-        _CudaSubDim(a, b, c, n, beta);
-#else
-        ShowNTErrors("Please specify USE_CUDA and recompile the code!");
-#endif
-    }
-    else {
-        int stride = 1;
-        int blockSize = a->dimSize[n];
-        int blockNum = 1;
-        for (int i = a->order - 1; i >= 0; i--) {
-            if (i > n)
-                stride *= a->dimSize[i];
-            else if (i < n)
-                blockNum *= a->dimSize[i];
-        }
-        if (a->dataType == DEFAULT_DTYPE) {
-            int num = a->unitNum;
-            if (stride > 1) {
-                for (int i = 0, j = 0; i < num; i += stride, j++) {
-                    DTYPE * ap = (DTYPE*)a->data + i;
-                    DTYPE   bv = *((DTYPE*)b->data + j % blockSize) * beta;
-                    DTYPE * cp = (DTYPE*)c->data + i;
-                    for (int k = 0; k < stride; k++)
-                        cp[k] = ap[k] - bv;
-                }
-            }
-            else if (stride == 1) {
-                DTYPE * bp = (DTYPE*)b->data;
-                for (int i = 0; i < num; i += blockSize) {
-                    DTYPE * ap = (DTYPE*)a->data + i;
-                    DTYPE * cp = (DTYPE*)c->data + i;
-                    if (beta == 1.0F) {
-                        for (int j = 0; j < blockSize; j++)
-                            cp[j] = ap[j] - bp[j];
-                    }
-                    else {
-                        for (int j = 0; j < blockSize; j++)
-                            cp[j] = ap[j] - bp[j] * beta;
-                    }
-                }
-            }
-            else {
-                ShowNTErrors("Something is wrong!");
-            }
-        }
-        else {
-            ShowNTErrors("TODO!");
-        }
-    }
-}
-/*
-tensor subtraction (do it on site)
-keep the result in the input tensor and return nothing
-c = a - b * \beta
-where the size of b is equal to the n-th dimension of a,
-i.e., a is subtracted with b by broadcasting
->> a - a tensor
->> b - another tensor whose size is equal to that of dimension n of a
->> n - the dimension index
->> beta - the scaling factor
-*/
-void _SubDim(XTensor * a, const XTensor * b, int n, DTYPE beta)
-{
-    _SubDim(a, b, a, n, beta);
-}
-/*
-tensor subtraction (return an XTensor structure and make tensor connections)
-make a new tensor to keep the result and return it
-c = a - b * \beta
-where the size of b is equal to the n-th dimension of a,
-i.e., a is subtracted with b by broadcasting
->> a - a tensor
->> b - another tensor whose size is equal to that of dimension n of a
->> n - the dimension index
->> beta - the scaling factor
-<< return - the result tensor by tensor subtraction
-*/
-XTensor SubDim(const XTensor &a, const XTensor &b, int n, DTYPE beta)
-{
-    XTensor c(&a);
-    c.SetTMPFlag();
-    n = MODX(n, a.order);
-    /* call _Sub function */
-    _SubDim(&a, &b, &c, n, beta);
-    /* tensor connections */
-    if (a.enableGrad && b.enableGrad) {
-        XLink::MakeLink(&a, &b, &c, MATH_SUBDIM);
-        XLink::AddParamToHeadInt(&c, n);
-        XLink::AddParamToHead(&c, beta);
-    }
-    return c;
-}
-/*
-tensor subtraction
-c = a - b * \beta
-where the size of b is equal to the n-th dimension of a,
-i.e., a is subtracted with b by broadcasting
->> a - a tensor
->> b - another tensor whose size is equal to that of dimension n of a
->> c - where we put a-b*\beta. we save it in a if c is NULL
->> n - the dimension index
->> beta - the scaling factor
-*/
-void SubDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta)
-{
-    if (!c.isInit || !IsSameShaped(a, c)) {
-        InitTensorV2(&c, &a);
-    }
-    /* call _Sub function */
-    _SubDim(&a, &b, &c, n, beta);
-    if (a.enableGrad && b.enableGrad) {
-        /* tensor connections */
-        XLink::MakeLink(&a, &b, &c, MATH_SUBDIM);
-        XLink::AddParamToHeadInt(&c, n);
-        XLink::AddParamToHead(&c, beta);
-    }
-}
-}
--- a/source/tensor/core/arithmetic/SubDim.cu
+++ b/source/tensor/core/arithmetic/SubDim.cu
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-/*
-* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
-*/
-#include "SubDim.cuh"
-#include "../../XDevice.h"
-namespace nts { // namespace nts(NiuTrans.Tensor)
-#ifdef USE_CUDA
-/*
-tensor subtraction of a tensor and a row vector
-c = a - b * \beta
-where a is a tensor and b is a row vector
->> a - pointer to the data array of a
->> b - pointer to the data array of b
->> c - pointer to the data array of c
->> rowNum - number of rows of a and c
->> colNum - number of columns of a and c (i.e., the size of b)
->> beta - the scaling factor
-*/
-template <class T, bool betaFired>
-__global__
-void KernelSubWithRow(T * a, T * b, T * c, int rowNum, int colNum, T beta)
-{
-    __shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
-    int col = blockDim.x * blockIdx.x + threadIdx.x;
-    int row = blockDim.y * blockIdx.y + threadIdx.y;
-    if (col >= colNum || row >= rowNum)
-        return;
-    if (threadIdx.y == 0)
-        bv[threadIdx.x] = b[col];
-    __syncthreads();
-    int offset = colNum * row + col;
-    if (betaFired)
-        c[offset] = a[offset] - bv[threadIdx.x] * beta;
-    else
-        c[offset] = a[offset] - bv[threadIdx.x];
-}
-/*
-tensor subtraction of a tensor and a colum vector
-c = a - b * \beta
-where a is a tensor and b is a colum vector
->> a - pointer to the data array of a
->> b - pointer to the data array of b
->> c - pointer to the data array of c
->> rowNum - number of rows of a and c (i.e., the size of b)
->> colNum - number of columns of a and c
->> blockNum - size of a block (matrix), i.e., rowNum * colNum
->> blockNum - number of matrics
->> beta - the scaling factor
-*/
-template <class T, bool betaFired>
-__global__
-void KernelSubWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, T beta)
-{
-    __shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
-    int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
-    int row = blockDim.y * blockIdx.y + threadIdx.y;
-    int col = colIndex % colNum;
-    int block = colIndex / colNum;
-    if (row >= rowNum || block >= blockNum)
-        return;
-    if (threadIdx.x == 0)
-        bv[threadIdx.y] = b[row];
-    __syncthreads();
-    int offset = block * blockSize + row * colNum + col;
-    if (betaFired)
-        c[offset] = a[offset] - bv[threadIdx.y] * beta;
-    else
-        c[offset] = a[offset] - bv[threadIdx.y];
-}
-/*
-tensor subtraction (cuda version)
-c = a - b * \beta
-where the size of b is equal to the n-th dimension of a,
-i.e., a is subtracted with b by broadcasting
->> a - a tensor
->> b - another tensor whose size is equal to that of dimension n of a
->> c - where we put a+b*\beta. we save it in a if c is NULL
->> n - the dimension index
->> beta - the scaling factor
-*/
-void _CudaSubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta)
-{
-    CheckNTErrors(a && b && c, "Empty tensor input!");
-    CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in subtraction!");
-    CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
-                  "Unmatched data types in subtraction!");
-    CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in subtraction!");
-    CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
-    CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
-    int stride = 1;
-    int blockSize = a->dimSize[n];
-    int blockNum = 1;
-    for (int i = a->order - 1; i >= 0; i--) {
-        if (i > n)
-            stride *= a->dimSize[i];
-        else if (i < n)
-            blockNum *= a->dimSize[i];
-    }
-    int cudaGrids[3];
-    int cudaBlocks[3];
-    int devIDBackup = 0;
-    ProtectCudaDev(a->devID, devIDBackup);
-    if (a->dataType == DEFAULT_DTYPE) {
-        if (stride > 1) {
-            GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
-            if (beta == (DTYPE)1.0F)
-                KernelSubWithCol<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
-                                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
-                                                  blockSize, stride, blockSize * stride, blockNum, beta);
-            else
-                KernelSubWithCol<DTYPE, true>  <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
-                                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
-                                                  blockSize, stride, blockSize * stride, blockNum, beta);
-        }
-        else if (stride == 1) {
-            GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
-            if (beta == (DTYPE)1.0F)
-                KernelSubWithRow<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
-                                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
-                                                  blockNum, blockSize, beta);
-            else
-                KernelSubWithRow<DTYPE, true>  <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
-                                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
-                                                  blockNum, blockSize, beta);
-        }
-        else {
-            ShowNTErrors("Something is wrong!");
-        }
-    }
-    else {
-        ShowNTErrors("TODO!");
-    }
-    BacktoCudaDev(a->devID, devIDBackup);
-}
-#endif
-} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/SubDim.cuh
+++ b/source/tensor/core/arithmetic/SubDim.cuh
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-/*
-* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
-*/
-#ifndef __SUBDIM_CUH__
-#define __SUBDIM_CUH__
-#include "../../XTensor.h"
-namespace nts { // namespace nts(NiuTrans.Tensor)
-#ifdef USE_CUDA
-/* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a,
-   i.e., a is subtracted with b by broadcasting (cuda version) */
-void _CudaSubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta = (DTYPE)1.0);
-#endif
-} // namespace nts(NiuTrans.Tensor)
-#endif // __SUBDIM_CUH__
--- a/source/tensor/core/arithmetic/SubDim.h
+++ b/source/tensor/core/arithmetic/SubDim.h
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-/*
-* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
-*/
-#ifndef __SUBDIM_H__
-#define __SUBDIM_H__
-#include "../../XTensor.h"
-namespace nts { // namespace nts(NiuTrans.Tensor)
-/* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a, 
-   i.e., a is subtracted with b by broadcasting*/
-void _SubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta = (DTYPE)1.0);
-/* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a, 
-   i.e., a is subtracted with b by broadcasting. we keep the result in the input tensor a and return nothing */
-void _SubDim(XTensor * a, const XTensor * b, int n, DTYPE beta = (DTYPE)1.0);
-/* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a,
-   i.e., a is subtracted with b by broadcasting. We make a new tensor c to keep the result and return it */
-XTensor SubDim(const XTensor &a, const XTensor &b, int n, DTYPE beta = (DTYPE)1.0);
-/* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a, 
-   i.e., a is subtracted with b by broadcasting*/
-void SubDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta = (DTYPE)1.0);
-} // namespace nts(NiuTrans.Tensor)
-#endif // __SUBDIM_H__
--- a/source/tensor/core/arithmetic/SumDim.cpp
+++ b/source/tensor/core/arithmetic/SumDim.cpp
@@ -136,7 +136,6 @@ i.e., a is summed with b by broadcasting
 >> a - a tensor
 >> b - another tensor whose size is equal to that of dimension n of a
 >> n - the dimension index
->> inplace - indicates whether the result will be placed in the input tensor
 >> beta - the scaling factor
 */
 void _SumDim(XTensor * a, const XTensor * b, int n, DTYPE beta)

--- a/source/tensor/core/getandset/SetData.h
+++ b/source/tensor/core/getandset/SetData.h
@@ -29,6 +29,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* generate data items with a Glorot initialization*/
 void _SetDataXavierNormal(XTensor * tensor, DTYPE gain = 1.0F);
 /* generate data items with a xavier initialization */
 void _SetDataFanInOut(XTensor * tensor, DTYPE gain = 1.0F);

--- a/source/tensor/core/utilities/Float16.cpp
+++ b/source/tensor/core/utilities/Float16.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- /*
-  * $Creted by: Guan Huhao 2020-02-05
-  * $Updated by: Xu Chen (email: hello_master1954@163.com) 2020-05-01
-  */
-#include "../../XGlobal.h"
-#include "Float16.h"
-namespace nts { // namespace nts(NiuTrans.Tensor)
-float16 float16::SetOverFlow()
-{
-    exp = 31;
-    data = 0;
-    return *this;
-}
-int float16::IsOverlFlow() const 
-{
-    return exp==31;
-}
-// mask for calculate the highest 1
-unsigned int float16::mask[32] = 
-{
-    0xffffffff,0xfffffffe,0xfffffffc,0xfffffff8,0xfffffff0,0xffffffe0,0xffffffc0,0xffffff80,
-    0xffffff00,0xfffffe00,0xfffffc00,0xfffff800,0xfffff000,0xffffe000,0xffffc000,0xffff8000,
-    0xffff0000,0xfffe0000,0xfffc0000,0xfff80000,0xfff00000,0xffe00000,0xffc00000,0xff800000,
-    0xff000000,0xfe000000,0xfc000000,0xf8000000,0xf0000000,0xe0000000,0xc0000000,0x80000000
-};
-// to calculate the power of 2
-unsigned int float16::pow2[32] = 
-{
-    0x00000001,0x00000002,0x00000004,0x00000008,0x00000010,0x00000020,0x00000040,0x00000080,
-    0x00000100,0x00000200,0x00000400,0x00000800,0x00001000,0x00002000,0x00004000,0x00008000,
-    0x00010000,0x00020000,0x00040000,0x00080000,0x00100000,0x00200000,0x00400000,0x00800000,
-    0x01000000,0x02000000,0x04000000,0x08000000,0x10000000,0x20000000,0x40000000,0x80000000,
-};
-// compare the absolute value， if a < b return 1, else return 0
-int float16::AbsCompare(const float16 & a, const float16 & b)
-{
-    if (a.exp < b.exp)
-        return 1;
-    else if (a.exp > b.exp) 
-        return 0;
-    return a.data < b.data;
-}
-// get inverse that a * inverse(a) == 1
-float16 float16::GetInverse() const 
-{
-    float16 ans;
-    ans.sign = sign;
-    ans.exp = 29 - exp;
-    int rec = pow2[31];
-    //let it div 0x80000000
-    rec /= (this->data | pow2[10]);
-    if (!(rec & pow2[21])) {
-        rec <<= 1;
-        ans.exp++;
-    }
-    rec >>= 10;
-    ans.data = rec;
-    return ans;
-}
-/* constructor by (sign, exp, data), similar to ieee 32 floating point
->> s - sign: 1bit
->> e - exp:  5bit
->> d - data: 10bit 
-*/
-float16::float16(const int& s, const int& e, const int& d) 
-{
-    sign = s;
-    exp = e;
-    data = d;
-}
-/* initializes the 16bit floating point to 0 
-*/
-float16::float16() 
-{
-    sign = 0;
-    exp = 0;
-    data = 0;
-}
-/* constructor by other datatype
-   We convert the data to float and convert float to float16.
->> data - num
-*/
-template<class T>
-float16::float16(const T& data) 
-{
-    *this = (float)data;
-}
-template float16::float16 (const int &);
-template float16::float16 (const double &);
-/* constructor by a 32-bit float num
->> data - 32-bit float num
-*/
-float16::float16(const float& data) 
-{
-    *this = data;
-}
-void float16::Dump()
-{
-    printf("sign: %d\texp: %d\tdata: %d\n", sign, exp, data);
-}
-/*
-convert float16 to float and return
-construct of 32-bit is
-the 31th bit present the sign
-the 30th~23th bit present the exp, with 128 offset
-rest 23th～0th store the data
-*/
-float float16::Float() 
-{
-    int ret = 0;
-    ret = IsOverlFlow() ? 0x7f800000 :
-        (sign ? 0x80000000 : 0) | ((exp + 112) << 23) | (data << 13);
-    float p = *(float*)&ret;
-    return p;
-}
-// basic assignment function
-float16 float16::operator = (const float16& a) 
-{
-    sign = a.sign;
-    exp = a.exp;
-    data = a.data;
-    return *this;
-}
-// convert float to float16
-float16 float16::operator = (const float& a) 
-{
-    unsigned int p = *(unsigned int*)&a;
-    sign = p & pow2[31] ? 1 : 0;
-    if (a > 65535 || a < -65535) 
-        return SetOverFlow();
-    exp = ((p >> 23)& (0xf)) | ((p >> 26 & 0x10));
-    data = (p >> 13);
-    return *this;
-}
-/* Template assignment function is force change other datetype to float,
-   then call the float assignment function.
-   Template assignment function now support int and double.
-*/
-template <class T>
-float16 float16::operator = (const T& data) 
-{
-    *this = (float)data;
-    return *this;
-}
-template float16 float16:: operator = <int>(const int&);
-template float16 float16:: operator = <double>(const double&);
-/*
-template for multi-datatype overload
->> operator - the overload operator, e.g. <, =
->> return_type - the returned datetype of function, e.g, int, float
->> expression - the returned expression
-*/
-#define _OVERLOAD_OPRATER_TEMPLATE(operation, returnType, expression)       \
-template<class T>                                                           \
-returnType float16::operator operation (const T & data)                     \
-{                                                                           \
-    float16 rec=(float)data;                                                \
-    return expression;                                                      \
-}                                                                           \
-template returnType float16::operator operation <int>(const int&);          \
-template returnType float16::operator operation <float>(const float&);      \
-template returnType float16::operator operation <double>(const double&);
-// overload operator (less than) a<b
-int float16::operator < (const float16& data) 
-{
-    if (sign < data.sign)
-        return 1;
-    else if (sign > data.sign) 
-        return 0;
-    if (exp < data.exp) 
-        return 1;
-    else if (exp > data.exp) 
-        return 0;
-    return this->data < data.data;
-}
-_OVERLOAD_OPRATER_TEMPLATE(< , int, *this < rec)
-// overload opertator <= (less or equal than) a <= b
-int float16::operator <= (const float16& data) 
-{
-    if (sign < data.sign)
-        return 1;
-    else if (sign > data.sign) 
-        return 0;
-    if (exp < data.exp) 
-        return 1;
-    else if (exp > data.exp) 
-        return 0;
-    return this->data <= data.data;
-}
-_OVERLOAD_OPRATER_TEMPLATE(<= , int, *this <= rec)
-// overload operator (greater than) a > b
-int float16::operator > (const float16& data) 
-{
-    if (sign > data.sign)
-        return 1;
-    else if (sign < data.sign) 
-        return 0;
-    if (exp > data.exp) 
-        return 1;
-    else if (exp < data.exp) 
-        return 0;
-    return this->data > data.data;
-}
-_OVERLOAD_OPRATER_TEMPLATE(> , int, * this > rec)
-// overload opertator >= (greater or equal than) a >= b
-int float16::operator >= (const float16& data) 
-{
-    if (sign > data.sign)
-        return 1;
-    else if (sign < data.sign) 
-        return 0;
-    if (exp > data.exp) 
-        return 1;
-    else if (exp < data.exp) 
-        return 0;
-    return this->data >= data.data;
-}
-_OVERLOAD_OPRATER_TEMPLATE(>= , int, *this < rec)
-// overload operator + (add) a + b
-float16 float16::operator + (const float16& data)
-{
-    float16 ans;
-    // avoid overflow inf + anything = inf
-    if (this->IsOverlFlow()) 
-        return *this;
-    if (data.IsOverlFlow()) 
-        return data;
-    /* the greater number determine the sign and 
-       the smaller should be >> to aligment to the greater one */
-    if (AbsCompare(*this, data)) {
-        ans.sign = data.sign;
-        // rec the exp
-        int recp = data.exp;          
-        //to calculate the data
-        int recd = (data.data | (pow2[10])) + 
-            ((data.sign ^ sign) ? -1 : 1) * 
-            (((pow2[10]) | this->data) >> (data.exp - exp));   
-        //because the date may carry， if carryed >> the data, and change its exp
-        if (recd) {        
-            //to make the highest one is 10th bit
-            while (mask[10] & recd) {      
-                recd >>= 1;
-                recp++;
-            }
-            //to make the highest one is 10th bit
-            while (!(mask[10] & recd)) {    
-                recd <<= 1;
-                recp--;
-            }
-        }
-        // if data==0, exp should be 0
-        else 
-            recp = 0;  
-        ans.data = recd;
-        // if overflow should set overflow
-        if (recp >= 31) 
-            ans.SetOverFlow(); 
-        else {
-            ans.exp = recp;
-            ans.data = recd;
-        }
-    }
-    // same as above. while divided into two part? reduce assignment to increase efficent
-    else {             
-        ans.sign = sign;
-        int recp = exp;
-        int recd = (this->data | (pow2[10])) + 
-                   ((sign ^ data.sign) ? -1 : 1) * 
-                   (((pow2[10]) | data.data) >> (exp - data.exp));
-        if (recd) {
-            while (mask[10] & recd) {
-                recd >>= 1;
-                recp++;
-            }
-            while (!(mask[10] & recd)) {
-                recd <<= 1;
-                recp--;
-            }
-        }
-        else 
-            recp = 0;
-        if (recp >= 31) 
-            ans.SetOverFlow();
-        else {
-            ans.exp = recp;
-            ans.data = recd;
-        }
-    }
-    return ans;
-}
-_OVERLOAD_OPRATER_TEMPLATE(+, float16, *this = *this + rec)
-//overide operator +=
-float16 float16::operator+=(const float16& data) {
-    return *this = *this + data;
-}
-_OVERLOAD_OPRATER_TEMPLATE(+=, float16, *this = *this + rec)
-//overide operator -（negetive） -a
-float16 float16::operator - () 
-{
-    sign ^= 1;
-    float16 rec = *this;
-    sign ^= 1;
-    return rec;
-}
-//overide operator - (substraction) a-b
-float16 float16::operator - (const float16& data) 
-{
-    float16 ans;
-    if (this->IsOverlFlow()) 
-        return *this;
-    if (data.IsOverlFlow()) 
-        return data;
-    /* same as add only diffrent is the sign judge, 
-    a possitive number sub a greater number will be negtive. */
-    if (AbsCompare(*this, data)) {
-        ans.sign = !data.sign;
-        int recp = data.exp;
-        int recd = (data.data | (pow2[10])) + 
-            ((data.sign ^ sign) ? 1 : -1) * 
-            (((pow2[10]) | this->data) >> (data.exp - exp));
-        if (recd) {
-            while (mask[10] & recd) {
-                recd >>= 1;
-                recp++;
-            }
-            while (!(mask[10] & recd)) {
-                recd <<= 1;
-                recp--;
-            }
-        }
-        else recp = 0;
-        if (recp >= 31) 
-            ans.SetOverFlow();
-        else {
-            ans.data = recd;
-            ans.exp = recp;
-        }
-    }
-    else {
-        ans.sign = sign;
-        int recp = exp;
-        int recd = (this->data | (pow2[10])) + 
-            ((sign ^ data.sign) ? 1 : -1) * 
-            (((pow2[10]) | data.data) >> (exp - data.exp));
-        if (recd) {
-            while (mask[10] & recd) {
-                recd >>= 1;
-                recp++;
-            }
-            while (!(mask[10] & recd)) {
-                recd <<= 1;
-                recp--;
-            }
-        }
-        else recp = 0;
-        if (recp >= 31) 
-            ans.SetOverFlow();
-        else {
-            ans.data = recd;
-            ans.exp = recp;
-        }
-    }
-    return ans;
-}
-_OVERLOAD_OPRATER_TEMPLATE(-, float16, *this = *this - rec)
-// overide operator -=
-float16 float16::operator-=(const float16& data) 
-{
-    return *this = *this - data;
-}
-_OVERLOAD_OPRATER_TEMPLATE(-=, float16, *this = *this - rec)
-// overload operator * (multiple) a * b
-float16 float16::operator * (const float16& data) 
-{
-    //if(IsOverlFlow()) 
-    //    return *this;
-    //if(data.IsOverlFlow()) 
-    //    return data;
-    float16 ans;
-    // ^ to get zhe result sign different will be 1(negtive), same will be 0 positive;
-    ans.sign = sign ^ data.sign;
-    // mul to get answer
-    int rec = (data.data | pow2[10]) * (this->data | pow2[10]); 
-    // calculat the new exp
-    int recp = exp + data.exp - 15 > 0 ? exp + data.exp - 15 : 0;       
-    // if carryed, to fix the exp and data
-    rec >>= 10;                                           
-    while (rec & mask[11]) {
-        ++recp;
-        rec >>= 1;
-    }
-    if (recp >= 31) 
-        ans.SetOverFlow();
-    else {
-        ans.exp = recp;
-        ans.data = rec;
-    }
-    return ans;
-}
-_OVERLOAD_OPRATER_TEMPLATE(*, float16, (*this)* rec)
-// overload operator *= (multiple) a *= b
-float16 float16::operator *= (const float16& data) 
-{
-    return *this = *this * data;
-}
-_OVERLOAD_OPRATER_TEMPLATE(*=, float16, *this = *this * rec)
-// overload operator / (division) a / b
-float16 float16::operator / (const float16& data) 
-{
-    float16 ans;
-    // ^ to get zhe result sign different will be 1(negtive),same will be 0 positive;
-    ans.sign = sign ^ data.sign;                       
-    // calculat the new exp
-    int recp = exp - data.exp + 14;                        
-    // defore div should move to the left to avoid precision loss
-    int recd = (this->data << 21) | pow2[31];              
-    recd /= (data.data | pow2[10]);
-    // to make the highest one is the 21st bit
-    if (recd & pow2[21]) {                              
-        recd >>= 1;
-        ++recp;
-    }
-    if (recp >= 31) 
-        ans.SetOverFlow();
-    else {
-        recd >>= 10;
-        ans.data = recd;
-        ans.exp = recp;
-    }
-    return ans;
-}
-_OVERLOAD_OPRATER_TEMPLATE(/ , float16, (*this) / rec)
-// overload operator /= (division) a /= b
-float16 float16::operator /= (const float16& data) {
-    return *this = *this / data;
-}
-_OVERLOAD_OPRATER_TEMPLATE(/=, float16, *this = *this / rec)
-} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/utilities/Float16.h
+++ b/source/tensor/core/utilities/Float16.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*
- * $Creted by: Guan Huhao 2020-02-05
- * $Updated by: Xu Chen (email: hello_master1954@163.com) 2020-05-01
- */
-#ifndef FLOAT16_H
-#define FLOAT16_H
-namespace nts { // namespace nts(NiuTrans.Tensor)
-struct float16
-{
-private:
-    /* 
-    sign is the sign bit 1 means negative, 0 means positive
-    exp is the exponent with 16 offset
-    data is the data, similar to ieee-754, the highest is default 1 and ignored 
-    */
-    unsigned short data : 10;
-    unsigned short exp : 5;
-    unsigned short sign : 1;
-    // mask for calculate the highest 1
-    static unsigned int mask[32];
-    static unsigned int pow2[32];
-    //int FindHighOne(const int &num, int &l, int &r);
-    int AbsCompare(const float16 & a,const float16 & b);
-public:
-    float16 SetOverFlow();
-    // judge whether overflow
-    int IsOverlFlow() const;
-    /* constructor by (sign, exp, data)
-       similar to ieee 32 floating point
-       sign: 1bit 
-       exp:  5bit 
-       data: 10bit */
-    float16(const int& s, const int& e, const int& d);
-    /* default constructor
-       This initializes the 16bit floating point to 0. */
-    float16();
-    // constructor by a 32-bit float num
-    float16(const float& data);
-    // constructor by other datatype
-    template<class T> float16(const T& data);
-    void Dump();
-    // convert float16 to float and return
-    float Float();
-    /* assignment function and tempalte function
-       Float assignment function is the basic function.
-       Template assignment function is force change other datetype to float,
-       then call the float assignment function.
-       Template assignment function now support int and double. */
-    float16 operator = (const float& data);
-    float16 operator = (const float16& data);
-    template<class T>  float16 operator = (const T& data);
-    // overload operator (less than) a < b
-    int operator < (const float16& data);
-    template<class T>  int operator < (const T& data);
-    // overload opertator <= (less or equal than) a <= b
-    int operator <= (const float16& data);
-    template<class T> int operator <= (const T& data);
-    // overload operator (greater than) a > b
-    int operator > (const float16& data);
-    template<class T> int operator > (const T& data);
-    // overload opertator >= (greater or equal than) a >= b
-    int operator >= (const float16& data);
-    template<class T> int operator >= (const T& data);
-    // overload operator + (add) a + b
-    float16 operator + (const float16& data);
-    template<class T> float16 operator + (const T& data);
-    // overload operator += (add) a += b
-    float16 operator += (const float16& data);
-    template<class T> float16 operator += (const T& data);
-    // overload operator -(negetive) -a
-    float16 operator - ();
-    // overload operator - (substraction) a - b
-    float16 operator - (const float16& data);
-    template<class T> float16 operator - (const T& data);
-    // overload operator -= (substraction) a -= b
-    float16 operator -= (const float16& data);
-    template<class T> float16 operator -= (const T& data);
-    // overload operator * (multiple) a * b
-    float16 operator * (const float16& data);
-    template<class T> float16 operator * (const T& data);
-    // overload operator *= (multiple) a *= b
-    float16 operator *= (const float16& data);
-    template<class T> float16 operator *= (const T& data);
-    // overload operator / (division) a / b
-    float16 GetInverse() const;
-    float16 operator / (const float16& data);
-    template<class T> float16 operator / (const T& data);
-    // overload operator /= (division) a /= b
-    float16 operator /= (const float16& data);
-    template<class T> float16 operator /= (const T& data);
-};
-} // namespace nts(NiuTrans.Tensor)
-#endif /* FLOAT16_H */
--- a/source/tensor/test/TMultiply.cpp
+++ b/source/tensor/test/TMultiply.cpp
@@ -87,7 +87,7 @@ bool TestMultiply1()
    /* call Multiply function */
    _Multiply(s1, s2, t, 0, 0);
    _MultiplyMe(tMe, s2, 0, 0);
-    tUser = Multiply(*s1, *s2, 0);
+    tUser = Multiply(*s1, *s2, false, 0);
    /* check results */
    cpuTest = _CheckData(t, answer, tUnitNum, 1e-4F) &&

--- a/source/tensor/test/TSub.cpp
+++ b/source/tensor/test/TSub.cpp
@@ -161,7 +161,7 @@ bool TestSub2()
    /* call Sub function */
    _Sub(a, b, c, beta);
    _SubMe(cMe, b, beta);
-    cUser = Sub(*a, *b, beta);
+    cUser = Sub(*a, *b, false, beta);
    /* check results */
    cpuTest = _CheckData(c, answer, unitNum, 1e-4F) &&
@@ -268,7 +268,7 @@ bool TestSub3()
    b->SetData(bData, bUnitNum);
    /* call Sum function */
-    cUser = Sub(*a, *b, beta);
+    cUser = Sub(*a, *b, false, beta);
    /* check results */
    cpuTest = _CheckData(&cUser, answer, cUnitNum, 1e-4F);
@@ -370,7 +370,7 @@ bool TestSub4()
    b->SetData(bData, bUnitNum);
    /* call Sum function */
-    cUser = Sub(*a, *b, beta);
+    cUser = Sub(*a, *b, false, beta);
    /* check results */
    cpuTest = _CheckData(&cUser, answer, cUnitNum, 1e-4F);
@@ -472,7 +472,7 @@ bool TestSub5()
    b->SetData(bData, bUnitNum);
    /* call Sum function */
-    cUser = Sub(*a, *b, beta);
+    cUser = Sub(*a, *b, false, beta);
    /* check results */
    cpuTest = _CheckData(&cUser, answer, cUnitNum, 1e-4F);

--- a/source/tensor/test/TSubDim.cpp
+++ b/source/tensor/test/TSubDim.cpp
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-/*
-* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
-*/
-#include "../core/utilities/CheckData.h"
-#include "../core/arithmetic/SubDim.h"
-#include "../XTensor.h"
-#include "TSubDim.h"
-namespace nts { // namespace nts(NiuTrans.Tensor)
-/*
-case 1: tensor subtraction c = a - b * \beta
-where the size of b is equal to the n-th dimension of a,
-i.e., a is subtracted with b by broadcasting
-*/
-bool TestSubDim1()
-{
-    /* a tensor of size (2, 4) */
-    int aOrder = 2;
-    int * aDimSize = new int[aOrder];
-    aDimSize[0] = 2;
-    aDimSize[1] = 4;
-    int aUnitNum = 1;
-    for (int i = 0; i < aOrder; i++)
-        aUnitNum *= aDimSize[i];
-    /* a tensor of size (2) */
-    int bOrder = 1;
-    int * bDimSize = new int[bOrder];
-    bDimSize[0] = 2;
-    int bUnitNum = 1;
-    for (int i = 0; i < bOrder; i++)
-        bUnitNum *= bDimSize[i];
-    DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
-                          {4.0F, 5.0F, 6.0F, 7.0F} };
-    DTYPE bData[2] = {1.0F, -1.0F};
-    DTYPE answer[2][4] = { {-1.0F, 0.0F, 1.0F, 2.0F},
-                           {5.0F, 6.0F, 7.0F, 8.0F} };
-    /* CPU test */
-    bool cpuTest = true;
-    /* create tensors */
-    XTensor * a = NewTensorV2(aOrder, aDimSize);
-    XTensor * b = NewTensorV2(bOrder, bDimSize);
-    XTensor * c = NewTensorV2(aOrder, aDimSize);
-    XTensor * cMe = NewTensorV2(aOrder, aDimSize);
-    XTensor cUser;
-    /* initialize variables */
-    a->SetData(aData, aUnitNum);
-    cMe->SetData(aData, aUnitNum);
-    b->SetData(bData, bUnitNum);
-    c->SetZeroAll();
-    /* call SubDim function */
-    _SubDim(a, b, c, 0);
-    _SubDim(cMe, b, 0);
-    cUser = SubDim(*a, *b, 0);
-    /* check results */
-    cpuTest = _CheckData(c, answer, aUnitNum) &&
-              _CheckData(cMe, answer, aUnitNum) &&
-              _CheckData(&cUser, answer, aUnitNum);
-#ifdef USE_CUDA
-    /* GPU test */
-    bool gpuTest = true;
-    /* create tensor */
-    XTensor * aGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * bGPU = NewTensorV2(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * cGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * cMeGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
-    XTensor cUserGPU;
-    /* Initialize variables */
-    aGPU->SetData(aData, aUnitNum);
-    cMeGPU->SetData(aData, aUnitNum);
-    bGPU->SetData(bData, bUnitNum);
-    cGPU->SetZeroAll();
-    /* call sub function */
-    _SubDim(aGPU, bGPU, cGPU, 0);
-    _SubDim(cMeGPU, bGPU, 0);
-    cUserGPU = SubDim(*aGPU, *bGPU, 0);
-    /* check results */
-    gpuTest = _CheckData(cGPU, answer, aUnitNum) &&
-              _CheckData(cMeGPU, answer, aUnitNum) &&
-              _CheckData(&cUserGPU, answer, aUnitNum);
-    /* destroy variables */
-    delete a;
-    delete b;
-    delete c;
-    delete cMe;
-    delete aGPU;
-    delete bGPU;
-    delete cGPU;
-    delete cMeGPU;
-    delete[] aDimSize;
-    delete[] bDimSize;
-    return cpuTest && gpuTest;
-#else
-    /* destroy variables */
-    delete a;
-    delete b;
-    delete c;
-    delete cMe;
-    delete[] aDimSize;
-    delete[] bDimSize;
-    return cpuTest;
-#endif // USE_CUDA
-}
-/*
-case 2: tensor subtraction c = a - b * \beta
-where the size of b is equal to the n-th dimension of a,
-i.e., a is subtracted with b by broadcasting
-*/
-bool TestSubDim2()
-{
-    /* a tensor of size (2, 4) */
-    int aOrder = 2;
-    int * aDimSize = new int[aOrder];
-    aDimSize[0] = 2;
-    aDimSize[1] = 4;
-    int aUnitNum = 1;
-    for (int i = 0; i < aOrder; i++)
-        aUnitNum *= aDimSize[i];
-    /* a tensor of size (2, 2) */
-    int bOrder = 2;
-    int * bDimSize = new int[bOrder];
-    bDimSize[0] = 2;
-    bDimSize[1] = 2;
-    int bUnitNum = 1;
-    for (int i = 0; i < bOrder; i++)
-        bUnitNum *= bDimSize[i];
-    DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
-                          {4.0F, 5.0F, 6.0F, 7.0F} };
-    DTYPE bData[2][2] = { {1.0F, -1.0F},
-                          {-1.0F, 1.0F} };
-    DTYPE answer[2][4] = { {-1.0F, 2.0F, 3.0F, 2.0F},
-                           {3.0F, 6.0F, 7.0F, 6.0F} };
-    /* CPU test */
-    bool cpuTest = true;
-    /* create tensors */
-    XTensor * a = NewTensorV2(aOrder, aDimSize);
-    XTensor * b = NewTensorV2(bOrder, bDimSize);
-    XTensor * c = NewTensorV2(aOrder, aDimSize);
-    XTensor * cMe = NewTensorV2(aOrder, aDimSize);
-    XTensor cUser;
-    /* initialize variables */
-    a->SetData(aData, aUnitNum);
-    cMe->SetData(aData, aUnitNum);
-    b->SetData(bData, bUnitNum);
-    c->SetZeroAll();
-    /* call SubDim function */
-    _SubDim(a, b, c, 1);
-    _SubDim(cMe, b, 1);
-    cUser = SubDim(*a, *b, 1);
-    /* check results */
-    cpuTest = _CheckData(c, answer, aUnitNum) &&
-              _CheckData(cMe, answer, aUnitNum) &&
-              _CheckData(&cUser, answer, aUnitNum);
-#ifdef USE_CUDA
-    /* GPU test */
-    bool gpuTest = true;
-    /* create tensor */
-    XTensor * aGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * bGPU = NewTensorV2(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * cGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * cMeGPU = NewTensorV2(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
-    XTensor cUserGPU;
-    /* Initialize variables */
-    aGPU->SetData(aData, aUnitNum);
-    cMeGPU->SetData(aData, aUnitNum);
-    bGPU->SetData(bData, bUnitNum);
-    cGPU->SetZeroAll();
-    /* call sub function */
-    _SubDim(aGPU, bGPU, cGPU, 1);
-    _SubDim(cMeGPU, bGPU, 1);
-    cUserGPU = SubDim(*aGPU, *bGPU, 1);
-    /* check results */
-    gpuTest = _CheckData(cGPU, answer, aUnitNum) &&
-              _CheckData(cMeGPU, answer, aUnitNum) &&
-              _CheckData(&cUserGPU, answer, aUnitNum);
-    /* destroy variables */
-    delete a;
-    delete b;
-    delete c;
-    delete cMe;
-    delete aGPU;
-    delete bGPU;
-    delete cGPU;
-    delete cMeGPU;
-    delete[] aDimSize;
-    delete[] bDimSize;
-    return cpuTest && gpuTest;
-#else
-    /* destroy variables */
-    delete a;
-    delete b;
-    delete c;
-    delete cMe;
-    delete[] aDimSize;
-    delete[] bDimSize;
-    return cpuTest;
-#endif // USE_CUDA
-}
-/* other cases */
-/*
-TODO!!
-*/
-/* test for SubDim Function */
-bool TestSubDim()
-{
-    XPRINT(0, stdout, "[TEST SUBDIM] tensor subtraction c = a - b * beta by broadcasting\n");
-    bool returnFlag = true, caseFlag = true;
-    /* case 1 test */
-    caseFlag = TestSubDim1();
-    if (!caseFlag) {
-        returnFlag = false;
-        XPRINT(0, stdout, ">> case 1 failed!\n");
-    }
-    else
-        XPRINT(0, stdout, ">> case 1 passed!\n");
-    /* case 2 test */
-    caseFlag = TestSubDim2();
-    if (!caseFlag) {
-        returnFlag = false;
-        XPRINT(0, stdout, ">> case 2 failed!\n");
-    }
-    else
-        XPRINT(0, stdout, ">> case 2 passed!\n");
-    /* other cases test */
-    /*
-    TODO!!
-    */
-    if (returnFlag) {
-        XPRINT(0, stdout, ">> All Passed!\n");
-    }
-    else
-        XPRINT(0, stdout, ">> Failed!\n");
-    XPRINT(0, stdout, "\n");
-    return returnFlag;
-}
-} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/test/TSubDim.h
+++ b/source/tensor/test/TSubDim.h
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-/*
-* $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
-*/
-#ifndef __TEST_SUBDIM_H__
-#define __TEST_SUBDIM_H__
-#include "../core/arithmetic/SubDim.h"
-namespace nts { // namespace nts(NiuTrans.Tensor)
-/* test for SubDim Function */
-bool TestSubDim();
-} // namespace nts(NiuTrans.Tensor)
-#endif // __TEST_SUBDIM_H__
--- a/source/tensor/test/TSum.cpp
+++ b/source/tensor/test/TSum.cpp
@@ -161,7 +161,7 @@ bool TestSum2()
    /* call Sum function */
    _Sum(a, b, c, beta);
    _SumMe(cMe, b, beta);
-    cUser = Sum(*a, *b, beta);
+    cUser = Sum(*a, *b, false, beta);
    /* check results */
    cpuTest = _CheckData(c, answer, unitNum, 1e-4F) &&
@@ -268,7 +268,7 @@ bool TestSum3()
    b->SetData(bData, bUnitNum);
    /* call Sum function */
-    cUser = Sum(*a, *b, beta);
+    cUser = Sum(*a, *b, false, beta);
    /* check results */
    cpuTest = _CheckData(&cUser, answer, cUnitNum, 1e-4F);
@@ -370,7 +370,7 @@ bool TestSum4()
    b->SetData(bData, bUnitNum);
    /* call Sum function */
-    cUser = Sum(*a, *b, beta);
+    cUser = Sum(*a, *b, false, beta);
    /* check results */
    cpuTest = _CheckData(&cUser, answer, cUnitNum, 1e-4F);
@@ -472,7 +472,7 @@ bool TestSum5()
    b->SetData(bData, bUnitNum);
    /* call Sum function */
-    cUser = Sum(*a, *b, beta);
+    cUser = Sum(*a, *b, false, beta);
    /* check results */
    cpuTest = _CheckData(&cUser, answer, cUnitNum, 1e-4F);

--- a/tools/pack_model.py
+++ b/tools/pack_model.py
-import argparse
-from struct import pack
-import torch
-parser = argparse.ArgumentParser(description='Pack Pytorch model to NiuTensor')
-parser.add_argument('-src', help='pytorch model', type=str, default='model.pt')
-parser.add_argument('-tgt', help='niutensor model', type=str, default='model.bin')
-args = parser.parse_args()
-model = torch.load(args.src, map_location='cpu')
-model = model['model']
-def get_model_parameters(m):
-    '''
-    get flattend transformer model parameters
-    '''
-    p = []
-    w = None
-    for k in m:
-        if 'embed_tokens.weight'  in k:
-            w = m[k]
-        elif m[k].numel() != 1:
-            # p.append(m[k])
-            if 'weight' in k:
-                # weights for qkv
-                if 'in_proj' in k:
-                    dim = m[k].shape[0] // 3
-                    p.append((m[k][:dim, :]).t())
-                    p.append((m[k][dim:dim*2, :]).t())
-                    p.append((m[k][dim*2:, :]).t())
-                else:
-                    if 'norm' in k:
-                        p.append(m[k])
-                    else:
-                        p.append(m[k].t())
-            else:
-                p.append(m[k])
-    # encoder embedding weight
-    p.append(w)
-    # decoder embedding weight
-    p.append(w)
-    # output weight
-    p.append(w)
-    return p
-with torch.no_grad():
-    params = get_model_parameters(model)
-    params_number = pack("Q", len(params))
-    params_size = pack("Q" * len(params), *[p.numel() for p in params])
-    print('total params: ', len(params))
-    print('total params size: ', sum([p.numel() for p in params]))
-    with open(args.tgt+".name.txt", "w") as name_list:
-        for p in model:
-            name_list.write("{}\t{}\n".format(p, model[p].shape))
-    with open(args.tgt+".bin", 'wb') as tgt:
-        # part 1: number of parameters
-        # tgt.write(params_number)
-        # part 2: offsets of parameters
-        # tgt.write(params_size)
-        # part 3: values of parameters
-        for p in params:
-            values = pack("f" * p.numel(), *(p.contiguous().view(-1).cpu().tolist()))
-            tgt.write(values)
\ No newline at end of file