Initial commit

05715480 · huchi · 05715480 · 05715480 · 05715480 · 05715480
Commit 05715480 authored Sep 16, 2020 by huchi
--- a/.gitignore
+++ b/.gitignore
+/bin
+/build
+/out
+sample/train/iwlst14de-en.train.log
+/models
+/source/niutensor/
\ No newline at end of file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
+# CMake minimum version
+cmake_minimum_required(VERSION 2.8)
+
+# Project's name
+project(NiuTrans.NMT)
+
+# The prefix of the generated executable file
+set(NIUTRANS_EXE "NiuTrans.NMT")
+set(NIUTRANS_DLL "${NIUTRANS_EXE}")
+
+
+# Generated file path
+set(EXECUTABLE_OUTPUT_PATH ../bin)
+set(LIBRARY_OUTPUT_PATH ../lib)
+
+# Use CMAKE_MACOSX_RPATH for MacOS 
+set(CMAKE_MACOSX_RPATH 1)
+
+# Open floder manage
+set_property(GLOBAL PROPERTY USE_FOLDERS ON)
+option(USE_CUDA "Use CUDA" OFF)
+option(USE_MKL "Use MKL" OFF)
+option(USE_OPENBLAS "Use OpenBLAS" OFF)
+option(USE_FP16 "Use FP16" OFF)
+option(GEN_DLL "Generate Dynamic Link Library" OFF)
+
+if (USE_CUDA)
+    if(NOT DEFINED CUDA_TOOLKIT_ROOT_DIR)
+        if(WIN32)
+            message(STATUS "HERE cuda")
+            set(CUDA_TOOLKIT_ROOT_DIR "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1")
+        else()
+            set(CUDA_TOOLKIT_ROOT_DIR "/usr/cuda-9.0")
+        endif()
+    endif()
+    message(STATUS "CUDA_TOOLKIT_ROOT_DIR: ${CUDA_TOOLKIT_ROOT_DIR}")
+endif()
+if(USE_MKL)
+    if(NOT DEFINED INTEL_ROOT)
+        if(WIN32)
+            message(STATUS "HERE mkl")
+            set(INTEL_ROOT "C:/Program Files (x86)/IntelSWTools/compilers_and_libraries_2020.2.254/windows")
+        else()
+            set(INTEL_ROOT "/usr/intel/compilers_and_libraries_2020.2.254/linux")
+        endif()
+    endif()
+    message(STATUS "INTEL_ROOT: ${INTEL_ROOT}")
+endif()
+if(USE_OPENBLAS)
+    if(NOT DEFINED OPENBLAS_ROOT)
+        if(WIN32)
+            set(OPENBLAS_ROOT "D:/software/BaiduNetdiskDownload/thirdparty20170624/OpenBLAS")
+        else()
+            set(OPENBLAS_ROOT "/usr/OpenBLAS")
+        endif()
+    endif()
+    message(STATUS "OPENBLAS_ROOT: ${OPENBLAS_ROOT}")
+endif()
+
+# Find all the .cpp .h .cu .chu files in source folder
+file(GLOB_RECURSE CPP_FILES source/*.cpp)
+file(GLOB_RECURSE H_FILES source/*.h)
+file(GLOB_RECURSE CU_FILES source/*.cu)
+file(GLOB_RECURSE CUH_FILES source/*.cuh)
+
+function(assign_source_group)
+    foreach(_source IN ITEMS ${ARGN})
+        if (IS_ABSOLUTE "${_source}")
+            file(RELATIVE_PATH _source_rel "${CMAKE_CURRENT_SOURCE_DIR}" "${_source}")
+        else()
+            set(_source_rel "${_source}")
+        endif()
+        get_filename_component(_source_path "${_source_rel}" PATH)
+        string(REPLACE "/" "\\" _source_path_msvc "${_source_path}")
+        source_group("${_source_path_msvc}" FILES "${_source}")
+    endforeach()
+endfunction(assign_source_group)
+
+function(my_add_executable)
+    foreach(_source IN ITEMS ${ARGN})
+        assign_source_group(${_source})
+    endforeach()
+    if(USE_CUDA)
+        cuda_add_executable(${ARGV})
+    else()
+        add_executable(${ARGV})
+    endif()
+endfunction(my_add_executable)
+
+# Set libs and compiler options for CUDA
+if(USE_CUDA)
+    add_definitions(-DUSE_CUDA)
+    if(USE_FP16)
+        add_definitions(-DHALF_PRECISION)
+    endif()
+    find_package(CUDA ${CUDA_VERSION} REQUIRED)
+    if(WIN32)
+        SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4819")
+        set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-maxrregcount=0 -m64 --disable-warnings -use_fast_math -DUSE_CUDA")
+        set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} -arch=sm_30 
+            -gencode=arch=compute_30,code=sm_30 
+            -gencode=arch=compute_50,code=sm_50 
+            -gencode=arch=compute_52,code=sm_52 
+            -gencode=arch=compute_60,code=sm_60 
+            -gencode=arch=compute_61,code=sm_61 
+            -gencode=arch=compute_62,code=sm_62 
+            -gencode=arch=compute_70,code=sm_70 
+            -gencode=arch=compute_70,code=compute_70 
+        )
+        set(CMAKE_POLICY_DEFAULT_CMP0028 NEW)
+        link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib/x64")
+        include_directories("${CUDA_TOOLKIT_ROOT_DIR}/include")
+        set(CUDA_LIB_DIR "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64/")
+        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}cublas.lib")
+        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}npps.lib")
+        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}nppc.lib")
+        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}cudadevrt.lib")
+        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}curand.lib")
+    else()
+        set(CMAKE_CXX_FLAGS "-fPIC -msse4.2 -w -march=native -Wno-enum-compare -Wno-sign-compare -Wno-format -Wno-dev -O3 -DNDEBUG -rdynamic")
+        if(USE_FP16)
+            set(CUDA_NVCC_FLAGS "-Xcompiler -fPIC -maxrregcount=0 --disable-warnings -use_fast_math -DUSE_CUDA -DHALF_PRECISION -Wno-deprecated-gpu-targets -std=c++11 ")
+            set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -arch=sm_60
+                -gencode=arch=compute_60,code=sm_60 
+                -gencode=arch=compute_61,code=sm_61 
+                -gencode=arch=compute_62,code=sm_62 
+                -gencode=arch=compute_70,code=sm_70 
+                -gencode=arch=compute_70,code=compute_70 
+            )
+        else()
+            set(CUDA_NVCC_FLAGS "-Xcompiler -fPIC -maxrregcount=0 --disable-warnings -use_fast_math -DUSE_CUDA -Wno-deprecated-gpu-targets -std=c++11 ")
+            set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -arch=sm_30 
+                -gencode=arch=compute_30,code=sm_30 
+                -gencode=arch=compute_50,code=sm_50 
+                -gencode=arch=compute_52,code=sm_52 
+                -gencode=arch=compute_60,code=sm_60 
+                -gencode=arch=compute_61,code=sm_61 
+                -gencode=arch=compute_62,code=sm_62 
+                -gencode=arch=compute_70,code=sm_70 
+                -gencode=arch=compute_70,code=compute_70 
+            )
+        endif()
+        link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64)
+        include_directories(${CUDA_TOOLKIT_ROOT_DIR}/include)
+        set(CUDA_LIB_DIR "${CUDA_TOOLKIT_ROOT_DIR}/lib64/")
+        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcublas_static.a")
+        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libculibos.a")
+        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libnpps_static.a")
+        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libnppc_static.a")
+        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcudadevrt.a")
+        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcurand_static.a")
+        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "/usr/lib64/libdl.so.2")
+    endif()
+endif()
+
+# Set libs and compiler options for MKL
+if(USE_MKL)
+    add_definitions(-DMKL)
+    set(COMPILER_DIR "${INTEL_ROOT}/compiler")
+    set(MKL_DIR "${INTEL_ROOT}/mkl")
+    set(CPU_ARCH intel64)
+    if(WIN32)
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNDEBUG -DMKL")
+        link_directories(${MKL_DIR}/lib/intel64/)
+        link_directories(${COMPILER_DIR}/lib/intel64)
+        include_directories(${MKL_DIR}/include)
+        set(COMPILER_LIB_DIR "${COMPILER_DIR}/lib/intel64/")
+        set(MKL_LIB_DIR "${MKL_DIR}/lib/intel64/")
+        set(MKL_LIB_PATH ${MKL_LIB_PATH} "${MKL_LIB_DIR}mkl_intel_lp64.lib")
+        set(MKL_LIB_PATH ${MKL_LIB_PATH} "${MKL_LIB_DIR}mkl_core.lib")
+        set(MKL_LIB_PATH ${MKL_LIB_PATH} "${MKL_LIB_DIR}mkl_intel_thread.lib")
+        set(MKL_LIB_PATH ${MKL_LIB_PATH} "${COMPILER_LIB_DIR}libiomp5md.lib")
+    else()
+        if(USE_CUDA)
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-reorder -DMKL")
+        else()
+            set(CMAKE_CXX_FLAGS "-std=c++11 -msse4.2 -w -march=native -Wno-enum-compare -Wno-sign-compare -Wno-reorder -Wno-format -O3 -flto -DNDEBUG -rdynamic -fkeep-inline-functions -fPIC -DMKL")
+        endif(USE_CUDA)
+        link_directories(${MKL_DIR}/lib/intel64/)
+        link_directories(${COMPILER_DIR}/lib/intel64)
+        include_directories(${MKL_DIR}/include)
+        set(COMPILER_LIB_DIR "${COMPILER_DIR}/lib/intel64/")
+        set(MKL_LIB_DIR "${MKL_DIR}/lib/intel64/")
+        set(MKL_LIB_PATH ${MKL_LIB_PATH} "${MKL_LIB_DIR}libmkl_intel_lp64.a")
+        set(MKL_LIB_PATH ${MKL_LIB_PATH} "${MKL_LIB_DIR}libmkl_core.a")
+        set(MKL_LIB_PATH ${MKL_LIB_PATH} "${MKL_LIB_DIR}libmkl_intel_thread.a")
+        set(MKL_LIB_PATH ${MKL_LIB_PATH} "${COMPILER_LIB_DIR}libiomp5.a")
+    endif()
+endif()
+
+# Set libs and compiler options for OpenBLAS
+if(USE_OPENBLAS)
+    add_definitions(-DUSE_BLAS -DMKL)
+    set(OPENBLAS_INCLUDE_DIR "${OPENBLAS_ROOT}/include")
+    set(OPENBLAS_LIB_DIR "${OPENBLAS_ROOT}/lib")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_BLAS")
+    if(WIN32)
+        link_directories(${OPENBLAS_LIB_DIR})
+        include_directories(${OPENBLAS_INCLUDE_DIR})
+        set(OPENBLAS_LIB_PATH ${OPENBLAS_LIB_PATH} "${OPENBLAS_LIB_DIR}/libopenblas.lib")
+    else()
+        link_directories(${OPENBLAS_LIB_DIR})
+        include_directories(${OPENBLAS_INCLUDE_DIR})
+        set(OPENBLAS_LIB_PATH ${OPENBLAS_LIB_PATH} "${OPENBLAS_LIB_DIR}/libopenblas.a")
+    endif()
+endif()
+
+# Integrate all libs
+set(CUDA_LIB ${CUDA_LIB_PATH})
+set(MKL_LIB ${MKL_LIB_PATH})
+set(OPENBLAS_LIB ${OPENBLAS_LIB_PATH})
+
+# Add executable files to project
+# Generate dynamic link library about project
+if(USE_CUDA)
+    if(GEN_DLL)
+        cuda_add_library(${NIUTRANS_DLL} SHARED ${CPP_FILES} ${H_FILES} ${CU_FILES} ${CUH_FILES})
+    else()
+        my_add_executable(${NIUTRANS_EXE} ${CPP_FILES} ${H_FILES} ${CU_FILES} ${CUH_FILES})
+    endif()
+else()
+    if(GEN_DLL)
+        add_library(${NIUTRANS_DLL} SHARED ${CPP_FILES} ${H_FILES})
+    else()
+        my_add_executable(${NIUTRANS_EXE} ${CPP_FILES} ${H_FILES})
+    endif()
+endif()
+
+# Link external libs to executable files
+# Link external libs to dynamic link library
+if(WIN32)
+    add_definitions(-DWIN32)
+    set(MESS ${MESS} "On Windows")
+    if(USE_CUDA)
+        set(MESS ${MESS} " Use CUDA")
+        set(ALL_LIB ${ALL_LIB} ${CUDA_LIB})
+    endif()
+    if(USE_MKL)
+        set(MESS ${MESS} " Use MKL")
+        set(ALL_LIB ${ALL_LIB} ${MKL_LIB})
+    elseif(USE_OPENBLAS)
+        set(MESS ${MESS} " Use OpenBLAS")
+        set(ALL_LIB ${ALL_LIB} ${OPENBLAS_LIB})
+    else()
+    endif()
+
+    if(GEN_DLL)
+        message(STATUS "Generate Dynamic Link Library")
+        message(STATUS "Name of Dynamic Link Library: " ${NIUTRANS_DLL})
+        target_link_libraries(${NIUTRANS_DLL} ${ALL_LIB})
+    else()
+        message(STATUS "Generate Makefile For Executable File")
+        message(STATUS "Name of Executable File :" ${NIUTRANS_EXE})
+        target_link_libraries(${NIUTRANS_EXE} ${ALL_LIB})
+    endif()
+    message(STATUS "${MESS}")
+else()
+    add_definitions(-std=c++11)
+    set(MESS ${MESS} "On Linux")
+    if(USE_CUDA)
+        set(MESS ${MESS} " Use CUDA")
+        set(ALL_LIB ${ALL_LIB} ${CUDA_LIB})
+        set(FLAG ${FLAG} "-lpthread -lcudart -lnvidia-ml")
+    else()
+        set(FLAG ${FLAG} "-lpthread")
+    endif()
+    if(USE_MKL)
+        set(MESS ${MESS} " Use MKL")
+        set(ALL_LIB ${ALL_LIB} ${MKL_LIB})
+        set(FLAG ${FLAG} "-liomp5 -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -ldl")
+    elseif(USE_OPENBLAS)
+        set(MESS ${MESS} " Use OpenBLAS")
+        set(ALL_LIB ${ALL_LIB} ${OPENBLAS_LIB})
+        set(FLAG ${FLAG} "-lopenblas")
+    else()
+    endif()
+
+    if(GEN_DLL)
+        message(STATUS "Generate Dynamic Link Library")
+        message(STATUS "Name of Dynamic Link Library: " ${NIUTRANS_DLL})
+        target_link_libraries(${NIUTRANS_DLL} ${ALL_LIB} ${FLAG})
+    else()
+        message(STATUS "Generate Makefile For Executable File")
+        message(STATUS "Name of Executable File: " ${NIUTRANS_EXE})
+        target_link_libraries(${NIUTRANS_EXE} ${ALL_LIB} ${FLAG})
+    endif()
+    message(STATUS "${MESS}")
+endif()
--- a/LICENSE
+++ b/LICENSE
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/README.md
+++ b/README.md
+# NiuTrans.NMT
+
+  - [Features](#features)
+  - [Installation](#installation)
+    - [Requirements](#requirements)
+    - [Build from Source](#build-from-source)
+      - [Configure with CMake](#configure-with-cmake)
+      - [Configuration Example](#configuration-example)
+      - [Compile on Linux](#compile-on-linux)
+      - [Compile on Windows](#compile-on-windows)
+  - [Usage](#usage)
+    - [Training](#training)
+      - [Commands](#commands)
+      - [An Example](#an-example)
+    - [Translating](#translating)
+      - [Commands](#commands-1)
+      - [An Example](#an-example-1)
+  - [Low Precision Inference](#low-precision-inference)
+  - [Converting Models from Fairseq](#converting-models-from-fairseq)
+  - [A Model Zoo](#a-model-zoo)
+  - [Papers](#papers)
+  - [Team Members](#team-members)
+
+## Features
+NiuTrans.NMT is a lightweight and efficient Transformer-based neural machine translation system. Its main features are:
+* Few dependencies. It is implemented with pure C++, and all dependencies are optional.
+* Fast decoding. It supports various decoding acceleration strategies, such as batch pruning and dynamic batch size.
+* Advanced NMT models, such as [Deep Transformer](https://www.aclweb.org/anthology/P19-1176).
+* Flexible running modes. The system can be run on various systems and devices (Linux vs. Windows, CPUs vs. GPUs, and FP32 vs. FP16, etc.).
+* Framework agnostic. It supports various models trained with other tools, e.g., fairseq models.
+* The code is simple and friendly to beginners.
+
+## Installation
+
+### Requirements
+* OS: Linux or Windows
+
+* [GCC/G++](https://gcc.gnu.org/) >=4.8.4 (on Linux)
+
+* [VC++](https://www.microsoft.com/en-us/download/details.aspx?id=48145) >=2015 (on Windows)
+
+* [CMake](https://cmake.org/download/) >= 2.8
+
+* [CUDA](https://developer.nvidia.com/cuda-92-download-archive) >= 9.2, <= 10.0 (optional)
+
+* [MKL](https://software.intel.com/content/www/us/en/develop/tools/math-kernel-library.html) latest version (optional)
+
+* [OpenBLAS](https://github.com/xianyi/OpenBLAS) latest version (optional)
+
+
+### Build from Source
+
+#### Configure with CMake
+
+The default configuration enables compiling for the **pure CPU** version.
+
+```bash
+# Download the code
+git clone https://github.com/NiuTrans/NiuTrans.NMT.git
+git clone https://github.com/NiuTrans/NiuTensor.git
+# Merge with NiuTrans.Tensor
+mv NiuTrans.Tensor/source NiuTrans.NMT/source/niutensor
+rm NiuTrans.NMT/source/niutensor/Main.cpp
+rm -rf NiuTrans.NMT/source/niutensor/sample NiuTrans.NMT/source/niutensor/tensor/test
+mkdir NiuTrans.NMT/build && cd NiuTrans.NMT/build
+# Run CMake
+cmake ..
+```
+
+You can add compilation options to the CMake command to support accelerations with MKL, OpenBLAS, or CUDA.
+
+*Please note that you can only select at most one of MKL or OpenBLAS.*
+
+* Use CUDA (required for training)
+
+  Add ``-DUSE_CUDA=ON`` and ``-DCUDA_TOOLKIT_ROOT_DIR=$CUDA_PATH`` to the CMake command, where ``$CUDA_PATH`` is the path of the CUDA toolkit.
+
+  You can also add ``-DUSE_FP16=ON`` to the CMake command to get half-precision supported.
+
+* Use MKL (optional)
+
+  Add ``-DUSE_MKL=ON`` and ``-DINTEL_ROOT=$MKL_PATH`` to the CMake command, where ``$MKL_PATH`` is the path of MKL.
+
+* Use OpenBLAS (optional)
+
+  Add ``-DUSE_OPENBLAS=ON`` and ``-DOPENBLAS_ROOT=$OPENBLAS_PATH`` to the CMake command, where ``$OPENBLAS_PATH`` is the path of OpenBLAS.
+
+
+*Note that half-precision requires Pascal or newer architectures on GPUs.*
+
+#### Configuration Example
+
+We provide [several examples](./sample/compile/README.md) to build the project with different options. 
+
+#### Compile on Linux
+
+```bash
+make -j && cd ..
+```
+
+#### Compile on Windows
+
+Add ``-A 64`` to the CMake command and it will generate a visual studio project on windows, i.e., ``NiuTrans.NMT.sln`` so you can open & build it with Visual Studio (>= Visual Studio 2015).
+
+If it succeeds, you will get an executable file **`NiuTrans.NMT`** in the 'bin' directory.
+
+
+
+## Usage
+
+### Training
+
+#### Commands
+
+*Make sure compiling the program with CUDA because training on CPUs is not supported now.*
+
+Step 1: Prepare the training data.
+
+```bash
+# Convert the BPE vocabulary
+python3 tools/GetVocab.py \
+  -raw $bpeVocab \
+  -new $niutransVocab
+```
+
+Description:
+* `raw` - Path of the BPE vocabulary.
+* `new` - Path of the NiuTrans.NMT vocabulary to be saved.
+
+```bash
+# Binarize the training data
+python3 tools/PrepareParallelData.py \ 
+-src $srcFile \
+-tgt $tgtFile \
+-src_vocab $srcVocab \
+-tgt_vocab $tgtVocab \
+-output $trainingFile 
+```
+
+Description:
+
+* `src` - Path of the source language data. One sentence per line with tokens separated by spaces or tabs.
+* `tgt` - Path of the target language data. The same format as the source language data.
+* `sv` - Path of the source language vocabulary. Its first line is the vocabulary size and the first index, followed by a word and its index in each following line.
+* `tv` - Path of the target language vocabulary. The same format as the source language vocabulary.
+* `output` - Path of the training data to be saved. 
+
+
+
+Step 2: Train the model
+
+```bash
+bin/NiuTrans.NMT \
+-dev $deviceID \
+-model $modelFile \
+-train $trainingData \
+-valid $validData 
+```
+
+Description:
+
+* `dev` - Device id (>= 0 for GPUs). Default: 0.
+* `model` - Path of the model to be saved.
+* `train` - Path to the training file. The same format as the output file in step 1.
+* `valid` - Path to the validation file. The same format as the output file in step 1.
+* `wbatch` - Word batch size. Default: 4096.
+* `sbatch` - Sentence batch size. Default: 8.
+* `mt` - Indicates whether the model runs for machine translation. Default: true.
+* `dropout` - Dropout rate for the model. Default: 0.3.
+* `fnndrop` - Dropout rate for fnn layers. Default: 0.1.
+* `attdrop` - Dropout rate for attention layers. Default: 0.1.
+* `lrate`- Learning rate. Default: 0.0015.
+* `lrbias` - The parameter that controls the maximum learning rate in training. Default: 0.
+* `nepoch` - Training epoch number. Default: 50.
+* `nstep` - Traing step number. Default: 100000.
+* `nwarmup` - Step number of warm-up for training. Default: 8000.
+* `adam` - Indicates whether Adam is used. Default: true.
+* `adambeta1` - Hyper parameters of Adam. Default: 0.9.
+* `adambeta2` - Hyper parameters of Adam. Default: 0.98.
+* `adambeta` - Hyper parameters of Adam. Default: 1e-9.
+* `shuffled` - Indicates whether the data file is shuffled for training. Default: true.
+* `labelsmoothing` - Label smoothing factor. Default: 0.1.
+* `nstepcheckpoint` - Number of steps after which we make a checkpoint. Default: -1.
+* `epochcheckpoint` - Indicates whether we make a checkpoint after each training epoch. Default: true.
+* `updatestep` - Number of batches that we collect for model update. Default: 1 (one can set > 1 for gradient accumulation).
+* `sorted` - Indicates whether the sequence is sorted by length. Default: false.
+* `bufsize` - Buffer size for the batch loader. Default: 50000.
+* `doubledend` - Indicates whether we double the </s> symbol for the output of LM. Default: false.
+* `smallbatch` - Indicates whether we use batchsize = max *  sc
+       rather rather than batchsize = word-number, where max is the maximum
+       length and sc is the sentence number. Default: true.
+* `bigbatch` - Counterpart of "isSmallBatch". Default: false.
+* `randbatch` - Randomize batches. Default: false.
+* `bucketsize` - Bucket size for the batch loader. Default: wbatch * 10.
+
+
+
+#### An Example
+
+Refer to [this page for the training example.](./sample/train/)
+
+### Translating
+
+*Make sure compiling the program with CUDA and FP16 if you want to translate with FP16 on GPUs.*
+
+#### Commands
+
+```bash
+bin/NiuTrans.NMT \
+ -dev $deviceID \
+ -test $inputFile \
+ -model $modelPath \
+ -sbatch $batchSize \
+ -beamsize $beamSize \
+ -srcvocab $srcVocab \
+ -tgtvocab $tgtVocab \
+ -output $outputFile
+```
+
+
+Description:
+
+
+* `model` - Path of the model.
+* `sbatch` - Sentence batch size. Default: 8.
+* `dev` - Device id (-1 for CPUs, and >= 0 for GPUs). Default: 0.
+* `beamsize` - Size of the beam. 1 for the greedy search.
+* `test` - Path of the input file. One sentence per line with tokens separated by spaces.
+* `output` - Path of the output file to be saved. The same format as the input file.
+* `srcvocab` - Path of the source language vocabulary. Its first line is the vocabulary size, followed by a word and its index in each following line.
+* `tgtvocab` - Path of the target language vocabulary. The same format as the source language vocabulary.
+* `fp16 (optional)` - Inference with FP16. This will not work if the model is stored in FP32. Default: false.
+* `lenalpha` - The alpha parameter controls the length preference. Default: 0.6.
+* `maxlenalpha` - Scalar of the input sequence (for the max number of search steps). Default: 1.2.
+
+
+
+#### An Example
+
+Refer to [this page for the translating example.](./sample/translate/)
+
+## Low Precision Inference
+
+NiuTrans.NMT supports inference with FP16, you can convert the model to FP16 with our tools:
+
+```bash
+python3 tools/FormatConverter.py \
+  -input $inputModel \
+  -output $outputModel \ 
+  -format $targetFormat
+```
+
+Description:
+
+* `input` - Path of the raw model file.
+* `output` - Path of the new model file.
+* `format` - Target storage format, FP16 (Default) or FP32.
+
+## Converting Models from Fairseq
+
+The core implementation is framework agnostic, so we can easily convert models trained with other frameworks to a binary format for efficient inference. 
+
+The following frameworks and models are currently supported:
+
+|     | [fairseq (0.6.2)](https://github.com/pytorch/fairseq/tree/v0.6.2) |
+| --- | :---: |
+| Transformer ([Vaswani et al. 2017](https://arxiv.org/abs/1706.03762)) | ✓ |
+| RPR attention ([Shaw et al. 2018](https://arxiv.org/abs/1803.02155)) | ✓ |
+| Deep Transformer ([Wang et al. 2019](https://www.aclweb.org/anthology/P19-1176/)) | ✓ |
+
+*Refer to [this page](https://fairseq.readthedocs.io/en/latest/getting_started.html#training-a-new-model) for the details about training models with fairseq.*
+
+After training, you can convert the fairseq models and vocabulary with the following steps.
+
+Step 1: Convert parameters of a single fairseq model
+```bash
+python3 tools/ModelConverter.py -src $src -tgt $tgt
+```
+Description:
+
+* `src` - Path of the fairseq checkpoint, [refer to this for more details](https://fairseq.readthedocs.io/en/latest/).
+* `tgt` - Path to save the converted model parameters. All parameters are stored in a binary format.
+* `fp16 (optional)` - Save the parameters with 16-bit data type. Default: disabled.
+
+Step 2: Convert the vocabulary:
+```bash
+python3 tools/VocabConverter.py -src $fairseqVocabPath -tgt $newVocabPath
+```
+Description:
+
+* `src` - Path of the fairseq vocabulary, [refer to this for more details](https://fairseq.readthedocs.io/en/latest/).
+* `tgt` - Path to save the converted vocabulary. Its first line is the vocabulary size, followed by a word and its index in each following line.
+
+*You may need to convert both the source language vocabulary and the target language vocabulary if they are not shared.*
+
+## A Model Zoo
+
+We provide several pre-trained models to test the system.
+All models and runnable systems are packaged into docker files so that one can easily reproduce our result.
+
+Refer to [this page](./sample/translate) for more details.
+
+## Papers
+
+Here are the papers related to this project:
+
+[Learning Deep Transformer Models for Machine Translation.](https://www.aclweb.org/anthology/P19-1176) Qiang Wang, Bei Li, Tong Xiao, Jingbo Zhu, Changliang Li, Derek F. Wong, Lidia S. Chao. 2019. Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics.
+
+[The NiuTrans System for WNGT 2020 Efficiency Task.](https://www.aclweb.org/anthology/2020.ngt-1.24)  Chi Hu, Bei Li, Yinqiao Li, Ye Lin, Yanyang Li, Chenglong Wang, Tong Xiao, Jingbo Zhu. 2020. Proceedings of the Fourth Workshop on Neural Generation and Translation.
+
+## Team Members
+
+This project is maintained by a joint team from NiuTrans Research and NEU NLP Lab. Current team members are
+
+*Chi Hu, Bei Li, Yinqiao Li, Ye Lin, Quan Du, Tong Xiao and Jingbo Zhu*
+
+Please contact niutrans@mail.neu.edu.cn if you have any questions.
+
--- a/sample/compile/README.md
+++ b/sample/compile/README.md
+# Compilation Example
+
+Here is some compilation example for Linux with MKL, OpenBLAS, or CUDA supported. 
+
+**Replace the path in your environment.**
+
+
+## Compile with CUDA supported
+
+```bash
+git clone https://github.com/NiuTrans/NiuTrans.NMT.git
+git clone https://github.com/NiuTrans/NiuTensor.git
+mv NiuTrans.Tensor/source NiuTrans.NMT/source/niutensor
+rm NiuTrans.NMT/source/niutensor/Main.cpp
+rm -rf NiuTrans.NMT/source/niutensor/sample NiuTrans.NMT/source/niutensor/tensor/test
+mkdir NiuTrans.NMT/build && cd NiuTrans.NMT/build
+cmake -DUSE_CUDA=ON -DCUDA_TOOLKIT_ROOT_DIR='/home/nlplab/cuda9.2/' ..
+make -j
+```
+
+## Compile with CUDA and FP16 supported
+
+
+```bash
+git clone https://github.com/NiuTrans/NiuTrans.NMT.git
+git clone https://github.com/NiuTrans/NiuTensor.git
+mv NiuTrans.Tensor/source NiuTrans.NMT/source/niutensor
+rm NiuTrans.NMT/source/niutensor/Main.cpp
+rm -rf NiuTrans.NMT/source/niutensor/sample NiuTrans.NMT/source/niutensor/tensor/test
+mkdir NiuTrans.NMT/build && cd NiuTrans.NMT/build
+cmake -DUSE_CUDA=ON -DCUDA_TOOLKIT_ROOT_DIR='/home/nlplab/cuda9.2/' -DUSE_FP16=ON ..
+make -j
+```
+
+## Compile with MKL supported
+
+
+```bash
+git clone https://github.com/NiuTrans/NiuTrans.NMT.git
+git clone https://github.com/NiuTrans/NiuTensor.git
+mv NiuTrans.Tensor/source NiuTrans.NMT/source/niutensor
+rm NiuTrans.NMT/source/niutensor/Main.cpp
+rm -rf NiuTrans.NMT/source/niutensor/sample NiuTrans.NMT/source/niutensor/tensor/test
+mkdir NiuTrans.NMT/build && cd NiuTrans.NMT/build
+cmake -DUSE_MKL=ON -DINTEL_ROOT='/home/nlplab/intel/compilers_and_libraries_2020.2.254/linux' ..
+make -j
+```
+
+## Compile with OpenBLAS supported
+
+
+```bash
+git clone https://github.com/NiuTrans/NiuTrans.NMT.git
+git clone https://github.com/NiuTrans/NiuTensor.git
+mv NiuTrans.Tensor/source NiuTrans.NMT/source/niutensor
+rm NiuTrans.NMT/source/niutensor/Main.cpp
+rm -rf NiuTrans.NMT/source/niutensor/sample NiuTrans.NMT/source/niutensor/tensor/test
+mkdir NiuTrans.NMT/build && cd NiuTrans.NMT/build
+cmake -DUSE_OPENBLAS=ON -DOPENBLAS_ROOT='/home/nlplab/openblas/' ..
+make -j
+```
+
--- a/sample/train/README.md
+++ b/sample/train/README.md
+# Training a new model
+
+## IWSLT'14 German to English (Transformer)
+
+The following instructions can train a Transformer model on the [IWSLT'14 German to English dataset](http://workshop2014.iwslt.org/downloads/proceeding.pdf).
+
+Step 1: Prepare the training data:
+
+*We provide the BPE code for better reproducibility. The source and target vocabulary are shared with 10,000 merges.*
+
+```bash
+# Extract the data
+cd sample/train/
+IWSLT_PATH=iwslt14.tokenized.de-en
+tar -zxvf $IWSLT_PATH.tar.gz
+IWSLT_PATH=sample/train/$IWSLT_PATH
+
+# Binarize the data
+cd ../..
+python3 tools/GetVocab.py \
+  -raw $IWSLT_PATH/bpevocab \
+  -new $IWSLT_PATH/vocab.de
+python3 tools/GetVocab.py \
+  -raw $IWSLT_PATH/bpevocab \
+  -new $IWSLT_PATH/vocab.en
+python3 tools/PrepareParallelData.py \
+  -src $IWSLT_PATH/train.de -tgt $IWSLT_PATH/train.en \
+  -src_vocab $IWSLT_PATH/vocab.de -tgt_vocab $IWSLT_PATH/vocab.en \
+  -output $IWSLT_PATH/train.data
+python3 tools/PrepareParallelData.py \
+  -src $IWSLT_PATH/valid.de -tgt $IWSLT_PATH/valid.en \
+  -src_vocab $IWSLT_PATH/vocab.de -tgt_vocab $IWSLT_PATH/vocab.en \
+  -output $IWSLT_PATH/valid.data
+```
+*You may extract the data manually on Windows.*
+
+
+Step 2: Train the model with default configurations 
+(6 encoder/decoder layer, 512 model size, 50 epoches):
+
+```bash
+bin/NiuTrans.NMT \
+  -dev 0 \
+  -nepoch 50 \
+  -model model.bin \
+  -maxcheckpoint 10 \
+  -train $IWSLT_PATH/train.data \
+  -valid $IWSLT_PATH/valid.data
+```
+
+Step 3: Average the last ten checkpoints:
+
+```bash
+python tools/Ensemble.py -input 'model.bin.*' -output model.ensemble
+```
+
+It costs about 310s per epoch on a GTX 1080 Ti.
+
+Expected BLEU score (lenalpha=0.6, maxlenalpha=1.2):
+
+| Model type      | Beam Search     | Greedy Search   |
+| --------------- | --------------- | --------------- |
+| Single model    | 34.05 (beam=4)  | 33.35    |
+| Ensemble model  | 34.48 (beam=4)  | 34.01    |
+
+We provide models trained with the default configurations:
+
+[Google Drive](https://drive.google.com/drive/folders/10W89cx60Q7A9nGyg5fwLP21Sg53n6NXV?usp=sharing)
+
+[Baidu Cloud](https://pan.baidu.com/s/1LbkV8kuaDWNunVR2jwOhRg) (password: bdwp)
\ No newline at end of file
--- a/sample/train/iwslt14.tokenized.de-en.tar.gz
+++ b/sample/train/iwslt14.tokenized.de-en.tar.gz
--- a/sample/translate/README.md
+++ b/sample/translate/README.md
+# Translating with pre-trained models
+
+## IWSLT'14 En-De Models
+
+The following instructions can be used to translate with a pre-trained Transformer model.
+
+You can evaluate models trained in the [training example](../sample/train) by two steps.
+
+Step 1: Translate the IWSLT14 De-En test set (tokenized) on the GPU:
+```
+IWSLT_PATH=sample/train/iwslt14.tokenized.de-en
+bin/NiuTrans.NMT \
+-dev 0 \
+ -test $IWSLT_PATH/test.de \
+ -model model.bin \
+ -sbatch 64 \
+ -beamsize 1  \
+ -srcvocab $IWSLT_PATH/vocab.de \
+ -tgtvocab $IWSLT_PATH/vocab.en \
+ -output output.atat
+sed -r 's/(@@ )|(@@ ?$)//g' < output.atat > output
+```
+
+You can also set `-dev -1` to use the CPU.
+
+Step 2: Check the translation with [multi-bleu](https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/multi-bleu.perl):
+```
+perl multi-bleu.perl $IWSLT_PATH/test.en < output
+```
+
+It takes about 15s for translating test.de (6,750 sentences) on a GTX 1080 Ti with a greedy search.
+
+## WNGT 2020 Models
+
+The models here are the submissions to the [WNGT 2020 efficiency task](https://sites.google.com/view/wngt20/efficiency-task), which focuses on developing efficient MT systems.
+
+The WNGT 2020 efficiency task constrains systems to translate 1 million sentences on CPUs and GPUs under the condition of the [WMT 2019 English-German news](http://statmt.org/wmt19/translation-task.html) translation task.
+
+- For CPUs, the performance was measured on an [AWS c5.metal instance](https://aws.amazon.com/cn/blogs/aws/now-available-new-c5-instance-sizes-and-bare-metal-instances/) with 96 logical Cascade Lake processors and 192 GB memory. We submitted one system (9-1-tiny) running with all CPU cores.
+
+- For GPUs, the performance was measured on an [AWS g4dn.xlarge instance](https://aws.amazon.com/cn/ec2/instance-types/g4/) with an NVIDIA T4 GPU and 16 GB memory. We submitted four systems (9-1, 18-1, 35-1, 35-6) running with FP16.
+
+We list the results of all submissions. See [the official results](https://docs.google.com/spreadsheets/d/1M82S5wPSIM543Gh20d71Zs0FNHJQ3JdiJzDECiYJNlE/edit#gid=0) for more details.
+
+| Model type | Time (s) | File size (MiB) | BLEU | Word per second |
+| ---------- | -------- | --------------- | ---- | --------------- |
+| 9-1-tiny*  | 810      | 66.8            | 27.0 | 18518 |           
+| 9-1        | 977      | 99.3            | 31.1 | 15353 |
+| 18-1       | 1355     | 156.1           | 31.4 | 11070 |
+| 35-1       | 2023     | 263.3           | 32.0 | 7418  |
+| 35-6       | 3166     | 305.4           | 32.2 | 4738  |
+
+
+<em>* means run on CPUs. </em>
+
+Description:
+
+* `Model type` - Number of encoder and decoder layers, e.g., 9-1 means that the model consists of 9 encoder layers and 1 decoder layer. The model size is 512 except for the *tiny* model, whose size is 256.
+* `Time` - Real time took for translating the whole test set, which contains about 1 million sentences with ~15 million tokens. The time of the `tiny` model was measured on CPUs, while other models were measured on GPUs.
+* `File size` - All models are stored in FP16 except for the `tiny` model stored in FP32.
+* `BLEU` - We report the averaged sacre BLEU score across wmt10 to wmt19, wmt12 is excluded. BLEU+case.mixed+lang.en-de+numrefs.1+smooth.exp+test.wmt10+tok.13a+version.1.4.9 (for wmt10, similar for others).
+
+
+All these models and docker images are available at:
+
+[Baidu Cloud](https://pan.baidu.com/s/1J8kRoF3d5P-XA4Qd3YT4ZQ) password: bdwp
+
+[Google Drive](https://drive.google.com/file/d/1tgCUN8TnUsbcI7BCYFQkj30rCvk68YRb) (docker images only)
\ No newline at end of file
--- a/source/Main.cpp
+++ b/source/Main.cpp
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-10
+ */
+
+//#define CRTDBG_MAP_ALLOC
+//#include <stdlib.h>
+//#include <crtdbg.h>
+
+#include "./nmt/NMT.h"
+#include "niutensor/network/XNoder.h"
+#include "niutensor/tensor/XTensor.h"
+#include "niutensor/tensor/core/movement/Spread.h"
+
+using namespace nmt;
+using namespace nts;
+
+void test() {
+    XTensor input, node, index;
+    InitTensor2D(&input, 32, 4);
+    InitTensor2D(&input, 13, 4);
+    InitTensor2D(&input, 32, 4);
+
+    XNoder::MakeGrad(&input);
+
+    XTensor* tmp = NewTensorBufV2(&input, input.devID, input.mem);
+    _SpreadForGather(tmp, node.grad, &index);
+
+    _SumMe(input.grad, tmp);
+    input.grad->Dump(stderr);
+}
+
+int main(int argc, const char** argv)
+{
+    //_CrtSetDbgFlag(_CrtSetDbgFlag(_CRTDBG_REPORT_FLAG) | _CRTDBG_LEAK_CHECK_DF);
+    //_CrtSetBreakAlloc(2708);
+
+    NMTMain(argc - 1, argv + 1);
+
+    //test();
+
+    //_CrtDumpMemoryLeaks();
+    
+    return 0;
+}
+
--- a/source/nmt/Decoder.cpp
+++ b/source/nmt/Decoder.cpp
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-10-09
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
+ */
+
+#include "Decoder.h"
+#include "Utility.h"
+#include "layer/LayerNorm.h"
+#include "layer/CommonModules.h"
+#include "../niutensor/tensor/core/CHeader.h"
+
+namespace nmt
+{
+
+/* constructor */
+AttDecoder::AttDecoder()
+{
+    selfAtt = NULL;
+    fnns = NULL;
+    selfAttLayerNorms = NULL;
+    fnnLayerNorms = NULL;
+    enDeAtt = NULL;
+    enDeAttLayerNorms = NULL;
+    decoderLayerNorm = NULL;
+    selfAttCache = NULL;
+    enDeAttCache = NULL;
+}
+
+/* de-constructor */
+AttDecoder::~AttDecoder()
+{
+    delete[] selfAttCache;
+    delete[] enDeAttCache;
+    delete[] selfAtt;
+    delete[] fnns;
+    delete[] selfAttLayerNorms;
+    delete[] fnnLayerNorms;
+    delete[] enDeAtt;
+    delete[] enDeAttLayerNorms;
+    if (preNorm)
+        delete decoderLayerNorm;
+}
+
+/*
+initialize the model
+>> config - configurations of the model
+*/
+void AttDecoder::InitModel(Config& config)
+{
+    devID = config.devID;
+    nlayer = config.nDecLayer;
+    hSize = config.modelSize;
+    eSize = config.embSize;
+    vSize = config.tgtVocabSize;
+    dropoutP = config.dropout;
+    preNorm = config.preNorm;
+
+    CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
+    CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsizetgt\"");
+
+    /* embedding model */
+    embedder.InitModel(config, false);
+
+    selfAtt = new Attention[nlayer];
+    fnns = new FNN[nlayer];
+    selfAttLayerNorms = new LN[nlayer];
+    enDeAtt = new Attention[nlayer];
+    enDeAttLayerNorms = new LN[nlayer];
+    fnnLayerNorms = new LN[nlayer];
+
+    selfAttCache = new Cache[nlayer];
+    enDeAttCache = new Cache[nlayer];
+    if (preNorm)
+        decoderLayerNorm = new LN;
+
+    /* initialize the stacked layers */
+    for (int i = 0; i < nlayer; i++) {
+        selfAtt[i].InitModel(config);
+        fnns[i].InitModel(config);
+        selfAttLayerNorms[i].InitModel(config);
+        fnnLayerNorms[i].InitModel(config);
+        enDeAtt[i].InitModel(config);
+        enDeAttLayerNorms[i].InitModel(config);
+        selfAttCache[i].enable = true;
+        enDeAttCache[i].enable = true;
+    }
+    if (preNorm)
+        decoderLayerNorm->InitModel(config);
+}
+
+/*
+make the decoding network
+>> inputDec - the input tensor of the decoder
+>> outputEnc - the output tensor of the encoder
+>> mask - mask that indicates which position is valid
+>> maskEncDec - mask for the encoder-decoder attention
+>> nstep - the current length of the decoder input
+>> isTraining - indicates whether the model is used for training
+<< return - the output tensor of the decoder
+*/
+XTensor AttDecoder::Make(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
+                         XTensor* maskEncDec, int nstep, bool isTraining)
+{
+    XTensor x;
+
+    x = embedder.Make(inputDec, true, isTraining, nstep);
+
+    /* dropout */
+    if (isTraining && dropoutP > 0)
+        x = Dropout(x, dropoutP);
+
+    for (int i = 0; i < nlayer; i++) {
+        XTensor att;
+        XTensor ende;
+        XTensor fnn;
+        XTensor res;
+        XTensor selfAttnBefore;
+        XTensor selfAttnAfter;
+        XTensor endeAttnBefore;
+        XTensor endeAttnAfter;
+        XTensor fnnBefore;
+
+        /* layer normalization with pre-norm for self-attn */
+        selfAttnBefore = LayerNorm(x, selfAttLayerNorms[i], preNorm, true, false);
+
+        /******************/
+        /* self attention */
+        att = selfAtt[i].Make(selfAttnBefore, selfAttnBefore, selfAttnBefore, 
+                              mask, isTraining, &selfAttCache[i], SELF_ATT);
+
+        /* dropout */
+        if (isTraining && dropoutP > 0)
+            att = Dropout(att, dropoutP);
+
+        /* residual connection */
+        res = Sum(att, x);
+
+        /* layer normalization with post-norm for self-attention */
+        selfAttnAfter = LayerNorm(res, selfAttLayerNorms[i], preNorm, false, true);
+
+        /* layer normalization with pre-norm for encoder-decoder attention */
+        endeAttnBefore = LayerNorm(selfAttnAfter, enDeAttLayerNorms[i], preNorm, true, false);
+
+        /* encoder-decoder attention */
+        ende = enDeAtt[i].Make(outputEnc, endeAttnBefore, outputEnc, maskEncDec, 
+                               isTraining, &enDeAttCache[i], EN_DE_ATT);
+
+        /* dropout */
+        if (isTraining && dropoutP > 0)
+            ende = Dropout(ende, dropoutP);
+
+        /* residual connection */
+        res = Sum(ende, selfAttnAfter);
+
+        /* layer normalization with post-norm for encoder-decoder attention */
+        endeAttnAfter = LayerNorm(res, enDeAttLayerNorms[i], preNorm, false, true);
+
+        /* layer normalization with pre-norm for fnn */
+        fnnBefore = LayerNorm(endeAttnAfter, fnnLayerNorms[i], preNorm, true, false);
+
+        /* fnn */
+        fnn = fnns[i].Make(fnnBefore, isTraining);
+
+        /* dropout */
+        if (isTraining && dropoutP > 0)
+            fnn = Dropout(fnn, dropoutP);
+
+        /* residual connection */
+        res = Sum(fnn, endeAttnAfter);
+
+        /* layer normalization with post-norm for fnn */
+        x = LayerNorm(res, fnnLayerNorms[i], preNorm, false, true);
+    }
+
+    if (preNorm)
+        return decoderLayerNorm->Make(x);
+
+    return x;
+}
+
+/*
+make the decoding network
+>> inputDec - the input tensor of the decoder
+>> outputEnc - the output tensor of the encoder
+>> mask - mask that indicates which position is valid
+>> maskEncDec - mask for the encoder-decoder attention
+>> nstep - the current length of the decoder input
+>> isTraining - indicates whether the model is used for training
+<< return - the output tensor of the decoder
+*/
+XTensor AttDecoder::MakeFast(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
+                             XTensor* maskEncDec, int nstep, bool isTraining)
+{
+    XTensor x;
+
+    x = embedder.Make(inputDec, true, isTraining, nstep);
+
+    /* dropout */
+    if (isTraining && dropoutP > 0)
+        x = Dropout(x, dropoutP);
+
+    for (int i = 0; i < nlayer; i++) {
+        XTensor res;
+
+        res = x;
+
+        /* layer normalization with pre-norm for self-attn */
+        x = selfAttLayerNorms[i].Make(x);
+
+        /******************/
+        /* self attention */
+        x = selfAtt[i].Make(x, x, x, mask, isTraining, &selfAttCache[i], SELF_ATT);
+
+        /* dropout */
+        if (isTraining && dropoutP > 0)
+            x = Dropout(x, dropoutP);
+
+        /* residual connection */
+        x = Sum(res, x);
+
+        res = x;
+
+        /* layer normalization with pre-norm for encoder-decoder attention */
+        x = enDeAttLayerNorms[i].Make(x);
+
+        /* encoder-decoder attention */
+        x = enDeAtt[i].Make(outputEnc, x, outputEnc, maskEncDec,
+                            isTraining, &enDeAttCache[i], EN_DE_ATT);
+
+        /* dropout */
+        if (isTraining && dropoutP > 0)
+            x = Dropout(x, dropoutP);
+
+        /* residual connection */
+        x = Sum(res, x);
+
+        res = x;
+
+        /* layer normalization with pre-norm for fnn */
+        x = fnnLayerNorms[i].Make(x);
+
+        /* fnn */
+        x = fnns[i].Make(x, isTraining);
+
+        /* dropout */
+        if (isTraining && dropoutP > 0)
+            x = Dropout(x, dropoutP);
+
+        /* residual connection */
+        x = Sum(res, x);
+    }
+
+    x = decoderLayerNorm->Make(x);
+
+    return x;
+}
+
+}
\ No newline at end of file
--- a/source/nmt/Decoder.h
+++ b/source/nmt/Decoder.h
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
+ */
+
+#ifndef __DECODER_H__
+#define __DECODER_H__
+
+#include "Encoder.h"
+#include "Utility.h"
+
+namespace nmt
+{
+
+class AttDecoder
+{
+public:
+
+    /* device id */
+    int devID;
+
+    /* layer number */
+    int nlayer;
+
+    /* hidden layer size of the FNN layer */
+    int hSize;
+
+    /* embedding size */
+    int eSize;
+
+    /* vocabulary size */
+    int vSize;
+
+    /* dropout probability */
+    DTYPE dropoutP;
+
+    /* embedding of word at each position */
+    Embedder embedder;
+
+    /* FNN model of each layer */
+    FNN* fnns;
+
+    /* attention model of each layer */
+    Attention* selfAtt;
+
+    /* layer normalization for attention */
+    LN* selfAttLayerNorms;
+
+    /* layer normalization for fnn */
+    LN* fnnLayerNorms;
+
+    /* layer normalization for decoder */
+    LN* decoderLayerNorm;
+
+    /* encoder-decoder attention model of each layer */
+    Attention* enDeAtt;
+
+    /* layer normalization for encoder-decoder attention */
+    LN* enDeAttLayerNorms;
+
+    /* layer cache list */
+    Cache* selfAttCache;
+
+    /* layer cache list */
+    Cache* enDeAttCache;
+
+    /* the location of layer normalization */
+    bool preNorm;
+
+public:
+    /* constructor */
+    AttDecoder();
+
+    /* de-constructor */
+    ~AttDecoder();
+
+    /* initialize the model */
+    void InitModel(Config& config);
+
+    /* make the decoding network */
+    XTensor Make(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
+                 XTensor* maskEncDec, int nstep, bool isTraining);
+
+    /* make the decoding network (pre norm) */
+    XTensor MakeFast(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
+                     XTensor* maskEncDec, int nstep, bool isTraining);
+};
+
+}
+
+#endif
\ No newline at end of file
--- a/source/nmt/Encoder.cpp
+++ b/source/nmt/Encoder.cpp
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
+ */
+
+#include "Encoder.h"
+#include "Utility.h"
+#include "layer/LayerNorm.h"
+#include "layer/CommonModules.h"
+#include "../niutensor/tensor/core/CHeader.h"
+
+namespace nmt
+{
+
+/* constructor */
+AttEncoder::AttEncoder()
+{
+    selfAtt = NULL;
+    fnns = NULL;
+    attLayerNorms = NULL;
+    fnnLayerNorms = NULL;
+    encoderLayerNorm = NULL;
+}
+
+/* de-constructor */
+AttEncoder::~AttEncoder()
+{
+    delete[] selfAtt;
+    delete[] fnns;
+    delete[] attLayerNorms;
+    delete[] fnnLayerNorms;
+    if (preNorm)
+        delete encoderLayerNorm;
+}
+
+/*
+initialize the model
+>> config - configurations for the model
+*/
+void AttEncoder::InitModel(Config& config)
+{
+
+    devID = config.devID;
+    nlayer = config.nEncLayer;
+    eSize = config.embSize;
+    hSize = config.modelSize;
+    vSize = config.srcVocabSize;
+    preNorm = config.preNorm;
+    dropoutP = config.dropout;
+
+    CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
+    CheckNTErrors(vSize > 1, "Set vocabulary size by \"-vsize\"");
+
+    /* embedding model */
+    embedder.InitModel(config);
+
+    selfAtt = new Attention[nlayer];
+    fnns = new FNN[nlayer];
+    attLayerNorms = new LN[nlayer];
+    fnnLayerNorms = new LN[nlayer];
+
+    if (preNorm)
+        encoderLayerNorm = new LN;
+
+    /* initialize the stacked layers */
+    for (int i = 0; i < nlayer; i++) {
+        selfAtt[i].InitModel(config);
+        fnns[i].InitModel(config);
+        attLayerNorms[i].InitModel(config);
+        fnnLayerNorms[i].InitModel(config);
+    }
+    if (preNorm)
+        encoderLayerNorm->InitModel(config);
+}
+
+/*
+make the encoding network
+>> input - the input tensor of the encoder
+>> mask - the mask that indicate each position is valid
+>> maskEncDec - no use
+>> isTraining - indicates whether the model is used for training
+<< return - the output tensor of the encoder
+*/
+XTensor AttEncoder::Make(XTensor& input, XTensor* mask, XTensor& maskEncDec, bool isTraining)
+{
+    XTensor x;
+
+    x = embedder.Make(input, false, isTraining);
+
+    /* dropout */
+    if (isTraining && dropoutP > 0)
+        x = Dropout(x, dropoutP);
+
+    for (int i = 0; i < nlayer; i++) {
+        XTensor att;
+        XTensor fnn;
+        XTensor res;
+        XTensor attnBefore;
+        XTensor attnAfter;
+        XTensor fnnBefore;
+
+        /* layer normalization with pre-norm for self-attn */
+        attnBefore = LayerNorm(x, attLayerNorms[i], preNorm, true, false);
+
+        /* self attention */
+        att = selfAtt[i].Make(attnBefore, attnBefore, attnBefore, mask, isTraining, NULL, SELF_ATT);
+
+        /* dropout */
+        if (isTraining && dropoutP > 0)
+            att = Dropout(att, dropoutP);
+
+        /* residual connection */
+        res = Sum(att, x);
+
+        /* layer normalization with post-norm for self-attn */
+        attnAfter = LayerNorm(res, attLayerNorms[i], preNorm, false, true);
+
+        /* layer normalization with pre-norm for fnn */
+        fnnBefore = LayerNorm(attnAfter, fnnLayerNorms[i], preNorm, true, false);
+
+        /* fnn */
+        fnn = fnns[i].Make(fnnBefore, isTraining);
+
+        /* dropout */
+        if (isTraining && dropoutP > 0)
+            fnn = Dropout(fnn, dropoutP);
+
+        /* residual connection */
+        res = Sum(fnn, attnAfter);
+
+        /* layer normalization with post-norm for fnn */
+        x = LayerNorm(res, fnnLayerNorms[i], preNorm, false, true);
+    }
+    if (preNorm)
+        return encoderLayerNorm->Make(x);
+
+    return x;
+}
+
+/*
+make the encoding network
+>> input - the input tensor of the encoder
+>> mask - the mask that indicate each position is valid
+>> maskEncDec - no use
+>> isTraining - indicates whether the model is used for training
+<< return - the output tensor of the encoder
+*/
+XTensor AttEncoder::MakeFast(XTensor& input, XTensor* mask, XTensor& maskEncDec, bool isTraining)
+{
+    XTensor x;
+
+    x = embedder.Make(input, false, isTraining);
+
+    /* dropout */
+    if (isTraining && dropoutP > 0)
+        x = Dropout(x, dropoutP);
+
+    for (int i = 0; i < nlayer; i++) {
+        XTensor res;
+
+        res = x;
+
+        /* layer normalization with pre-norm for self-attn */
+        x = attLayerNorms[i].Make(x);
+
+        /* self attention */
+        x = selfAtt[i].Make(x, x, x, mask, isTraining, NULL, SELF_ATT);
+
+        /* dropout */
+        if (isTraining && dropoutP > 0)
+            x = Dropout(x, dropoutP);
+
+        /* residual connection */
+        x = Sum(res, x);
+
+        res = x;
+
+        /* layer normalization with pre-norm for fnn */
+        x = fnnLayerNorms[i].Make(x);
+
+        /* fnn */
+        x = fnns[i].Make(x, isTraining);
+
+        /* dropout */
+        if (isTraining && dropoutP > 0)
+            x = Dropout(x, dropoutP);
+
+        /* residual connection */
+        x = Sum(res, x);
+    }
+    x = encoderLayerNorm->Make(x);
+
+    return x;
+}
+
+/*
+make the encoding network (wrapper)
+>> input - the input tensor of the encoder
+>> mask - the mask that indicate each position is valid
+>> isTraining - indicates whether the model is used for training
+<< return - the output tensor of the encoder
+*/
+XTensor AttEncoder::Make(XTensor& input, XTensor* mask, bool isTraining)
+{
+    XTensor nothing;
+
+    return Make(input, mask, nothing, isTraining);
+}
+
+}
\ No newline at end of file
--- a/source/nmt/Encoder.h
+++ b/source/nmt/Encoder.h
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
+ */
+
+#ifndef __ENCODER_H__
+#define __ENCODER_H__
+
+#include "Utility.h"
+#include "layer/FNN.h"
+#include "layer/Attention.h"
+#include "layer/Embedding.h"
+#include "layer/LayerNorm.h"
+#include "../niutensor/network/XNet.h"
+
+using namespace nts;
+
+namespace nmt
+{
+
+/*
+base class of the encoder
+*/
+class Encoder
+{
+public:
+    virtual XTensor Make(XTensor& input, XTensor* mask, XTensor& mask2, bool isTraining) = 0;
+};
+
+/*
+the encoder based on self-attention
+*/
+class AttEncoder : Encoder
+{
+public:
+    /* device id */
+    int devID;
+
+    /* layer number */
+    int nlayer;
+
+    /* hidden layer size of the FNN layer */
+    int hSize;
+
+    /* embedding size */
+    int eSize;
+
+    /* vocabulary size */
+    int vSize;
+
+    /* dropout probability */
+    DTYPE dropoutP;
+
+    /* some positions can be ignored in attention. this is useful in lm where the first position needs
+       special design for the attention model. */
+    int ignored;
+
+    /* embedding of word at each position */
+    Embedder embedder;
+
+    /* FNN model of each layer */
+    FNN* fnns;
+
+    /* attention model of each layer */
+    Attention* selfAtt;
+
+    /* layer normalizations for attention */
+    LN* attLayerNorms;
+
+    /* layer normalization for fnn */
+    LN* fnnLayerNorms;
+
+    /* layer normalization for encoder */
+    LN* encoderLayerNorm;
+
+    /* the location of layer normalization */
+    bool preNorm;
+
+public:
+    /* constructor */
+    AttEncoder();
+
+    /* de-constructor */
+    ~AttEncoder();
+
+    /* initialize the model */
+    void InitModel(Config& config);
+
+    /* make the encoding network */
+    XTensor Make(XTensor& input, XTensor* mask, XTensor& maskEncDec, bool isTraining);
+
+    /* make the encoding network */
+    XTensor MakeFast(XTensor& input, XTensor* mask, XTensor& maskEncDec, bool isTraining);
+
+    /* make the encoding network (wrapper) */
+    XTensor Make(XTensor& input, XTensor* mask, bool isTraining);
+};
+
+}
+
+#endif
--- a/source/nmt/Model.cpp
+++ b/source/nmt/Model.cpp
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
+ */
+
+#include <cstdint>
+
+#include "Model.h"
+#include "Utility.h"
+#include "../niutensor/tensor/XUtility.h"
+#include "../niutensor/tensor/core/CHeader.h"
+
+namespace nmt
+{
+
+/* constructor */
+Model::Model()
+{
+    devID = -1;
+    isLM = false;
+    isMT = false;
+    useFP16 = false;
+    shareAllEmbeddings = 0;
+    shareDecInputOutputWeight = 0;
+    nhead = 1;
+
+    encoder = new AttEncoder();
+    decoder = new AttDecoder();
+    outputLayer = new Output();
+}
+
+/* de-constructor */
+Model::~Model()
+{
+    delete encoder;
+    delete decoder;
+    delete outputLayer;
+}
+
+/*
+initialize the model
+>> config - configurations of the model
+*/
+void Model::InitModel(Config& config)
+{
+    devID = config.devID;
+    isMT = config.isMT;
+    isLM = !isMT;
+    useFP16 = config.useFP16;
+
+    /* configurations for the model */
+    int* metaInfo[] = {
+        &config.nEncLayer, &config.nDecLayer,
+        &config.fnnHiddenSize, &config.modelSize,
+        &config.embSize, &config.srcVocabSize,
+        &config.tgtVocabSize, &config.nhead,
+        &config.maxRP, &config.shareAllEmbeddings,
+        &config.shareDecInputOutputWeight,
+        &config.maxPosLen
+    };
+
+    FILE* modelFile = NULL;
+
+    /* read model configurations */
+    if (!config.isTraining) {
+        modelFile = fopen(config.modelFN, "rb");
+        CheckNTErrors(modelFile, "Failed to open the model file");
+        for (auto& meta : metaInfo) {
+            fread(meta, sizeof(int), 1, modelFile);
+        }
+    }
+    else {
+        /* read the source and target vocab size */
+        FILE* trainF = fopen(config.trainFN, "rb");
+        CheckNTErrors(trainF, "Failed to open the training file");
+
+        fread(&config.srcVocabSize, sizeof(config.srcVocabSize), 1, trainF);
+        fread(&config.tgtVocabSize, sizeof(config.tgtVocabSize), 1, trainF);
+        CheckNTErrors(config.srcVocabSize > 0, "Invalid source vocabulary size");
+        CheckNTErrors(config.tgtVocabSize > 0, "Invalid target vocabulary size");
+        fclose(trainF);
+    }
+
+    nhead = config.nhead;
+    shareAllEmbeddings = config.shareAllEmbeddings;
+    shareDecInputOutputWeight = config.shareDecInputOutputWeight;
+
+    ShowModelConfig(config);
+
+    encoder->InitModel(config);
+    outputLayer->InitModel(config);
+
+    if (isMT)
+        decoder->InitModel(config);
+
+    /* load parameters */
+    if (!config.isTraining)
+        Read(modelFile);
+    else {
+        TensorList params;
+        GetParams(params);
+        for (int i = 0; i < params.Size(); i++)
+            params[i]->SetVarFlag();
+    }
+
+    if (modelFile != NULL)
+        fclose(modelFile);
+}
+
+/*
+print model configurations
+>> config - model configurations
+*/
+void Model::ShowModelConfig(Config& config)
+{
+    /* TODO: output more info */
+    XPRINT1(0, stderr, "encoder layer: %d\n", config.nEncLayer);
+    XPRINT1(0, stderr, "decoder layer: %d\n", config.nDecLayer);
+    XPRINT1(0, stderr, "attention heads: %d\n", config.nhead);
+    XPRINT1(0, stderr, "model size: %d\n", config.modelSize);
+    XPRINT1(0, stderr, "source vocab size: %d\n", config.srcVocabSize);
+    XPRINT1(0, stderr, "target vocab size: %d\n", config.tgtVocabSize);
+}
+
+/*
+make the encoding network
+>> input - input tensor, (batchSize, srcLen)
+>> mask - the mask for encoder self-attention, (headNum, batchSize, srcLen, srcLen)
+>> isTraining - indicates whether we are training the model
+<< return - encoding result, (batchSize, srcLen, hiddenDim)
+*/
+XTensor Model::MakeEncoder(XTensor& input, XTensor* mask, bool isTraining)
+{
+    XTensor nothing;
+
+    return encoder->Make(input, mask, nothing, isTraining);
+}
+
+/*
+make the decoding network
+>> inputDec - input tensor of the decoder, (batchSize, tgtLen)
+>> outputEnc - output tensor of the encoder, (batchSize, srcLen, hiddenDim)
+>> mask - mask for decoder self-attention, (headNum, batchSize, tgtLen, tgtLen)
+>> maskEncDec - mask for the encoder-decoder attention, (headNum, batchSize, tgtLen, srcLen)
+>> isTraining - indicates whether we are training the model
+<< return - decoding result, (batchSize, tgtLen, hiddenDim)
+*/
+XTensor Model::MakeDecoder(XTensor& inputDec, XTensor& outputEnc,
+    XTensor* mask, XTensor& maskEncDec, bool isTraining)
+{
+    return decoder->Make(inputDec, outputEnc, mask, &maskEncDec,
+                         inputDec.GetDim(1), isTraining);
+}
+
+/*
+make the network for language modeling (with the output softmax layer)
+>> input - input tensor
+>> output - output tensor (distribution)
+>> padding - padding of the sequences
+>> isTraining - indicates whether the model is for training
+*/
+void Model::MakeLM(XTensor& input, XTensor& output, XTensor& padding, bool isTraining)
+{
+    int len = padding.GetDim(padding.order - 1);
+    int* dims = new int[padding.order + 2];
+    for (int i = 0; i < padding.order; i++)
+        dims[i + 1] = padding.GetDim(i);
+    dims[0] = nhead;
+    dims[padding.order + 1] = len;
+    XTensor mask;
+    InitTensor(&mask, padding.order + 2, dims, X_FLOAT, padding.devID);
+
+    delete[] dims;
+
+    /* a upper triangular matrix where the cells of the upper triangular are set to -1e-9.
+        this matrix can be used to prevent the attention to current or following words in
+        a given sequence. */
+    _SetDataLowTri(&mask, 1e9F, 0);
+    ScaleAndShiftMe(mask, 1.0F, -1e9F);
+
+    /* forward */
+    XTensor encoding;
+
+    encoding = MakeEncoder(input, &mask, isTraining);
+    outputLayer->Make(encoding, output, true, true);
+}
+
+/*
+make the network for machine translation (with the output softmax layer)
+>> inputEnc - input tensor of the encoder, (batchSize, srcLen)
+>> inputDec - input tensor of the decoder, (batchSize, tgtLen)
+>> output - output tensor (distribution), (batchSize, tgtLen, hiddenDim)
+>> paddingEnc - padding of the sequences (on the encoder side), (batchSize, srcLen)
+>> paddingDec - padding of the sequences (on the decoder side), (batchSize, tgtLen)
+>> isTraining - indicates whether the model is for training
+*/
+void Model::MakeMT(XTensor& inputEnc, XTensor& inputDec, XTensor& output,
+                   XTensor& paddingEnc, XTensor& paddingDec, bool isTraining)
+{
+    XTensor encoding;
+    XTensor decoding;
+
+    XTensor maskEnc;
+    XTensor maskDec;
+    XTensor maskEncDec;
+
+    /* encoder mask */
+    MakeMTMaskEnc(paddingEnc, maskEnc);
+
+    /* decoder mask */
+    MakeMTMaskDec(paddingEnc, paddingDec, maskDec, maskEncDec);
+
+    encoding = MakeEncoder(inputEnc, &maskEnc, isTraining);
+
+    decoding = MakeDecoder(inputDec, encoding, &maskDec, maskEncDec, isTraining);
+
+    outputLayer->Make(decoding, output, true, true);
+}
+
+/*
+make the mask for training MT models
+>> inputEnc - input of the encoder
+>> inputDec - input of the decoder
+>> paddingEnc - padding of the encoder input
+>> paddingDec - padding of the decoder input
+>> maskEnc - mask of the encoder self-attention
+>> maksDec - mask of the decoder self-attention
+>> maksEncDec - mask of the decoder enc-dec attention
+*/
+void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
+                       XTensor& paddingEnc, XTensor& paddingDec,
+                       XTensor& maskEnc, XTensor& maskDec, XTensor& maskEncDec)
+{
+    int len = inputDec.GetDim(inputDec.order - 1);
+    int* dims = new int[inputDec.order + 2];
+    for (int i = 0; i < inputDec.order; i++)
+        dims[i + 1] = inputDec.GetDim(i);
+    dims[0] = nhead;
+    dims[inputDec.order + 1] = len;
+    InitTensor(&maskDec, inputDec.order + 2, dims, X_FLOAT, paddingDec.devID);
+
+    /* an upper triangular matrix where the cells of the upper triangular are set to -1e-9.
+       this matrix can be used to prevent the attention to current or following words in
+       a given sequence. */
+    _SetDataLowTri(&maskDec, 1e9F, 0);
+    ScaleAndShiftMe(maskDec, 1.0F, -1e9F);
+
+    /* encoder-decoder mask that prevents the attention to padding dummy words */
+    dims[inputDec.order + 1] = inputEnc.GetDim(inputEnc.order - 1);
+    InitTensor(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, paddingEnc.devID);
+
+    XTensor* maskEncDecTMPEnc = NewTensorBuf(paddingEnc.order + 1, dims + 1,
+        paddingEnc.dataType, paddingEnc.devID);
+    XTensor* maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID);
+
+    _Unsqueeze(&paddingEnc, maskEncDecTMPEnc, paddingEnc.order - 1, paddingDec.GetDim(-1));
+    _ScaleAndShiftMe(maskEncDecTMPEnc, 1e9F, -1e9F);
+    _Unsqueeze(maskEncDecTMPEnc, &maskEncDec, 0, dims[0]);
+
+    DelTensorBuf(maskEncDecTMPDec);
+    DelTensorBuf(maskEncDecTMPEnc);
+
+    /* padding on the source side */
+    int* dimsPadding = new int[paddingEnc.order + 2];
+    for (int i = 0; i < paddingEnc.order - 1; i++)
+        dimsPadding[i] = paddingEnc.GetDim(i);
+    dimsPadding[paddingEnc.order - 1] = paddingEnc.GetDim(-1);
+    dimsPadding[paddingEnc.order] = paddingEnc.GetDim(-1);
+
+    XTensor* padding2 = NewTensorBuf(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType,
+        paddingEnc.devID);
+
+    for (int i = 0; i < padding2->order; i++)
+        dimsPadding[i + 1] = padding2->GetDim(i);
+    dimsPadding[0] = nhead;
+
+    XTensor* padding3 = NewTensorBuf(paddingEnc.order + 2, dimsPadding, paddingEnc.dataType, paddingEnc.devID);
+
+    /* mask of the padding */
+    _Unsqueeze(&paddingEnc, padding2, paddingEnc.order - 1, paddingEnc.GetDim(-1));
+    _Unsqueeze(padding2, padding3, 0, nhead);
+
+    _ScaleAndShiftMe(padding3, 1e9F, -1e9F);
+
+    InitTensor(&maskEnc, padding3);
+    maskEnc.SetZeroAll();
+
+    /* generate the mask on the source language side (for padding) */
+    _Sum(&maskEnc, padding3, &maskEnc);
+
+    delete[] dims;
+    delete[] dimsPadding;
+
+    DelTensorBuf(padding3);
+    DelTensorBuf(padding2);
+}
+
+/*
+make the mask of the encoder
+>> paddingEnc - padding of the encoder input, (batchSize, srcLen)
+>> maskEnc - mask of the encoder self-attention, (headNum, batchSize, srcLen, srcLen)
+*/
+void Model::MakeMTMaskEnc(XTensor& paddingEnc, XTensor& maskEnc)
+{
+    XTensor padding2;
+
+    /* mask of the padding */
+    Unsqueeze(paddingEnc, padding2, paddingEnc.order - 1, paddingEnc.GetDim(-1));
+    Unsqueeze(padding2, maskEnc, 0, nhead);
+    ScaleAndShiftMe(maskEnc, 1e9F, -1e9F);
+}
+
+/*
+make the mask of the decoder
+>> paddingEnc - padding of the encoder input, (batchSize, srcLen)
+>> paddingDec - padding of the decoder input, (batchSize, tgtLen)
+>> maksDec - mask of the decoder self-attention, (headNum, batchSize, tgtLen, tgtLen)
+>> maksEncDec - mask of the decoder enc-dec attention, (headNum, batchSize, tgtLen, srcLen)
+*/
+void Model::MakeMTMaskDec(XTensor& paddingEnc, XTensor& paddingDec,
+                          XTensor& maskDec, XTensor& maskEncDec)
+{
+    int len = paddingDec.GetDim(paddingDec.order - 1);
+    int* dims = new int[paddingDec.order + 2];
+    for (int i = 0; i < paddingDec.order; i++)
+        dims[i + 1] = paddingDec.GetDim(i);
+    dims[0] = nhead;
+    dims[paddingDec.order + 1] = len;
+    InitTensor(&maskDec, paddingDec.order + 2, dims, X_FLOAT, paddingDec.devID);
+
+    /* An upper triangular matrix where the cells of the upper triangular are set to -1e-9.
+       This matrix can be used to block the attention to current or following words in
+       a given sequence. */
+    _SetDataLowTri(&maskDec, 1e9F, 0);
+    ScaleAndShiftMe(maskDec, 1.0F, -1e9F);
+
+    /* encoder-decoder mask that prevents the attention to padding dummy words */
+    XTensor maskEncDecTMP;
+
+    Unsqueeze(paddingEnc, maskEncDecTMP, paddingEnc.order - 1, paddingDec.GetDim(-1));
+    ScaleAndShiftMe(maskEncDecTMP, 1e9F, -1e9F);
+    Unsqueeze(maskEncDecTMP, maskEncDec, 0, dims[0]);
+
+    delete[] dims;
+}
+
+/*
+get parameter matrices
+>> list - the list that keeps the parameter matrics
+*/
+void Model::GetParams(TensorList& list)
+{
+    list.Clear();
+
+    /* encoder parameters */
+    for (int i = 0; i < encoder->nlayer; i++) {
+        list.Add(&encoder->selfAtt[i].weightQ);
+        list.Add(&encoder->selfAtt[i].weightK);
+        list.Add(&encoder->selfAtt[i].weightV);
+        list.Add(&encoder->selfAtt[i].biasQ);
+        list.Add(&encoder->selfAtt[i].biasK);
+        list.Add(&encoder->selfAtt[i].biasV);
+        if (encoder->selfAtt[i].useRPR)
+            list.Add(&encoder->selfAtt[i].RPEmbK);
+        list.Add(&encoder->selfAtt[i].weightO);
+        list.Add(&encoder->selfAtt[i].biasO);
+        list.Add(&encoder->fnns[i].w1);
+        list.Add(&encoder->fnns[i].b1);
+        list.Add(&encoder->fnns[i].w2);
+        list.Add(&encoder->fnns[i].b2);
+        list.Add(&encoder->attLayerNorms[i].w);
+        list.Add(&encoder->attLayerNorms[i].b);
+        list.Add(&encoder->fnnLayerNorms[i].w);
+        list.Add(&encoder->fnnLayerNorms[i].b);
+    }
+    if (encoder->preNorm) {
+        list.Add(&encoder->encoderLayerNorm->w);
+        list.Add(&encoder->encoderLayerNorm->b);
+    }
+
+    if (isMT) {
+        /* decoder parameters */
+        for (int i = 0; i < decoder->nlayer; i++) {
+            list.Add(&decoder->selfAtt[i].weightQ);
+            list.Add(&decoder->selfAtt[i].weightK);
+            list.Add(&decoder->selfAtt[i].weightV);
+            list.Add(&decoder->selfAtt[i].biasQ);
+            list.Add(&decoder->selfAtt[i].biasK);
+            list.Add(&decoder->selfAtt[i].biasV);
+            if (decoder->selfAtt[i].useRPR)
+                list.Add(&decoder->selfAtt[i].RPEmbK);
+            list.Add(&decoder->selfAtt[i].weightO);
+            list.Add(&decoder->selfAtt[i].biasO);
+            list.Add(&decoder->selfAttLayerNorms[i].w);
+            list.Add(&decoder->selfAttLayerNorms[i].b);
+            list.Add(&decoder->enDeAtt[i].weightQ);
+            list.Add(&decoder->enDeAtt[i].weightK);
+            list.Add(&decoder->enDeAtt[i].weightV);
+            list.Add(&decoder->enDeAtt[i].biasQ);
+            list.Add(&decoder->enDeAtt[i].biasK);
+            list.Add(&decoder->enDeAtt[i].biasV);
+            list.Add(&decoder->enDeAtt[i].weightO);
+            list.Add(&decoder->enDeAtt[i].biasO);
+            list.Add(&decoder->enDeAttLayerNorms[i].w);
+            list.Add(&decoder->enDeAttLayerNorms[i].b);
+            list.Add(&decoder->fnns[i].w1);
+            list.Add(&decoder->fnns[i].b1);
+            list.Add(&decoder->fnns[i].w2);
+            list.Add(&decoder->fnns[i].b2);
+            list.Add(&decoder->fnnLayerNorms[i].w);
+            list.Add(&decoder->fnnLayerNorms[i].b);
+        }
+        if (decoder->preNorm) {
+            list.Add(&decoder->decoderLayerNorm->w);
+            list.Add(&decoder->decoderLayerNorm->b);
+        }
+    }
+
+    list.Add(&encoder->embedder.w);
+
+    if (isMT && (shareAllEmbeddings == 0)) {
+        list.Add(&decoder->embedder.w);
+    }
+
+    if (shareDecInputOutputWeight == 0) {
+        list.Add(&outputLayer->w);
+    }
+}
+
+/*
+dump the model to a file
+>> fn - where to save the model
+>> model - the model
+*/
+void Model::Dump(const char* fn)
+{
+    double startT = GetClockSec();
+
+    FILE* file = fopen(fn, "wb");
+    CheckNTErrors(file, "Cannot open the model file");
+
+    TensorList params;
+
+    GetParams(params);
+
+    int metaInfo[]{
+        encoder->nlayer, decoder->nlayer,
+        encoder->fnns->hSize, encoder->selfAtt->d,
+        encoder->embedder.eSize, encoder->embedder.vSize,
+        decoder->embedder.vSize, encoder->selfAtt->nhead,
+        encoder->selfAtt->maxRP, shareAllEmbeddings,
+        shareDecInputOutputWeight, encoder->embedder.maxLength - 1 - 1,
+    };
+
+    /* part 1: hyper-parameters */
+    fwrite(metaInfo, sizeof(int), sizeof(metaInfo) / sizeof(int), file);
+
+    /* part 2: model parameters */
+    for (int i = 0; i < params.Size(); i++) {
+        params[i]->BinaryDump(file);
+    }
+
+    fclose(file);
+
+    double elapsed = GetClockSec() - startT;
+
+    LOG("model saved (took %.1fs)", elapsed);
+}
+
+/* read the parameters */
+void Model::Read(FILE* file)
+{
+    double startT = GetClockSec();
+
+    TensorList params;
+    GetParams(params);
+    LOG("params count: %d", params.Size());
+    int size = 0;
+    for (int i = 0; i < params.Size(); i++) {
+        size += params[i]->unitNum;
+    }
+    LOG("params size: %d", size);
+
+    /* convert parameters to FP16 before reading files */
+    if (useFP16) {
+        LOG("Convert parameters to FP16");
+        for (int i = 0; i < params.Size(); i++) {
+            XTensor* p = params[i];
+            InitTensor(p, p->order, p->dimSize, X_FLOAT16, p->devID, p->enableGrad && X_ENABLE_GRAD);
+        }
+
+        auto& encEmb = encoder->embedder.posEmbeddingBase;
+        auto& decEmb = decoder->embedder.posEmbeddingBase;
+        encEmb = ConvertDataType(encEmb, X_FLOAT16);
+        decEmb = ConvertDataType(decEmb, X_FLOAT16);
+    }
+
+    for (int i = 0; i < params.Size(); i++)
+        params[i]->BinaryRead(file);
+
+    /* share all embeddings */
+    if (shareAllEmbeddings == 1) {
+        _CopyValues(&encoder->embedder.w, &decoder->embedder.w);
+        LOG("sharing encoder decoder embeddings");
+    }
+
+    /* share embeddings with output weights */
+    if (shareDecInputOutputWeight == 1) {
+        _CopyValues(&decoder->embedder.w, &outputLayer->w);
+        LOG("sharing decoder embeddings with output weights");
+    }
+
+    double elapsed = GetClockSec() - startT;
+    LOG("model loaded (took %.1fs)", elapsed);
+}
+
+}
\ No newline at end of file
--- a/source/nmt/Model.h
+++ b/source/nmt/Model.h
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
+ */
+
+#ifndef __MODEL_H__
+#define __MODEL_H__
+
+#include "Encoder.h"
+#include "Decoder.h"
+#include "layer/FNN.h"
+#include "layer/Output.h"
+#include "Utility.h"
+#include "layer/Attention.h"
+
+namespace nmt
+{
+
+/* a nmt model that keeps parameters of the encoder,
+   the decoder and the output layer (softmax). */
+class Model
+{
+public:
+    /* device id */
+    int devID;
+
+    /* the encoder */
+    AttEncoder* encoder;
+
+    /* the decoder */
+    AttDecoder* decoder;
+
+    /* output layer */
+    Output* outputLayer;
+
+    /* indicates whether the model is running for language modeling */
+    bool isLM;
+
+    /* indicates whether the model is running for machine translation */
+    bool isMT;
+
+    /* indicates whether the model is running with FP16 data type */
+    bool useFP16;
+
+    /* number of heads in the attention model */
+    int nhead;
+
+    /* indicates whether share encoders embeddings with decoders */
+    int shareAllEmbeddings;
+
+    /* indicates whether share decoder embeddings with output weights */
+    int shareDecInputOutputWeight;
+
+public:
+    /* constructor */
+    Model();
+
+    /* de-constructor */
+    ~Model();
+
+    /* initialize the model */
+    void InitModel(Config& config);
+
+    /* print model configurations */
+    void ShowModelConfig(Config& config);
+
+    /* make the encoding network */
+    XTensor MakeEncoder(XTensor& input, XTensor* mask, bool isTraining);
+
+    /* make the encoding network */
+    XTensor MakeDecoder(XTensor& inputEnc, XTensor& inputDec, XTensor* mask,
+        XTensor& MaskEncDec, bool isTraining);
+
+    /* make the network for language modeling (with the output softmax layer) */
+    void MakeLM(XTensor& input, XTensor& output, XTensor& padding, bool isTraining);
+
+    /* make the network for machine translation (with the output softmax layer) */
+    void MakeMT(XTensor& inputEnc, XTensor& inputDec, XTensor& output,
+        XTensor& paddingEnc, XTensor& paddingDec, bool isTraining);
+
+    /* make the mask for training MT models */
+    void MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
+        XTensor& paddingEnc, XTensor& paddingDec,
+        XTensor& maskEnc, XTensor& maskDec, XTensor& maskEncDec);
+
+    /* make the mask of the encoder */
+    void MakeMTMaskEnc(XTensor& paddingEnc, XTensor& maskEnc);
+
+    /* make the mask of the decoder */
+    void MakeMTMaskDec(XTensor& paddingEnc, XTensor& paddingDec,
+        XTensor& maskDec, XTensor& maskEncDec);
+
+    /* get parameter matrices */
+    void GetParams(TensorList& list);
+
+    /* dump the model to a file */
+    void Dump(const char* fn);
+
+    /* read the parameters */
+    void Read(FILE* file);
+};
+
+}
+
+#endif
--- a/source/nmt/NMT.cpp
+++ b/source/nmt/NMT.cpp
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-06, 2020-07
+ */
+
+#include <ctime>
+
+#include "NMT.h"
+#include "train/Trainer.h"
+#include "translate/Translator.h"
+
+namespace nmt
+{
+
+int NMTMain(int argc, const char** argv)
+{
+    if (argc == 0)
+        return 1;
+
+    /* load configurations */
+    Config config(argc, argv);
+
+    srand(1);
+
+    /* training */
+    if (strcmp(config.trainFN, "") != 0) {
+        
+        Model model;
+        model.InitModel(config);
+        Trainer trainer;
+        trainer.Init(config);
+        trainer.Train(config.trainFN, config.validFN, config.modelFN, &model);
+    }
+
+    /* translating */
+    if (strcmp(config.testFN, "") != 0 && strcmp(config.outputFN, "") != 0) {
+        
+        /* disable grad flow */
+        DISABLE_GRAD;
+
+        Model model;
+        model.InitModel(config);
+        Translator translator;
+        translator.Init(config);
+        translator.Translate(config.testFN, config.srcVocabFN, 
+                             config.tgtVocabFN, config.outputFN, &model);
+    }
+
+    return 0;
+}
+
+}
\ No newline at end of file
--- a/source/nmt/NMT.h
+++ b/source/nmt/NMT.h
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * An implementation of the NMT system. 
+ */
+
+#ifndef __NMT_H__
+#define __NMT_H__
+
+namespace nmt
+{
+
+/* entrance of the program */
+int NMTMain(int argc, const char** argv);
+
+}
+
+#endif
\ No newline at end of file
--- a/source/nmt/Utility.cpp
+++ b/source/nmt/Utility.cpp
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <string>
+#include <fstream>
+#include <sstream>
+
+#include "Utility.h"
+#include "../niutensor/tensor/XGlobal.h"
+
+using namespace nts;
+using namespace std;
+
+namespace nmt
+{
+
+/*
+load configurations from the command
+>> argc - number of arguments
+>> argv - the list of arguments
+*/
+Config::Config(int argc, const char** argv)
+{
+    char** args = new char* [MAX_PARAM_NUM];
+    for (int i = 0; i < argc; i++) {
+        args[i] = new char[strlen(argv[i]) + 1];
+        strcpy(args[i], argv[i]);
+    }
+
+    char* configFN = new char[1024];
+    LoadParamString(argc, args, "config", configFN, "");
+
+    int argsNum = argc;
+
+    /* load configurations from a file */
+    if (strcmp(configFN, "") != 0)
+        argsNum = LoadFromFile(configFN, args);
+
+    ShowParams(argsNum, args);
+
+    /* options for the model */
+    LoadParamInt(argsNum, args, "nhead", &nhead, 4);
+    LoadParamInt(argsNum, args, "enclayer", &nEncLayer, 6);
+    LoadParamInt(argsNum, args, "declayer", &nDecLayer, 6);
+    LoadParamInt(argsNum, args, "maxrp", &maxRP, 8);
+    LoadParamInt(argsNum, args, "embsize", &embSize, 512);
+    LoadParamInt(argsNum, args, "modelsize", &modelSize, 512);
+    LoadParamInt(argsNum, args, "maxpos", &maxPosLen, 1024);
+    LoadParamInt(argsNum, args, "fnnhidden", &fnnHiddenSize, modelSize * 2);
+    LoadParamInt(argsNum, args, "vsize", &srcVocabSize, 10152);
+    LoadParamInt(argsNum, args, "vsizetgt", &tgtVocabSize, 10152);
+    LoadParamInt(argsNum, args, "padid", &padID, 1);
+    LoadParamInt(argsNum, args, "startid", &startID, 2);
+    LoadParamInt(argsNum, args, "endid", &endID, 2);
+    LoadParamBool(argsNum, args, "rpr", &useRPR, false);
+    LoadParamBool(argsNum, args, "prenorm", &preNorm, true);
+
+    // TODO: refactor the parameters type to support weight sharing during training
+    LoadParamInt(argsNum, args, "shareemb", &shareAllEmbeddings, 0);
+    LoadParamInt(argsNum, args, "sharedec", &shareDecInputOutputWeight, 0);
+    LoadParamString(argsNum, args, "model", modelFN, "");
+    LoadParamString(argsNum, args, "srcvocab", srcVocabFN, "vocab.src");
+    LoadParamString(argsNum, args, "tgtvocab", tgtVocabFN, "vocab.tgt");
+
+    /* options for training */
+    LoadParamString(argsNum, args, "train", trainFN, "");
+    LoadParamString(argsNum, args, "valid", validFN, "");
+    LoadParamInt(argsNum, args, "dev", &devID, 0);
+    LoadParamInt(argsNum, args, "wbatch", &wBatchSize, 4096);
+    LoadParamInt(argsNum, args, "sbatch", &sBatchSize, 8);
+    isTraining = (strcmp(trainFN, "") == 0) ? false : true;
+    LoadParamBool(argsNum, args, "mt", &isMT, true);
+    LoadParamFloat(argsNum, args, "dropout", &dropout, 0.3);
+    LoadParamFloat(argsNum, args, "fnndrop", &fnnDropout, 0.1);
+    LoadParamFloat(argsNum, args, "attdrop", &attDropout, 0.1);
+
+    LoadParamFloat(argc, args, "lrate", &lrate, 0.0015F);
+    LoadParamFloat(argc, args, "lrbias", &lrbias, 0);
+    LoadParamInt(argc, args, "nepoch", &nepoch, 50);
+    LoadParamInt(argc, args, "maxcheckpoint", &maxCheckpoint, 10);
+    LoadParamInt(argc, args, "nstep", &nstep, 100000);
+    LoadParamInt(argc, args, "nwarmup", &nwarmup, 8000);
+    LoadParamBool(argc, args, "adam", &useAdam, true);
+    LoadParamFloat(argc, args, "adambeta1", &adamBeta1, 0.9F);
+    LoadParamFloat(argc, args, "adambeta2", &adamBeta2, 0.98F);
+    LoadParamFloat(argc, args, "adamdelta", &adamDelta, 1e-9F);
+    LoadParamBool(argc, args, "shuffled", &isShuffled, true);
+    LoadParamFloat(argc, args, "labelsmoothing", &labelSmoothingP, 0.1);
+    LoadParamInt(argc, args, "nstepcheckpoint", &nStepCheckpoint, -1);
+    LoadParamBool(argc, args, "epochcheckpoint", &useEpochCheckpoint, true);
+    LoadParamInt(argc, args, "updatestep", &updateStep, 1);
+    LoadParamBool(argc, args, "sorted", &isLenSorted, false);
+
+    LoadParamInt(argc, args, "bufsize", &bufSize, 50000);
+    LoadParamBool(argc, args, "doubledend", &isDoubledEnd, false);
+    LoadParamBool(argc, args, "smallbatch", &isSmallBatch, true);
+    LoadParamBool(argc, args, "bigbatch", &isBigBatch, false);
+    LoadParamBool(argc, args, "randbatch", &isRandomBatch, false);
+    LoadParamInt(argc, args, "bucketsize", &bucketSize, wBatchSize * 10);
+
+    /* options for translating */
+    LoadParamString(argsNum, args, "test", testFN, "");
+    LoadParamString(argsNum, args, "output", outputFN, "");
+    LoadParamInt(argsNum, args, "beamsize", &beamSize, 1);
+    LoadParamBool(argsNum, args, "fp16", &useFP16, false);
+    LoadParamFloat(argsNum, args, "lenalpha", &lenAlpha, 0.6);
+    LoadParamFloat(argsNum, args, "maxlenalpha", &maxLenAlpha, 1.2);
+
+    for (int i = 0; i < argc; i++)
+        delete[] args[i];
+    delete[] args;
+    delete[] configFN;
+}
+
+/*
+load configurations from a file
+>> configFN - path to the configuration file
+>> args - the list to store the configurations
+format: one option per line, separated by a blank or a tab
+*/
+int Config::LoadFromFile(const char* configFN, char** args) {
+    ifstream f(configFN, ios::in);
+    CheckNTErrors(f.is_open(), "unable to open the config file");
+
+    int argsNum = 0;
+
+    /* parse arguments */
+    string key, value;
+    while (f >> key >> value) {
+        key += '-';
+        strcpy(args[argsNum++], key.c_str());
+        strcpy(args[argsNum++], value.c_str());
+    }
+
+    /* record the number of arguments */
+    return argsNum;
+}
+
+void LoadParamString(int argc, char** argv, const char* name, char* p, const char* defaultP)
+{
+    char vname[128];
+    vname[0] = '-';
+    strcpy(vname + 1, name);
+    bool hit = false;
+    for (int i = 0; i < argc; i++) {
+        if (!strcmp(argv[i], vname) && i + 1 < argc) {
+            strcpy(p, argv[i + 1]);
+            hit = true;
+            break;
+        }
+    }
+    if (!hit)
+        strcpy(p, defaultP);
+}
+
+void LoadParamInt(int argc, char** argv, const char* name, int* p, int defaultP)
+{
+    char vname[128];
+    vname[0] = '-';
+    strcpy(vname + 1, name);
+    bool hit = false;
+    for (int i = 0; i < argc; i++) {
+        if (!strcmp(argv[i], vname) && i + 1 < argc) {
+            *(int*)p = atoi(argv[i + 1]);
+            hit = true;
+            break;
+        }
+    }
+    if (!hit)
+        *p = defaultP;
+}
+
+void LoadParamBool(int argc, char** argv, const char* name, bool* p, bool defaultP)
+{
+    char vname[128];
+    vname[0] = '-';
+    strcpy(vname + 1, name);
+    bool hit = false;
+    for (int i = 0; i < argc; i++) {
+        if (!strcmp(argv[i], vname)) {
+            *(bool*)p = true;
+            hit = true;
+            break;
+        }
+    }
+    if (!hit)
+        *p = defaultP;
+}
+
+void LoadParamFloat(int argc, char** argv, const char* name, float* p, float defaultP)
+{
+    char vname[128];
+    vname[0] = '-';
+    strcpy(vname + 1, name);
+    bool hit = false;
+    for (int i = 0; i < argc; i++) {
+        if (!strcmp(argv[i], vname) && i + 1 < argc) {
+            *p = (float)atof(argv[i + 1]);
+            hit = true;
+            break;
+        }
+    }
+    if (!hit)
+        *p = defaultP;
+}
+
+void ShowParams(int argc, char** argv)
+{
+    fprintf(stderr, "args:\n");
+    for (int i = 0; i < argc; i++) {
+        if (argv[i][1] == 0)
+            continue;
+        if (argv[i][0] == '-' && (argv[i][1] < '1' || argv[i][1] > '9')) {
+            if (i + 1 < argc && argv[i + 1][0] != '-')
+                fprintf(stderr, " %s=%s\n", argv[i], argv[i + 1]);
+            else
+                fprintf(stderr, " %s=yes\n", argv[i]);
+        }
+    }
+    fprintf(stderr, "\n");
+}
+
+#define MAX_WORD_NUM 120
+
+/*
+split string by delimiter, this will return indices of all sub-strings
+>> s - the original string
+>> delimiter - as it is
+<< indices - indices of all sub-strings
+*/
+UInt64List SplitToPos(const string& s, const string& delimiter)
+{
+    UInt64List indices;
+    if (delimiter.length() == 0) {
+        indices.Add(0);
+    }
+    size_t pos = 0;
+    uint64_t start = 0;
+    while ((pos = s.find(delimiter, start)) != string::npos) {
+        if (pos != start) {
+            indices.Add(start);
+        }
+        start = pos + delimiter.length();
+    }
+    if (start != s.length()) {
+        indices.Add(start);
+    }
+    return indices;
+}
+
+/* split a string to a int64_t list */
+IntList SplitInt(const string& s, const string& delimiter)
+{
+    IntList values;
+    auto indices = SplitToPos(s, delimiter);
+    for (int i = 0; i < indices.Size(); i++) {
+        values.Add(strtol(s.data() + indices[i], nullptr, 10));
+    }
+    return values;
+}
+
+/* split a string to a float list */
+FloatList SplitFloat(const string& s, const string& delimiter)
+{
+    FloatList values;
+    auto indices = SplitToPos(s, delimiter);
+    for (int i = 0; i < indices.Size(); i++) {
+        values.Add(strtof(s.data() + indices[i], nullptr));
+    }
+    return values;
+}
+
+}
\ No newline at end of file
--- a/source/nmt/Utility.h
+++ b/source/nmt/Utility.h
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
+ */
+
+#ifndef __UTILITY_H__
+#define __UTILITY_H__
+
+#include <string>
+#include <cstdio>
+
+#include "../niutensor/tensor/XList.h"
+
+using namespace std;
+using namespace nts;
+
+namespace nmt
+{
+
+#define MAX_PARAM_NUM 100
+
+/* load arguments */
+void LoadParamInt(int argc, char** argv, const char* name, int* p, int defaultP);
+void LoadParamBool(int argc, char** argv, const char* name, bool* p, bool defaultP);
+void LoadParamFloat(int argc, char** argv, const char* name, float* p, float defaultP);
+void LoadParamString(int argc, char** argv, const char* name, char* p, const char* defaultP);
+
+/* show arguments */
+void ShowParams(int argc, char** argv);
+
+/* split string */
+IntList SplitInt(const string& s, const string& delimiter);
+FloatList SplitFloat(const string& s, const string& delimiter);
+UInt64List SplitToPos(const string& s, const string& delimiter);
+
+/* configurations for  */
+class Config {
+public:
+    /* path to the model */
+    char modelFN[1024];
+
+    /* path to the source vocab */
+    char srcVocabFN[1024];
+
+    /* path to the target vocab */
+    char tgtVocabFN[1024];
+
+    /* path to the input file (for inference) */
+    char testFN[1024];
+
+    /* path to the output file (for inference) */
+    char outputFN[1024];
+
+    /* path to the training file */
+    char trainFN[1024];
+
+    /* path to the validation file */
+    char validFN[1024];
+
+    /* device id */
+    int devID;
+
+    /* beam size */
+    int beamSize;
+
+    /* word batch size */
+    int wBatchSize;
+
+    /* sentence batch size */
+    int sBatchSize;
+
+    /* number of heads in attention */
+    int nhead;
+
+    /* number of encoder layers */
+    int nEncLayer;
+
+    /* number of decoder layers */
+    int nDecLayer;
+
+    /* the maximum relative position in RPR attentions */
+    int maxRP;
+
+    /* the dimension of embeddings */
+    int embSize;
+
+    /* the dimension of hidden layer */
+    int modelSize;
+
+    /* the maximum length in positional embedding */
+    int maxPosLen;
+
+    /* the dimension of fnn hidden layer */
+    int fnnHiddenSize;
+
+    /* the vocab size of source sequence */
+    int srcVocabSize;
+
+    /* the vocab size of target sequence */
+    int tgtVocabSize;
+
+    /* the padding id */
+    int padID;
+
+    /* start symbol */
+    int startID;
+
+    /* end symbol */
+    int endID;
+
+    /* indicates whether the model uses pre-norm */
+    bool preNorm;
+
+    /* indicates whether the model is running for machine translation */
+    bool isMT;
+
+    /* indicates whether share encoder decoder embeddings */
+    int shareAllEmbeddings;
+
+    /* indicates whether share decoder embeddings and output weights */
+    int shareDecInputOutputWeight;
+
+    /* indicates whether the model is running with FP16 data type */
+    bool useFP16;
+
+    /* indicates whether we use the RPR attention */
+    bool useRPR;
+
+    /* indicates whether we train the model */
+    bool isTraining;
+
+    /* dropout rate for the model */
+    float dropout;
+
+    /* dropout rate for fnn layers */
+    float fnnDropout;
+
+    /* dropout rate for attention layers */
+    float attDropout;
+
+    /* the alpha parameter controls the length preference */
+    float lenAlpha;
+
+    /* scalar of the input sequence (for max number of search steps) */
+    float maxLenAlpha;
+
+    /* learning rate */
+    float lrate;
+
+    /* the parameter that controls the maximum learning rate in training */
+    float lrbias;
+
+    /* training epoch number */
+    int nepoch;
+
+    /* training step number */
+    int nstep;
+
+    /* the maximum number of saved checkpoints */
+    int maxCheckpoint;
+
+    /* indicates whether we use Adam */
+    bool useAdam;
+
+    /* hyper parameters of Adam */
+    float adamBeta1;
+    float adamBeta2;
+    float adamDelta;
+
+    /* step number of warm-up for training */
+    int nwarmup;
+
+    /* indicates whether the data file is shuffled for training */
+    bool isShuffled;
+
+    /* the factor of label smoothing */
+    float labelSmoothingP;
+
+    /* number of steps after which we make a checkpoint */
+    int nStepCheckpoint;
+
+    /* indicates whether we make a checkpoint after each training epoch */
+    bool useEpochCheckpoint;
+
+    /* number of batches on which we do model update */
+    int updateStep;
+
+    /* indicates whether the sequence is sorted by length */
+    bool isLenSorted;
+
+    /* buffer size */
+    int bufSize;
+
+    /* indicates whether we double the </s> symbol for the output of LM */
+    bool isDoubledEnd;
+
+    /* indicates whether we use batchsize = max * sc
+       rather rather than batchsize = word-number, where max is the maximum
+       length and sc is the sentence number */
+    bool isSmallBatch;
+
+    /* counterpart of "isSmallBatch" */
+    bool isBigBatch;
+
+    /* randomize batches */
+    bool isRandomBatch;
+
+    /* bucket size */
+    int bucketSize;
+
+public:
+
+    /* load configurations from the command */
+    Config(int argc, const char** argv);
+
+    /* load configurations from a file */
+    int LoadFromFile(const char* configFN, char** args);
+};
+
+}
+
+#endif
--- a/source/nmt/layer/Attention.cpp
+++ b/source/nmt/layer/Attention.cpp
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /*
+  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+  * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
+  */
+
+#include "Attention.h"
+#include "Embedding.h"
+#include "../Utility.h"
+#include "../../niutensor/tensor/core/CHeader.h"
+
+namespace nmt
+{
+/* constructor */
+Attention::Attention()
+{
+    nhead = -1;
+    dk = -1;
+    dv = -1;
+    d = -1;
+}
+
+/* de-constructor */
+Attention::~Attention()
+{
+}
+
+/*
+initialize the model
+>> config - the configurations of the network
+*/
+void Attention::InitModel(Config& config)
+{
+    devID = config.devID;
+    useRPR = config.useRPR;
+
+    nhead = config.nhead;
+    d = config.modelSize;
+    dk = config.modelSize;
+    dv = config.modelSize;
+    maxRP = config.maxRP;
+    dropoutP = config.attDropout;
+
+    /* initialize the parameters */
+    InitTensor2D(&weightQ, d, d, X_FLOAT, devID);
+    InitTensor1D(&biasQ, d, X_FLOAT, devID);
+    InitTensor2D(&weightK, d, d, X_FLOAT, devID);
+    InitTensor1D(&biasK, d, X_FLOAT, devID);
+    InitTensor2D(&weightV, d, d, X_FLOAT, devID);
+    InitTensor1D(&biasV, d, X_FLOAT, devID);
+
+    if (useRPR)
+        InitTensor2D(&RPEmbK, maxRP * 2 + 1, d / nhead, X_FLOAT, devID);
+
+    InitTensor2D(&weightO, d, d, X_FLOAT, devID);
+    InitTensor1D(&biasO, d, X_FLOAT, devID);
+
+    float scale = 1.0F;
+    _SetDataFanInOut(&weightK, scale);
+    _SetDataFanInOut(&weightQ, scale);
+    _SetDataFanInOut(&weightV, scale);
+    _SetDataFanInOut(&weightO, scale);
+
+    if (useRPR)
+        _SetDataFanInOut(&RPEmbK, scale);
+
+    biasQ.SetZeroAll();
+    biasO.SetZeroAll();
+
+    biasK.SetDataRand(-(DTYPE)sqrt(6.0F / d), (DTYPE)sqrt(6.0F / d));
+    biasV.SetDataRand(-(DTYPE)sqrt(6.0F / d), (DTYPE)sqrt(6.0F / d));
+}
+
+/*
+make the network
+>> k - keys, B * L * H for encoders, B * 1 * H for decoders
+       where B = batch size, L = sequence length,
+       and H = vector size of each position
+>> q - queries, B * L * H
+>> v - values, B * L * H for encoders, B * 1 * H for decoders
+>> mask - as it is
+>> isTraining - indicates whether the model is used for training
+>> cache - decoder cache
+>> cacheType - type of cache, e.g., self-attention
+<< return - multi-attention result
+*/
+XTensor Attention::Make(XTensor& k, XTensor& q, XTensor& v, XTensor* mask,
+    bool isTraining, Cache* cache, int attType)
+{
+    const bool isEnc = (!cache) ? true : false;
+
+    /* linear transformation before self-attention */
+    XTensor q2, k2, v2;
+
+    q2 = MulAndShift(q, weightQ, biasQ);
+
+    if (!cache || isTraining || !(cache->enable)) {
+        /* self attention for encoder layers */
+        k2 = MulAndShift(k, weightK, biasK);
+        v2 = MulAndShift(v, weightV, biasV);
+
+        if (useRPR && attType == SELF_ATT)
+            return MakeRPRAttention(k2, q2, v2, mask, isTraining, isEnc);
+        return MakeAttention(k2, q2, v2, mask, isTraining);
+    }
+
+    else {
+        if (attType == SELF_ATT) {
+            k2 = MulAndShift(k, weightK, biasK);
+            v2 = MulAndShift(v, weightV, biasV);
+
+            /* if hit, we only concat the cache with the new token */
+            if (!cache->miss) {
+                k2 = Concatenate(cache->key, k2, 1);
+                v2 = Concatenate(cache->value, v2, 1);
+            }
+            cache->key = k2;
+            cache->value = v2;
+            cache->miss = false;
+
+            if (useRPR)
+                return MakeRPRAttention(cache->key, q2, cache->value, mask, isTraining, isEnc);
+            return MakeAttention(cache->key, q2, cache->value, mask, isTraining);
+        }
+        else if (attType == EN_DE_ATT) {
+            if (cache->miss) {
+                cache->key = MulAndShift(k, weightK, biasK);
+                cache->value = MulAndShift(v, weightV, biasV);
+                cache->miss = false;
+            }
+
+            return MakeAttention(cache->key, q2, cache->value, mask, isTraining);
+        }
+        CheckNTErrors(0, "invalid cache type");
+    }
+}
+
+/*
+make the attention network given keys, queries and values (after linear transformation)
+>> k - keys, B * L * H
+>> q - queries, B * L * H
+>> v - values, B * L * H
+>> mask - as it is
+>> isTraining - indicates whether the model is used for training
+*/
+XTensor Attention::MakeAttention(XTensor& k, XTensor& q, XTensor& v,
+    XTensor* mask, bool isTraining)
+{
+    XTensor kheads;
+    XTensor qheads;
+    XTensor vheads;
+
+    const auto dataType = k.dataType;
+
+    /* multi head */
+    kheads = Split(k, k.order - 1, nhead);
+    qheads = Split(q, q.order - 1, nhead);
+    vheads = Split(v, v.order - 1, nhead);
+
+    XTensor att;
+    XTensor dot;
+    XTensor scalar;
+
+    /* Some operations may cause numerical overflow under FP16 including
+       BMMul, Mask, Div and Softmax. So we need to cast the input to FP32 */
+
+    if (qheads.dataType == X_FLOAT16) {
+        qheads = ConvertDataType(qheads, X_FLOAT);
+        kheads = ConvertDataType(kheads, X_FLOAT);
+    }
+
+    /* scalar = softmax(Q * K^T / sqrt(dk)) * V */
+    dot = BMMul(qheads, X_NOTRANS, kheads, X_TRANS);
+
+    if (mask)
+        dot = dot + *mask;
+
+    dot = Linear(dot, 1.0F / (float)sqrt((float)dk / nhead));
+
+    scalar = Softmax(dot, -1);
+
+    if (isTraining && dropoutP > 0)
+        scalar = Dropout(scalar, dropoutP);
+
+    if (vheads.dataType != scalar.dataType)
+        vheads = ConvertDataType(vheads, scalar.dataType);
+
+    att = BMMul(scalar, vheads);
+
+    if (dataType != att.dataType)
+        att = ConvertDataType(att, dataType);
+
+    /* concatenate the heads */
+    return MulAndShift(Merge(att, att.order - 1), weightO, biasO);
+}
+
+/*
+make the attention network by incorporating the relative position representation
+with the given keys, queries and values (after linear transformation)
+>> k - keys, B * L * H
+>> q - queries, B * L * H
+>> v - values, B * L * H
+>> mask - as it is
+>> isTraining - indicates whether the model is used for training
+>> isEnc - indicates whether it is encoder
+*/
+XTensor Attention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
+                                    XTensor* mask, bool isTraining, bool isEnc)
+{
+    XTensor kheads;
+    XTensor qheads;
+    XTensor vheads;
+
+    const int batchSize = q.GetDim(0);
+    const int lenQ = q.GetDim(1);
+    const int lenKV = k.GetDim(1);
+
+    const auto dataType = k.dataType;
+
+    /* multi head */
+    kheads = Split(k, k.order - 1, nhead);
+    qheads = Split(q, q.order - 1, nhead);
+    vheads = Split(v, v.order - 1, nhead);
+
+    XTensor att;
+    XTensor dot;
+    XTensor scalar;
+
+    XTensor embMatrix, relativeKey;
+
+    /* generate the relative emb index (L_q, L_kv) */
+    embMatrix = GetRPEmbedding(lenQ, lenKV, maxRP, isEnc || isTraining);
+
+    /* generate the relative key from the RPEmbK (L_q, L_kv, H/K) */
+    relativeKey = Gather(RPEmbK, embMatrix);
+
+    if (qheads.dataType == X_FLOAT16) {
+        qheads = ConvertDataType(qheads, X_FLOAT);
+        kheads = ConvertDataType(kheads, X_FLOAT);
+        relativeKey = ConvertDataType(relativeKey, X_FLOAT);
+    }
+
+    float scaling = sqrt(d / nhead);
+    qheads = ScaleAndShift(qheads, 1.0F / scaling);
+
+    dot = RPDotProduct(qheads, kheads, relativeKey, true);
+
+    if (mask)
+        dot = dot + *mask;
+
+    /* softmax */
+    scalar = Softmax(dot, -1);
+
+    if (isTraining && dropoutP > 0)
+        scalar = Dropout(scalar, dropoutP);
+
+    if (vheads.dataType != scalar.dataType)
+        vheads = ConvertDataType(vheads, scalar.dataType);
+
+    /* generate the relative attention output (K, B, L_q, H/K) */
+    att = BMMul(scalar, vheads);
+
+    if (dataType != att.dataType)
+        att = ConvertDataType(att, dataType);
+
+    /* concatenate the heads */
+    return MulAndShift(Merge(att, att.order - 1), weightO, biasO);
+}
+
+/*
+generate relative position embeddings
+>> lenQ - the length of query
+>> lenKV - the length of key and value
+>> maxRelativeLen - the maximum length of relative position
+*/
+XTensor Attention::GetRPEmbedding(const int lenQ, const int lenKV,
+    const int maxRelativeLen, const bool isEnc)
+{
+    XTensor range;
+    XTensor embMatrix;
+    InitTensor1D(&range, lenKV, X_INT, devID);
+    int* index = new int[lenKV];
+
+    if (isEnc) {
+        for (int i = 0; i < lenKV; i++)
+            index[i] = i;
+        range.SetData(index, lenKV);
+        XTensor range2D;
+        XTensor range2DTrans;
+        range2D = Unsqueeze(range, 0, lenQ);
+        range2DTrans = Transpose(range2D, 0, 1);
+        embMatrix = Sum(range2D, range2DTrans, -1);
+    }
+    else {
+        for (int i = 0; i < lenKV; i++)
+            index[i] = -lenKV + i + 1;
+        range.SetData(index, lenKV);
+        embMatrix = Unsqueeze(range, 0, lenQ);
+    }
+
+    //ClipMe(embMatrix, -float(maxRelativeLen), float(maxRelativeLen));
+    embMatrix = Clip(embMatrix, -float(maxRelativeLen), float(maxRelativeLen));
+    embMatrix = ScaleAndShift(embMatrix, 1.0F, float(maxRelativeLen));
+
+    delete[] index;
+    return embMatrix;
+}
+
+/*
+relative position-aware dot-product attention inner calculation.
+>> x - Tensor with shape [batch_size*heads, length, length or depth].
+>> y - Tensor with shape [batch_size*heads, length, depth].
+>> z - Tensor with shape [length, length, depth].
+>> isKey - Whether y is key.
+<< return - A Tensor with shape [batch_size*heads, length, length or depth].
+*/
+XTensor Attention::RPDotProduct(XTensor& x, XTensor& y, XTensor& z, const bool isKey)
+{
+    const int headNum = nhead;
+    const int batchSize = x.GetDim(1);
+    const int lenQ = x.GetDim(2);
+    const int lenKV = y.GetDim(2);
+    const int depth = y.GetDim(3);
+
+    const int lastDim = isKey ? lenKV : depth;
+    auto transposeFlag = isKey ? X_TRANS : X_NOTRANS;
+
+    int mergeDimsX[] = { headNum * batchSize, lenQ, x.GetDim(3) };
+    int mergeDimsY[] = { headNum * batchSize, lenKV, y.GetDim(3) };
+    x = Reshape(x, 3, mergeDimsX);
+    y = Reshape(y, 3, mergeDimsY);
+
+    if (isKey) {
+        y = Transpose(y, 1, 2);
+    }
+
+    XTensor context;
+    context = BMMul(x, y);
+    int newDims[]{ headNum, batchSize, context.GetDim(1), context.GetDim(2) };
+    context = Reshape(context, 4, newDims);
+
+    XTensor xTrans;
+    xTrans = Transpose(x, 0, 1);
+
+    XTensor relative;
+    relative = MatrixMulBatched(xTrans, X_NOTRANS, z, transposeFlag);
+
+    XTensor relativeTrans;
+    relativeTrans = Transpose(relative, 0, 1);
+
+    int splitDims[] = { headNum, batchSize, lenQ, lastDim };
+
+    relativeTrans = Reshape(relativeTrans, 4, splitDims);
+
+    return context + relativeTrans;
+}
+
+/* constructor */
+Cache::Cache()
+{
+    miss = true;
+    enable = true;
+}
+
+/* update the states cache */
+void Cache::Update(XTensor&& k, XTensor&& v)
+{
+    key = k;
+    value = v;
+    miss = false;
+}
+
+/* keep alive states */
+void Cache::KeepAlive(XTensor& aliveIdx)
+{
+    if (!miss) {
+        key = AutoGather(key, aliveIdx);
+        value = AutoGather(value, aliveIdx);
+    }
+}
+
+/* reorder alive states */
+void Cache::Reorder(XTensor& reorder)
+{
+    if (!miss) {
+        key = AutoGather(key, reorder);
+        value = AutoGather(value, reorder);
+    }
+}
+}
\ No newline at end of file
--- a/source/nmt/layer/Attention.h
+++ b/source/nmt/layer/Attention.h
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
+ */
+
+#ifndef __ATTENTION_H__
+#define __ATTENTION_H__
+
+#include "NNUtil.h"
+#include "../Utility.h"
+#include "../../niutensor/network/XNet.h"
+#include "../../niutensor/tensor/core/CHeader.h"
+
+using namespace nts;
+
+namespace nmt
+{
+/* attention type */
+enum { NONE, SELF_ATT, EN_DE_ATT };
+
+/* layer cache for keys and values */
+class Cache
+{
+public:
+    /* cache for keys, (B, L, H) */
+    XTensor key;
+
+    /* cache for values, (B, L, H) */
+    XTensor value;
+
+public:
+
+    /* indicates cache miss if 'true' */
+    bool miss;
+
+    /* indicates whether we use cache */
+    bool enable;
+
+    /* constructor */
+    Cache();
+
+    /* update the states cache */
+    void Update(XTensor&& k, XTensor&& v);
+
+    /* keep alive states */
+    void KeepAlive(XTensor& aliveIdx);
+
+    /* reorder alive states */
+    void Reorder(XTensor& reorder);
+};
+
+/* multi-head attention */
+class Attention
+{
+public:
+    /* device id */
+    int devID;
+
+    /* head number */
+    int nhead;
+
+    /* transformation matrix for Q */
+    XTensor weightQ;
+
+    /* bias for Q */
+    XTensor biasQ;
+
+    /* transformation matrix for K */
+    XTensor weightK;
+
+    /* bias for K */
+    XTensor biasK;
+
+    /* transformation matrix for V */
+    XTensor weightV;
+
+    /* bias for V */
+    XTensor biasV;
+
+    XTensor wBig;
+
+    XTensor bBig;
+
+    /* RPR emb */
+    XTensor RPEmbK;
+
+    /* transformation after dot-product attention */
+    XTensor weightO;
+
+    /* bias after dot-product attention */
+    XTensor biasO;
+
+    /* size of transformed Q and K */
+    int dk;
+
+    /* size of transformed V */
+    int dv;
+
+    /* size of input Q, K and V */
+    int d;
+
+    /* indicates whether we use the RPR attention */
+    bool useRPR;
+
+    /* dropout probability */
+    DTYPE dropoutP;
+
+    /* the maximum relative window size */
+    int maxRP;
+
+public:
+    /* constructor */
+    Attention();
+
+    /* de-constructor */
+    ~Attention();
+
+    /* initialize the model */
+    void InitModel(Config& config);
+
+    /* make the network */
+    XTensor Make(XTensor& k, XTensor& q, XTensor& v,
+                 XTensor* mask, bool isTraining,
+                 Cache* cache, int cacheType);
+
+    /* make the attention network given keys, queries and values (after linear transformation) */
+    XTensor MakeAttention(XTensor& k, XTensor& q, XTensor& v,
+                          XTensor* mask, bool isTraining);
+
+    /* make the attention network given keys, queries and values (after linear transformation) */
+    XTensor MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
+                             XTensor* mask, bool isTraining, bool isEnc);
+
+    /* generate relative position embeddings */
+    XTensor GetRPEmbedding(const int lenQ, const int lenKV, const int maxRelativeLen, const bool isEnc);
+
+    /* relative position-aware dot-product attention inner calculation */
+    XTensor RPDotProduct(XTensor& x, XTensor& y, XTensor& z, const bool is_key);
+};
+}
+
+#endif
--- a/source/nmt/layer/CommonModules.cpp
+++ b/source/nmt/layer/CommonModules.cpp
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Bei Li (libei_neu@outlook.com) 2020-02-05
+ * This file includes some common modules of the Transformer model
+ */
+
+#include "CommonModules.h"
+#include "../../niutensor/tensor/core/CHeader.h"
+#include "../../niutensor/tensor/function/FHeader.h"
+
+namespace nmt
+{
+
+/* 
+flexible layer normalization for the Transformer 
+>> input - input tensor
+>> ln - the layernorm network
+>> prenorm - whether we use prenorm or not
+>> before - whether we use layernorm before attention/fnn
+>> after - whether we use layernorm after attention/fnn
+*/
+XTensor LayerNorm(XTensor& input, LN& ln, bool prenorm, bool before, bool after)
+{
+    if (after ^ prenorm)
+        return ln.Make(input);
+    else
+        return input;
+}
+
+}
\ No newline at end of file
--- a/source/nmt/layer/CommonModules.h
+++ b/source/nmt/layer/CommonModules.h
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /*
+  * $Created by: Bei Li (libei_neu@outlook.com) 2020-02-03
+  */
+
+#ifndef __COMMONMODULE_H__
+#define __COMMONMODULE_H__
+
+#include "LayerNorm.h"
+#include "CommonModules.h"
+
+using namespace nts;
+
+namespace nmt
+{
+
+/* the layer normalization module to control pre-norm or post-norm*/
+XTensor LayerNorm(XTensor& input, LN& ln, bool prenorm, bool before, bool after);
+
+}
+
+#endif
\ No newline at end of file
--- a/source/nmt/layer/Embedding.cpp
+++ b/source/nmt/layer/Embedding.cpp
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-01
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-07
+ */
+
+#include "Embedding.h"
+#include "../Utility.h"
+#include "../../niutensor/tensor/core/CHeader.h"
+
+namespace nmt
+{
+
+/* constructor */
+Embedder::Embedder()
+{
+    devID = -1;
+    vSize = -1;
+    maxLength = -1;
+}
+
+/* de-constructor */
+Embedder::~Embedder()
+{
+}
+
+/*
+initialize the model
+>> config - configurations of the model
+>> isEnc - indicates if it is used for the encoder
+*/
+void Embedder::InitModel(Config& config, bool isEnc)
+{
+    devID = config.devID;
+    d = config.modelSize;
+    padIdx = config.padID;
+    eSize = config.embSize;
+    maxLength = config.maxPosLen;
+    vSize = (isEnc) ? config.srcVocabSize : config.tgtVocabSize;
+
+    InitTensor2D(&w, vSize, eSize, X_FLOAT, devID);
+
+    maxLength = maxLength + 1 + 1;
+    DTYPE v = 1.0F / (float)sqrt((float)eSize);
+    w.SetDataRandn(0, v);
+
+    /* create the positional embedding matrix */
+    MakePosEmbedding(maxLength);
+}
+
+/*
+make positional embeddings (of size eSize * length)
+>> length - length of the sequence
+*/
+void Embedder::MakePosEmbedding(int length)
+{
+    InitTensor2D(&posEmbeddingBase, length, eSize, X_FLOAT, devID);
+
+    float* data = new float[posEmbeddingBase.unitNum];
+
+    for (int pos = 0; pos < length; pos++) {
+        float* dp = data + pos * eSize;
+
+        int channelSize = eSize / 2;
+        int offset = 0;
+        for (int i = 0; i < channelSize; i++) {
+            dp[offset++] = (float)sin(pos * exp(-i * log(10000.0F) / (channelSize - 1)));
+        }
+        for (int i = 0; i < channelSize; i++) {
+            dp[offset++] = (float)cos(pos * exp(-i * log(10000.0F) / (channelSize - 1)));
+        }
+    }
+
+    /* padding zeros */
+    int padStart = padIdx * eSize;
+    for (int i = padStart; i < padStart + eSize; i++)
+        data[i] = 0.F;
+
+    posEmbeddingBase.SetData(data, posEmbeddingBase.unitNum);
+
+    if (w.dataType != posEmbeddingBase.dataType)
+        posEmbeddingBase = ConvertDataType(posEmbeddingBase, w.dataType);
+
+    delete[] data;
+}
+
+/*
+make the network
+>> input - the word indices
+>> nstep - the length of current sequence
+>> isDec - indicates whether it is decoder
+>> isTraining - indicates whether it is training
+<< return - word & position embeddings of the input
+*/
+XTensor Embedder::Make(XTensor& input, bool isDec, bool isTraining, int nstep)
+{
+    /* make sure the padding index is 1 */
+    CheckNTErrors(input.order > 1, "Wrong input tensor size!");
+    CheckNTErrors(input.dimSize[input.order - 1] < maxLength, "The sequence is too long!");
+    CheckNTErrors(vSize > 0, "Set vocabulary size by \"-vsize\"");
+    CheckNTErrors(eSize > 0, "Set embedding size by \"-esize\"");
+
+    XTensor wordEmbedding, position, posEmbedding;
+
+    InitTensor1D(&position, input.GetDim(-1), X_INT, devID);
+
+    if (!isDec || isTraining || input.GetDim(-1) > 1)
+    {
+        position.Range(0, position.unitNum, 1);
+
+        // disable grad
+        ScaleAndShiftMe(position, 1.0F, float(padIdx + 1));
+    }
+    else
+    {
+        /* decoder embeddings during decoding */
+        position.SetDataFixed(nstep + padIdx + 1);
+    }
+
+    /* we make positional embeddings first */
+    XTensor embTMP;
+    embTMP = Gather(posEmbeddingBase, position);
+    posEmbedding = Unsqueeze(embTMP, 0, input.GetDim(0));
+
+    /* then we make word embeddings */
+    //w.enableGrad = false;
+    wordEmbedding = Gather(w, input);
+
+    wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize));
+
+    /* we sum over the two embeddings */
+    SumMe(wordEmbedding, posEmbedding);
+    return wordEmbedding;
+}
+
+}
\ No newline at end of file
--- a/source/nmt/layer/Embedding.h
+++ b/source/nmt/layer/Embedding.h
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-01
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-07
+ */
+
+#ifndef __EMBEDDING_H__
+#define __EMBEDDING_H__
+
+#include "../Utility.h"
+#include "../../niutensor/network/XNet.h"
+
+using namespace nts;
+
+namespace nmt
+{
+
+#define DEFAULT_EMBEDDING_SIZE 512
+
+/*
+embedding (of word at position i):
+word embedding + positional embedding
+*/
+class Embedder
+{
+public:
+    /* device id */
+    int devID;
+
+    /* vocabulary size */
+    int vSize;
+
+    /* embedding size */
+    int eSize;
+
+    /* maximum length of the sequence */
+    int maxLength;
+
+    /* dimension size of the hidden layers in the  model */
+    int d;
+
+    /* padding index */
+    int padIdx;
+
+    /* word embedding matrix */
+    XTensor w;
+
+    /* predefined positional embeddings. It can speeds up
+       the embedding processing by re-loading. */
+    XTensor posEmbeddingBase;
+
+public:
+    /* constructor */
+    Embedder();
+
+    /* de-constructor */
+    ~Embedder();
+
+    /* initialize the model */
+    void InitModel(Config& config, bool isEnc = true);
+
+    /* make positional embeddings */
+    void MakePosEmbedding(int length);
+
+    /* make the network */
+    XTensor Make(XTensor& input, bool isDec, bool isTraining, int nstep = 0);
+};
+
+}
+
+#endif
--- a/source/nmt/layer/FNN.cpp
+++ b/source/nmt/layer/FNN.cpp
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
+ */
+
+#include "FNN.h"
+#include "Embedding.h"
+#include "../Utility.h"
+#include "../../niutensor/tensor/core/CHeader.h"
+#include "../../niutensor/tensor/function/FHeader.h"
+
+namespace nmt
+{
+
+/* constructor */
+FNN::FNN()
+{
+    inSize = -1;
+    outSize = -1;
+    hSize = -1;
+}
+
+/* de-constructor */
+FNN::~FNN()
+{
+}
+
+/*
+initialize the model
+>> argc - number of arguments
+>> argv - list of pointers to the arguments
+>> config - configurations of the model
+*/
+void FNN::InitModel(Config& config)
+{
+    devID = config.devID;
+
+    inSize = config.modelSize;
+    outSize = config.modelSize;
+    hSize = config.fnnHiddenSize;
+    dropoutP = config.fnnDropout;
+
+    InitTensor2D(&w1, inSize, hSize, X_FLOAT, devID);
+    InitTensor1D(&b1, hSize, X_FLOAT, devID);
+
+    InitTensor2D(&w2, hSize, outSize, X_FLOAT, devID);
+    InitTensor1D(&b2, outSize, X_FLOAT, devID);
+
+    float scale = 1.0F;
+    _SetDataFanInOut(&w1, scale);
+    _SetDataFanInOut(&w2, scale);
+
+    w1.SetDataRand(-(DTYPE)sqrt(6.0F / inSize), (DTYPE)sqrt(6.0F / inSize));
+    w2.SetDataRand(-(DTYPE)sqrt(6.0F / hSize), (DTYPE)sqrt(6.0F / hSize));
+
+    b1.SetZeroAll();
+    b2.SetZeroAll();
+}
+
+/*
+make the network
+y = max(0, x * w1 + b1) * w2 + b2
+>> input - the input tensor
+>> return - the output tensor
+*/
+XTensor FNN::Make(XTensor& input, bool isTraining)
+{
+    XTensor t1;
+
+    /* t1 = max(0, x * w1 + b1) */
+    t1 = Rectify(MulAndShift(input, w1, b1));
+
+    if (isTraining && dropoutP > 0)
+        t1 = Dropout(t1, dropoutP);
+
+    /* result = t1 * w2 + b2 */
+    return MulAndShift(t1, w2, b2);
+}
+
+}
\ No newline at end of file
--- a/source/nmt/layer/FNN.h
+++ b/source/nmt/layer/FNN.h
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
+ */
+
+#ifndef __FNN_H__
+#define __FNN_H__
+
+#include "LayerNorm.h"
+#include "../Utility.h"
+#include "../../niutensor/tensor/XTensor.h"
+
+using namespace nts;
+
+namespace nmt
+{
+
+/* a fnn: y = max(0, x * w1 + b1) * w2 + b2 */
+class FNN
+{
+public:
+    /* device id */
+    int devID;
+
+    /* size of input vector */
+    int inSize;
+
+    /* size of output vector */
+    int outSize;
+
+    /* size of hidden layers */
+    int hSize;
+
+    /* matrix of transformation 1 */
+    XTensor w1;
+
+    /* bias of transformation 1 */
+    XTensor b1;
+
+    /* matrix of transformation 2 */
+    XTensor w2;
+
+    /* bias of transformation 2 */
+    XTensor b2;
+
+    /* dropout probability */
+    DTYPE dropoutP;
+
+public:
+
+    /* constructor */
+    FNN();
+
+    /* de-constructor */
+    ~FNN();
+
+    /* initialize the model */
+    void InitModel(Config& config);
+
+    /* make the network */
+    XTensor Make(XTensor& input, bool isTraining);
+};
+
+}
+
+#endif
--- a/source/nmt/layer/GLU.cpp
+++ b/source/nmt/layer/GLU.cpp
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Bei Li (libei_neu@outlook.com) 2020-02-03
+ */
+
+#include "GLU.h"
+#include "Embedding.h"
+#include "../Utility.h"
+#include "../../niutensor/tensor/core/CHeader.h"
+#include "../../niutensor/tensor/function/FHeader.h"
+
+namespace nmt
+{
+
+/* constructor */
+GLU::GLU()
+{
+    inSize = -1;
+    outSize = -1;
+    hSize = -1;
+}
+
+/* de-constructor */
+GLU::~GLU()
+{
+}
+
+/*
+initialize the model
+>> config - configurations of the model
+*/
+void GLU::InitModel(Config& config)
+{
+    devID = config.devID;
+
+    float minmax = 0;
+
+    inSize = config.modelSize;
+    outSize = config.modelSize;
+
+    InitTensor2D(&w1, hSize, outSize, X_FLOAT, devID);
+    InitTensor1D(&b1, outSize, X_FLOAT, devID);
+
+    InitTensor2D(&w2, hSize, outSize, X_FLOAT, devID);
+    InitTensor1D(&b2, outSize, X_FLOAT, devID);
+}
+
+/*
+make the network
+y = W1 * x + b1 * sigmod(W2 * x + b2)
+>> input - the input tensor, size = 2 * hSize
+>> return - the output tensor, size = hSize
+*/
+XTensor GLU::Make(XTensor& input)
+{
+    XTensor t1;
+    XTensor t2;
+    TensorList input_list;
+
+    /* split the input into two vectors with the dim hSize */
+    Split(input, input_list, -1, 2);
+
+    /* t1 = W1 * x + b1 */
+    t1 = MulAndShift(input_list.GetItem(0), w1, b1);
+
+    /* t2 = W2 * x + b2 */
+    t2 = MulAndShift(input_list.GetItem(1), w2, b2);
+
+    return t1 * Sigmoid(t2);
+}
+
+}
\ No newline at end of file
--- a/source/nmt/layer/GLU.h
+++ b/source/nmt/layer/GLU.h
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Bei Li (libei_neu@outlook.com) 2020-02-03
+ */
+
+
+#ifndef __GLU_H__
+#define __GLU_H__
+
+#include "LayerNorm.h"
+
+using namespace nts;
+
+namespace nmt
+{
+
+/* a fnn: y = max(0, x * w1 + b1) * w2 + b2 */
+class GLU
+{
+public:
+    /* device id */
+    int devID;
+
+    /* size of input vector */
+    int inSize;
+
+    /* size of output vector */
+    int outSize;
+
+    /* size of hidden layers */
+    int hSize;
+
+    /* matrix of transformation 1 */
+    XTensor w1;
+
+    /* bias of transformation 1 */
+    XTensor b1;
+
+    /* matrix of transformation 2 */
+    XTensor w2;
+
+    /* bias of transformation 2 */
+    XTensor b2;
+
+public:
+
+    /* constructor */
+    GLU();
+
+    /* de-constructor */
+    ~GLU();
+
+    /* initialize the model */
+    void InitModel(Config& config);
+
+    /* make the network */
+    XTensor Make(XTensor& input);
+};
+
+}
+
+#endif
\ No newline at end of file
--- a/source/nmt/layer/LayerHistory.cpp
+++ b/source/nmt/layer/LayerHistory.cpp
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Bei Li (libei_neu@outlook.com) 2020-02-03
+ */
+
+#include "Embedding.h"
+#include "LayerNorm.h"
+#include "LayerHistory.h"
+#include "../Utility.h"
+#include "../../niutensor/tensor/core/CHeader.h"
+
+#define SAFE_DELETE(x) do{ if((x) != NULL){delete (x); (x) = NULL;} } while(false)
+#define SAFE_DELETE_ARRAY(x) do{ if((x) != NULL) {delete [] (x); (x)=NULL;} } while(false)
+
+namespace nmt
+{
+
+/* constructor */
+LayerHistory::LayerHistory()
+{
+    d = -1;
+    count = -1;
+    weight = NULL;
+    layerNorms = NULL;
+}
+
+/* de-constructor */
+LayerHistory::~LayerHistory()
+{
+    history.Clear();
+    delete[] layerNorms;
+}
+
+/*
+initialize the model
+>> config - configurations of the model
+*/
+void LayerHistory::InitModel(Config& config)
+{
+    devID = config.devID;
+    d = config.modelSize;
+    nlayer = config.nEncLayer;
+
+    InitTensor2D(&weight, nlayer + 1, nlayer + 1, X_FLOAT, devID);
+
+    layerNorms = new LN[nlayer];
+
+    /* initialize the layer normalization of each layer */
+    for (int i = 0; i < nlayer; i++) {
+        layerNorms[i].InitModel(config);
+    }
+}
+
+/*
+the Add operation
+>> tensor - the previous layer output. It might be of size B * L * H
+            where B = batch size, L = sequence length,
+            and H = vector size of each position
+*/
+void LayerHistory::Add(XTensor& tensor)
+{
+    /* the embedding is not normed */
+    count += 1;
+    if (history.Size() == 0) {
+
+        //sample_ = tensor;
+        history.Add(&tensor);
+        return;
+    }
+    XTensor ln = layerNorms[count - 2].Make(tensor);
+    history.Add(&ln);
+}
+
+/*
+generate the weight sum vector of all previous layer output in the history as the layer input
+*/
+XTensor LayerHistory::Pop()
+{
+    /* the number of layer output in the history */
+    size_t size = history.Size();
+
+    TensorList historyList;
+    for (size_t i = 0; i < size; i++)
+        historyList.Add(history[i]);
+
+    /* we need stack the tensor along the first dim*/
+    XTensor stackTensor = Stack(historyList, 0);
+
+    XTensor interWeight;
+
+    InitTensor2D(&interWeight, 1, weight.dimSize[1], DEFAULT_DTYPE, devID);
+    XTensor layerWeight;
+    InitTensor1D(&layerWeight, size, DEFAULT_DTYPE, devID);
+
+    _SelectRange(&weight, &interWeight, 0, size - 1, size);
+    interWeight.Reshape(interWeight.unitNum);
+    _SelectRange(&interWeight, &layerWeight, 0, 0, size);
+    MultiplyDimMe(stackTensor, layerWeight, 0);
+
+    XTensor result;
+    ReduceSum(stackTensor, result, 0);
+
+    return result;
+}
+
+void LayerHistory::ClearHistory()
+{
+    history.Clear();
+}
+
+}
\ No newline at end of file
--- a/source/nmt/layer/LayerHistory.h
+++ b/source/nmt/layer/LayerHistory.h
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Bei Li (libei_neu@outlook.com) 2020-02-03
+ */
+
+#ifndef __LAYERHISTORY_H__
+#define __LAYERHISTORY_H__
+
+#include "LayerNorm.h"
+#include "LayerHistory.h"
+
+#include "../../niutensor/tensor/function/FHeader.h"
+
+using namespace nts;
+
+namespace nmt
+{
+
+/*
+multi-head attention
+y(Q, K, V) = cat(head_1, head_2, ..., head_n)
+where head_i = Attention(Q * w_i^Q, K * w_i^K, V * w_i^V)
+      attention(Q, K, V) = softmax(Q * K^T/d_k^0.5) V
+      d_k = dimension size of K
+*/
+class LayerHistory
+{
+public:
+    /* device id */
+    int devID;
+
+    /* the triangle weight matrix for dlcl */
+    XTensor weight;
+
+    /* hidden size */
+    int d;
+
+    /* layer number */
+    int nlayer;
+
+    /* current layer number */
+    int count;
+
+    /* a history to store the value of intimidate layers */
+    TensorList history;
+
+    /* layer normalization for each intimidate layer */
+    LN* layerNorms;
+
+public:
+    /* constructor */
+    LayerHistory();
+
+    /* de-constructor */
+    ~LayerHistory();
+
+    /* initialize the model */
+    void InitModel(Config& config);
+
+    /* add the layer output to the history */
+    void Add(XTensor& tensor);
+
+    /* compute the layer input for the current layer, the weight sum of all previous layer output after normed in the history */
+    XTensor Pop();
+
+    /* clean the history*/
+    void ClearHistory();
+};
+
+}
+
+#endif
--- a/source/nmt/layer/LayerNorm.cpp
+++ b/source/nmt/layer/LayerNorm.cpp
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
+ */
+
+#include "Embedding.h"
+#include "LayerNorm.h"
+#include "../Utility.h"
+#include "../../niutensor/tensor/core/CHeader.h"
+
+namespace nmt
+{
+
+/* constructor */
+LN::LN()
+{
+    devID = -1;
+    d = 0;
+}
+
+/* de-constructor */
+LN::~LN()
+{
+}
+
+/*
+initialize the model
+>> argc - number of arguments
+>> argv - list of pointers to the arguments
+>> config - configurations of the model
+*/
+void LN::InitModel(Config& config)
+{
+    devID = config.devID;
+
+    d = config.modelSize;
+
+    InitTensor1D(&w, d, X_FLOAT, devID);
+    InitTensor1D(&b, d, X_FLOAT, devID);
+    w.SetDataRand(1.0F, 1.0F);
+    b.SetZeroAll();
+
+    w.SetDataFixed(1);
+}
+
+/*
+make the network
+>> input - the input tensor
+>> return - layer normalization output
+*/
+XTensor LN::Make(XTensor& input)
+{
+    XTensor& x = input;
+    XTensor xn;
+    XTensor mean;
+    XTensor variance;
+    XTensor standard;
+    XTensor meanFilled;
+    XTensor standardFilled;
+
+    TENSOR_DATA_TYPE dataType = input.dataType;
+
+    if (dataType == X_FLOAT16) {
+        /* reduce functions can only run with FP32 */
+        x = ConvertDataType(input, X_FLOAT);
+    }
+
+    /* \mu = (sum_i x_i)/m */
+    mean = ReduceMean(x, x.order - 1);
+
+    /* \sigma = (sum_i (x_i - \mu)^2)/m */
+    variance = ReduceVariance(x, x.order - 1, mean);
+
+    /* standard = sqrt(variance) */
+    standard = Power(variance, 0.5F);
+
+    /* unsqueeze mean and standard deviation to fit them into
+       the same shape of x */
+    meanFilled = Unsqueeze(mean, x.order - 1, x.GetDim(-1));
+    standardFilled = Unsqueeze(standard, x.order - 1, x.GetDim(-1));
+
+    /* x' = (x - \mu)/standard */
+    xn = (x - meanFilled) / standardFilled;
+
+    if (dataType != mean.dataType) {
+        x = ConvertDataType(x, dataType);
+        xn = ConvertDataType(xn, dataType);
+    }
+
+    /* result = x' * w + b   */
+    return xn * w + b;
+}
+
+}
\ No newline at end of file
--- a/source/nmt/layer/LayerNorm.h
+++ b/source/nmt/layer/LayerNorm.h
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
+ */
+
+#ifndef __LAYERNORMAL_H__
+#define __LAYERNORMAL_H__
+
+#include "../Utility.h"
+#include "../../niutensor/network//XNet.h"
+
+using namespace nts;
+
+namespace nmt
+{
+
+/* layer normalization: y = norm(x) * w + b
+   where norm(x) = (x - mean)/standardDeviation */
+class LN
+{
+public:
+    /* device id */
+    int devID;
+
+    /* the transformation matrix w */
+    XTensor w;
+
+    /* the bias term b */
+    XTensor b;
+
+    /* dimension size of the model */
+    int d;
+
+public:
+    /* constructor */
+    LN();
+
+    /* de-constructor */
+    ~LN();
+
+    /* initialize the model */
+    void InitModel(Config& config);
+
+    /* make the network */
+    XTensor Make(XTensor& input);
+};
+
+}
+
+#endif
--- a/source/nmt/layer/NNUtil.cpp
+++ b/source/nmt/layer/NNUtil.cpp
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: HU Chi (huchinlp@foxmail.com) 2020-03-21
+ */
+
+#include "NNUtil.h"
+
+namespace nmt
+{
+
+/* 
+a wrapper for the gather function 
+>> src - the input tensor
+>> index - the index tensor
+<< res - the output tensor
+*/
+XTensor AutoGather(XTensor& src, XTensor& index)
+{
+
+    if (src.order == 2)
+        return Gather(src, index);
+    else {
+        CheckNTErrors(src.order == 3, "the source must be 3d");
+
+        int order = src.order;
+        int dimSize[MAX_TENSOR_DIM_NUM];
+        for (int i = 0; i < src.order; i++) {
+            dimSize[i] = src.dimSize[i];
+        }
+
+        src.Reshape(src.dimSize[0], src.dimSize[1] * src.dimSize[2]);
+        XTensor res = Gather(src, index);
+
+        src.Reshape(order, dimSize);
+
+        dimSize[0] = index.dimSize[0];
+        dimSize[1] = res.unitNum / (dimSize[0] * dimSize[2]);
+
+        res.Reshape(order, dimSize);
+        return res;
+    }
+}
+
+}
\ No newline at end of file
--- a/source/nmt/layer/NNUtil.h
+++ b/source/nmt/layer/NNUtil.h
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: HU Chi (huchinlp@foxmail.com) 2020-03-21
+ */
+
+#ifndef __NNUTIL_H__
+#define __NNUTIL_H__
+
+#include "../../niutensor/tensor/XGlobal.h"
+#include "../../niutensor/tensor/core/CHeader.h"
+#include "../../niutensor/tensor/function/FHeader.h"
+
+using namespace nts;
+
+namespace nmt
+{
+
+/* the gather function for tensor with any dimension */
+XTensor AutoGather(XTensor& src, XTensor& index);
+
+}
+
+#endif
\ No newline at end of file
--- a/source/nmt/layer/Output.cpp
+++ b/source/nmt/layer/Output.cpp
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
+ */
+
+#include "Output.h"
+#include "Embedding.h"
+#include "../Utility.h"
+#include "../../niutensor/tensor/core/CHeader.h"
+
+namespace nmt
+{
+
+/* constructor */
+Output::Output()
+{
+    devID = -1;
+    vSize = -1;
+    hSize = -1;
+}
+
+/* de-constructor */
+Output::~Output()
+{
+}
+
+/*
+initialize the model
+>> config - configurations of the model
+*/
+void Output::InitModel(Config& config)
+{
+    devID = config.devID;
+    hSize = config.modelSize;
+    vSize = config.tgtVocabSize;
+
+    InitTensor2D(&w, vSize, hSize, X_FLOAT, devID);
+
+    DTYPE v = 1.0F / (float)sqrt((float)hSize);
+    w.SetDataRandn(0, v);
+}
+
+/*
+make the network (redefined output tensor)
+>> input - input tensor
+>> output - output tensor
+>> isTraining - whether it is used for training
+>> normalized - whether ignore the log-softmax
+*/
+void Output::Make(XTensor& input, XTensor& output, bool isTraining, bool normalized)
+{
+    XTensor& x = input;
+
+    output = MMul(x, X_NOTRANS, w, X_TRANS);
+
+    /* use softmax for training */
+    if (isTraining) {
+        output = Softmax(output, -1);
+        return;
+    }
+
+    /* normalize the output for beam search */
+    if (normalized) {
+        auto dataType = output.dataType;
+        if (dataType == X_FLOAT16)
+            output = ConvertDataType(output, X_FLOAT);
+
+        output = LogSoftmax(output, -1);
+
+        if (output.dataType != dataType)
+            output = ConvertDataType(output, dataType);
+    }
+}
+
+}
\ No newline at end of file
--- a/source/nmt/layer/Output.h
+++ b/source/nmt/layer/Output.h
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
+ */
+
+#ifndef __OUTPUT_H__
+#define __OUTPUT_H__
+
+#include "../Utility.h"
+#include "../../niutensor/tensor/function/FHeader.h"
+
+using namespace nts;
+
+namespace nmt
+{
+
+/* output layer */
+class Output
+{
+public:
+    /* device id */
+    int devID;
+
+    /* vocabulary size */
+    int vSize;
+
+    /* vector size of the linear transformation */
+    int hSize;
+
+    /* transformation matrix */
+    XTensor w;
+
+public:
+    /* constructor */
+    Output();
+
+    /* de-constructor */
+    ~Output();
+
+    /* initialize the model */
+    void InitModel(Config& config);
+
+    /* make the network (redefined output tensor) */
+    void Make(XTensor& input, XTensor& output, bool isTraining, bool normalized);
+};
+
+}
+
+#endif
\ No newline at end of file
--- a/source/nmt/train/TrainDataSet.cpp
+++ b/source/nmt/train/TrainDataSet.cpp
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: HU Chi (huchinlp@foxmail.com) 2020-08-09
+ * TODO: refactor the data loader class and references
+ */
+
+#include <string>
+#include <vector>
+#include <cstdlib>
+#include <fstream>
+#include <algorithm>
+
+#include "TrainDataSet.h"
+#include "../Utility.h"
+#include "../translate/Vocab.h"
+
+using namespace nmt;
+
+namespace nts {
+
+/* sort the dataset by length (in descending order) */
+void TrainDataSet::SortByLength() {
+    sort(buffer.items, buffer.items + buffer.count,
+        [](TrainExample* a, TrainExample* b) {
+            return (a->srcSent.Size() + a->tgtSent.Size())
+                 > (b->srcSent.Size() + b->tgtSent.Size());
+        });
+}
+
+/* sort buckets by key (in descending order) */
+void TrainDataSet::SortBucket() {
+    sort(buffer.items, buffer.items + buffer.count,
+        [](TrainExample* a, TrainExample* b) {
+            return a->bucketKey > b->bucketKey;
+        });
+}
+
+/*
+sort the output by key in a range (in descending order)
+>> begin - the first index of the range
+>> end - the last index of the range
+*/
+void TrainDataSet::SortInBucket(int begin, int end) {
+    sort(buffer.items + begin, buffer.items + end,
+        [](TrainExample* a, TrainExample* b) {
+            return (a->key > b->key);
+        });
+}
+
+/*
+load all data from a file to the buffer
+training data format (binary):
+first 8 bit: number of sentence pairs
+subsequent segements:
+source sentence length (4 bit)
+target sentence length (4 bit)
+source tokens (4 bit per token)
+target tokens (4 bit per token)
+*/
+void TrainDataSet::LoadDataToBuffer()
+{
+    buffer.Clear();
+    curIdx = 0;
+
+    int id = 0;
+    uint64_t sentNum = 0;
+
+    int srcVocabSize = 0;
+    int tgtVocabSize = 0;
+    fread(&srcVocabSize, sizeof(srcVocabSize), 1, fp);
+    fread(&tgtVocabSize, sizeof(tgtVocabSize), 1, fp);
+
+    fread(&sentNum, sizeof(uint64_t), 1, fp);
+    CheckNTErrors(sentNum > 0, "Invalid sentence pairs number");
+
+    while (id < sentNum) {
+        int srcLen = 0;
+        int tgtLen = 0;
+        fread(&srcLen, sizeof(int), 1, fp);
+        fread(&tgtLen, sizeof(int), 1, fp);
+        CheckNTErrors(srcLen > 0, "Invalid source sentence length");
+        CheckNTErrors(tgtLen > 0, "Invalid target sentence length");
+
+        IntList srcSent;
+        IntList tgtSent;
+        srcSent.ReadFromFile(fp, srcLen);
+        tgtSent.ReadFromFile(fp, tgtLen);
+
+        TrainExample* example = new TrainExample;
+        example->id = id++;
+        example->key = id;
+        example->srcSent = srcSent;
+        example->tgtSent = tgtSent;
+
+        buffer.Add(example);
+    }
+
+    fclose(fp);
+
+    XPRINT1(0, stderr, "[INFO] loaded %d sentences\n", id);
+}
+
+/*
+load a mini-batch to the device (for training)
+>> batchEnc - a tensor to store the batch of encoder input
+>> paddingEnc - a tensor to store the batch of encoder paddings
+>> batchDec - a tensor to store the batch of decoder input
+>> paddingDec - a tensor to store the batch of decoder paddings
+>> label - a tensor to store the label of input
+>> minSentBatch - the minimum number of sentence batch
+>> batchSize - the maxium number of words in a batch
+>> devID - the device id, -1 for the CPU
+<< return - number of target tokens and sentences
+*/
+UInt64List TrainDataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
+                                   XTensor* batchDec, XTensor* paddingDec, XTensor* label,
+                                   size_t minSentBatch, size_t batchSize, int devID)
+{
+    UInt64List info;
+    size_t srcTokenNum = 0;
+    size_t tgtTokenNum = 0;
+    int realBatchSize = 1;
+
+    if (!isTraining)
+        realBatchSize = minSentBatch;
+
+    /* get the maximum source sentence length in a mini-batch */
+    size_t maxSrcLen = buffer[curIdx]->srcSent.Size();
+
+    /* max batch size */
+    const int MAX_BATCH_SIZE = 512;
+
+    /* dynamic batching for sentences, enabled when the dataset is used for training */
+    if (isTraining) {
+        while ((realBatchSize < (buffer.Size() - curIdx))
+            && (realBatchSize * maxSrcLen < batchSize)
+            && (realBatchSize < MAX_BATCH_SIZE)
+            && (realBatchSize * buffer[curIdx + realBatchSize]->srcSent.Size() < batchSize)) {
+            if (maxSrcLen < buffer[curIdx + realBatchSize]->srcSent.Size())
+                maxSrcLen = buffer[curIdx + realBatchSize]->srcSent.Size();
+            realBatchSize++;
+        }
+    }
+    
+    /* real batch size */
+    if ((buffer.Size() - curIdx) < realBatchSize) {
+        realBatchSize = buffer.Size() - curIdx;
+    }
+
+    CheckNTErrors(realBatchSize > 0, "Invalid batch size");
+
+    /* get the maximum target sentence length in a mini-batch */
+    size_t maxTgtLen = buffer[curIdx]->tgtSent.Size();
+    for (size_t i = 0; i < realBatchSize; i++) {
+        if (maxTgtLen < buffer[curIdx + i]->tgtSent.Size())
+            maxTgtLen = buffer[curIdx + i]->tgtSent.Size();
+    }
+    for (size_t i = 0; i < realBatchSize; i++) {
+        if (maxSrcLen < buffer[curIdx + i]->srcSent.Size())
+            maxSrcLen = buffer[curIdx + i]->srcSent.Size();
+    }
+
+    CheckNTErrors(maxSrcLen != 0, "Invalid source length for batching");
+
+    int* batchEncValues = new int[realBatchSize * maxSrcLen];
+    float* paddingEncValues = new float[realBatchSize * maxSrcLen];
+
+    int* labelVaues = new int[realBatchSize * maxTgtLen];
+    int* batchDecValues = new int[realBatchSize * maxTgtLen];
+    float* paddingDecValues = new float[realBatchSize * maxTgtLen];
+
+    for (int i = 0; i < realBatchSize * maxSrcLen; i++) {
+        batchEncValues[i] = PAD;
+        paddingEncValues[i] = 1;
+    }
+    for (int i = 0; i < realBatchSize * maxTgtLen; i++) {
+        batchDecValues[i] = PAD;
+        labelVaues[i] = PAD;
+        paddingDecValues[i] = 1.0F;
+    }
+
+    size_t curSrc = 0;
+    size_t curTgt = 0;
+
+    /*
+    batchEnc: end with EOS (left padding)
+    batchDec: begin with SOS (right padding)
+    label:    end with EOS (right padding)
+    */
+    for (int i = 0; i < realBatchSize; ++i) {
+
+        srcTokenNum += buffer[curIdx + i]->srcSent.Size();
+        tgtTokenNum += buffer[curIdx + i]->tgtSent.Size();
+
+        curSrc = maxSrcLen * i;
+        for (int j = 0; j < buffer[curIdx + i]->srcSent.Size(); j++) {
+            batchEncValues[curSrc++] = buffer[curIdx + i]->srcSent[j];
+        }
+
+        curTgt = maxTgtLen * i;
+        for (int j = 0; j < buffer[curIdx + i]->tgtSent.Size(); j++) {
+            if (j > 0)
+                labelVaues[curTgt - 1] = buffer[curIdx + i]->tgtSent[j];
+            batchDecValues[curTgt++] = buffer[curIdx + i]->tgtSent[j];
+        }
+        labelVaues[curTgt - 1] = EOS;
+        while (curSrc < maxSrcLen * (i + 1))
+            paddingEncValues[curSrc++] = 0;
+        while (curTgt < maxTgtLen * (i + 1))
+            paddingDecValues[curTgt++] = 0;
+
+    }
+
+    InitTensor2D(batchEnc, realBatchSize, maxSrcLen, X_INT, devID);
+    InitTensor2D(paddingEnc, realBatchSize, maxSrcLen, X_FLOAT, devID);
+    InitTensor2D(batchDec, realBatchSize, maxTgtLen, X_INT, devID);
+    InitTensor2D(paddingDec, realBatchSize, maxTgtLen, X_FLOAT, devID);
+    InitTensor2D(label, realBatchSize, maxTgtLen, X_INT, devID);
+
+    curIdx += realBatchSize;
+
+    batchEnc->SetData(batchEncValues, batchEnc->unitNum);
+    paddingEnc->SetData(paddingEncValues, paddingEnc->unitNum);
+    batchDec->SetData(batchDecValues, batchDec->unitNum);
+    paddingDec->SetData(paddingDecValues, paddingDec->unitNum);
+    label->SetData(labelVaues, label->unitNum);
+
+    delete[] batchEncValues;
+    delete[] paddingEncValues;
+    delete[] batchDecValues;
+    delete[] paddingDecValues;
+    delete[] labelVaues;
+
+    info.Add(tgtTokenNum);
+    info.Add(realBatchSize);
+    return info;
+}
+
+/*
+the constructor of DataSet
+>> dataFile - path of the data file
+>> bucketSize - size of the bucket to keep similar length sentence pairs
+>> training - indicates whether it is used for training
+*/
+void TrainDataSet::Init(const char* dataFile, int myBucketSize, bool training)
+{
+    fp = fopen(dataFile, "rb");
+    CheckNTErrors(fp, "can not open the training file");
+    curIdx = 0;
+    bucketSize = myBucketSize;
+    isTraining = training;
+
+    LoadDataToBuffer();
+
+    SortByLength();
+
+    if (isTraining)
+        BuildBucket();
+}
+
+/* check if the buffer is empty */
+bool TrainDataSet::IsEmpty() {
+    if (curIdx < buffer.Size())
+        return false;
+    return true;
+}
+
+/* reset the buffer */
+void TrainDataSet::ClearBuf()
+{
+    curIdx = 0;
+
+    /* make different batches in different epochs */
+    SortByLength();
+
+    if (isTraining)
+        BuildBucket();
+}
+
+/* group data into buckets with similar length */
+void TrainDataSet::BuildBucket()
+{
+    size_t idx = 0;
+
+    /* build and shuffle buckets */
+    while (idx < buffer.Size()) {
+
+        /* sentence number in a bucket */
+        size_t sentNum = 1;
+
+        /* get the maximum source sentence length in a bucket */
+        size_t maxSrcLen = buffer[idx]->srcSent.Size();
+
+        /* bucketing for sentences */
+        while ((sentNum < (buffer.Size() - idx))
+            && (sentNum * maxSrcLen < bucketSize)
+            && (sentNum * buffer[curIdx + sentNum]->srcSent.Size() < bucketSize)) {
+            if (maxSrcLen < buffer[idx + sentNum]->srcSent.Size())
+                maxSrcLen = buffer[idx + sentNum]->srcSent.Size();
+            sentNum++;
+        }
+
+        /* make sure the number is valid */
+        if ((buffer.Size() - idx) < sentNum) {
+            sentNum = buffer.Size() - idx;
+        }
+
+        int randomKey = rand();
+
+        /* shuffle items in a bucket */
+        for (size_t i = 0; i < sentNum; i++) {
+            buffer[idx + i]->bucketKey = randomKey;
+        }
+
+        idx += sentNum;
+    }
+    SortBucket();
+
+    /* sort items in a bucket */
+    idx = 0;
+    while (idx < buffer.Size()) {
+        size_t sentNum = 0;
+        int bucketKey = buffer[idx + sentNum]->bucketKey;
+        while (sentNum < (buffer.Size() - idx)
+            && buffer[idx + sentNum]->bucketKey == bucketKey) {
+            buffer[idx + sentNum]->key = buffer[idx + sentNum]->srcSent.Size();
+            sentNum++;
+        }
+        SortInBucket(idx, idx + sentNum);
+        idx += sentNum;
+    }
+}
+
+/* de-constructor */
+TrainDataSet::~TrainDataSet()
+{
+
+    /* release the buffer */
+    for (int i = 0; i < buffer.Size(); i++)
+        delete buffer[i];
+}
+
+}
\ No newline at end of file
--- a/source/nmt/train/TrainDataSet.h
+++ b/source/nmt/train/TrainDataSet.h
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: HU Chi (huchinlp@foxmail.com) 2019-04-03
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
+ */
+
+#ifndef __TRAIN_DATASET_H__
+#define __TRAIN_DATASET_H__
+
+#include <cstdio>
+#include <vector>
+#include <fstream>
+
+#include "../../niutensor/tensor/XList.h"
+#include "../../niutensor/tensor/XTensor.h"
+#include "../../niutensor/tensor/XGlobal.h"
+
+#define MAX_WORD_NUM 120
+
+using namespace std;
+
+namespace nts {
+
+/* a class of sentence pairs for training */
+struct TrainExample {
+
+    /* id of the sentence pair */
+    int id;
+
+    /* source language setence (tokenized) */
+    IntList srcSent;
+
+    /* target language setence (tokenized) */
+    IntList tgtSent;
+
+    /* the key used to shuffle items in a bucket */
+    int key;
+
+    /* the key used to shuffle buckets */
+    int bucketKey;
+};
+
+/* A `TrainDataSet` is associated with a file which contains training data. */
+struct TrainDataSet {
+public:
+    /* the data buffer */
+    TrainBufferType buffer;
+
+    /* a list of empty line number */
+    IntList emptyLines;
+
+    /* the pointer to file stream */
+    FILE* fp;
+
+    /* current index in the buffer */
+    size_t curIdx;
+
+    /* size of used data in the buffer */
+    size_t bufferUsed;
+
+    /* size of the bucket used for grouping sentences */
+    size_t bucketSize;
+
+    /* indicates whether it is used for training */
+    bool isTraining;
+
+public:
+
+    /* sort the input by length (in descending order) */
+    void SortByLength();
+
+    /* sort buckets by key (in descending order) */
+    void SortBucket();
+
+    /* sort the output by key (in descending order) */
+    void SortInBucket(int begin, int end);
+
+    /* load data from a file to the buffer */
+    void LoadDataToBuffer();
+
+    /* generate a mini-batch */
+    UInt64List LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
+                         XTensor* batchDec, XTensor* paddingDec, XTensor* label,
+                         size_t minSentBatch, size_t batchSize, int devID);
+
+    /* initialization function */
+    void Init(const char* dataFile, int bucketSize, bool training);
+
+    /* check if the buffer is empty */
+    bool IsEmpty();
+
+    /* reset the buffer */
+    void ClearBuf();
+
+    /* group data into buckets with similar length */
+    void BuildBucket();
+
+    /* de-constructor */
+    ~TrainDataSet();
+};
+}
+
+#endif // __TRAIN_DATASET_H__
\ No newline at end of file
--- a/source/nmt/train/Trainer.cpp
+++ b/source/nmt/train/Trainer.cpp
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-02
+ */
+
+#include "Trainer.h"
+#include "../Utility.h"
+#include "../../niutensor/network/XNoder.h"
+#include "../../niutensor/tensor/XUtility.h"
+#include "../../niutensor/tensor/core/CHeader.h"
+#include "../../niutensor/tensor/loss/LHeader.h"
+
+
+#ifndef WIN32
+#include <sys/time.h>
+#include <unistd.h>
+#endif
+#include "../../niutensor/tensor/XMem.h"
+
+namespace nmt
+{
+
+/* constructor */
+Trainer::Trainer()
+{
+    cfg = NULL;
+}
+
+/* de-constructor */
+Trainer::~Trainer()
+{
+    for (int i = 0; i < moments.count; i++) {
+        XTensor* m = (XTensor*)moments.Get(i);
+        delete m;
+    }
+
+    for (int i = 0; i < moments2nd.count; i++) {
+        XTensor* m = (XTensor*)moments2nd.Get(i);
+        delete m;
+    }
+}
+
+/*
+initialization
+>> config - configurations of the training process
+*/
+void Trainer::Init(Config& config)
+{
+    cfg = &config;
+    lrate = config.lrate;
+    lrbias = config.lrbias;
+    sBatchSize = config.sBatchSize;
+    wBatchSize = config.wBatchSize;
+    bucketSize = config.bucketSize;
+    nepoch = config.nepoch;
+    nstep = config.nstep;
+    maxCheckpoint = config.maxCheckpoint;
+    d = config.modelSize;
+    nwarmup = config.nwarmup;
+    vSize = config.srcVocabSize;
+    vSizeTgt = config.tgtVocabSize;
+    useAdam = config.useAdam;
+    adamBeta1 = config.adamBeta1;
+    adamBeta2 = config.adamBeta2;
+    adamDelta = config.adamDelta;
+    isShuffled = config.isShuffled;
+    labelSmoothingP = config.labelSmoothingP;
+    nStepCheckpoint = config.nStepCheckpoint;
+    useEpochCheckpoint = config.useEpochCheckpoint;
+    updateStep = config.updateStep;
+    isLenSorted = config.isLenSorted;
+
+    adamBeta1T = 1.0F;
+    adamBeta2T = 1.0F;
+}
+
+/*
+train the model
+>> fn - training data file
+>> validFN - validation data file
+>> modelFN - where we keep the model
+>> model - model to train
+*/
+void Trainer::Train(const char* fn, const char* validFN, 
+                    const char* modelFN, Model* model)
+{
+    /* disable cache during training */
+    for (int i = 0; i < model->decoder->nlayer; i++) {
+        model->decoder->selfAttCache[i].enable = false;
+        model->decoder->enDeAttCache[i].enable = false;
+    }
+    int step = 0;
+    int wc = 0;
+    int ws = 0;
+    int wordCount = 0;
+    int wordCountTotal = 0;
+    int batchCountTotal = 0;
+    bool isEnd = false;
+    float loss = 0;
+    float lr = 0;
+    int nStepCheck = 0;
+    int nCheckpoint = 0;
+    int nSkipped = 0;
+    int gradStep = 0;
+    int validStep = 0;
+    int epoch = 0;
+
+    char* trainFN = new char[(int)strlen(fn) + 10];
+    strcpy(trainFN, fn);
+
+#ifndef WIN32
+    if (isShuffled)
+        sprintf(trainFN, "%s.random", fn);
+#endif
+
+    int devID = model->devID;
+
+    PrepareModel(model);
+
+    double startT = GetClockSec();
+
+    batchLoader.Init(fn, bucketSize, true);
+
+    for (epoch = 1; epoch <= nepoch; epoch++) {
+
+        wordCount = 0;
+        loss = 0;
+
+        /* reset the batch loader */
+        batchLoader.ClearBuf();
+
+        while (!batchLoader.IsEmpty())
+        {
+            XNet net;
+            net.Clear();
+
+            /* batch of sequences (on the encoder and decoder sides) */
+            XTensor batchEnc;
+            XTensor batchDec;
+
+            /* labels */
+            XTensor label;
+
+            /* padding */
+            XTensor paddingEnc;
+            XTensor paddingDec;
+
+            UInt64List info = batchLoader.LoadBatch(&batchEnc, &paddingEnc, &batchDec, &paddingDec, &label, 
+                                                    sBatchSize, wBatchSize, devID);
+
+            wc = info[0];
+            ws = info[1];
+            CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch");
+
+            /* output probabilities */
+            XTensor output;
+
+            /* make the network */
+            if (model->isLM)
+                model->MakeLM(batchEnc, output, paddingEnc, true);
+            else if (model->isMT)
+                model->MakeMT(batchEnc, batchDec, output, paddingEnc, paddingDec, true);
+            else {
+                ShowNTErrors("Illegal model type!");
+            }
+
+            /* get loss and probabilities */
+            XTensor labelOnehot;
+            XTensor lossTensor;
+
+            labelOnehot = IndexToOnehot(label, vSizeTgt, labelSmoothingP);
+
+            lossTensor = CrossEntropy(output, labelOnehot, paddingDec);
+
+            float lossBatch = ReduceSumAllValue(lossTensor);
+
+            DTYPE lossLocal = lossBatch / wc;
+            bool doUpdate = (!IsNAN(lossLocal) && !IsINF(lossLocal) && lossLocal < 1e3F);
+
+            if (doUpdate) {
+                /* back-propagation */
+                net.Backward(lossTensor);
+
+                gradStep += 1;
+                loss += lossBatch;
+                wordCount += wc;
+                wordCountTotal += wc;
+                batchCountTotal += ws;
+
+                /* update the parameters */
+                if (gradStep == updateStep) {
+
+                    float warmupEndLR = lrate;
+                    float warmupInitLR = 1e-7;
+                    float lrStep = (warmupEndLR - warmupInitLR) / nwarmup;
+                    float decayFactor = warmupEndLR * pow(float(nwarmup), 0.5F);
+
+                    /* learning rate, scheduled by inverse square root */
+                    if (step < nwarmup)
+                        lr = warmupInitLR + step * lrStep;
+                    else
+                        lr = decayFactor * pow((float)step, -0.5F);
+
+
+                    /* model update */
+                    Update(model, lr);
+
+                    gradStep = 0;
+                    validStep++;
+                }
+            }
+            else
+                nSkipped++;
+
+            if (++step >= nstep) {
+                isEnd = true;
+                break;
+            }
+
+            if (step == 10) {
+                // LOG("after backward --------");
+                // lossTensor.mem->ShowMemUsage(stderr);
+                // exit(0);
+            }
+
+            if (step % 100 == 0) {
+                double elapsed = GetClockSec() - startT;
+                LOG("elapsed=%.1fs, step=%d, epoch=%d, "
+                    "total word=%d, total batch=%d, loss=%.3f, ppl=%.3f, lr=%.2e", 
+                    elapsed, step, epoch, wordCountTotal, batchCountTotal,
+                    loss / wordCount / log(2.0), exp(loss / wordCount), lr);
+                
+                if (!doUpdate)
+                    XPRINT(0, stderr, " (no update)");
+            }
+
+            if (nStepCheckpoint > 0 && ++nStepCheck >= nStepCheckpoint) {
+                MakeCheckpoint(model, validFN, modelFN, "step", step);
+                nStepCheck = 0;
+                nCheckpoint++;
+            }
+        }
+
+        if (isEnd)
+            break;
+
+        if (useEpochCheckpoint)
+            MakeCheckpoint(model, validFN, modelFN, "epoch", epoch);
+    }
+
+    double elapsed = GetClockSec() - startT;
+
+    epoch = MIN(epoch, nepoch);
+
+    LOG("lr=%.2e, elapsed=%.1fs, step=%d, "
+        "epoch=%d, word=%d, loss=%.3f, ppl=%.3f",
+        lr, elapsed, step, epoch, wordCountTotal, loss / wordCount / log(2.0), exp(loss / wordCount));
+    LOG("training finished (took %.1fs, step=%d, "
+        "skipped=%d and epoch=%d)", elapsed, step, nSkipped, epoch);
+
+    LOG("saving the final model");
+    model->Dump(modelFN);
+
+    delete[] trainFN;
+}
+
+/*
+test the model
+>> fn - test data file
+>> ofn - output data file
+>> model - model that is trained
+*/
+void Trainer::Validate(const char* fn, const char* ofn, Model* model)
+{
+    int wc = 0;
+    int ws = 0;
+    int wordCount = 0;
+    int sentCount = 0;
+    float loss = 0;
+
+    /* data files */
+    batchLoader.Init(fn, 0, false);
+
+    double startT = GetClockSec();
+
+    while (!batchLoader.IsEmpty())
+    {
+        /* batch of input sequences */
+        XTensor batchEnc;
+        XTensor batchDec;
+
+        /* label */
+        XTensor label;
+
+        /* padding */
+        XTensor paddingEnc;
+        XTensor paddingDec;
+
+        /* output probabilities */
+        XTensor output;
+
+        /* prediction probabilities */
+        XTensor labelOnehot;
+        XTensor lossTensor;
+
+        UInt64List info = batchLoader.LoadBatch(&batchEnc, &paddingEnc, &batchDec, &paddingDec, &label, 
+                                                sBatchSize, 0, model->devID);
+        wc = info[0];
+        ws = info[1];
+        CheckNTErrors(batchEnc.order == 2, "Wrong tensor order of the sequence batch");
+
+        /* make the network */
+        if (model->isLM)
+            model->MakeLM(batchEnc, output, paddingEnc, false);
+        else if (model->isMT)
+            model->MakeMT(batchEnc, batchDec, output, paddingEnc, paddingDec, false);
+        else {
+            ShowNTErrors("Illegal model type!");
+        }
+
+        int bSize = output.GetDim(0);
+        int length = output.GetDim(1);
+
+        labelOnehot = IndexToOnehot(label, vSizeTgt, 0);
+        lossTensor = CrossEntropy(output, labelOnehot, paddingDec);
+        float lossBatch = ReduceSumAllValue(lossTensor);
+
+        loss += lossBatch;
+
+        wordCount += wc;
+        sentCount += bSize;
+    }
+
+    double elapsed = GetClockSec() - startT;
+
+    LOG("test finished (took %.1fs, sentence=%d, word=%d, loss=%.3f and ppl=%.3f)",
+        elapsed, sentCount, wordCount, loss / wordCount / log(2.0), exp(loss / wordCount));
+}
+
+/*
+make a checkpoint
+>> model - the model
+>> validFN - validation data file
+>> modelFN - model data file
+>> label - label of the model
+>> id - id of the checkpoint
+*/
+void Trainer::MakeCheckpoint(Model* model, const char* validFN, 
+                             const char* modelFN, const char* label, int id)
+{
+    LOG("make a checkpoint");
+    char* fn = new char[MAX_LINE_LENGTH];
+
+    Trainer validator;
+    validator.Init(*cfg);
+    
+    /* save last checkpoints */
+    id = validator.maxCheckpoint - (maxCheckpoint--);
+    if (maxCheckpoint == 0)
+        maxCheckpoint = validator.maxCheckpoint;
+    sprintf(fn, "%s.%s.%03d", modelFN, label, id);
+
+    model->Dump(fn);
+    delete[] fn;
+
+    char* fn2 = new char[MAX_LINE_LENGTH];
+    sprintf(fn2, "%s.%s.%03d.output", modelFN, label, id);
+    if (validFN != NULL) {
+        
+        validator.Validate(validFN, fn2, model);
+    }
+    delete[] fn2;
+}
+
+/*
+update the model by delta rule
+\theta_{new} = \theta - \lrate * grad
+where
+\lrate = d^-0.5 * min(stepNum^{-0.5}, stepNum * warmupStepNum^{-1.5})
+>> model - the  model
+>> lr - learning rate
+*/
+void Trainer::Update(Model* model, const float lr)
+{
+    TensorList ws;
+
+    model->GetParams(ws);
+
+    for (int i = 0; i < ws.Size(); i++) {
+        XTensor* para = ws[i];
+        XTensor* paraGrad = para->grad;
+
+        if (paraGrad == NULL)
+            continue;
+
+        CheckNTErrors(para != NULL, "NULL parameter tensor!");
+        CheckNTErrors(paraGrad != NULL, "NULL gradient tensor!");
+
+        if (useAdam) {
+            adamBeta1T *= adamBeta1;
+            adamBeta2T *= adamBeta2;
+            DTYPE e = lr * (DTYPE)sqrt(1 - adamBeta2T) / (1 - adamBeta1T);
+            DTYPE d = adamDelta * (DTYPE)sqrt(1 - adamBeta2T);
+
+            /* m = beta_1 * m + (1-beta_1) * grad */
+            XTensor* m = (XTensor*)moments.Get(i);
+            _ScaleAndShiftMe(m, adamBeta1, 0);
+            _Sum(m, paraGrad, m, (1.0F - adamBeta1));
+
+            /* v = beta_2 * v + (1-beta_2) * grad * grad*/
+            XTensor* v = (XTensor*)moments2nd.Get(i);
+            _Multiply(paraGrad, paraGrad, v, adamBeta2 / (1.0F - adamBeta2));
+            _ScaleAndShiftMe(v, (1.0F - adamBeta2), 0);
+
+            /* v2 = m / (sqrt(v) + delta) */
+            XTensor* v2 = NewTensorBuf(v, v->devID);
+            _Power(v, v2, 0.5F);
+            _ScaleAndShiftMe(v2, 1.0F, d);
+            _Div(m, v2, v2);
+
+            /* the delta rule */
+            _Sum(para, v2, para, -e);
+
+            DelTensorBuf(v2);
+        }
+        else {
+            /* the delta rule */
+            _Sum(para, paraGrad, para, -lr);
+        }
+
+        /* clear gradient */
+        paraGrad->SetZeroAll();
+    }
+}
+
+/*
+prepare model for training
+>> model - the model for training
+*/
+void Trainer::PrepareModel(Model* model)
+{
+    moments.Clear();
+    moments2nd.Clear();
+
+    TensorList ws;
+
+    model->GetParams(ws);
+
+    for (int i = 0; i < ws.Size(); i++) {
+        XTensor* para = ws[i];
+        XNoder::MakeGrad(para);
+
+        if (useAdam) {
+            XTensor* m = new XTensor(para);
+            XTensor* m2 = new XTensor(para);
+            m->SetZeroAll();
+            m2->SetZeroAll();
+            moments.Add(m);
+            moments2nd.Add(m2);
+        }
+    }
+
+    adamBeta1T = 1.0F;
+    adamBeta2T = 1.0F;
+}
+
+}
\ No newline at end of file
--- a/source/nmt/train/Trainer.h
+++ b/source/nmt/train/Trainer.h
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-02
+ */
+
+#ifndef __TRAINER_H__
+#define __TRAINER_H__
+
+#include "../Model.h"
+#include "TrainDataSet.h"
+
+using namespace nts;
+
+namespace nmt
+{
+
+/* trainer of the  model */
+class Trainer
+{
+public:
+
+    /* configurations */
+    Config* cfg;
+
+    /* dimension size of each inner layer */
+    int d;
+
+    /* step number of warm-up for training */
+    int nwarmup;
+
+    /* vocabulary size of the source side */
+    int vSize;
+
+    /* vocabulary size of the target side */
+    int vSizeTgt;
+
+    /* learning rate */
+    float lrate;
+
+    /* the parameter that controls the maximum learning rate in training */
+    float lrbias;
+
+    /* sentence batch size */
+    int sBatchSize;
+
+    /* word batch size */
+    int wBatchSize;
+
+    /* size of bucket for grouping data by length */
+    int bucketSize;
+
+    /* training epoch number */
+    int nepoch;
+
+    /* traing step number */
+    int nstep;
+
+    /* the maximum number of saved checkpoints */
+    int maxCheckpoint;
+
+    /* indicates whether we use adam */
+    bool useAdam;
+
+    /* hyper parameters of adam*/
+    float adamBeta1;
+    float adamBeta2;
+    float adamDelta;
+    float adamBeta1T;
+    float adamBeta2T;
+
+    /* list of the moment of the parameter matrices */
+    TensorList moments;
+
+    /* list of the 2nd order moment of the parameter matrices */
+    TensorList moments2nd;
+
+    /* indicates whether the data file is shuffled for training */
+    bool isShuffled;
+
+    /* the factor of label smoothing */
+    DTYPE labelSmoothingP;
+
+    /* number of steps after which we make a checkpoint */
+    int nStepCheckpoint;
+
+    /* indicates whether we make a checkpoint after each training epoch */
+    bool useEpochCheckpoint;
+
+    /* number of batches on which we do model update */
+    int updateStep;
+
+    /* indicates whether the sequence is sorted by length */
+    bool isLenSorted;
+
+    /* used for loading batches */
+    TrainDataSet batchLoader;
+
+public:
+    /* constructor */
+    Trainer();
+
+    /* de-constructor */
+    ~Trainer();
+
+    /* initialize the trainer */
+    void Init(Config& config);
+
+    /* train the model */
+    void Train(const char* fn, const char* validFN, const char* modelFN, Model* model);
+
+    /* test the model */
+    void Validate(const char* fn, const char* ofn, Model* model);
+
+    /* make a checkpoint */
+    void MakeCheckpoint(Model* model, const char* validFN, const char* modelFN, const char* label, int id);
+
+    /* update the model by delta rule */
+    void Update(Model* model, const float lr);
+
+    /* prepare model for training */
+    void PrepareModel(Model* model);
+};
+
+}
+
+#endif
--- a/source/nmt/translate/DataSet.cpp
+++ b/source/nmt/translate/DataSet.cpp
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: HU Chi (huchinlp@foxmail.com) 2019-04-03
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
+ */
+
+#include <string>
+#include <vector>
+#include <cstdlib>
+#include <fstream>
+#include <algorithm>
+
+#include "DataSet.h"
+#include "../Utility.h"
+
+using namespace nmt;
+
+namespace nts {
+
+/* sort the output by id (in ascending order) */
+void DataSet::SortInput() {
+    sort(inputBuffer.items, inputBuffer.items + inputBuffer.count, 
+        [](Example* a, Example* b) {
+            return a->values.count > b->values.count;
+        });
+}
+
+/* sort the input by length (in descending order) */
+void DataSet::SortOutput() {
+    sort(outputBuffer.items, outputBuffer.items + outputBuffer.count, 
+        [](Result* a, Result* b) {
+            return a->id < b->id;
+        });
+}
+
+/*
+load data from the file to the buffer
+*/
+void DataSet::LoadDataToBuffer()
+{
+    string line;
+    inputBuffer.Clear();
+    bufferUsed = 0;
+
+    int id = 0;
+    const string tokenDelimiter = " ";
+
+    while (getline(*fp, line)) {
+        IntList values;
+
+        /* load words and transform them to ids */
+        auto indices = SplitToPos(line, tokenDelimiter);
+
+        /* reserve the first 120 words if the input is too long */
+        size_t maxLen = indices.Size() > MAX_WORD_NUM ? MAX_WORD_NUM : indices.Size();
+
+        for (size_t i = 0; i < maxLen; i++) {
+            auto offset = (i != (indices.Size() - 1)) ?
+                indices[i + 1] - indices[i] - tokenDelimiter.size()
+                : line.size() - indices[i];
+            string word = line.substr(indices[i], offset);
+            if (srcVocab.word2id.find(word) == srcVocab.word2id.end())
+                values.Add(UNK);
+            else
+                values.Add(srcVocab.word2id.at(word));
+        }
+
+        /* make sure that the sequence ends with EOS */
+        if (values.Size() != 0 && values[-1] != EOS)
+            values.Add(EOS);
+
+        Example* example = new Example;
+        example->id = id;
+        example->values = values;
+        if (values.Size() != 0)
+            inputBuffer.Add(example);
+        else
+            emptyLines.Add(id);
+        id++;
+    }
+    fp->close();
+
+    SortInput();
+
+    XPRINT1(0, stderr, "[INFO] loaded %d sentences\n", id);
+}
+
+/*
+load a mini-batch to the device (for translating)
+>> batchEnc - a tensor to store the batch of input
+>> paddingEnc - a tensor to store the batch of paddings
+>> minSentBatch - the minimum number of sentence batch
+>> batchSize - the maxium number of words in a batch
+>> devID - the device id, -1 for the CPU
+<< indices of the sentences
+*/
+UInt64List DataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
+                              size_t minSentBatch, size_t batchSize, int devID)
+{
+    size_t realBatchSize = minSentBatch;
+
+    /* get the maximum sentence length in a mini-batch */
+    size_t maxLen = inputBuffer[bufferUsed]->values.Size();
+
+    /* dynamic batching for sentences */
+    //while ((realBatchSize < (inputBuffer.Size() - bufferUsed))
+    //    && (realBatchSize * maxLen < batchSize)) {
+    //    realBatchSize++;
+    //}
+
+    /* real batch size */
+    if ((inputBuffer.Size() - bufferUsed) < realBatchSize) {
+        realBatchSize = inputBuffer.Size() - bufferUsed;
+    }
+
+    CheckNTErrors(maxLen != 0, "invalid length");
+
+    int* batchValues = new int[realBatchSize * maxLen];
+    float* paddingValues = new float[realBatchSize * maxLen];
+
+    for (int i = 0; i < realBatchSize * maxLen; i++) {
+        batchValues[i] = PAD;
+        paddingValues[i] = 1.0F;
+    }
+
+    size_t curSrc = 0;
+
+    /* right padding */
+    UInt64List infos;
+    size_t totalLength = 0;
+
+    for (int i = 0; i < realBatchSize; ++i) {
+        infos.Add(inputBuffer[bufferUsed + i]->id);
+        totalLength += inputBuffer[bufferUsed + i]->values.Size();
+
+        curSrc = maxLen * i;
+        for (int j = 0; j < inputBuffer[bufferUsed + i]->values.Size(); j++)
+            batchValues[curSrc++] = inputBuffer[bufferUsed + i]->values[j];
+        while (curSrc < maxLen * (i + 1))
+            paddingValues[curSrc++] = 0;
+    }
+    infos.Add(totalLength);
+
+    InitTensor2D(batchEnc, realBatchSize, maxLen, X_INT, devID);
+    InitTensor2D(paddingEnc, realBatchSize, maxLen, X_FLOAT, devID);
+
+    bufferUsed += realBatchSize;
+
+    batchEnc->SetData(batchValues, batchEnc->unitNum);
+    paddingEnc->SetData(paddingValues, paddingEnc->unitNum);
+
+    delete[] batchValues;
+    delete[] paddingValues;
+
+    return infos;
+}
+
+/*
+the constructor of DataSet
+>> dataFile - path of the data file
+>> srcVocabFN - path of the source vocab file
+>> tgtVocabFN - path of the target vocab file
+*/
+void DataSet::Init(const char* dataFile, const char* srcVocabFN, const char* tgtVocabFN)
+{
+    fp = new ifstream(dataFile);
+    CheckNTErrors(fp->is_open(), "Can not open the test data");
+    bufferUsed = 0;
+
+    CheckNTErrors(strcmp(srcVocabFN, "") != 0, "missing source vocab file");
+    CheckNTErrors(strcmp(tgtVocabFN, "") != 0, "missing target vocab file");
+
+    srcVocab.Load(srcVocabFN);
+
+    /* share source and target vocabs */
+    if (strcmp(srcVocabFN, tgtVocabFN) == 0) {
+        XPRINT(0, stderr, "[INFO] share source and target vocabs \n");
+        tgtVocab.CopyFrom(srcVocab);
+    }
+    else {
+        tgtVocab.Load(tgtVocabFN);
+    }
+
+    LoadDataToBuffer();
+}
+
+/* check if the buffer is empty */
+bool DataSet::IsEmpty() {
+    if (bufferUsed < inputBuffer.Size())
+        return false;
+    return true;
+}
+
+/* dump the translation to a file */
+void DataSet::DumpRes(const char* ofn)
+{
+    ofstream ofile(ofn, ios::out);
+
+    for (int t = 0; t < outputBuffer.Size(); t++) {
+        auto res = outputBuffer[t];
+        for (int i = 0; i < res->res.Size(); i++) {
+            if (res->res[i] < 4)
+                break;
+            ofile << tgtVocab.id2word[res->res[i]] << " ";
+        }
+        ofile << "\n";
+    }
+
+    ofile.close();
+}
+
+/* de-constructor */
+DataSet::~DataSet()
+{
+    /* release the file */
+    delete fp;
+
+    /* release the input buffer */
+    for (int i = 0; i < inputBuffer.Size(); i++)
+        delete inputBuffer[i];
+
+    /* release the output buffer */
+    for (int i = 0; i < outputBuffer.Size(); i++)
+        delete outputBuffer[i];
+}
+
+}
\ No newline at end of file
--- a/source/nmt/translate/DataSet.h
+++ b/source/nmt/translate/DataSet.h
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: HU Chi (huchinlp@foxmail.com) 2019-04-03
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
+ */
+
+#ifndef __DATASET_H__
+#define __DATASET_H__
+
+#include <cstdio>
+#include <vector>
+#include <fstream>
+#include "Vocab.h"
+
+#include "../../niutensor/tensor/XList.h"
+#include "../../niutensor/tensor/XTensor.h"
+#include "../../niutensor/tensor/XGlobal.h"
+
+#define MAX_WORD_NUM 120
+
+using namespace std;
+
+namespace nts {
+/* the struct of tokenized input */
+struct Example {
+    int id;
+    IntList values;
+};
+
+/* the struct of tokenized output */
+struct Result {
+    int id;
+    IntList res;
+};
+
+/* A `DataSet` is associated with a file which contains variable length data.*/
+struct DataSet {
+public:
+    /* the data buffer */
+    InputBufferType inputBuffer;
+
+    /* a list of empty line number */
+    IntList emptyLines;
+
+    /* the result buffer */
+    OutputBufferType outputBuffer;
+
+    /* the pointer to file stream */
+    ifstream* fp;
+
+    /* size of used data in buffer */
+    size_t bufferUsed;
+
+    /* the source vocabulary */
+    Vocab srcVocab;
+
+    /* the target vocabulary */
+    Vocab tgtVocab;
+
+public:
+
+    /* sort the input by length */
+    void SortInput();
+
+    /* reorder the output by ids */
+    void SortOutput();
+
+    /* load data from a file to the buffer */
+    void LoadDataToBuffer();
+
+    /* generate a mini-batch */
+    UInt64List LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
+        size_t sBatch, size_t wBatch, int devID);
+
+    /* initialization function */
+    void Init(const char* dataFile, const char* srcVocabFN, const char* tgtVocabFN);
+
+    /* check if the buffer is empty */
+    bool IsEmpty();
+
+    /* dump the translations to a file */
+    void DumpRes(const char* ofn);
+
+    /* de-constructor */
+    ~DataSet();
+};
+}
+
+#endif // __DATASET_H__
\ No newline at end of file
--- a/source/nmt/translate/LengthPenalty.cpp
+++ b/source/nmt/translate/LengthPenalty.cpp
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-04-08
+ * Start of a new week - I just finished several documents.
+ * Writing document is harder than writing code :)
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
+ */
+
+#include "LengthPenalty.h"
+
+using namespace nts;
+
+namespace nmt
+{
+
+/*
+GNMT-like length penalty: pl = ((5 + n)/(5 + 1))^\alpha
+where n = length of the sequence
+>> length - length of the sequence
+>> alpha - the parameter controls the length preference
+<< return - length penalty of the sequence
+*/
+float LengthPenalizer::GNMT(float length, float alpha)
+{
+    float base;
+    float lp;
+
+    base = (length + 5.0F) / (1.0F + 5.0F);
+
+    lp = pow(base, alpha);
+
+    return lp;
+}
+
+}
\ No newline at end of file
--- a/source/nmt/translate/LengthPenalty.h
+++ b/source/nmt/translate/LengthPenalty.h
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-04-08
+ * Start of a new week - I just finished several documents.
+ * Writing document is harder than writing code :)
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
+ */
+
+#ifndef __LENGTHPENALTY_H__
+#define __LENGTHPENALTY_H__
+
+#include "../Utility.h"
+#include "../../niutensor/tensor/XTensor.h"
+
+using namespace nts;
+
+namespace nmt
+{
+
+/* We intend to penalize short sequences because they have higher score
+   in product of a sequence of probability-like terms and have more chances
+   to beat others in search. */
+class LengthPenalizer
+{
+public:
+    /* GNMT-like length penalty: pl = ((5 + n)/(5 + 1))^\alpha
+       where n = length of the sequence */
+    static float GNMT(float length, float alpha);
+};
+
+}
+
+#endif
--- a/source/nmt/translate/Predictor.cpp
+++ b/source/nmt/translate/Predictor.cpp
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-13
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
+ */
+
+#include <iostream>
+
+#include "Predictor.h"
+#include "../layer/NNUtil.h"
+
+using namespace nts;
+
+namespace nmt
+{
+
+/* constructor */
+StateBundle::StateBundle()
+{
+    states = NULL;
+    isStart = false;
+}
+
+/* de-constructor */
+StateBundle::~StateBundle()
+{
+    if (states != NULL)
+        delete[] states;
+}
+
+/*
+create states
+>> num - number of states
+*/
+void StateBundle::MakeStates(int num)
+{
+    CheckNTErrors(num > 0, "invalid number");
+
+    if (states != NULL)
+        delete[] states;
+
+    states = new State[num];
+
+    for (int i = 0; i < num; i++) {
+        states[i].prediction = -1;
+        states[i].pid = _PID_EMPTY;
+        states[i].isEnd = false;
+        states[i].isStart = false;
+        states[i].isCompleted = false;
+        states[i].prob = 0;
+        states[i].probPath = 0;
+        states[i].modelScore = 0;
+        states[i].nstep = 0;
+        states[i].last = NULL;
+    }
+
+    stateNum = num;
+}
+
+/* constructor */
+Predictor::Predictor()
+{
+    startSymbol = 2;
+}
+
+/* de-constructor */
+Predictor::~Predictor()
+{
+}
+
+/*
+create an initial state
+>> model - the  model
+>> top - the top-most layer of the network
+>> input - input of the network
+>> beamSize - beam size
+>> state - the state to be initialized
+*/
+void Predictor::Create(Model* model, XTensor* top, const XTensor* input,
+                       int beamSize, StateBundle* state)
+{
+    int dims[MAX_TENSOR_DIM_NUM];
+    for (int i = 0; i < input->order - 1; i++)
+        dims[i] = input->dimSize[i];
+    dims[input->order - 1] = beamSize;
+
+    InitTensor(&state->probPath, input->order, dims, X_FLOAT, input->devID);
+    InitTensor(&state->endMark, input->order, dims, X_INT, input->devID);
+
+    state->probPath.SetZeroAll();
+    state->nstep = 0.0F;
+    state->endMark.SetZeroAll();
+
+    state->stateNum = 0;
+}
+
+/*
+set start symbol
+>> symbol - the symbol (in integer)
+*/
+void Predictor::SetStartSymbol(int symbol)
+{
+    startSymbol = symbol;
+}
+
+/*
+read a state
+>> model - the  model that keeps the network created so far
+>> state - a set of states. It keeps
+1) hypotheses (states)
+2) probabilities of hypotheses
+3) parts of the network for expanding toward the next state
+*/
+void Predictor::Read(Model* model, StateBundle* state)
+{
+    m = model;
+    s = state;
+}
+
+/*
+predict the next state
+>> next - next states
+>> aliveIndices - indices of alive states, (B)
+>> absoluteIdx - the absolute indices of alive states, (B)
+>> encoding - encoder output, (B, L, E)
+>> inputEnc - input of the encoder, (B, L)
+>> paddingEnc - padding of the encoder, (B, L)
+>> rawBatchSize - the raw batch size (in case of some states are pruned)
+>> isStart - whether it is the start state or not
+>> reorderState - the new order of states
+>> needReorder - whether we need reordering the states
+>> nstep - current time step of the target sequence
+*/
+void Predictor::Predict(StateBundle* next, XTensor& aliveState, XTensor& encoding,
+                        XTensor& inputEnc, XTensor& paddingEnc, int batchSize, bool isStart,
+                        XTensor& reorderState, bool needReorder, int nstep)
+{
+    int dims[MAX_TENSOR_DIM_NUM];
+
+    /* word indices of positions up to next state */
+    XTensor inputDec;
+
+    /* the first token */
+    XTensor first;
+
+    InitTensor2D(&first, batchSize, 1, X_INT, inputEnc.devID);
+    first.SetDataFixed(startSymbol);
+
+    /* add a new word into the input sequence of the decoder side */
+    if (isStart) {
+        inputDec = Identity(first);
+    }
+    else {
+        /* only pass one step to the decoder */
+        inputDec = GetLastPrediction(s, inputEnc.devID);
+    }
+
+    /* keep alive states for the decoder */
+    if (aliveState.dimSize[0] < batchSize) {
+        /* alive inputs */
+        inputDec = AutoGather(inputDec, aliveState);
+
+        /* alive cache */
+        for (int i = 0; i < m->decoder->nlayer; i++) {
+            m->decoder->selfAttCache[i].KeepAlive(aliveState);
+            m->decoder->enDeAttCache[i].KeepAlive(aliveState);
+        }
+    }
+
+    if (needReorder) {
+        for (int i = 0; i < m->decoder->nlayer; i++) {
+            m->decoder->selfAttCache[i].Reorder(reorderState);
+            m->decoder->enDeAttCache[i].Reorder(reorderState);
+        }
+    }
+
+    /* prediction probabilities */
+    XTensor& output = next->prob;
+    XTensor decoding;
+
+    for (int i = 0; i < inputDec.order - 1; i++)
+        dims[i] = inputDec.dimSize[i];
+    dims[inputDec.order - 1] = inputDec.dimSize[inputDec.order - 1];
+
+    XTensor paddingDec;
+    InitTensor(&paddingDec, inputDec.order, dims, X_INT, paddingEnc.devID);
+    paddingDec.SetDataFixed(1);
+
+    XTensor maskDec;
+    XTensor maskEncDec;
+
+    /* decoder mask */
+    m->MakeMTMaskDec(paddingEnc, paddingDec, maskDec, maskEncDec);
+
+    /* make the decoding network */
+    decoding = m->decoder->Make(inputDec, encoding, NULL, &maskEncDec, nstep, false);
+
+    CheckNTErrors(decoding.order >= 2, "The tensor must be of order 2 or larger!");
+
+    /* generate the output probabilities */
+    m->outputLayer->Make(decoding, output, false, true);
+}
+
+/*
+generate paths up to the states of the current step
+>> state - state bundle of the current step
+*/
+XTensor Predictor::GeneratePaths(StateBundle* state)
+{
+    CheckNTErrors(state->stateNum >= 0, "Illegal state!");
+
+    int distance = -1;
+
+    for (int i = 0; i < state->stateNum; i++) {
+        State* cur = state->states + i;
+        int nsteps = 0;
+
+        while (cur != NULL) {
+            nsteps++;
+            cur = cur->last;
+        }
+
+        if (nsteps > distance)
+            distance = nsteps;
+    }
+
+    XTensor path;
+    InitTensor2D(&path, state->stateNum, distance, X_INT);
+    path.SetZeroAll();
+
+    for (int i = 0; i < state->stateNum; i++) {
+        State* cur = state->states + i;
+        int nsteps = 0;
+
+        while (cur != NULL) {
+            nsteps++;
+            path.Set2DInt(cur->prediction, i, distance - nsteps);
+            cur = cur->last;
+        }
+    }
+
+    return path;
+}
+
+/*
+get the predictions of the previous step
+>> state - state bundle of the current step
+>> devID - the device id for the predictions
+*/
+XTensor Predictor::GetLastPrediction(StateBundle* state, int devID)
+{
+    CheckNTErrors(state->stateNum >= 0, "Illegal state!");
+
+    IntList last;
+
+    for (int i = 0; i < state->stateNum; i++) {
+        State* cur = state->states + i;
+
+        last.Add(cur->prediction);
+    }
+
+    XTensor lastPred;
+    InitTensor2D(&lastPred, int(last.Size()), 1, X_INT, devID);
+    lastPred.SetData(last.items, int(last.Size()));
+
+    return lastPred;
+}
+
+}
\ No newline at end of file
--- a/source/nmt/translate/Predictor.h
+++ b/source/nmt/translate/Predictor.h
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-13
+ * This is the first source file I create in 2019 - new start!
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
+ */
+
+#ifndef __PREDICTOR_H__
+#define __PREDICTOR_H__
+
+#include "../Model.h"
+#include "LengthPenalty.h"
+
+using namespace std;
+
+namespace nmt
+{
+
+#define _PID_EMPTY -1
+
+/* state for search. It keeps the path (back-pointer), prediction distribution,
+   and etc. It can be regarded as a hypotheses in translation. */
+class State
+{
+public:
+    /* we assume that the prediction is an integer */
+    int prediction;
+
+    /* id of the problem. One can regard it as the sentence id when we
+       translate a number of sentences in the batched manner. The hypotheses
+       is empty if id = -1 */
+    int pid;
+
+    /* indicates whether the state is an end */
+    bool isEnd;
+
+    /* indicates whether the state is the start */
+    bool isStart;
+
+    /* indicates whether the state is completed */
+    bool isCompleted;
+
+    /* probability of every prediction (last state of the path) */
+    float prob;
+
+    /* probability of every path */
+    float probPath;
+
+    /* model score of every path. A model score = path probability + some other stuff */
+    float modelScore;
+
+    /* number of steps we go over so far */
+    int nstep;
+
+    /* pointer to the previous state */
+    State* last;
+};
+
+/* a bundle of states */
+class StateBundle
+{
+public:
+    /* predictions */
+    XTensor prediction;
+
+    /* id of the previous state that generates the current one  */
+    XTensor preID;
+
+    /* mark that indicates whether each hypotheses is completed */
+    XTensor endMark;
+
+    /* probability of every prediction (last state of the path) */
+    XTensor prob;
+
+    /* probability of every path */
+    XTensor probPath;
+
+    /* model score of every path */
+    XTensor modelScore;
+
+    /* step number of each hypotheses */
+    float nstep;
+
+    /* list of states */
+    State* states;
+
+    /* number of states */
+    int stateNum;
+
+    /* indicates whether it is the first state */
+    bool isStart;
+
+public:
+    /* constructor */
+    StateBundle();
+
+    /* de-constructor */
+    ~StateBundle();
+
+    /* create states */
+    void MakeStates(int num);
+};
+
+/* The predictor reads the current state and then predicts the next.
+   It is exactly the same procedure of MT inference -
+   we get the state of previous words and then generate the next word.
+   Here, a state can be regarded as the representation of words (word
+   indices, hidden states, embeddings and etc.).  */
+class Predictor
+{
+private:
+    /* pointer to the transformer model */
+    Model* m;
+
+    /* current state */
+    StateBundle* s;
+
+    /* start symbol */
+    int startSymbol;
+
+    /* end symbol */
+    int endSymbol;
+
+public:
+    /* constructor */
+    Predictor();
+
+    /* de-constructor */
+    ~Predictor();
+
+    /* create an initial state */
+    void Create(Model* model, XTensor* top, const XTensor* input, int beamSize, StateBundle* state);
+
+    /* set the start symbol */
+    void SetStartSymbol(int symbol);
+
+    /* read a state */
+    void Read(Model* model, StateBundle* state);
+
+    /* predict the next state */
+    void Predict(StateBundle* next, XTensor& aliveIndices, XTensor& encoding,
+        XTensor& inputEnc, XTensor& paddingEnc, int rawBatchSize,
+        bool isStart, XTensor& reorderState, bool needReorder, int nstep);
+
+    /* generate paths up to the states of the current step */
+    XTensor GeneratePaths(StateBundle* state);
+
+    /* get the predictions of the previous step */
+    XTensor GetLastPrediction(StateBundle* state, int devID);
+};
+
+}
+
+#endif
--- a/source/nmt/translate/Search.cpp
+++ b/source/nmt/translate/Search.cpp
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
+ */
+
+#include "Search.h"
+#include "../Utility.h"
+#include "../../niutensor/tensor/core/CHeader.h"
+
+using namespace nts;
+
+namespace nmt
+{
+/* constructor */
+BeamSearch::BeamSearch()
+{
+    alpha = 0;
+    maxLength = 0;
+    beamSize = 0;
+    batchSize = 0;
+    endSymbolNum = 0;
+    fullHypos = NULL;
+    endSymbols = new int[32];
+    startSymbol = -1;
+}
+
+/* de-constructor */
+BeamSearch::~BeamSearch()
+{
+    if (fullHypos != NULL)
+        delete[] fullHypos;
+    if (endSymbols != NULL)
+        delete[] endSymbols;
+}
+
+/*
+initialize the model
+>> argc - number of arguments
+>> argv - list of pointers to the arguments
+*/
+void BeamSearch::Init(Config& config)
+{
+    beamSize = config.beamSize;
+    batchSize = config.sBatchSize;
+    alpha = config.lenAlpha;
+    endSymbols[0] = config.endID;
+    startSymbol = config.startID;
+    scalarMaxLength = config.maxLenAlpha;
+
+    if (endSymbols[0] >= 0)
+        endSymbolNum = 1;
+}
+
+/*
+prepare for search
+>> batchSize - size of the batch
+>> beamSize - size of the beam
+*/
+void BeamSearch::Prepare(int myBatchSize, int myBeamSize)
+{
+    batchSize = myBatchSize;
+    beamSize = myBeamSize;
+    needReorder = false;
+
+    /* prepare for the heap of hypotheses */
+    if (fullHypos != NULL)
+        delete[] fullHypos;
+
+    fullHypos = new XHeap<MIN_HEAP, float>[batchSize];
+
+    for (int i = 0; i < batchSize; i++)
+        fullHypos[i].Init(beamSize);
+
+    /* prepare for the indices of alive states */
+    aliveStatePids.Clear();
+    aliveSentList.Clear();
+    for (int i = 0; i < batchSize; i++) {
+        aliveStatePids.Add(i);
+        aliveSentList.Add(i);
+    }
+}
+
+/*
+search for the most promising states
+>> model - the transformer model
+>> input - input of the model
+>> padding - padding of the input
+>> output - output that represents the sequences as rows
+>> score - score of the sequences
+*/
+void BeamSearch::Search(Model* model, XTensor& input, XTensor& padding, 
+                        IntList* output, XTensor& score)
+{
+    Predictor predictor;
+    XTensor maskEnc;
+    XTensor encoding;
+    XTensor encodingBeam;
+    XTensor inputBeam;
+    XTensor paddingBeam;
+
+    CheckNTErrors(endSymbolNum > 0, "The search class is not initialized!");
+    CheckNTErrors(startSymbol >= 0, "The search class is not initialized!");
+
+    Prepare(input.unitNum / input.dimSize[input.order - 1], beamSize);
+
+    /* encoder mask */
+    model->MakeMTMaskEnc(padding, maskEnc);
+
+    /* make the encoding network */
+    encoding = model->MakeEncoder(input, &maskEnc, false);
+
+    encodingBeam = Unsqueeze(encoding, encoding.order - 2, beamSize);
+    inputBeam = Unsqueeze(input, input.order - 1, beamSize);
+    paddingBeam = Unsqueeze(padding, padding.order - 1, beamSize);
+
+    encodingBeam.ReshapeMerged(encodingBeam.order - 4);
+    inputBeam.ReshapeMerged(inputBeam.order - 3);
+    paddingBeam.ReshapeMerged(paddingBeam.order - 3);
+
+    /* max output-length = scalar * source-length */
+    int lengthLimit = (int)(input.dimSize[input.order - 1] * scalarMaxLength);
+
+    CheckNTErrors(lengthLimit > 0, "no max length specified!");
+    maxLength = lengthLimit;
+
+    StateBundle* states = new StateBundle[lengthLimit + 1];
+    StateBundle* first = states;
+    StateBundle* cur = NULL;
+    StateBundle* next = NULL;
+
+    /* create the first state */
+    predictor.Create(model, &encodingBeam, &input, beamSize, first);
+    predictor.SetStartSymbol(startSymbol);
+
+    first->isStart = true;
+
+    XTensor aliveState;
+    InitTensor1D(&aliveState, batchSize * beamSize, X_INT, input.devID);
+    SetAscendingOrder(aliveState, 0);
+
+    XTensor reorderState;
+    InitTensor1D(&reorderState, batchSize * beamSize, X_INT, input.devID);
+    SetAscendingOrder(reorderState, 0);
+
+    /* generate the sequence from left to right */
+    for (int l = 0; l < lengthLimit; l++) {
+        if (beamSize > 1) {
+            inputBeam = AutoGather(inputBeam, reorderState);
+            paddingBeam = AutoGather(paddingBeam, reorderState);
+            encodingBeam = AutoGather(encodingBeam, reorderState);
+        }
+
+        cur = states + l;
+        next = states + l + 1;
+
+        /* read the current state */
+        predictor.Read(model, cur);
+
+        /* predict the next state */
+        predictor.Predict(next, aliveState, encodingBeam, inputBeam,
+            paddingBeam, batchSize * beamSize, l == 0, reorderState, needReorder, l);
+
+        /* compute the model score (given the prediction probability) */
+        Score(cur, next);
+
+        /* beam pruning */
+        Generate(cur, next);
+
+        /* expand the search graph */
+        Expand(cur, next, reorderState);
+
+        /* push complete hypotheses into the heap */
+        Collect(next);
+
+        /* stop searching when all hypotheses are completed */
+        if (IsAllCompleted(next)) {
+            maxLength = l + 1;
+            break;
+        }
+
+        /* remove finished sentences */
+
+        //RemoveFinishedStates(next, encodingBeam, inputBeam, paddingBeam, aliveState);
+    }
+
+    /* fill the heap with incomplete hypotheses if necessary */
+    FillHeap(next);
+
+    Dump(output, &score);
+
+    delete[] states;
+}
+
+/*
+compute the model score for each hypotheses
+>> prev - the beam of the previous state
+>> beam - the beam that keeps a number of states
+*/
+void BeamSearch::Score(StateBundle* prev, StateBundle* beam)
+{
+    XTensor& score = beam->modelScore;
+    XTensor& prob = beam->prob;
+    XTensor& probPath = beam->probPath;
+    XTensor& probPathPrev = prev->probPath;
+    XTensor mask;
+
+    int order = prob.order;
+    int outputSize = prob.dimSize[prob.order - 1];
+    int dims[MAX_TENSOR_DIM_NUM];
+    for (int i = 0; i < order; i++)
+        dims[i] = prob.dimSize[i];
+
+    if (prob.dataType == X_FLOAT16)
+        prob = ConvertDataType(prob, X_FLOAT);
+
+    InitTensor(&score, &prob);
+    InitTensor(&probPath, &prob);
+
+    prob.Reshape(prob.unitNum / outputSize, outputSize);
+    score.Reshape(score.unitNum / outputSize, outputSize);
+    probPath.Reshape(score.unitNum / outputSize, outputSize);
+    probPathPrev.Reshape(probPathPrev.unitNum);
+
+    /* the log-scale probability of the entire sequence */
+    SumDim(prob, probPathPrev, probPath, 0);
+
+    beam->nstep = prev->nstep + 1.0F;
+
+    /* the GNMT-like length penalty */
+    float lp = LengthPenalizer::GNMT(beam->nstep, alpha);
+
+    /* score = log-prob/lp */
+    score = probPath / lp;
+
+    if (prev->isStart) {
+        XTensor firstMask = MakeFirstMask(beam);
+        firstMask.Reshape(firstMask.unitNum);
+
+        /* mask the hypotheses in the beam except the first one */
+        SumDim(score, firstMask, score, 0);
+    }
+
+    InitTensor(&mask,
+        prev->endMark.order, prev->endMark.dimSize, X_FLOAT,
+        prev->endMark.devID);
+    mask.SetZeroAll();
+    _SetDataFixedCond(&mask, &prev->endMark, -1e9F);
+
+    mask.Reshape(mask.unitNum);
+
+    /* mask the completed hypotheses so that they cannot
+       be involved in further sorting and beam search. */
+    SumDim(score, mask, score, 0);
+
+    prob.Reshape(order, dims);
+    score.Reshape(order, dims);
+    probPath.Reshape(order, dims);
+}
+
+/*
+generate tokens for the next state via beam pruning
+>> prev - the last beam
+>> beam - the beam that keeps a number of states
+*/
+void BeamSearch::Generate(StateBundle* prev, StateBundle* beam)
+{
+    int dims[MAX_TENSOR_DIM_NUM];
+    int dimsBeam[MAX_TENSOR_DIM_NUM];
+    int dimsTopK[MAX_TENSOR_DIM_NUM];
+
+    XTensor scoreTopK;
+    XTensor indexCPU;
+    XTensor& score = beam->modelScore;
+    XTensor& index = beam->prediction;
+    XTensor& preID = beam->preID;
+    XTensor& probPath = beam->probPath;
+    XTensor& prob = beam->prob;
+
+    int order = score.order;
+
+    for (int i = 0; i < order; i++) {
+        dims[i] = score.dimSize[i];
+        dimsBeam[i] = score.dimSize[i];
+        dimsTopK[i] = score.dimSize[i];
+    }
+
+    CheckNTErrors(order >= 3, "The tensor must be of order 2 or larger.");
+    CheckNTErrors(dimsBeam[order - 3] % beamSize == 0, "Wrong dimension size!");
+
+    int sizeVocab = score.dimSize[score.order - 1];
+    int stride = score.dimSize[score.order - 1];
+
+    dimsBeam[order - 3] /= beamSize;
+    dimsBeam[order - 1] *= beamSize;
+    dimsTopK[order - 3] = dimsBeam[order - 3];
+    dimsTopK[order - 1] = beamSize;
+
+    InitTensor(&scoreTopK, order, dimsTopK, score.dataType, score.devID);
+    InitTensor(&index, order, dimsTopK, X_INT, score.devID);
+    InitTensor(&preID, order, dimsTopK, X_INT, -1);
+    InitTensor(&indexCPU, order, dimsTopK, X_INT, -1);
+
+    score.Reshape(order, dimsBeam);
+    prob.Reshape(order, dimsBeam);
+
+    /* keep the most promising candidates in the beam */
+    TopK(score, scoreTopK, index, -1, beamSize, true);
+
+    float lp = LengthPenalizer::GNMT(beam->nstep, alpha);
+
+    CopyValues(index, indexCPU);
+    CopyValues(index, preID);
+
+    /* "preID" represents the id (or the offset) of the previous state used to make the current
+       hypotheses. Note that we reshape the "score" tensor into a matrix where each
+       row means a previous state. The column number is size-of-beam \times vocab-size. We,
+       therefore, divide entries of the top-k index by vocab-size to compute the id of the
+       previous state for each hypotheses in the top-k list. */
+    DescaleMe(preID, sizeVocab);
+
+    /* Then, we do something similar to "preID". For the top-k predictions, we need
+       to know their indices in the vocabulary. We compute the offset of each prediction
+       in the vocabulary by dividing it with vocab-size and computing the remainder. */
+    ModMe(index, sizeVocab);
+
+    /* we keep the top-k scores */
+    score = CopyValues(scoreTopK);
+
+    for (int i = 0; i < indexCPU.unitNum; i += beamSize) {
+        for (int j = 0; j < beamSize; j++) {
+            indexCPU.SetInt(i * stride + indexCPU.GetInt(i + j), i + j);
+        }
+    }
+
+    /* sequence probability of top-k candidates */
+    for (int i = 0; i < probPath.order; i++) {
+        dims[i] = probPath.dimSize[i];
+        dimsTopK[i] = scoreTopK.dimSize[i];
+    }
+
+    order = probPath.order;
+
+    prob.Reshape(prob.unitNum, 1);
+    probPath.Reshape(probPath.unitNum, 1);
+    indexCPU.Reshape(indexCPU.dimSize[0], indexCPU.dimSize[indexCPU.order - 1]);
+
+    indexCPU.SetDevice(prob.devID);
+    prob = Gather(prob, indexCPU);
+    probPath = Gather(probPath, indexCPU);
+
+    prob.Reshape(order, dimsTopK);
+    probPath.Reshape(order, dimsTopK);
+}
+
+/*
+expand the search graph
+>> prev - the last beam
+>> beam - the beam that keeps a number of states
+>> reorderState - the new order of states
+*/
+void BeamSearch::Expand(StateBundle* prev, StateBundle* beam, XTensor& reorderState)
+{
+    CheckNTErrors(beam->prediction.unitNum == beam->preID.unitNum, 
+                  "A problem occurs in the beam!");
+
+    beam->MakeStates(beam->prediction.unitNum);
+
+    State* states = beam->states;
+    XTensor& idRef = beam->preID;
+    XTensor& modelScoreRef = beam->modelScore;
+    XTensor& probRef = beam->prob;
+    XTensor& probPathRef = beam->probPath;
+    XTensor& predictionRef = beam->prediction;
+    XTensor& endMark = beam->endMark;
+    XTensor id;
+    XTensor modelScore;
+    XTensor prob;
+    XTensor probPath;
+    XTensor prediction;
+    XTensor endMarkCPU;
+    XTensor reorderStateCPU;
+
+    InitTensorOnCPU(&id, &idRef);
+    InitTensorOnCPU(&modelScore, &modelScoreRef);
+    InitTensorOnCPU(&prob, &probRef);
+    InitTensorOnCPU(&probPath, &probPathRef);
+    InitTensorOnCPU(&prediction, &predictionRef);
+    InitTensorOnCPU(&endMarkCPU, &predictionRef);
+    InitTensor(&endMark, &predictionRef);
+    InitTensorOnCPU(&reorderStateCPU, &reorderState);
+
+    /* we copy the data to CPU because the frequent access to GPU is slow
+       and we can speed-up the process by doing the job on CPU. */
+    CopyValues(idRef, id);
+    CopyValues(modelScoreRef, modelScore);
+    CopyValues(probRef, prob);
+    CopyValues(probPathRef, probPath);
+    CopyValues(predictionRef, prediction);
+
+    CheckNTErrors(beam->stateNum == id.unitNum, "Errors occur in counting!");
+
+    /* Related variables are kept on the states of the graph. All these are
+       maintained on CPUs to ease the implementation of frequent access and
+       modification of the states. An alternative is to do this on GPUs but
+       it needs much more coding work and the speed-up is not obvious. */
+
+    for (int i = 0; i < beam->stateNum; i += beamSize) {
+        for (int j = 0; j < beamSize; j++) {
+            int k = i + j;
+            State& state = states[k];
+
+            int offset = id.GetInt(k);
+            int pid = i / beamSize;
+            reorderStateCPU.SetInt(i + offset, i + j);
+            if (offset != j)
+                needReorder = true;
+
+            State* last = prev->states + pid * beamSize + offset;
+
+            CheckNTErrors(offset >= 0, "Wrong state index!");
+
+            /* pointer to the previous state */
+            if (prev->isStart) {
+                state.last = NULL;
+                state.pid = pid;
+                state.nstep = 0;
+                state.isCompleted = false;
+            }
+            else {
+                state.last = last;
+                state.pid = state.last->pid;
+                state.nstep = last->nstep + 1;
+                state.isCompleted = last->isCompleted;
+                CheckNTErrors(offset < prev->stateNum, "Wrong state index!");
+            }
+            /*if(aliveStatePids.size() < batchSize)
+                state.pid = aliveStatePids[i/beamSize];*/
+
+                /* scores */
+            state.modelScore = modelScore.Get(k);
+            state.prob = prob.Get(k);
+            state.probPath = probPath.Get(k);
+
+            /* prediction */
+            state.prediction = prediction.GetInt(k);
+
+            CheckNTErrors(state.prediction >= 0, "Illegal prediction!");
+
+            /* check if it is the end of the sequence */
+            state.isEnd = IsEnd(state.prediction);
+            state.isCompleted = (state.isCompleted || state.isEnd);
+
+            /* set the ending mark */
+            endMarkCPU.SetInt(state.isEnd, k);
+        }
+    }
+
+    /* copy the ending mark from CPU to the target device */
+    CopyValues(endMarkCPU, endMark);
+    CopyValues(reorderStateCPU, reorderState);
+}
+
+/*
+collect hypotheses with ending symbols. Given a beam of hypotheses,
+we remove the finished hypotheses and keep them in a heap.
+>> beam  - the beam that keeps a number of states
+*/
+void BeamSearch::Collect(StateBundle* beam)
+{
+    State* states = beam->states;
+
+    for (int i = 0; i < beam->stateNum; i++) {
+        State& state = states[i];
+
+        CheckNTErrors(state.pid >= 0 && state.pid < batchSize,
+            "Invalid sample id!");
+
+        /* check if this is the first end symbol. It is false
+           if there have been end symbols in previously generated words. */
+        bool isCompleted = state.isCompleted && 
+             (state.last == NULL || !state.last->isCompleted);
+
+        /* we push the hypothesis into the heap when it is completed */
+        if ((state.isEnd || state.isCompleted)) {
+            fullHypos[state.pid].Push(HeapNode<float>(&state, state.modelScore));
+        }
+    }
+}
+
+/*
+fill the hypothesis heap with incomplete hypotheses
+>> beam  - the beam that keeps a number of states (final)
+*/
+void BeamSearch::FillHeap(StateBundle* beam)
+{
+    State* states = beam->states;
+
+    for (int i = 0; i < beam->stateNum / beamSize; i++) {
+        for (int j = 0; j < beamSize; j++) {
+            State& state = states[i * beamSize + j];
+
+            /* we push the incomplete hypothesis into the heap */
+            if (fullHypos[state.pid].Count() == 0 && state.isEnd && state.isCompleted) {
+                fullHypos[state.pid].Push(HeapNode<float>(&state, state.modelScore));
+            }
+            else {
+                auto node = fullHypos[state.pid].Top();
+                float score = node.value;
+                if (score < state.modelScore)
+                    fullHypos[state.pid].Push(HeapNode<float>(&state, state.modelScore));
+            }
+        }
+    }
+}
+
+/*
+save the output sequences in a tensor
+>> output - output sequences (for return)
+>> score - score of thes sequences
+*/
+void BeamSearch::Dump(IntList* output, XTensor* score)
+{
+    int dims[3] = { batchSize, 1, maxLength };
+
+    InitTensor(score, 2, dims, X_FLOAT);
+    score->SetZeroAll();
+
+    /* heap for an input sentence in the batch */
+    for (int h = 0; h < batchSize; h++) {
+        XHeap<MIN_HEAP, float>& heap = fullHypos[h];
+        int c = heap.Count();
+
+        float bestScore = -1e9F;
+        State* state = NULL;
+        for (int i = 0; i < c; i++) {
+            auto node = heap.Pop();
+            State* s = (State*)node.index;
+            if (i == 0 || bestScore < node.value) {
+                state = s;
+                bestScore = node.value;
+            }
+        }
+
+        int count = 0;
+        bool isCompleted = true;
+
+        /* we track the state from the end to the beginning */
+        while (state != NULL) {
+            if (!state->isCompleted)
+                isCompleted = false;
+            if (isCompleted) {
+                output[h].Add(2);
+            }
+            else {
+                output[h].Add(state->prediction);
+            }
+            state = state->last;
+        }
+        output[h].Reverse();
+
+        score->Set2D(bestScore, h, 0);
+    }
+}
+
+/*
+check if the token is an end symbol
+>> token - token to be checked
+*/
+bool BeamSearch::IsEnd(int token)
+{
+    CheckNTErrors(endSymbolNum > 0, "No end symbol?");
+
+    for (int i = 0; i < endSymbolNum; i++) {
+        if (endSymbols[i] == token)
+            return true;
+    }
+
+    return false;
+}
+
+/*
+set end symbols for search
+>> tokens - end symbols
+>> tokenNum - number of the end symbols
+*/
+void BeamSearch::SetEnd(const int* tokens, const int tokenNum)
+{
+    if (endSymbols != NULL)
+        delete[] endSymbols;
+
+    if (tokenNum <= 0)
+        return;
+
+    /* we may have multiple end symbols */
+    tokens = new int[tokenNum];
+    for (int i = 0; i < tokenNum; i++)
+        endSymbols[i] = tokens[i];
+    endSymbolNum = tokenNum;
+}
+
+/*
+check whether all hypotheses are completed
+>> beam - the beam that keeps the searching states
+*/
+bool BeamSearch::IsAllCompleted(StateBundle* beam)
+{
+    State* states = beam->states;
+
+    for (int i = 0; i < beam->stateNum; i++) {
+        State& state = states[i];
+        if (!state.isCompleted)
+            return false;
+    }
+
+    return true;
+}
+
+/*
+update the beam by removing finished hypotheses
+>> beam - the beam that keeps the searching states
+>> aliveEncoding - new input embeddings for the encoder, (B, L, E)
+>> aliveInput - new input tokens of the encoder, (B, L)
+>> alivePadding - new paddings for the inputs, (B, L)
+<< aliveIdx - the indices of alive states
+*/
+void BeamSearch::RemoveFinishedStates(StateBundle* beam, XTensor& aliveEncoding,
+                                      XTensor& aliveInput, XTensor& alivePadding, 
+                                      XTensor& aliveState)
+{
+    State* states = beam->states;
+
+    /* get the indices of uncompleted sentences and states */
+    aliveSentList.Clear();
+    IntList aliveStateList;
+    int count = 0;
+
+    /* the number of completed sentences */
+    for (int i = 0; i < beam->stateNum; i += beamSize) {
+        int endState = 0;
+        for (int j = 0; j < beamSize; j++) {
+            if (states[i + j].isEnd) {
+                endState++;
+            }
+        }
+        bool isSentCompleted = (endState == beamSize);
+
+        int sent = i / beamSize;
+        if (!isSentCompleted) {
+            aliveSentList.Add(sent);
+            for (int j = 0; j < beamSize; j++) {
+                aliveStateList.Add(i + j);
+            }
+        }
+        else {
+            aliveStatePids.Remove(sent - count);
+            count++;
+        }
+    }
+
+    InitTensor1D(&aliveState, int(aliveStateList.Size()), X_INT, aliveEncoding.devID);
+    aliveState.SetData(aliveStateList.items, int(aliveStateList.Size()));
+
+    XTensor aliveSent;
+    InitTensor1D(&aliveSent, int(aliveSentList.Size()), X_INT, aliveEncoding.devID);
+    aliveSent.SetData(aliveSentList.items, int(aliveSentList.Size()));
+
+    if (aliveStateList.Size() < aliveEncoding.dimSize[0] && aliveStateList.Size() > 0) {
+        aliveInput = AutoGather(aliveInput, aliveState);
+        alivePadding = AutoGather(alivePadding, aliveState);
+        aliveEncoding = AutoGather(aliveEncoding, aliveState);
+        beam->prob = AutoGather(beam->prob, aliveSent);
+        beam->endMark = AutoGather(beam->endMark, aliveSent);
+        beam->probPath = AutoGather(beam->probPath, aliveSent);
+        beam->modelScore = AutoGather(beam->modelScore, aliveSent);
+        beam->prediction = AutoGather(beam->prediction, aliveSent);
+    }
+}
+
+/*
+make a mask to prevent duplicated entries in beam expansion for the first position
+>> beam - the beam that keeps the searching states
+*/
+XTensor BeamSearch::MakeFirstMask(StateBundle* beam)
+{
+    XTensor& prob = beam->prob;
+    XTensor mask;
+
+    int order = prob.order;
+    int dims[MAX_TENSOR_DIM_NUM];
+    for (int i = 0; i < order - 1; i++)
+        dims[i] = prob.dimSize[i];
+
+    InitTensor(&mask, order - 1, dims, X_FLOAT);
+    mask.SetZeroAll();
+
+    for (int i = 0; i < mask.unitNum; i++) {
+        if (i % beamSize != 0)
+            mask.Set(-1e9, i);
+    }
+
+    mask.SetDevice(prob.devID);
+
+    return mask;
+}
+
+/* constructor */
+GreedySearch::GreedySearch()
+{
+    maxLength = 0;
+    batchSize = 0;
+    endSymbolNum = 0;
+    endSymbols = new int[32];
+    startSymbol = -1;
+}
+
+/* de-constructor */
+GreedySearch::~GreedySearch()
+{
+    if (endSymbols != NULL)
+        delete[] endSymbols;
+}
+
+/*
+initialize the model
+>> argc - number of arguments
+>> argv - list of pointers to the arguments
+*/
+void GreedySearch::Init(Config& config)
+{
+    batchSize = config.wBatchSize;
+    endSymbols[0] = config.endID;
+    startSymbol = config.startID;
+    scalarMaxLength = config.maxLenAlpha;
+
+    if (endSymbols[0] >= 0)
+        endSymbolNum = 1;
+}
+
+/*
+prepare for search
+>> batchSize - size of the batch
+*/
+void GreedySearch::Prepare(int myBatchSize)
+{
+    batchSize = myBatchSize;
+}
+
+/* check if the token is an end symbol */
+bool GreedySearch::IsEnd(int token)
+{
+    CheckNTErrors(endSymbolNum > 0, "No end symbol?");
+
+    for (int i = 0; i < endSymbolNum; i++) {
+        if (endSymbols[i] == token)
+            return true;
+    }
+
+    return false;
+}
+
+/* set end symbols for search */
+void GreedySearch::SetEnd(const int* tokens, const int tokenNum)
+{
+    if (endSymbols != NULL)
+        delete[] endSymbols;
+
+    if (tokenNum <= 0)
+        return;
+
+    /* we may have multiple end symbols */
+    tokens = new int[tokenNum];
+    for (int i = 0; i < tokenNum; i++)
+        endSymbols[i] = tokens[i];
+    endSymbolNum = tokenNum;
+}
+
+/*
+search for the most promising states
+>> model - the transformer model
+>> input - input of the model
+>> padding - padding of the input
+>> output - output that represents the sequences as rows
+*/
+void GreedySearch::Search(Model* model, XTensor& input, 
+                          XTensor& padding, IntList* output)
+{
+    XTensor maskEnc;
+    XTensor encoding;
+
+    /* dynamic batch size */
+    Prepare(input.unitNum / input.dimSize[input.order - 1]);
+
+    /* encoder mask */
+    model->MakeMTMaskEnc(padding, maskEnc);
+
+    /* make the encoding network */
+    encoding = model->encoder->Make(input, &maskEnc, false);
+
+    /* max output-length = scalar * source-length */
+    maxLength = (int)(input.dimSize[input.order - 1] * scalarMaxLength);
+
+    /* the first token */
+    XTensor inputDec;
+    InitTensor2D(&inputDec, batchSize, 1, X_INT, input.devID);
+    inputDec.SetDataFixed(startSymbol);
+
+    /* initialize the finished flags */
+    int* finishedFlags = new int[batchSize];
+    for (int i = 0; i < batchSize; i++)
+        finishedFlags[i] = 0;
+
+    /* generate the sequence from left to right */
+    int l = 0;
+    for (; l < maxLength; l++) {
+        XTensor prob;
+        XTensor maskDec;
+        XTensor maskEncDec;
+        XTensor paddingDec;
+        XTensor decoding;
+        XTensor indexCPU;
+        XTensor bestScore;
+
+        InitTensor(&paddingDec, inputDec.order, inputDec.dimSize, X_INT, padding.devID);
+        paddingDec.SetDataFixed(1);
+
+        /* decoder mask */
+        model->MakeMTMaskDec(padding, paddingDec, maskDec, maskEncDec);
+
+        /* make the decoding network */
+        decoding = model->decoder->Make(inputDec, encoding, NULL, &maskEncDec, l, false);
+
+        /* generate the output probabilities */
+        model->outputLayer->Make(decoding, prob, false, false);
+
+        /* get the most promising prediction */
+        prob.Reshape(prob.dimSize[0], prob.dimSize[prob.order - 1]);
+        InitTensor2D(&bestScore, prob.dimSize[0], 1, prob.dataType, prob.devID);
+        TopK(prob, bestScore, inputDec, -1, 1);
+
+        /* save the prediction */
+        InitTensorOnCPU(&indexCPU, &inputDec);
+        CopyValues(inputDec, indexCPU);
+
+        for (int i = 0; i < batchSize; i++) {
+            output[i].Add(indexCPU.GetInt(i));
+            if (IsEnd(indexCPU.GetInt(i)))
+                finishedFlags[i] = 1;
+        }
+
+        int finished = 0;
+        for (int i = 0; i < batchSize; i++)
+            finished += finishedFlags[i];
+        if (finished == batchSize)
+            break;
+    }
+
+    delete[] finishedFlags;
+}
+
+}
\ No newline at end of file
--- a/source/nmt/translate/Search.h
+++ b/source/nmt/translate/Search.h
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
+ */
+
+#ifndef __SEARCH_H__
+#define __SEARCH_H__
+
+#include "../Model.h"
+#include "Predictor.h"
+
+using namespace std;
+
+namespace nmt
+{
+
+/* The class organizes the search process. It calls "predictors" to generate
+   distributions of the predictions and prunes the search space by beam pruning.
+   This makes a graph where each path represents a translation hypotheses.
+   The output can be the path with the highest model score. */
+class BeamSearch
+{
+private:
+    /* the alpha parameter controls the length preference */
+    float alpha;
+
+    /* predictor */
+    Predictor predictor;
+
+    /* max length of the generated sequence */
+    int maxLength;
+
+    /* beam size */
+    int beamSize;
+
+    /* batch size */
+    int batchSize;
+
+    /* we keep the final hypotheses in a heap for each sentence in the batch. */
+    XHeap<MIN_HEAP, float>* fullHypos;
+
+    /* array of the end symbols */
+    int* endSymbols;
+
+    /* number of the end symbols */
+    int endSymbolNum;
+
+    /* start symbol */
+    int startSymbol;
+
+    /* scalar of the input sequence (for max number of search steps) */
+    float scalarMaxLength;
+
+    /* indicate whether the early stop strategy is used */
+    bool isEarlyStop;
+
+    /* pids for alive states */
+    IntList aliveStatePids;
+
+    /* alive sentences */
+    IntList aliveSentList;
+
+    /* whether we need to reorder the states */
+    bool needReorder;
+
+public:
+    /* constructor */
+    BeamSearch();
+
+    /* de-constructor */
+    ~BeamSearch();
+
+    /* initialize the model */
+    void Init(Config& config);
+
+    /* search for the most promising states */
+    void Search(Model* model, XTensor& input, XTensor& padding, IntList* output, XTensor& score);
+
+    /* preparation */
+    void Prepare(int myBatchSize, int myBeamSize);
+
+    /* compute the model score for each hypotheses */
+    void Score(StateBundle* prev, StateBundle* beam);
+
+    /* generate token indices via beam pruning */
+    void Generate(StateBundle* prev, StateBundle* beam);
+
+    /* expand the search graph */
+    void Expand(StateBundle* prev, StateBundle* beam, XTensor& reorderState);
+
+    /* collect hypotheses with ending symbol */
+    void Collect(StateBundle* beam);
+
+    /* fill the hypotheses heap with incomplete hypotheses */
+    void FillHeap(StateBundle* beam);
+
+    /* save the output sequences and score */
+    void Dump(IntList* output, XTensor* score);
+
+    /* check if the token is an end symbol */
+    bool IsEnd(int token);
+
+    /* check whether all hypotheses are completed */
+    bool IsAllCompleted(StateBundle* beam);
+
+    /* update the beam by pruning finished states */
+    void RemoveFinishedStates(StateBundle* beam, XTensor& aliveEncoding,
+        XTensor& aliveInput, XTensor& alivePadding, XTensor& aliveIdx);
+
+    /* set end symbols for search */
+    void SetEnd(const int* tokens, const int tokenNum);
+
+    /* make a mask to prevent duplicated entries in beam expansion for the first position */
+    XTensor MakeFirstMask(StateBundle* beam);
+};
+
+class GreedySearch
+{
+private:
+
+    /* predictor */
+    Predictor predictor;
+
+    /* max length of the generated sequence */
+    int maxLength;
+
+    /* batch size */
+    int batchSize;
+
+    /* array of the end symbols */
+    int* endSymbols;
+
+    /* number of the end symbols */
+    int endSymbolNum;
+
+    /* start symbol */
+    int startSymbol;
+
+    /* scalar of the input sequence (for max number of search steps) */
+    float scalarMaxLength;
+
+public:
+    /* constructor */
+    GreedySearch();
+
+    /* de-constructor */
+    ~GreedySearch();
+
+    /* initialize the model */
+    void Init(Config& config);
+
+    /* search for the most promising states */
+    void Search(Model* model, XTensor& input, XTensor& padding, IntList* output);
+
+    /* preparation */
+    void Prepare(int myBatchSize);
+
+    /* check if the token is an end symbol */
+    bool IsEnd(int token);
+
+    /* set end symbols for search */
+    void SetEnd(const int* tokens, const int tokenNum);
+};
+
+}
+
+#endif
--- a/source/nmt/translate/Translator.cpp
+++ b/source/nmt/translate/Translator.cpp
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
+ */
+
+#include "Search.h"
+#include "Translator.h"
+#include "../Utility.h"
+#include "../../niutensor/tensor/XTensor.h"
+#include "../../niutensor/tensor/XUtility.h"
+#include "../../niutensor/tensor/core/CHeader.h"
+
+using namespace nts;
+
+namespace nmt
+{
+
+/* constructor */
+Translator::Translator()
+{
+}
+
+/* de-constructor */
+Translator::~Translator()
+{
+    if (beamSize > 1)
+        delete (BeamSearch*)seacher;
+    else
+        delete (GreedySearch*)seacher;
+}
+
+/* initialize the model */
+void Translator::Init(Config& config)
+{
+    beamSize = config.beamSize;
+    vSize = config.srcVocabSize;
+    vSizeTgt = config.tgtVocabSize;
+    sentBatch = config.sBatchSize;
+    wordBatch = config.wBatchSize;
+
+    if (beamSize > 1) {
+        LOG("translating with beam search (%d)", beamSize);
+        seacher = new BeamSearch();
+        ((BeamSearch*)seacher)->Init(config);
+    }
+    else if (beamSize == 1) {
+        LOG("translating with greedy search");
+        seacher = new GreedySearch();
+        ((GreedySearch*)seacher)->Init(config);
+    }
+    else {
+        CheckNTErrors(false, "Invalid beam size\n");
+    }
+}
+
+/*
+test the model
+>> ifn - input data file
+>> sfn - source vocab file
+>> tfn - target vocab file
+>> ofn - output data file
+>> model - pretrained model
+*/
+void Translator::Translate(const char* ifn, const char* sfn, 
+                           const char* tfn, const char* ofn, Model* model)
+{
+    int wc = 0;
+    int wordCountTotal = 0;
+    int sentCount = 0;
+    int batchCount = 0;
+
+    int devID = model->devID;
+
+    double startT = GetClockSec();
+
+    /* batch of input sequences */
+    XTensor batchEnc;
+
+    /* padding */
+    XTensor paddingEnc;
+
+    batchLoader.Init(ifn, sfn, tfn);
+    LOG("loaded the input file, elapsed=%.1fs ", GetClockSec() - startT);
+
+    int count = 0;
+    double batchStart = GetClockSec();
+    while (!batchLoader.IsEmpty())
+    {
+        count++;
+
+        for (int i = 0; i < model->decoder->nlayer; ++i) {
+            model->decoder->selfAttCache[i].miss = true;
+            model->decoder->enDeAttCache[i].miss = true;
+        }
+
+        auto indices = batchLoader.LoadBatch(&batchEnc, &paddingEnc, 
+                                             sentBatch, wordBatch, devID);
+
+        IntList* output = new IntList[indices.Size() - 1];
+
+        /* greedy search */
+        if (beamSize == 1) {
+            ((GreedySearch*)seacher)->Search(model, batchEnc, paddingEnc, output);
+        }
+        /* beam search */
+        else {
+            XTensor score;
+            ((BeamSearch*)seacher)->Search(model, batchEnc, paddingEnc, output, score);
+        }
+
+        for (int i = 0; i < indices.Size() - 1; ++i) {
+            Result* res = new Result;
+            res->id = int(indices[i]);
+            res->res = output[i];
+            batchLoader.outputBuffer.Add(res);
+        }
+        delete[] output;
+
+        wc += int(indices[-1]);
+        wordCountTotal += int(indices[-1]);
+
+        sentCount += int(indices.Size() - 1);
+        batchCount += 1;
+
+        if (count % 1 == 0) {
+            double elapsed = GetClockSec() - batchStart;
+            batchStart = GetClockSec();
+            LOG("elapsed=%.1fs, sentence=%f, sword=%.1fw/s",
+                elapsed, float(sentCount) / float(batchLoader.inputBuffer.Size()), 
+                double(wc) / elapsed);
+            wc = 0;
+        }
+    }
+
+    /* append empty lines to the result */
+    for (int i = 0; i < batchLoader.emptyLines.Size(); i++) {
+        Result* emptyRes = new Result;
+        emptyRes->id = batchLoader.emptyLines[i];
+        batchLoader.outputBuffer.Add(emptyRes);
+    }
+
+    double startDump = GetClockSec();
+
+    /* reorder the result */
+    batchLoader.SortOutput();
+
+    /* print the result to a file */
+    batchLoader.DumpRes(ofn);
+
+    double elapsed = GetClockSec() - startDump;
+
+    LOG("translation completed (word=%d, sent=%zu)", 
+        wordCountTotal, batchLoader.inputBuffer.Size() + batchLoader.emptyLines.Size());
+}
+
+/*
+dump the result into the file
+>> file - data file
+>> output - output tensor
+*/
+void Translator::Dump(FILE* file, XTensor* output)
+{
+    if (output != NULL && output->unitNum != 0) {
+        int seqLength = output->dimSize[output->order - 1];
+
+        for (int i = 0; i < output->unitNum; i += seqLength) {
+            for (int j = 0; j < seqLength; j++) {
+                int w = output->GetInt(i + j);
+                if (w < 0 || w == 1 || w == 2)
+                    break;
+                fprintf(file, "%d ", w);
+            }
+
+            fprintf(file, "\n");
+        }
+    }
+    else
+    {
+        fprintf(file, "\n");
+    }
+}
+
+}
\ No newline at end of file
--- a/source/nmt/translate/Translator.h
+++ b/source/nmt/translate/Translator.h
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
+ * A week with no trips :)
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
+ */
+
+#ifndef __TESTER_H__
+#define __TESTER_H__
+
+#include "Search.h"
+#include "DataSet.h"
+
+namespace nmt
+{
+
+/* This class translates test sentences with a trained model. */
+class Translator
+{
+public:
+    /* vocabulary size of the source side */
+    int vSize;
+
+    /* vocabulary size of the target side */
+    int vSizeTgt;
+
+    /* batch size for sentences */
+    int sentBatch;
+
+    /* batch size for words */
+    int wordBatch;
+
+    /* beam size */
+    int beamSize;
+
+    /* for batching */
+    DataSet batchLoader;
+
+    /* decoder for inference */
+    void* seacher;
+
+public:
+    /* constructor */
+    Translator();
+
+    /* de-constructor */
+    ~Translator();
+
+    /* initialize the model */
+    void Init(Config& config);
+
+    /* test the model */
+    void Translate(const char* ifn, const char* vfn, const char* ofn, 
+                   const char* tfn, Model* model);
+
+    /* dump the result into the file */
+    void Dump(FILE* file, XTensor* output);
+};
+
+}
+
+#endif
\ No newline at end of file
--- a/source/nmt/translate/Vocab.cpp
+++ b/source/nmt/translate/Vocab.cpp
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: HU Chi (huchinlp@foxmail.com) 2020-01-03
+ */
+
+#include <fstream>
+
+#include "Vocab.h"
+#include "../Utility.h"
+
+namespace nts {
+
+/* load a vocabulary from a file */
+void Vocab::Load(const string& src)
+{
+    string vsz, sid;
+    ifstream f(src, ios::in);
+    CheckNTErrors(f.is_open(), "unable to open the vocabulary file");
+
+    /* get the vocab size and the start id */
+    f >> vsz >> sid;
+    startID = stol(sid);
+    vocabSize = stol(vsz);
+
+    string word, id;
+    for (int i = 0; i < vocabSize - startID; i++) {
+        f >> word >> id;
+        word2id[word] = stol(id);
+        id2word[stol(id)] = word;
+    }
+
+    f.close();
+}
+
+/* save a vocabulary to a file */
+void Vocab::Save(const string& src)
+{
+    ofstream f(src, ios::out);
+
+    /* the first line: size of the vocab and the start id */
+    f << vocabSize << "\t" << startID;
+
+    /* other lines: words and indices */
+    for (const auto& p : word2id)
+        f << p.first << "\t" << p.second;
+
+    f.close();
+}
+
+/*
+copy data from another vocabulary
+>> v - the target vocabulary
+*/
+void Vocab::CopyFrom(const Vocab& v)
+{
+    for (const auto& w2i : v.word2id)
+        word2id.insert(w2i);
+
+    for (const auto& i2w : v.id2word)
+        id2word.insert(i2w);
+}
+
+}
\ No newline at end of file
--- a/source/nmt/translate/Vocab.h
+++ b/source/nmt/translate/Vocab.h
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: HU Chi (huchinlp@foxmail.com) 2020-01-03
+ */
+
+#ifndef __VOCAB_H__
+#define __VOCAB_H__
+
+#include <cstdio>
+#include <unordered_map>
+
+using namespace std;
+
+namespace nts {
+
+/* user-defined symbols */
+#define PAD 1
+#define SOS 2
+#define EOS 2
+#define UNK 3
+
+/* the vocabulary class */
+struct Vocab
+{
+    /* the start id for words */
+    int startID;
+
+    /* size of the vocabulary */
+    int vocabSize;
+
+    /* a dict that maps words to ids */
+    unordered_map<string, int> word2id;
+
+    /* a dict that maps ids to words */
+    unordered_map<int, string> id2word;
+
+    /* load a vocabulary from a file */
+    void Load(const string& src);
+
+    /* save a vocabulary to a file */
+    void Save(const string& src);
+
+    /* copy data from another vocab */
+    void CopyFrom(const Vocab& v);
+};
+
+}
+
+#endif
\ No newline at end of file
--- a/tools/Ensemble.py
+++ b/tools/Ensemble.py
+'''
+Ensemble multiple models by checkpoint averaging.
+Usage: python3 Ensemble.py -src <model_files> -tgt <ensembled_model>
+Help: python3 ModelConverter.py -h
+'''
+
+
+import argparse
+import numpy as np
+from glob import glob
+from struct import pack
+from struct import unpack
+
+parser = argparse.ArgumentParser(
+    description='A model ensemble tool for NiuTrans.NMT')
+parser.add_argument('-input', help='Model file pattern, e.g., \'model.bin.*\'',
+                    type=str, default='model.bin.*')
+parser.add_argument('-output', help='The ensembled model',
+                    type=str, default='model.ensemble')
+args = parser.parse_args()
+
+model_files = glob(args.input)
+
+meta_infos = None
+parameters = []
+
+for file in model_files:
+    with open(file, "rb") as f:
+        meta_infos = f.read(12 * 4)
+        data = f.read()
+        values = unpack('f' * (len(data) // 4), data)
+        print("Loaded {} parameters from: {}".format(len(values), file))
+        parameters.append(np.array(values))
+
+parameters = np.mean(np.array(parameters), axis=0)
+
+with open(args.output, "wb") as f:
+    f.write(meta_infos)
+    values = pack("f" * len(parameters), *parameters)
+    f.write(values)
+
+print("Model ensemble finished")
--- a/tools/FormatConverter.py
+++ b/tools/FormatConverter.py
+'''
+Convert the format of a model.
+Usage: python3 FormatConverter.py -src <raw_model> -tgt <new_model>
+Help: python3 FormatConverter.py -h
+'''
+
+import argparse
+import numpy as np
+from glob import glob
+from struct import pack
+from struct import unpack
+
+parser = argparse.ArgumentParser(
+    description='The format converter for NiuTrans.NMT')
+parser.add_argument('-input', help='Path of the raw model file',
+                    type=str, default='')
+parser.add_argument('-output', help='Path of the new model file',
+                    type=str, default='')
+parser.add_argument('-format', help='Target storage format, FP16 (Default) or FP32', type=str, default='fp16')
+args = parser.parse_args()
+args.format = args.format.lower()
+
+META_INFO_NUM = 12
+
+meta_infos = None
+parameters = None
+
+if args.format == 'fp32':
+    PARAM_LEN = 2
+elif args.format == 'fp16':
+    PARAM_LEN = 4
+else:
+    raise NotImplementedError("Unsupported data type")
+
+with open(args.input, "rb") as f:
+    meta_infos = f.read(META_INFO_NUM * 4)
+    data = f.read()
+    if args.format == 'fp32':
+        values = unpack('e' * (len(data) // PARAM_LEN), data)
+    elif args.format == 'fp16':
+        values = unpack('f' * (len(data) // PARAM_LEN), data)
+    print("Loaded {} parameters from: {}".format(len(values), args.input))
+    parameters = np.array(values)
+
+with open(args.output, "wb") as f:
+    f.write(meta_infos)
+    if args.format == 'fp32':
+        values = pack("f" * len(parameters), *(parameters.astype(np.float32)))
+    elif args.format == 'fp16':
+        values = pack("e" * len(parameters), *(parameters.astype(np.float16)))
+    f.write(values)
\ No newline at end of file
--- a/tools/GetVocab.py
+++ b/tools/GetVocab.py
+'''
+Convert a bpe vocabulary to a NiuTrans.NMT vocab
+Usage: python3 GetVocab.py -src [bpe_vocab] -tgt [niutrans_nmt_vocab]
+'''
+
+import sys
+import argparse
+
+parser = argparse.ArgumentParser(description='prepare parallel data for nmt training')
+parser.add_argument('-raw', help='Path of the BPE vocabulary', type=str, default='')
+parser.add_argument('-new', help='Path of the NiuTrans.NMT vocabulary to be saved', type=str, default='')
+args = parser.parse_args()
+
+# User defined words
+PAD=1
+SOS=2
+EOS=2
+UNK=3
+
+with open(args.raw, "r", encoding="utf8") as fi:
+    with open(args.new, "w", encoding="utf8") as fo:
+
+        all_lines = fi.readlines()
+        vocab_size = len(all_lines) + UNK + 1
+
+        # make sure the vocabulary size is divisible by 8
+        vocab_size += (8 - vocab_size % 8)
+
+        start_id = UNK + 1
+
+        # first line: vocab size, start id
+        fo.write("{} {}\n".format(vocab_size, start_id))
+
+        # other lines: word, id
+        for l in all_lines:
+            
+            fo.write("{} {}\n".format(l.split()[0], start_id))
+            start_id += 1
\ No newline at end of file
--- a/tools/ModelConverter.py
+++ b/tools/ModelConverter.py
+'''
+Convert a fairseq checkpoint to a NiuTrans.NMT model.
+Usage: python3 ModelConverter.py -src <fairseq_models> -tgt <niutrans_nmt_model>
+Help: python3 ModelConverter.py -h
+Requirements: fairseq >= 0.6.2
+'''
+
+import torch
+import argparse
+import numpy as np
+from glob import glob
+from struct import pack
+
+parser = argparse.ArgumentParser(
+    description='The model converter for NiuTrans.NMT')
+parser.add_argument('-src', help='The pattern used to find fairseq checkpoints, e.g., \'checkpoint*\'',
+                    type=str, default='checkpoint')
+parser.add_argument('-tgt', help='The file name prefix for Niutrans.NMT models',
+                    type=str, default='model')
+parser.add_argument('-mode', help='Storage mode, FP32 (Default) or FP16', type=str, default='fp32')
+args = parser.parse_args()
+args.mode = args.mode.lower()
+
+
+
+def get_model_parameters(m):
+    '''
+    get flattend transformer model parameters
+    '''
+    p = []
+    encoder_emb = None
+    decoder_emb = None
+    decoder_output_w = None
+    for k in m['model']:
+        if 'encoder.embed_tokens.weight' in k:
+            encoder_emb = m['model'][k]
+        elif 'decoder.embed_tokens.weight' in k:
+            decoder_emb = m['model'][k]
+        elif 'decoder.embed_out' in k:
+            decoder_output_w = m['model'][k]
+        elif m['model'][k].numel() != 1:
+            # ignore fairseq version descriptions
+            if 'weight' in k:
+                # weights for qkv
+                if 'in_proj' in k:
+                    # split qkv weights to slices
+                    dim = m['model'][k].shape[0] // 3
+                    p.append((m['model'][k][:dim, :]).t())
+                    p.append((m['model'][k][dim:dim*2, :]).t())
+                    p.append((m['model'][k][dim*2:, :]).t())
+                else:
+                    if 'norm' in k:
+                        p.append(m['model'][k])
+                    else:
+                        # transpose weights for matrix multiplication
+                        if 'fc' in k:
+                            p.append(m['model'][k].t())
+                        else:
+                            p.append(m['model'][k].t())
+            else:
+                # bias
+                p.append(m['model'][k])
+
+    # encoder embedding weight
+    p.append(encoder_emb)
+
+    # decoder embedding weight
+    if decoder_emb is not None:
+        p.append(decoder_emb)
+    else:
+        print('Sharing all embeddings')
+
+    # decoder output weight
+    if decoder_output_w is not None:
+        p.append(decoder_output_w)
+    else:
+        print('Sharing decoder input output embeddings')
+
+    return p
+
+
+with torch.no_grad():
+
+    model_files = glob(args.src)
+
+    for index, model_file in enumerate(model_files):
+
+        print('-' * 120)
+        print("source model: \'{}\' ({}/{})".format(model_file, index+1, len(model_files)))
+        print("target model: \'{}\'".format(args.tgt + "." + str(index)))
+        model = torch.load(model_file, map_location='cpu')
+
+        meta_info = {
+            'src_vocab_size': 0,
+            'tgt_vocab_size': 0,
+            'encoder_layer': model['args'].encoder_layers,
+            'decoder_layer': model['args'].decoder_layers,
+            'ffn_hidden_size': model['args'].encoder_ffn_embed_dim,
+            'hidden_size': model['args'].decoder_input_dim,
+            'emb_size': model['args'].encoder_embed_dim,
+            'head_num': model['args'].encoder_attention_heads,
+            'max_relative_length': model['args'].max_relative_length,
+            'share_all_embeddings': model['args'].share_all_embeddings,
+            'share_decoder_input_output_embed': model['args'].share_decoder_input_output_embed,
+            'max_source_positions': model['args'].max_source_positions,
+        }
+
+        params = get_model_parameters(model)
+
+        print('total params: ', len(params))
+        print('total params size: ', sum([p.numel() for p in params]))
+
+        model = model['model']
+        with open(args.tgt + "." + str(index) + "." +"name.txt", "w") as name_list:
+            for p in model:
+                name_list.write("{}\t{}\n".format(p, model[p].shape))
+                if 'embed_tokens' in p:
+                    if 'encoder' in p:
+                        meta_info['src_vocab_size'] = model[p].shape[0]
+                    else:
+                        meta_info['tgt_vocab_size'] = model[p].shape[0]
+
+        meta_info_list = [
+            meta_info['encoder_layer'],
+            meta_info['decoder_layer'],
+            meta_info['ffn_hidden_size'],
+            meta_info['hidden_size'],
+            meta_info['emb_size'],
+            meta_info['src_vocab_size'],
+            meta_info['tgt_vocab_size'],
+            meta_info['head_num'],
+            meta_info['max_relative_length'],
+            meta_info['share_all_embeddings'],
+            meta_info['share_decoder_input_output_embed'],
+            meta_info['max_source_positions'],
+        ]
+        print(meta_info)
+        meta_info_list = [int(p) for p in meta_info_list]
+        meta_info = pack("i" * len(meta_info_list), *meta_info_list)
+
+        with open(args.tgt + "." + str(index), 'wb') as tgt:
+            # part 1: meta info
+            tgt.write(meta_info)
+                
+            # part 2: values of parameters (in FP32 or FP16)
+            for p in params:
+                if args.mode == 'fp32':
+                    values = pack("f" * p.numel(), *
+                                (p.contiguous().view(-1).cpu().numpy()))
+                    tgt.write(values)
+                elif args.mode == 'fp16':
+                    values = pack(
+                        "e" * p.numel(), *(p.contiguous().view(-1).cpu().numpy().astype(np.float16)))
+                    tgt.write(values)
--- a/tools/PrepareParallelData.py
+++ b/tools/PrepareParallelData.py
+'''
+Convert a fairseq vocab to a NiuTrans.NMT vocab
+Help: python3 PrepareParallelData.py -h
+
+Training data format (binary):
+first 8 bit: number of sentence pairs
+subsequent segements:
+source sentence length (4 bit)
+target sentence length (4 bit)
+source tokens (4 bit per token)
+target tokens (4 bit per token)
+'''
+
+from struct import pack
+import argparse
+
+# User defined words
+PAD = 1
+SOS = 2
+EOS = 2
+UNK = 3
+
+# The maximum length for a sentence
+MAX_SENT_LEN = 120
+
+parser = argparse.ArgumentParser(
+    description='Prepare parallel data for nmt training')
+parser.add_argument('-src', help='Source language file', type=str, default='')
+parser.add_argument('-tgt', help='Target language file', type=str, default='')
+parser.add_argument(
+    '-src_vocab', help='Source language vocab file', type=str, default='')
+parser.add_argument(
+    '-tgt_vocab', help='Target language vocab file', type=str, default='')
+parser.add_argument('-output', help='Training file', type=str, default='')
+args = parser.parse_args()
+
+src_vocab = dict()
+tgt_vocab = dict()
+cut_num = 0
+
+def load_vocab(vocab, file):
+    with open(file, 'r', encoding='utf8') as f:
+        vocab_size = int(f.readline().split()[0])
+        for l in f:
+            l = l.split()
+            vocab[l[0]] = int(l[1])
+    print("{}: {} types".format(file, vocab_size))
+    return vocab_size
+
+
+def get_id(vocab, word, is_src=True):
+    if word in vocab.keys():
+        return vocab[word]
+    else:
+        return UNK
+
+
+src_vocab_size = load_vocab(src_vocab, args.src_vocab)
+tgt_vocab_size = load_vocab(tgt_vocab, args.tgt_vocab)
+if (not isinstance(src_vocab_size, int)) or (src_vocab_size < 0):
+    raise ValueError("Invalid source vocab size")
+if (not isinstance(tgt_vocab_size, int)) or (src_vocab_size < 0):
+    raise ValueError("Invalid source vocab size")
+
+
+with open(args.src, 'r', encoding='utf8') as fs:
+    with open(args.tgt, 'r', encoding='utf8') as ft:
+        src_sentences, tgt_sentences = list(), list()
+        for ls in fs:
+            ls = ls.split()
+            lt = ft.readline().split()
+            if len(ls) >= MAX_SENT_LEN:
+                cut_num += 1
+                ls = ls[:MAX_SENT_LEN - 1]
+            if len(lt) >= MAX_SENT_LEN:
+                cut_num += 1
+                lt = lt[:MAX_SENT_LEN - 1]
+            src_sent = [get_id(src_vocab, w) for w in ls] + [EOS]
+            tgt_sent = [SOS] + [get_id(tgt_vocab, w, False) for w in lt]
+
+            src_sentences.append(src_sent)
+            tgt_sentences.append(tgt_sent)
+
+        src_tokens = sum([len(s) - 1 for s in src_sentences])
+        tgt_tokens = sum([len(t) - 1 for t in tgt_sentences])
+        print("{}: {} sents, {} tokens, {:.2f} replaced by <UNK>".format(
+            args.src, len(src_sentences), src_tokens, sum([s.count(UNK) for s in src_sentences]) / src_tokens))
+        print("{}: {} sents, {} tokens, {:.2f} replaced by <UNK>".format(
+            args.tgt, len(tgt_sentences), tgt_tokens, sum([s.count(UNK) for s in tgt_sentences]) / tgt_tokens))
+
+        with open(args.output, 'wb') as fo:
+            # seg 1: source and target vocabulary size
+            vocab_size = [src_vocab_size, tgt_vocab_size]
+            vocab_size_pack = pack("i" * len(vocab_size), *vocab_size)
+            fo.write(vocab_size_pack)
+
+            # seg 2: number of sentence pairs (8 bit per number)
+            sent_num = [len(src_sentences)]
+            sent_num_pack = pack("Q", *sent_num)
+            fo.write(sent_num_pack)
+
+            for i in range(len(src_sentences)):
+                src_sent = src_sentences[i]
+                tgt_sent = tgt_sentences[i]
+
+                # seg 3: number of source and target sentence length (4 bit per number)
+                src_tgt_length = [len(src_sent), len(tgt_sent)]
+                src_tgt_length_pack = pack(
+                    "i" * len(src_tgt_length), *src_tgt_length)
+                fo.write(src_tgt_length_pack)
+
+                # seg 4: source sentence and target sentence pairs (4 bit per token)
+                # print(src_sent)
+                src_sent_pack = pack("i" * len(src_sent), *src_sent)
+                fo.write(src_sent_pack)
+                tgt_sent_pack = pack("i" * len(tgt_sent), *tgt_sent)
+                fo.write(tgt_sent_pack)
--- a/tools/VocabConverter.py
+++ b/tools/VocabConverter.py
+'''
+Convert a fairseq vocab to a NiuTrans.NMT vocab
+Usage: python3 VocabConverter.py [fairseq_vocab] [niutrans_nmt_vocab]
+'''
+
+import sys
+
+# User defined words
+PAD=1
+SOS=2
+EOS=2
+UNK=3
+
+with open(sys.argv[1], "r", encoding="utf8") as fi:
+    with open(sys.argv[2], "w", encoding="utf8") as fo:
+        lines = fi.readlines()
+
+        # the first several indices are reserved
+        start_id = UNK + 1
+        
+        # the first line: vocab_size, start_id
+        fo.write("{} {}\n".format(len(lines)+start_id, start_id))
+
+        # other lines: word, id
+        for l in lines:
+            fo.write("{} {}\n".format(l.split()[0], start_id))
+            start_id += 1
\ No newline at end of file