Merge with branch: xiaotong-working

3852f15a · huchi · 98a9130d · 3852f15a · 3852f15a · 3852f15a
Commit 3852f15a authored Mar 17, 2021 by huchi
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
-# CMake minimum version
+# cmake minimum version
 cmake_minimum_required(VERSION 2.8)

 # Project's name
 project(NiuTensor)

-# The prefix of the generated executable file
-set(NIUTRANS_EXE "NiuTensor")
-set(NIUTRANS_DLL "${NIUTRANS_EXE}")
-
+# The name of the generated executable file
+# The name of the dynamic link library
+set(NIUTENSOR_EXE "NiuTensor")
+set(NIUTENSOR_DLL "${NIUTENSOR_EXE}")

 # Generated file path
 set(EXECUTABLE_OUTPUT_PATH ../bin)
 set(LIBRARY_OUTPUT_PATH ../lib)

-# Use CMAKE_MACOSX_RPATH for MacOS 
+# Use CMAKE_MACOSX_RPATH for macOS 
 set(CMAKE_MACOSX_RPATH 1)

 # Open floder manage
 set_property(GLOBAL PROPERTY USE_FOLDERS ON)
+set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
 option(USE_CUDA "Use CUDA" OFF)
+option(USE_HALF_PRECISION "Use Half Precision in CUDA Codes" OFF)
 option(USE_MKL "Use MKL" OFF)
 option(USE_OPENBLAS "Use OpenBLAS" OFF)
-option(USE_FP16 "Use FP16" OFF)
 option(GEN_DLL "Generate Dynamic Link Library" OFF)

+# If set USE_CUDA ON, please modify CUDA_TOOLKIT_ROOT below.
+# If set USE_MKL ON, please modify the INTEL_ROOT below.
+# If set USE_OPENBLAS ON, please modify the OPENBLAS_ROOT below.
 if (USE_CUDA)
-    if(NOT DEFINED CUDA_TOOLKIT_ROOT_DIR)
+    if(NOT EXISTS ${CUDA_TOOLKIT_ROOT})
        if(WIN32)
-            message(STATUS "HERE cuda")
-            set(CUDA_TOOLKIT_ROOT_DIR "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1")
+            set(CUDA_TOOLKIT_ROOT "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1")
        else()
-            set(CUDA_TOOLKIT_ROOT_DIR "/usr/cuda-9.0")
+            set(CUDA_TOOLKIT_ROOT "/usr/local/cuda-10.1")
        endif()
    endif()
-    message(STATUS "CUDA_TOOLKIT_ROOT_DIR: ${CUDA_TOOLKIT_ROOT_DIR}")
+    set(CUDA_TOOLKIT_ROOT_DIR ${CUDA_TOOLKIT_ROOT})
+    message(STATUS "CUDA_TOOLKIT_ROOT: ${CUDA_TOOLKIT_ROOT}")
 endif()
 if(USE_MKL)
    if(NOT DEFINED INTEL_ROOT)
        if(WIN32)
-            message(STATUS "HERE mkl")
            set(INTEL_ROOT "C:/Program Files (x86)/IntelSWTools/compilers_and_libraries_2020.2.254/windows")
        else()
-            set(INTEL_ROOT "/usr/intel/compilers_and_libraries_2020.2.254/linux")
+            set(INTEL_ROOT "/opt/intel/compilers_and_libraries_2020.2.254/linux")
        endif()
    endif()
    message(STATUS "INTEL_ROOT: ${INTEL_ROOT}")
@@ -49,9 +52,9 @@ endif()
 if(USE_OPENBLAS)
    if(NOT DEFINED OPENBLAS_ROOT)
        if(WIN32)
-            set(OPENBLAS_ROOT "D:/software/BaiduNetdiskDownload/thirdparty20170624/OpenBLAS")
+            set(OPENBLAS_ROOT "C:/Program Files/OpenBLAS")
        else()
-            set(OPENBLAS_ROOT "/usr/OpenBLAS")
+            set(OPENBLAS_ROOT "/opt/OpenBLAS")
        endif()
    endif()
    message(STATUS "OPENBLAS_ROOT: ${OPENBLAS_ROOT}")
@@ -90,74 +93,90 @@ endfunction(my_add_executable)
 # Set libs and compiler options for CUDA
 if(USE_CUDA)
    add_definitions(-DUSE_CUDA)
-    if(USE_FP16)
+    if(USE_HALF_PRECISION)
        add_definitions(-DHALF_PRECISION)
    endif()
-    find_package(CUDA ${CUDA_VERSION} REQUIRED)
+    find_package(CUDA REQUIRED)
+    if(GPU_ARCH STREQUAL K) # Kepler cards (CUDA 5 until CUDA 10)
+        set(ARCH_FLAGS -arch=compute_30 -code=compute_30,sm_30,sm_35,sm_37)
+    elseif(GPU_ARCH STREQUAL M) # Maxwell cards (CUDA 6 until CUDA 11)
+        set(ARCH_FLAGS -arch=compute_50 -code=compute_50,sm_50,sm_52,sm_53)
+    elseif(GPU_ARCH STREQUAL P) # Pascal (CUDA 8 and later)
+        set(ARCH_FLAGS -arch=compute_60 -code=compute_60,sm_60,sm_61,sm_62)
+    elseif(GPU_ARCH STREQUAL V) # Volta (CUDA 9 and later)
+        set(ARCH_FLAGS -arch=compute_70 -code=compute_70,sm_70,sm_72)
+    elseif(GPU_ARCH STREQUAL T) # Turing (CUDA 10 and later)
+        set(ARCH_FLAGS -arch=compute_75 -code=sm_75)
+    elseif(GPU_ARCH STREQUAL A) # Ampere (CUDA 11 and later)
+        set(ARCH_FLAGS -arch=compute_80 -code=sm_80)
+    endif()
+
+    if(USE_HALF_PRECISION)
+        if(NOT DEFINED GPU_ARCH)
+            set(ARCH_FLAGS -arch=sm_60
+                -gencode=arch=compute_60,code=sm_60
+                -gencode=arch=compute_61,code=sm_61
+                -gencode=arch=compute_62,code=sm_62
+                -gencode=arch=compute_70,code=sm_70
+                -gencode=arch=compute_72,code=sm_72
+                -gencode=arch=compute_70,code=compute_70
+            )
+        elseif(${GPU_ARCH} STREQUAL K OR ${GPU_ARCH} STREQUAL M)
+            message(FATAL_ERROR "your GPU cannot use the function half precision")
+        endif()
+    endif()
+    
    if(WIN32)
-        SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4819")
-        set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-maxrregcount=0 -m64 -use_fast_math -DUSE_CUDA")
-        set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} -arch=sm_30 
-            -gencode=arch=compute_30,code=sm_30 
-            -gencode=arch=compute_50,code=sm_50 
-            -gencode=arch=compute_52,code=sm_52 
-            -gencode=arch=compute_60,code=sm_60 
-            -gencode=arch=compute_61,code=sm_61 
-            -gencode=arch=compute_62,code=sm_62 
-            -gencode=arch=compute_70,code=sm_70 
-            -gencode=arch=compute_70,code=compute_70 
-        )
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4819")
+        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-maxrregcount=0 -Wno-deprecated-gpu-targets -use_fast_math")
+        string(REPLACE -m32 -m64 CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${ARCH_FLAGS})
        set(CMAKE_POLICY_DEFAULT_CMP0028 NEW)
-        link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib/x64")
-        include_directories("${CUDA_TOOLKIT_ROOT_DIR}/include")
-        set(CUDA_LIB_DIR "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64/")
+        link_directories("${CUDA_TOOLKIT_ROOT}/lib/x64")
+        include_directories("${CUDA_TOOLKIT_ROOT}/include")
+        set(CUDA_LIB_DIR "${CUDA_TOOLKIT_ROOT}/lib/x64/")
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}cublas.lib")
+        if(CUDA_VERSION_MAJOR EQUAL 11)
+            set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}cublasLt.lib")
+        endif()
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}npps.lib")
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}nppc.lib")
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}cudadevrt.lib")
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}curand.lib")
    else()
        set(CMAKE_CXX_FLAGS "-fPIC -msse4.2 -w -march=native -Wno-enum-compare -Wno-sign-compare -Wno-format -Wno-dev -O3 -DNDEBUG -rdynamic")
-        if(USE_FP16)
-            set(CUDA_NVCC_FLAGS "-Xcompiler -fPIC -maxrregcount=0 -use_fast_math -DUSE_CUDA -DHALF_PRECISION -Wno-deprecated-gpu-targets -std=c++11 ")
-            set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -arch=sm_60
-                -gencode=arch=compute_60,code=sm_60 
-                -gencode=arch=compute_61,code=sm_61 
-                -gencode=arch=compute_62,code=sm_62 
-                -gencode=arch=compute_70,code=sm_70 
-                -gencode=arch=compute_70,code=compute_70 
-            )
-        else()
-            set(CUDA_NVCC_FLAGS "-Xcompiler -fPIC -maxrregcount=0 -use_fast_math -DUSE_CUDA -Wno-deprecated-gpu-targets -std=c++11 ")
-            set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -arch=sm_60
-                -gencode=arch=compute_60,code=sm_60 
-                -gencode=arch=compute_61,code=sm_61 
-                -gencode=arch=compute_62,code=sm_62 
-                -gencode=arch=compute_70,code=sm_70 
-                -gencode=arch=compute_70,code=compute_70 
-            )
-        endif()
-        link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64)
-        include_directories(${CUDA_TOOLKIT_ROOT_DIR}/include)
-        set(CUDA_LIB_DIR "${CUDA_TOOLKIT_ROOT_DIR}/lib64/")
+        set(CUDA_NVCC_FLAGS "-Xcompiler -fPIC -maxrregcount=0 --disable-warnings -use_fast_math -DUSE_CUDA -Wno-deprecated-gpu-targets -std=c++11")
+        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${ARCH_FLAGS})
+        link_directories("${CUDA_TOOLKIT_ROOT}/lib64")
+        include_directories("${CUDA_TOOLKIT_ROOT}/include")
+        set(CUDA_LIB_DIR "${CUDA_TOOLKIT_ROOT}/lib64/")
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcublas_static.a")
+        if(CUDA_VERSION_MAJOR EQUAL 11)
+            set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcublasLt_static.a")
+        endif()
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libculibos.a")
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libnpps_static.a")
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libnppc_static.a")
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcudadevrt.a")
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcurand_static.a")
-        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "/usr/lib64/libdl.so.2")
+        if(EXISTS "/usr/lib64/libdl.so.2")
+            set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "/usr/lib64/libdl.so.2")
+        elseif(EXISTS "/lib/x86_64-linux-gnu/libdl.so.2")
+            set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "/lib/x86_64-linux-gnu/libdl.so.2")
+        elseif(EXISTS "/lib64/libdl.so.2")
+            set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "/lib64/libdl.so.2")
+        endif()
    endif()
 endif()

 # Set libs and compiler options for MKL
 if(USE_MKL)
-    add_definitions(-DMKL)
+    add_definitions(-DUSE_BLAS -DMKL)
    set(COMPILER_DIR "${INTEL_ROOT}/compiler")
    set(MKL_DIR "${INTEL_ROOT}/mkl")
    set(CPU_ARCH intel64)
    if(WIN32)
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNDEBUG -DMKL")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNDEBUG")
        link_directories(${MKL_DIR}/lib/intel64/)
        link_directories(${COMPILER_DIR}/lib/intel64)
        include_directories(${MKL_DIR}/include)
@@ -169,9 +188,9 @@ if(USE_MKL)
        set(MKL_LIB_PATH ${MKL_LIB_PATH} "${COMPILER_LIB_DIR}libiomp5md.lib")
    else()
        if(USE_CUDA)
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-reorder -DMKL")
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-reorder")
        else()
-            set(CMAKE_CXX_FLAGS "-std=c++11 -msse4.2 -w -march=native -Wno-enum-compare -Wno-sign-compare -Wno-reorder -Wno-format -O3 -flto -DNDEBUG -rdynamic -fkeep-inline-functions -fPIC -DMKL")
+            set(CMAKE_CXX_FLAGS "-std=c++11 -msse4.2 -w -march=native -Wno-enum-compare -Wno-sign-compare -Wno-reorder -Wno-format -O3 -flto -DNDEBUG -rdynamic -fkeep-inline-functions -fPIC")
        endif(USE_CUDA)
        link_directories(${MKL_DIR}/lib/intel64/)
        link_directories(${COMPILER_DIR}/lib/intel64)
@@ -187,10 +206,9 @@ endif()

 # Set libs and compiler options for OpenBLAS
 if(USE_OPENBLAS)
-    add_definitions(-DUSE_BLAS -DMKL)
+    add_definitions(-DUSE_BLAS -DOPENBLAS)
    set(OPENBLAS_INCLUDE_DIR "${OPENBLAS_ROOT}/include")
    set(OPENBLAS_LIB_DIR "${OPENBLAS_ROOT}/lib")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_BLAS")
    if(WIN32)
        link_directories(${OPENBLAS_LIB_DIR})
        include_directories(${OPENBLAS_INCLUDE_DIR})
@@ -211,15 +229,15 @@ set(OPENBLAS_LIB ${OPENBLAS_LIB_PATH})
 # Generate dynamic link library about project
 if(USE_CUDA)
    if(GEN_DLL)
-        cuda_add_library(${NIUTRANS_DLL} SHARED ${CPP_FILES} ${H_FILES} ${CU_FILES} ${CUH_FILES})
+        cuda_add_library(${NIUTENSOR_DLL} SHARED ${CPP_FILES} ${H_FILES} ${CU_FILES} ${CUH_FILES})
    else()
-        my_add_executable(${NIUTRANS_EXE} ${CPP_FILES} ${H_FILES} ${CU_FILES} ${CUH_FILES})
+        my_add_executable(${NIUTENSOR_EXE} ${CPP_FILES} ${H_FILES} ${CU_FILES} ${CUH_FILES})
    endif()
 else()
    if(GEN_DLL)
-        add_library(${NIUTRANS_DLL} SHARED ${CPP_FILES} ${H_FILES})
+        add_library(${NIUTENSOR_DLL} SHARED ${CPP_FILES} ${H_FILES})
    else()
-        my_add_executable(${NIUTRANS_EXE} ${CPP_FILES} ${H_FILES})
+        my_add_executable(${NIUTENSOR_EXE} ${CPP_FILES} ${H_FILES})
    endif()
 endif()

@@ -243,17 +261,17 @@ if(WIN32)

    if(GEN_DLL)
        message(STATUS "Generate Dynamic Link Library")
-        message(STATUS "Name of Dynamic Link Library: " ${NIUTRANS_DLL})
-        target_link_libraries(${NIUTRANS_DLL} ${ALL_LIB})
+        message(STATUS "Name of Dynamic Link Library: " ${NIUTENSOR_DLL})
+        target_link_libraries(${NIUTENSOR_DLL} ${ALL_LIB})
    else()
        message(STATUS "Generate Makefile For Executable File")
-        message(STATUS "Name of Executable File :" ${NIUTRANS_EXE})
-        target_link_libraries(${NIUTRANS_EXE} ${ALL_LIB})
+        message(STATUS "Name of Executable File :" ${NIUTENSOR_EXE})
+        target_link_libraries(${NIUTENSOR_EXE} ${ALL_LIB})
    endif()
    message(STATUS "${MESS}")
 else()
    add_definitions(-std=c++11)
-    set(MESS ${MESS} "On Linux")
+    set(MESS ${MESS} "On Linux or macOS")
    if(USE_CUDA)
        set(MESS ${MESS} " Use CUDA")
        set(ALL_LIB ${ALL_LIB} ${CUDA_LIB})
@@ -274,12 +292,12 @@ else()

    if(GEN_DLL)
        message(STATUS "Generate Dynamic Link Library")
-        message(STATUS "Name of Dynamic Link Library: " ${NIUTRANS_DLL})
-        target_link_libraries(${NIUTRANS_DLL} ${ALL_LIB} ${FLAG})
+        message(STATUS "Name of Dynamic Link Library: " ${NIUTENSOR_DLL})
+        target_link_libraries(${NIUTENSOR_DLL} ${ALL_LIB} ${FLAG})
    else()
        message(STATUS "Generate Makefile For Executable File")
-        message(STATUS "Name of Executable File: " ${NIUTRANS_EXE})
-        target_link_libraries(${NIUTRANS_EXE} ${ALL_LIB} ${FLAG})
+        message(STATUS "Name of Executable File: " ${NIUTENSOR_EXE})
+        target_link_libraries(${NIUTENSOR_EXE} ${ALL_LIB} ${FLAG})
    endif()
    message(STATUS "${MESS}")
 endif()
--- a/README.md
+++ b/README.md
@@ -39,14 +39,14 @@ NiuTensor蟾･蜈ｷ蛹庄莉･蝨ｨWindows縲´inux莉･蜿確acOS邇ｯ蠅ｸ玖ｿ幄｡悟ｮ芽｣ｼ梧髪

 ##### CMake方式（Visual Studio）

-对于WIndows平台的NiuTensor工具包安装，这里可以使用CMake工具自动生成Visual Studio项目（需要用户提前安装CMake工具以及Visual Studio集成开发环境），操作步骤如下：
+对于Windows平台的NiuTensor工具包安装，这里可以使用CMake工具自动生成Visual Studio项目（需要用户提前安装CMake工具以及Visual Studio集成开发环境），操作步骤如下：

 - 在工具包根目录新建目录以保存生成的Visual Studio项目文件（如建立build目录）。
 - 在项目根目录打开Windows平台的命令行工具（如PowerShell），执行`cd build`命令进入新建的build目录。
 - 执行CMake命令对Visual Studio项目进行生成（如果 visual studio 版本低于 2019，则在使用下列命令的时候需额外加上`-A x64`的CMake参数），如计划生成动态链接库，则仅需在命令中额外加上`-DGEN_DLL=ON`的CMake参数即可，否则默认生成可执行程序。
  - 如项目计划启用MKL数学运算库（需用户自行安装），则仅需在CMake命令中使用`-DUSE_MKL=ON`参数，并通过`-DINTEL_ROOT='/intel/root/path'`指定MKL库（Intel工具包）的安装路径。如`cmake -DUSE_MKL=ON -DINTEL_ROOT='C:/Program Files (x86)/IntelSWTools/compilers_and_libraries_2020.2.254/windows' ..`。
  - 如项目计划启用OpenBLAS数学运算库（需用户自行安装），则仅需在CMake命令中使用`-DUSE_OPENBLAS=ON`参数，并通过`-DOPENBLAS_ROOT='/openblas/root/path'`指定OpenBLAS库的安装路径。如`cmake -DUSE_OPENBLAS=ON -DOPENBLAS_ROOT='C:/Program Files/OpenBLAS' ..`。
-  - 如项目计划启用CUDA数学运算库（需用户自行安装），则仅需在CMake命令中使用`-DUSE_CUDA=ON`参数，并通过`-DCUDA_ROOT='/cuda/root/path'`指定CUDA库的安装路径，通过-DGPU_ARCH=ARCH指定所在GPU设备的架构（K：Kepler架构；M：Maxwell架构；P：Pascal架构；V：Volta架构；T：Turing架构；A：Ampere架构）。如`cmake -DUSE_CUDA=ON -DCUDA_ROOT='C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.2' -DGPU_ARCH=P ..`。如需在GPU设备上使用半精度浮点数进行运算，需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数（需要注意的是半精度但需要注意的是，半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持，该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询）。
+  - 如项目计划启用CUDA数学运算库（需用户自行安装），则仅需在CMake命令中使用`-DUSE_CUDA=ON`参数，并通过`-DCUDA_TOOLKIT_ROOT='/cuda/root/path'`指定CUDA库的安装路径，通过-DGPU_ARCH=ARCH指定所在GPU设备的架构（K：Kepler架构；M：Maxwell架构；P：Pascal架构；V：Volta架构；T：Turing架构；A：Ampere架构）。如`cmake -DUSE_CUDA=ON -DCUDA_TOOLKIT_ROOT='C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.2' -DGPU_ARCH=P ..`。如需在GPU设备上使用半精度浮点数进行运算，需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数（需要注意的是半精度但需要注意的是，半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持，该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询）。
 - 执行成功将显示`Build files have been written to:...`。
 - 打开build目录中的NiuTensor.sln文件即可通过Visual Studio打开NiuTensor项目。
 - 打开后在解决方案管理器中选中NiuTensor，右键将其设为启动项目即可开始使用。
@@ -67,7 +67,7 @@ NiuTensor蟾･蜈ｷ蛹庄莉･蝨ｨWindows縲´inux莉･蜿確acOS邇ｯ蠅ｸ玖ｿ幄｡悟ｮ芽｣ｼ梧髪
 - 打开CLion首选项，点击“构建，执行，部署”选项卡中的CMake，在“CMake选项”中进行设置，设置完成后CLion将自动使用CMake对项目进行构建，如计划生成动态链接库，则仅需在在“CMake选项”中额外加上`-DGEN_DLL=ON`的CMake参数即可，否则默认生成可执行程序。
  - 如项目计划启用MKL数学运算库（需用户自行安装），则仅需在“CMake选项”中填入`-DUSE_MKL=ON`，并通过`-DINTEL_ROOT='/intel/root/path'`指定MKL库（Intel工具包）的安装路径。如`-DUSE_MKL=ON -DINTEL_ROOT='/opt/intel/compilers_and_libraries_2020.2.254/linux'`。
  - 如项目计划启用OpenBLAS数学运算库（需用户自行安装），则仅需在“CMake选项”中填入`-DUSE_OPENBLAS=ON`，并通过`-DOPENBLAS_ROOT='/openblas/root/path'`指定OpenBLAS库的安装路径。如`-DUSE_OPENBLAS=ON -DOPENBLAS_ROOT='/opt/OpenBLAS'`。
-  - 如项目计划启用CUDA数学运算库（需用户自行安装），则仅需在“CMake选项”中填入`-DUSE_CUDA=ON`，并通过`-DCUDA_ROOT='/cuda/root/path'`指定CUDA库的安装路径，通过-DGPU_ARCH=ARCH指定所在GPU设备的架构（K：Kepler架构；M：Maxwell架构；P：Pascal架构；V：Volta架构；T：Turing架构；A：Ampere架构）。如`-DUSE_CUDA=ON -DCUDA_ROOT='/usr/local/cuda-9.2' -DGPU_ARCH=P `。如需在GPU设备上使用半精度浮点数进行运算，需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数（需要注意的是半精度但需要注意的是，半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持，该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询）。
+  - 如项目计划启用CUDA数学运算库（需用户自行安装），则仅需在“CMake选项”中填入`-DUSE_CUDA=ON`，并通过`-DCUDA_TOOLKIT_ROOT='/cuda/root/path'`指定CUDA库的安装路径，通过-DGPU_ARCH=ARCH指定所在GPU设备的架构（K：Kepler架构；M：Maxwell架构；P：Pascal架构；V：Volta架构；T：Turing架构；A：Ampere架构）。如`-DUSE_CUDA=ON -DCUDA_TOOLKIT_ROOT='/usr/local/cuda-9.2' -DGPU_ARCH=P `。如需在GPU设备上使用半精度浮点数进行运算，需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数（需要注意的是半精度但需要注意的是，半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持，该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询）。

 ##### CMake方式（命令行）

@@ -78,7 +78,7 @@ NiuTensor蟾･蜈ｷ蛹庄莉･蝨ｨWindows縲´inux莉･蜿確acOS邇ｯ蠅ｸ玖ｿ幄｡悟ｮ芽｣ｼ梧髪
 - 执行CMake命令对项目进行生成，如计划生成动态链接库，则仅需在命令中额外加上`-DGEN_DLL=ON`的CMake参数即可，否则默认生成可执行程序。
  - 如项目计划启用MKL数学运算库（需用户自行安装），则仅需在CMake命令中使用`-DUSE_MKL=ON`参数，并通过`-DINTEL_ROOT='/intel/root/path'`指定MKL库（Intel工具包）的安装路径。如`cmake -DUSE_MKL=ON -DINTEL_ROOT='/opt/intel/compilers_and_libraries_2020.2.254/linux' ..`。
  - 如项目计划启用OpenBLAS数学运算库（需用户自行安装），则仅需在CMake命令中使用`-DUSE_OPENBLAS=ON`参数，并通过`-DOPENBLAS_ROOT='/openblas/root/path'`指定OpenBLAS库的安装路径。如`cmake -DUSE_OPENBLAS=ON -DOPENBLAS_ROOT='/opt/OpenBLAS' ..`。
-  - 如项目计划启用CUDA数学运算库（需用户自行安装），则仅需在CMake命令中使用`-DUSE_CUDA=ON`参数，并通过`-DCUDA_ROOT='/cuda/root/path'`指定CUDA库的安装路径，通过-DGPU_ARCH=ARCH指定所在GPU设备的架构（K：Kepler架构；M：Maxwell架构；P：Pascal架构；V：Volta架构；T：Turing架构；A：Ampere架构）。如`cmake -DUSE_CUDA=ON -DCUDA_ROOT='/usr/local/cuda-9.2' -DGPU_ARCH=P ..`。如需在GPU设备上使用半精度浮点数进行运算，需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数（需要注意的是半精度但需要注意的是，半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持，该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询）。
+  - 如项目计划启用CUDA数学运算库（需用户自行安装），则仅需在CMake命令中使用`-DUSE_CUDA=ON`参数，并通过`-DCUDA_TOOLKIT_ROOT='/cuda/root/path'`指定CUDA库的安装路径，通过-DGPU_ARCH=ARCH指定所在GPU设备的架构（K：Kepler架构；M：Maxwell架构；P：Pascal架构；V：Volta架构；T：Turing架构；A：Ampere架构）。如`cmake -DUSE_CUDA=ON -DCUDA_TOOLKIT_ROOT='/usr/local/cuda-9.2' -DGPU_ARCH=P ..`。如需在GPU设备上使用半精度浮点数进行运算，需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数（需要注意的是半精度但需要注意的是，半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持，该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询）。
 - 执行成功将显示`Build files have been written to:...`并在该目录下生成Makefile文件。
 - 执行`make -j`命令对NiuTensor项目进行编译，执行成功将显示`Built target NiuTensor`，安装完毕。

@@ -137,4 +137,4 @@ NiuTensor蠑驥剰ｮ｡邂怜ｺ鍋罰荳懷圏螟ｧ蟄ｦ閾ｪ辟ｶ隸ｭ險螟炊螳樣ｪ悟ｮ､蟆冗央蠑貅仙

 ## 更新版本

-NiuTensor version 0.3.5 - 2021年2月6日
+NiuTensor version 0.4.0 - 2021年3月13日
--- a/source/Main.cpp
+++ b/source/Main.cpp
@@ -27,6 +27,7 @@
 #include "./tensor/test/Test.h"
 #include "./sample/fnnlm/FNNLM.h"
 #include "./sample/transformer/NMT.h"
+#include "./train/TTrain.h"

 //#define CRTDBG_MAP_ALLOC
 //#include <stdlib.h>
@@ -38,8 +39,17 @@ using namespace nmt;

 int main( int argc, const char ** argv )
 {
-    if(argc > 1 && !strcmp(argv[1], "-test"))
+    XConfig config;
+
+    if(argc > 1){
+        config.Create(argc - 1, argv + 1);
+        verboseLevel = config.GetInt("verbose", 1);
+    }
+
+    if (argc > 1 && !strcmp(argv[1], "-test"))
        Test();
+    else if (argc > 1 && !strcmp(argv[1], "-testtrain"))
+        TestTrain();
    else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
        FNNLMMain(argc - 1, argv + 1);
    else if(argc > 1 && !strcmp(argv[1], "-t2t"))
@@ -47,7 +57,8 @@ int main( int argc, const char ** argv )
    else{
        fprintf(stderr, "Thanks for using NiuTensor! This is a library for building\n");
        fprintf(stderr, "neural networks in an easy way. \n\n");
-        fprintf(stderr, "Run this program with \"-test\" for unit test!\n");
+        fprintf(stderr, "   Run this program with \"-test\" for unit test!\n");
+        fprintf(stderr, "Or run this program with \"-testtrain\" for test of the trainer!\n");
        fprintf(stderr, "Or run this program with \"-fnnlm\" for sample FNNLM!\n");
        fprintf(stderr, "Or run this program with \"-t2t\" for sample Transformer!\n");
    }

--- a/source/network/XBackwardFunc.cpp
+++ b/source/network/XBackwardFunc.cpp
@@ -93,6 +93,7 @@ void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient)
    }

    node->visitMark = NODE_FINISHED;
+    node->isGradFinished = true;
 }

 /* indicates whether the node is for an activation function */

--- a/source/network/XBackwardLoss.cpp
+++ b/source/network/XBackwardLoss.cpp
@@ -89,6 +89,7 @@ void XLossGrad::MakeGrad(XTensor * node, bool isEfficient)
    }

    node->visitMark = NODE_FINISHED;
+    node->isGradFinished = true;
 }

 /* indicates whether the node is for a loss computation */

--- a/source/network/XBackwardMath.cpp
+++ b/source/network/XBackwardMath.cpp
@@ -125,6 +125,9 @@ void XMathGrad::MakeGrad(XTensor * node, bool isEfficient)
    else{
        ShowNTErrors("Unsupported backward computation! TODO!");
    }
+
+    node->visitMark = NODE_FINISHED;
+    node->isGradFinished = true;
 }

 /* indicates whether the node is for a math operation */
@@ -156,14 +159,16 @@ void XMathGrad::GradAbsolute(XTensor * node, bool isEfficient)
    if (!isEfficient || a->isGrad) {
        XNoder::MakeGrad(a);

+        if (a->mem != NULL)
+            a->mem->LockBuf();
        XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
        _Sign(a, tmp);
        _Multiply(node->grad, tmp, a->grad, 1.0F);

        DelTensorBuf(tmp);
+        if (a->mem != NULL)
+            a->mem->UnlockBuf();
    }
-
-    node->visitMark = NODE_FINISHED;
 }

 /*
@@ -187,15 +192,17 @@ void XMathGrad::GradCos(XTensor * node, bool isEfficient)
    if (!isEfficient || a->isGrad) {
        XNoder::MakeGrad(a);

+        if (a->mem != NULL)
+            a->mem->LockBuf();
        XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
        _Sin(a, tmp);
        _NegateMe(tmp);
        _Multiply(node->grad, tmp, a->grad, 1.0F);

        DelTensorBuf(tmp);
+        if (a->mem != NULL)
+            a->mem->UnlockBuf();
    }
-
-    node->visitMark = NODE_FINISHED;
 }

 /*
@@ -219,14 +226,16 @@ void XMathGrad::GradExp(XTensor * node, bool isEfficient)
    if (!isEfficient || a->isGrad) {
        XNoder::MakeGrad(a);

+        if (a->mem != NULL)
+            a->mem->LockBuf();
        XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
        _Exp(a, tmp);
        _Multiply(node->grad, tmp, a->grad, 1.0F);

        DelTensorBuf(tmp);
+        if (a->mem != NULL)
+            a->mem->UnlockBuf();
    }
-
-    node->visitMark = NODE_FINISHED;
 }

 /*
@@ -251,8 +260,6 @@ void XMathGrad::GradLog(XTensor * node, bool isEfficient)
        XNoder::MakeGrad(a);
        _Div(node->grad, a, a->grad, 1.0F);
    }
-
-    node->visitMark = NODE_FINISHED;
 }

 /*
@@ -276,8 +283,6 @@ void XMathGrad::GradRound(XTensor * node, bool isEfficient)
    if (!isEfficient || a->isGrad) {
        XNoder::MakeGrad(a);
    }
-
-    node->visitMark = NODE_FINISHED;
 }

 /*
@@ -301,8 +306,6 @@ void XMathGrad::GradSign(XTensor * node, bool isEfficient)
    if (!isEfficient || a->isGrad) {
        XNoder::MakeGrad(a);
    }
-
-    node->visitMark = NODE_FINISHED;
 }

 /*
@@ -326,14 +329,16 @@ void XMathGrad::GradSin(XTensor * node, bool isEfficient)
    if (!isEfficient || a->isGrad) {
        XNoder::MakeGrad(a);

+        if (a->mem != NULL)
+            a->mem->LockBuf();
        XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
        _Cos(a, tmp);
        _Multiply(node->grad, tmp, a->grad, 1.0F);

        DelTensorBuf(tmp);
+        if (a->mem != NULL)
+            a->mem->UnlockBuf();
    }
-
-    node->visitMark = NODE_FINISHED;
 }

 /*
@@ -352,20 +357,23 @@ void XMathGrad::GradTan(XTensor * node, bool isEfficient)
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for TAN!");

    XTensor * a = income.tails[0];
-    XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);

    /* dE/da = dE/dc * 1/(cos(a))^2
             = dE/dc * (cos(a))^-2 */
    if (!isEfficient || a->isGrad) {
        XNoder::MakeGrad(a);
+
+        if (a->mem != NULL)
+            a->mem->LockBuf();
+        XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
        _Cos(a, tmp);
        _PowerMe(tmp, -2.0F);
        _Multiply(node->grad, tmp, a->grad, 1.0F);

        DelTensorBuf(tmp);
+        if (a->mem != NULL)
+            a->mem->UnlockBuf();
    }
-
-    node->visitMark = NODE_FINISHED;
 }

 /*
@@ -392,14 +400,16 @@ void XMathGrad::GradClip(XTensor * node, bool isEfficient)
    if (!isEfficient || a->isGrad) {
        XNoder::MakeGrad(a);

+        if (a->mem != NULL)
+            a->mem->LockBuf();
        XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
        _ClipBackward(node, a, node->grad, tmp, lower, upper);
        _SumMe(a->grad, tmp);

        DelTensorBuf(tmp);
+        if (a->mem != NULL)
+            a->mem->UnlockBuf();
    }
-
-    node->visitMark = NODE_FINISHED;
 }

 /*
@@ -432,6 +442,8 @@ void XMathGrad::GradDiv(XTensor * node, bool isEfficient)
             = dE/dc * a * (-b^-2) */
    if (!isEfficient || b->isGrad) {
        XNoder::MakeGrad(b);
+        if (a->mem != NULL)
+            a->mem->LockBuf();
        XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
        _Power(b, tmp, -2.0F);
        _NegateMe(tmp);
@@ -439,9 +451,9 @@ void XMathGrad::GradDiv(XTensor * node, bool isEfficient)
        _Multiply(node->grad, tmp, b->grad, 1.0F);

        DelTensorBuf(tmp);
+        if (a->mem != NULL)
+            a->mem->UnlockBuf();
    }
-
-    node->visitMark = NODE_FINISHED;
 }

 /* 
@@ -478,9 +490,17 @@ void XMathGrad::GradDivDim(XTensor * node, bool isEfficient)
        int dimSize[MAX_TENSOR_DIM_NUM];
        memcpy(dimSize, a->dimSize, sizeof(int) * a->order);

+        if (a->mem != NULL)
+            a->mem->LockBuf();
        XTensor * aTMP1 = NewTensorBufV2(a, a->devID, a->mem);
        XTensor * aTMP2 = NewTensorBufV2(a, a->devID, a->mem);
+        if ((b->mem != NULL) && (b->mem != a->mem)) {
+            b->mem->LockBuf();
+        }
        XTensor * bTMP = NewTensorBufV2(b, b->devID, b->mem);
+        if ((node->mem != NULL) && (node->mem != a->mem) && (node->mem != b->mem)) {
+            node->mem->LockBuf();
+        }
        XTensor * interGradTMP = NewTensorBufV2(node->grad, node->devID, node->mem);

        _Negate(a, aTMP1);
@@ -522,6 +542,7 @@ void XMathGrad::GradDivDim(XTensor * node, bool isEfficient)
               Then reduce along with z and x to obtain dE/db. */
            interGradTMP->Reshape(3, reshapedSize);

+            // b->mem->LockBuf();
            XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
            _ReduceSum(interGradTMP, interGrad, 2);

@@ -532,15 +553,22 @@ void XMathGrad::GradDivDim(XTensor * node, bool isEfficient)

            DelTensorBuf(bGradTMP2);
            DelTensorBuf(interGrad);
+            // b->mem->UnlockBuf();
        }

        DelTensorBuf(interGradTMP);
+        if ((node->mem != NULL) && (node->mem != a->mem) && (node->mem != b->mem)) {
+            node->mem->UnlockBuf();
+        }
        DelTensorBuf(bTMP);
+        if ((b->mem != NULL) && (b->mem != a->mem)) {
+            b->mem->UnlockBuf();
+        }
        DelTensorBuf(aTMP2);
        DelTensorBuf(aTMP1);
+        if (a->mem != NULL)
+            a->mem->UnlockBuf();
    }
-
-    node->visitMark = NODE_FINISHED;
 }

 /* 
@@ -602,8 +630,6 @@ void XMathGrad::GradMatrixMul(XTensor * node, bool isEfficient)
    else{
        ShowNTErrors("TODO!");
    }
-
-    node->visitMark = NODE_FINISHED;
 }
    
 /*
@@ -757,8 +783,6 @@ void XMathGrad::GradMatrixMulBatched(XTensor * node, bool isEfficient)
        if (!isEfficient || b->isGrad)
            _MatrixMulBatched(dedc, X_TRANS, a, X_TRANS, dedb, alpha, 1.0F);
    }
-
-    node->visitMark = NODE_FINISHED;
 }

 /* 
@@ -793,8 +817,6 @@ void XMathGrad::GradMultiply(XTensor * node, bool isEfficient)
        XNoder::MakeGrad(b);
        _Multiply(node->grad, a, b->grad, 1.0F);
    }
-
-    node->visitMark = NODE_FINISHED;
 }

 /*
@@ -830,6 +852,8 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
        int dimSize[MAX_TENSOR_DIM_NUM];
        memcpy(dimSize, a->dimSize, sizeof(int) * a->order);

+        if (node->mem != NULL)
+            node->mem->LockBuf();
        XTensor * bGradTMP = NewTensorBufV2(node->grad, node->devID, node->mem);
        _Multiply(node->grad, a, bGradTMP);

@@ -842,12 +866,18 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
               size of b. Then we can reduce the matrix into a row vector. */
            bGradTMP->Reshape(2, reshapedSize);

+            if ((b->mem != NULL) && (b->mem != node->mem)) {
+                b->mem->LockBuf();
+            }
            XTensor * bGradTMP2 = NewTensorBufV2(b->grad, b->devID, b->mem);
            _ReduceSum(bGradTMP, bGradTMP2, 0);

            _Sum(b->grad, bGradTMP2, b->grad);

            DelTensorBuf(bGradTMP2);
+            if ((b->mem != NULL) && (b->mem != node->mem)) {
+                b->mem->UnlockBuf();
+            }
        }
        else {
            int reshapedSize[MAX_TENSOR_DIM_NUM];
@@ -866,6 +896,9 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
               Then reduce along with z and x to obtain dE/db. */
            bGradTMP->Reshape(3, reshapedSize);

+            if ((b->mem != NULL) && (b->mem != node->mem)) {
+                b->mem->LockBuf();
+            }
            XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
            _ReduceSum(bGradTMP, interGrad, 2);

@@ -876,11 +909,14 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)

            DelTensorBuf(bGradTMP2);
            DelTensorBuf(interGrad);
+            if ((b->mem != NULL) && (b->mem != node->mem)) {
+                b->mem->UnlockBuf();
+            }
        }
        DelTensorBuf(bGradTMP);
+        if (node->mem != NULL)
+            node->mem->UnlockBuf();
    }
-
-    node->visitMark = NODE_FINISHED;
 }

 /*
@@ -916,8 +952,6 @@ void XMathGrad::GradMultiplyBroadcast(XTensor * node, bool isEfficient)
        if (b->isVar || b->income.tailNum > 0)
            ShowNTErrors("TODO");
    }
-
-    node->visitMark = NODE_FINISHED;
 }

 /*
@@ -942,8 +976,6 @@ void XMathGrad::GradNegate(XTensor * node, bool isEfficient)
        XNoder::MakeGrad(a);        
        _Sum(a->grad, node->grad, a->grad, -1.0F);
    }
-
-    node->visitMark = NODE_FINISHED;
 }

 /*
@@ -980,15 +1012,17 @@ void XMathGrad::GradPower(XTensor * node, bool isEfficient)
    if (!isEfficient || a->isGrad) {
        XNoder::MakeGrad(a);

+        if (a->mem != NULL)
+            a->mem->LockBuf();
        XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
        _Power(a, tmp, p - 1.0F);
        _ScaleAndShiftMe(tmp, p);
        _Multiply(node->grad, tmp, a->grad, 1.0F);

        DelTensorBuf(tmp);
+        if (a->mem != NULL)
+            a->mem->UnlockBuf();
    }
-
-    node->visitMark = NODE_FINISHED;
 }


@@ -1012,15 +1046,17 @@ void XMathGrad::GradReciprocal(XTensor* node, bool isEfficient)
    if (!isEfficient || a->isGrad) {
        XNoder::MakeGrad(a);

+        if (a->mem != NULL)
+            a->mem->LockBuf();
        XTensor* tmp = NewTensorBufV2(a, a->devID, a->mem);
        _Power(a, tmp, -2.0F);
        _NegateMe(tmp);
        _Multiply(node->grad, tmp, a->grad, 1.0F);

        DelTensorBuf(tmp);
+        if (a->mem != NULL)
+            a->mem->UnlockBuf();
    }
-
-    node->visitMark = NODE_FINISHED;
 }

 /*
@@ -1043,14 +1079,16 @@ void XMathGrad::GradSqrt(XTensor * node, bool isEfficient)
    if (!isEfficient || a->isGrad) {
        XNoder::MakeGrad(a);

+        if (a->mem != NULL)
+            a->mem->LockBuf();
        XTensor* tmp = NewTensorBufV2(a, a->devID, a->mem);
        _ScaleMe(tmp, 2.0F);
        _Multiply(node->grad, tmp, a->grad, 1.0F);

        DelTensorBuf(tmp);
+        if (a->mem != NULL)
+            a->mem->UnlockBuf();
    }
-
-    node->visitMark = NODE_FINISHED;
 }

 /*
@@ -1073,15 +1111,17 @@ void XMathGrad::GradSquare(XTensor * node, bool isEfficient)
    if (!isEfficient || a->isGrad) {
        XNoder::MakeGrad(a);

+        if (a->mem != NULL)
+            a->mem->LockBuf();
        XTensor* tmp = NewTensorBufV2(a, a->devID, a->mem);
        _Power(a, tmp, -0.5F);
        _ScaleMe(tmp, 0.5);
        _Multiply(node->grad, tmp, a->grad, 1.0F);

        DelTensorBuf(tmp);
+        if (a->mem != NULL)
+            a->mem->UnlockBuf();
    }
-
-    node->visitMark = NODE_FINISHED;
 }

 /*
@@ -1109,8 +1149,6 @@ void XMathGrad::GradScaleAndShift(XTensor * node, bool isEfficient)

        _Sum(a->grad, node->grad, a->grad, scale);
    }
-
-    node->visitMark = NODE_FINISHED;
 }

 /*
@@ -1138,8 +1176,6 @@ void XMathGrad::GradScale(XTensor * node, bool isEfficient)

        _Sum(a->grad, node->grad, a->grad, scale);
    }
-
-    node->visitMark = NODE_FINISHED;
 }

 /*
@@ -1166,9 +1202,7 @@ void XMathGrad::GradDescale(XTensor * node, bool isEfficient)
        XNoder::MakeGrad(a);

        _Sum(a->grad, node->grad, a->grad, 1 / descale);
-    } 
-
-    node->visitMark = NODE_FINISHED;
+    }
 }

 /*
@@ -1194,8 +1228,6 @@ void XMathGrad::GradShift(XTensor * node, bool isEfficient)

        _Sum(a->grad, node->grad, a->grad);
    }
-
-    node->visitMark = NODE_FINISHED;
 }

 /*
@@ -1229,8 +1261,6 @@ void XMathGrad::GradSub(XTensor * node, bool isEfficient)
        XNoder::MakeGrad(b);
        _Sum(b->grad, node->grad, b->grad, -beta);
    }
-
-    node->visitMark = NODE_FINISHED;
 }

 /*
@@ -1275,12 +1305,16 @@ void XMathGrad::GradSubDim(XTensor * node, bool isEfficient)
               size of b. Then we can reduce the matrix into a row vector. */
            node->grad->Reshape(2, reshapedSize);

+            if (b->mem != NULL)
+                b->mem->LockBuf();
            XTensor * bGradTMP = NewTensorBufV2(b->grad, b->devID, b->mem);
            _ReduceSum(node->grad, bGradTMP, 0);
            if (beta != 1.0F)
                _ScaleAndShiftMe(bGradTMP, beta);
            _Sub(b->grad, bGradTMP, b->grad);
            DelTensorBuf(bGradTMP);
+            if (b->mem != NULL)
+                b->mem->UnlockBuf();

            node->grad->Reshape(order, dimSize);
        }
@@ -1301,6 +1335,8 @@ void XMathGrad::GradSubDim(XTensor * node, bool isEfficient)
               Then reduce along with z and x to obtain dE/db. */
            node->grad->Reshape(3, reshapedSize);

+            if (b->mem != NULL)
+                b->mem->LockBuf();
            XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);

            _ReduceSum(node->grad, interGrad, 2);
@@ -1315,10 +1351,10 @@ void XMathGrad::GradSubDim(XTensor * node, bool isEfficient)
            node->grad->Reshape(order, dimSize);

            DelTensorBuf(interGrad);
+            if (b->mem != NULL)
+                b->mem->UnlockBuf();
        }
    }
-
-    node->visitMark = NODE_FINISHED;
 }

 /* 
@@ -1352,8 +1388,6 @@ void XMathGrad::GradSum(XTensor * node, bool isEfficient)
        XNoder::MakeGrad(b);
        _Sum(b->grad, node->grad, b->grad, beta);
    }
-
-    node->visitMark = NODE_FINISHED;
 }

 /* 
@@ -1399,12 +1433,16 @@ void XMathGrad::GradSumDim(XTensor * node, bool isEfficient)
               size of b. Then we can reduce the matrix into a row vector. */
            node->grad->Reshape(2, reshapedSize);

+            if (b->mem != NULL)
+                b->mem->LockBuf();
            XTensor * bGradTMP = NewTensorBufV2(b->grad, b->devID, b->mem);
            _ReduceSum(node->grad, bGradTMP, 0);
            if (beta != 1.0F)
                _ScaleAndShiftMe(bGradTMP, beta);
            _Sum(bGradTMP, b->grad, b->grad);
            DelTensorBuf(bGradTMP);
+            if (b->mem != NULL)
+                b->mem->UnlockBuf();

            node->grad->Reshape(order, dimSize);
        }
@@ -1425,6 +1463,8 @@ void XMathGrad::GradSumDim(XTensor * node, bool isEfficient)
               Then reduce along with z and x to obtain dE/db. */
            node->grad->Reshape(3, reshapedSize);

+            if (b->mem != NULL)
+                b->mem->LockBuf();
            XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);

            _ReduceSum(node->grad, interGrad, 2);
@@ -1439,10 +1479,10 @@ void XMathGrad::GradSumDim(XTensor * node, bool isEfficient)
            node->grad->Reshape(order, dimSize);

            DelTensorBuf(interGrad);
+            if (b->mem != NULL)
+                b->mem->UnlockBuf();
        }
    }
-
-    node->visitMark = NODE_FINISHED;
 }

 /* 
@@ -1480,8 +1520,6 @@ void XMathGrad::GradSumBroadcast(XTensor * node, bool isEfficient)
            ShowNTErrors("TODO");
        }
    }
-
-    node->visitMark = NODE_FINISHED;
 }

 /*
@@ -1509,15 +1547,17 @@ void XMathGrad::GradReduceMean(XTensor * node, bool isEfficient)
    if (!isEfficient || a->isGrad) {
        XNoder::MakeGrad(a);

+        if (a->mem != NULL)
+            a->mem->LockBuf();
        XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
        _Unsqueeze(node->grad, tmp, dim, n);
        _ScaleAndShiftMe(tmp, 1.0F / n);
        _Sum(a->grad, tmp, a->grad);

        DelTensorBuf(tmp);
+        if (a->mem != NULL)
+            a->mem->UnlockBuf();
    }
-
-    node->visitMark = NODE_FINISHED;
 }

 /*
@@ -1545,13 +1585,15 @@ void XMathGrad::GradReduceSum(XTensor * node, bool isEfficient)
    if (!isEfficient || a->isGrad) {
        XNoder::MakeGrad(a);

+        if (a->mem != NULL)
+            a->mem->LockBuf();
        XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
        _Unsqueeze(node->grad, tmp, dim, n);
        _Sum(a->grad, tmp, a->grad);
        DelTensorBuf(tmp);
+        if (a->mem != NULL)
+            a->mem->UnlockBuf();
    }
-
-    node->visitMark = NODE_FINISHED;
 }

 /*
@@ -1576,14 +1618,16 @@ void XMathGrad::GradReduceSumAll(XTensor * node, bool isEfficient)
    if (!isEfficient || a->isGrad) {
        XNoder::MakeGrad(a);

+        if (a->mem != NULL)
+            a->mem->LockBuf();
        XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
        DTYPE value = node->grad->Get0D();
        tmp->SetDataFixed(value);
        _Sum(a->grad, tmp, a->grad);
        DelTensorBuf(tmp);
+        if (a->mem != NULL)
+            a->mem->UnlockBuf();
    }
-
-    node->visitMark = NODE_FINISHED;
 }

 /*
@@ -1605,9 +1649,14 @@ void XMathGrad::GradReduceSumSquared(XTensor * node, bool isEfficient)

    XTensor * a = income.tails[0];
    XTensor * b = income.tails[1];
+    if (a->mem != NULL)
+        a->mem->LockBuf();
    XTensor * c = NewTensorBufV2(a, a->devID, a->mem);
    XTensor * d = NewTensorBufV2(a, a->devID, a->mem);
    XTensor * e = NewTensorBufV2(a, a->devID, a->mem);
+    if ((b->mem != NULL) && (b->mem != a->mem)) {
+        b->mem->LockBuf();
+    }
    XTensor * f = NewTensorBufV2(b, b->devID, b->mem);

    int dim = income.GetParamInt(0);
@@ -1636,11 +1685,14 @@ void XMathGrad::GradReduceSumSquared(XTensor * node, bool isEfficient)
    }

    DelTensorBuf(f);
+    if ((b->mem != NULL) && (b->mem != a->mem)) {
+        b->mem->UnlockBuf();
+    }
    DelTensorBuf(e);
    DelTensorBuf(d);
    DelTensorBuf(c);
-
-    node->visitMark = NODE_FINISHED;
+    if (a->mem != NULL)
+        a->mem->UnlockBuf();
 }

 /*
@@ -1663,9 +1715,14 @@ void XMathGrad::GradReduceVariance(XTensor * node, bool isEfficient)

    XTensor * a = income.tails[0];
    XTensor * b = income.tails[1];
+    if (a->mem != NULL)
+        a->mem->LockBuf();
    XTensor * c = NewTensorBufV2(a, a->devID, a->mem);
    XTensor * d = NewTensorBufV2(a, a->devID, a->mem);
    XTensor * e = NewTensorBufV2(a, a->devID, a->mem);
+    if ((b->mem != NULL) && (b->mem != a->mem)) {
+        b->mem->LockBuf();
+    }
    XTensor * f = NewTensorBufV2(b, b->devID, b->mem);

    int dim = income.GetParamInt(0);
@@ -1693,11 +1750,14 @@ void XMathGrad::GradReduceVariance(XTensor * node, bool isEfficient)
    }

    DelTensorBuf(f);
+    if ((b->mem != NULL) && (b->mem != a->mem)) {
+        b->mem->UnlockBuf();
+    }
    DelTensorBuf(e);
    DelTensorBuf(d);
    DelTensorBuf(c);
-
-    node->visitMark = NODE_FINISHED;
+    if (a->mem != NULL)
+        a->mem->UnlockBuf();
 }

 /*
@@ -1742,10 +1802,14 @@ void XMathGrad::GradMulAndShift(XTensor * node, bool isEfficient)
            size of b. Then we can reduce the matrix into a row vector. */
            node->grad->Reshape(2, reshapedSize);

+            if (b->mem != NULL)
+                b->mem->LockBuf();
            XTensor * bGradTMP = NewTensorBufV2(b->grad, b->devID, b->mem);
            _ReduceSum(node->grad, bGradTMP, 0);
            _Sum(bGradTMP, b->grad, b->grad);
            DelTensorBuf(bGradTMP);
+            if (b->mem != NULL)
+                b->mem->UnlockBuf();

            node->grad->Reshape(order, dimSize);
        }
@@ -1766,6 +1830,8 @@ void XMathGrad::GradMulAndShift(XTensor * node, bool isEfficient)
            Then reduce along with z and x to obtain dE/db. */
            node->grad->Reshape(3, reshapedSize);

+            if (b->mem != NULL)
+                b->mem->LockBuf();
            XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
            _ReduceSum(node->grad, interGrad, 2);

@@ -1777,6 +1843,8 @@ void XMathGrad::GradMulAndShift(XTensor * node, bool isEfficient)
            node->grad->Reshape(order, dimSize);

            DelTensorBuf(interGrad);
+            if (b->mem != NULL)
+                b->mem->UnlockBuf();
        }
    }

@@ -1815,9 +1883,6 @@ void XMathGrad::GradMulAndShift(XTensor * node, bool isEfficient)
            dedx->Reshape(orderBackupX, dimsBackupX);
        dedc->Reshape(orderBackupC, dimsBackupC);
    }
-
-    node->visitMark = NODE_FINISHED;
-
 }

 /*
@@ -1884,6 +1949,8 @@ void XMathGrad::GradMLP(XTensor* node, bool isEfficient)
            Then reduce along with z and x to obtain dE/db. */
            node->grad->Reshape(3, reshapedSize);

+            if (b->mem != NULL)
+                b->mem->LockBuf();
            XTensor* interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
            _ReduceSum(node->grad, interGrad, 2);

@@ -1895,6 +1962,8 @@ void XMathGrad::GradMLP(XTensor* node, bool isEfficient)
            node->grad->Reshape(order, dimSize);

            DelTensorBuf(interGrad);
+            if (b->mem != NULL)
+                b->mem->UnlockBuf();
        }
    }

@@ -1933,9 +2002,6 @@ void XMathGrad::GradMLP(XTensor* node, bool isEfficient)
            dedx->Reshape(orderBackupX, dimsBackupX);
        dedc->Reshape(orderBackupC, dimsBackupC);
    }
-
-    node->visitMark = NODE_FINISHED;
-
 }

 }
--- a/source/network/XBackwardShape.cpp
+++ b/source/network/XBackwardShape.cpp
@@ -105,12 +105,19 @@ void XShapeGrad::GradConvertDataType(XTensor* node, bool isEfficient)
    if (!isEfficient || a->isGrad) {
        XNoder::MakeGrad(a);

+        if (a->mem != NULL)
+            a->mem->LockBuf();
        XTensor* tmp = NewTensorBufV2(a, a->devID, a->mem);
        _ConvertDataType(node->grad, tmp);
        _SumMe(a->grad, tmp);

        DelTensorBuf(tmp);
+        if (a->mem != NULL)
+            a->mem->UnlockBuf();
    }
+
+    node->visitMark = NODE_FINISHED;
+    node->isGradFinished = true;
 }

 /* 
@@ -138,12 +145,19 @@ void XShapeGrad::GradCopyIndexed(XTensor * node, bool isEfficient)
    if (!isEfficient || input->isGrad) {
        XNoder::MakeGrad(input);

+        if (input->mem != NULL)
+            input->mem->LockBuf();
        XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
        _SpreadForCopyIndexed(tmp, node->grad, dim, srcIndex, tgtIndex, copyNum);
        _SumMe(input->grad, tmp);

        DelTensorBuf(tmp);
+        if (input->mem != NULL)
+            input->mem->UnlockBuf();
    }
+
+    node->visitMark = NODE_FINISHED;
+    node->isGradFinished = true;
 }

 /* 
@@ -167,15 +181,20 @@ void XShapeGrad::GradGather(XTensor * node, bool isEfficient)
    if (!isEfficient || input->isGrad) {
        XNoder::MakeGrad(input);

+        if (input->mem != NULL)
+            input->mem->LockBuf();
        XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
        tmp->SetZeroAll();
        _SpreadForGather(tmp, node->grad, index);
        _SumMe(input->grad, tmp);

        DelTensorBuf(tmp);
+        if (input->mem != NULL)
+            input->mem->UnlockBuf();
    }

    node->visitMark = NODE_FINISHED;
+    node->isGradFinished = true;
 }

 /*
@@ -193,6 +212,8 @@ void XShapeGrad::GradDropoutWithIndex(XTensor * node, bool isEfficient)
    if (!isEfficient || input->isGrad) {
        XNoder::MakeGrad(input);

+        if (input->mem != NULL)
+            input->mem->LockBuf();
        XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
        _CopyValues(node->grad, tmp);

@@ -205,9 +226,12 @@ void XShapeGrad::GradDropoutWithIndex(XTensor * node, bool isEfficient)
        _SumMe(input->grad, tmp);

        DelTensorBuf(tmp);
+        if (input->mem != NULL)
+            input->mem->UnlockBuf();
    }

    node->visitMark = NODE_FINISHED;
+    node->isGradFinished = true;
 }

 /* 
@@ -246,13 +270,16 @@ void XShapeGrad::GradMerge(XTensor * node, bool isEfficient)
                dims[j++] = input->dimSize[i];
            }
        }
-        dims[0] = -dims[0];
+
+        dims[0] = -abs(dims[0]);
        XTensor gradInputSmall(input->order - leadDim, dims,
                               input->dataType, input->denseRatio,
                               input->devID, input->mem);

-        dims[whereToMerge - leadDim] *= dims[0];
-        XTensor gradNodeSmall(node->order - leadDim, dims + leadDim + 1,
+        dims[whereToMerge - leadDim] *= abs(dims[0]);
+        int * dimsNode = dims + 1;
+        dimsNode[0] = -abs(dimsNode[0]);
+        XTensor gradNodeSmall(node->order - leadDim, dimsNode,
                              node->dataType, node->denseRatio,
                              node->devID, node->mem);

@@ -296,6 +323,7 @@ void XShapeGrad::GradMerge(XTensor * node, bool isEfficient)
    }

    node->visitMark = NODE_FINISHED;
+    node->isGradFinished = true;
 }

 /* 
@@ -379,6 +407,7 @@ void XShapeGrad::GradMergeList(XTensor * node, bool isEfficient)
    }

    node->visitMark = NODE_FINISHED;
+    node->isGradFinished = true;
 }

 /* 
@@ -407,6 +436,7 @@ void XShapeGrad::GradReshape(XTensor * node, bool isEfficient)
    }

    node->visitMark = NODE_FINISHED;
+    node->isGradFinished = true;
 }

 /* 
@@ -442,16 +472,21 @@ void XShapeGrad::GradSplit(XTensor * node, bool isEfficient)
        /* if the tensor is used somewhere else, we need another SUM
           for gradient accumulation */
        else {
+            if (input->mem != NULL)
+                input->mem->LockBuf();
            XTensor * inputGradTMP = NewTensorBufV2(input, input->devID, input->mem);

            _Merge(node->grad, inputGradTMP, whereToSplit + 1, 0);
            _Sum(input->grad, inputGradTMP, input->grad);

            DelTensorBuf(inputGradTMP);
+            if (input->mem != NULL)
+                input->mem->UnlockBuf();
        }
    }

    node->visitMark = NODE_FINISHED;
+    node->isGradFinished = true;
 }

 /* 
@@ -528,14 +563,21 @@ void XShapeGrad::GradSplitListPost(XTensor * node, bool isEfficient)
           somewhere else, we need another SUM for gradient
           accumulation */
        else {
+            if (node->mem != NULL)
+                node->mem->LockBuf();
            XTensor * nodeGradTMP = NewTensorBufV2(node, node->devID, node->mem);

            _Merge(&splits, nodeGradTMP, whereToSplit + 1);
            _Sum(node->grad, nodeGradTMP, node->grad);

            DelTensorBuf(nodeGradTMP);
+            if (node->mem != NULL)
+                node->mem->UnlockBuf();
        }
    }
+
+    node->visitMark = NODE_DOING;
+    node->isGradFinished = true;
 }

 /*
@@ -566,14 +608,19 @@ void XShapeGrad::GradTranspose(XTensor * node, bool isEfficient)
        CheckNTErrors(input->order > i && i >= 0, "index of dimension is out of scope!");
        CheckNTErrors(input->order > j && j >= 0, "index of dimension is out of scope!");

+        if (input->mem != NULL)
+            input->mem->LockBuf();
        XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
        _Transpose(output->grad, tmp, i, j);
        _Sum(input->grad, tmp, input->grad);

        DelTensorBuf(tmp);
+        if (input->mem != NULL)
+            input->mem->UnlockBuf();
    }

    node->visitMark = NODE_FINISHED;
+    node->isGradFinished = true;
 }

 /* 
@@ -603,15 +650,20 @@ void XShapeGrad::GradUnsqueeze(XTensor * node, bool isEfficient)
    if (!isEfficient || input->isGrad) {
        XNoder::MakeGrad(input);

+        if (input->mem != NULL)
+            input->mem->LockBuf();
        XTensor * tmp = NewTensorBufV2(input->grad, input->devID, input->mem);

        _ReduceSum(output->grad, tmp, dim);
        _Sum(input->grad, tmp, input->grad);

        DelTensorBuf(tmp);
+        if (input->mem != NULL)
+            input->mem->UnlockBuf();
    }

    node->visitMark = NODE_FINISHED;
+    node->isGradFinished = true;
 }

 }
\ No newline at end of file
--- a/source/network/XNet.cpp
+++ b/source/network/XNet.cpp
@@ -101,6 +101,7 @@ void XNet::Backward(TensorList &roots)
    for(int i = 0; i < nodes.count; i++){
        XTensor * node = (XTensor*)nodes.Get(i);
        node->visitMark = NODE_UNFINISHED;
+        node->isGradFinished = false;
    }

    /* back-propagation from output to input */
@@ -108,7 +109,7 @@ void XNet::Backward(TensorList &roots)
        XTensor * node = (XTensor*)nodes.Get(i);

        if(node->mem != NULL){
-            CheckNTErrors(node->mem->bufUsed < BUF_PITCH, "Illegal access of buffer!");
+            //CheckNTErrors(node->mem->bufUsed < BUF_PITCH, "Illegal access of buffer!");
        }

        if(node->visitMark != NODE_FINISHED)
@@ -127,7 +128,20 @@ void XNet::Backward(TensorList &roots)
                    delete node;
                }
            }
-            
+        }
+    }
+
+    for (int i = 0; i < nodes.count; i++) {
+        XTensor* node = (XTensor*)nodes.Get(i);
+        if (node->income.tailNum >= 100 || node->outgo.tailNum >= 100) {
+            XPRINT(1, stderr, "Are you sure that the node should connect so many (100) nodes?\n");
+        }
+
+        if (node->grad != NULL) {
+            XTensor* grad = node->grad;
+            if (grad->income.tailNum >= 100 || grad->outgo.tailNum >= 100) {
+                XPRINT(1, stderr, "Are you sure that the grad node should connect so many (100) nodes?\n");
+            }
        }
    }
 }

--- a/source/sample/transformer/Decoder.cpp
+++ b/source/sample/transformer/Decoder.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/Decoder.h
+++ b/source/sample/transformer/Decoder.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/Encoder.cpp
+++ b/source/sample/transformer/Encoder.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/Encoder.h
+++ b/source/sample/transformer/Encoder.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/Model.cpp
+++ b/source/sample/transformer/Model.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -224,8 +224,6 @@ void Model::MakeMT(XTensor& inputEnc, XTensor& inputDec, XTensor& output,
    XTensor maskDec;
    XTensor maskEncDec;

-    bool debug(false);
-
    /* encoder mask */
    MakeMTMaskEnc(paddingEnc, maskEnc);

@@ -234,25 +232,9 @@ void Model::MakeMT(XTensor& inputEnc, XTensor& inputDec, XTensor& output,

    encoding = MakeEncoder(inputEnc, &maskEnc, isTraining);

-    if (debug) {
-        LOG("after encoding:");
-        encoding.mem->ShowMemUsage(stderr);
-    }
-    
    decoding = MakeDecoder(inputDec, encoding, &maskDec, maskEncDec, isTraining);

-    if (debug) {
-        LOG("after decoding:");
-        encoding.mem->ShowMemUsage(stderr);
-    }
-
    outputLayer->Make(decoding, output, true, true);
-
-    if (debug) {
-        LOG("after outputing:");
-        encoding.mem->ShowMemUsage(stderr);
-        exit(0);
-    }
 }

 /*
@@ -287,6 +269,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
    dims[inputDec.order + 1] = inputEnc.GetDim(inputEnc.order - 1);
    InitTensor(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, paddingEnc.devID);

+    GMems.GetMem(paddingEnc.devID)->LockBuf();
    XTensor* maskEncDecTMPEnc = NewTensorBufV2(paddingEnc.order + 1, dims + 1,
        paddingEnc.dataType, 1.0F, paddingEnc.devID, paddingEnc.mem);
    XTensor* maskEncDecTMPDec = NewTensorBufV2(maskEncDecTMPEnc, paddingEnc.devID, paddingEnc.mem);
@@ -297,6 +280,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,

    DelTensorBuf(maskEncDecTMPDec);
    DelTensorBuf(maskEncDecTMPEnc);
+    GMems.GetMem(paddingEnc.devID)->UnlockBuf();

    /* padding on the source side */
    int* dimsPadding = new int[paddingEnc.order + 2];
@@ -305,6 +289,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
    dimsPadding[paddingEnc.order - 1] = paddingEnc.GetDim(-1);
    dimsPadding[paddingEnc.order] = paddingEnc.GetDim(-1);

+    GMems.GetMem(paddingEnc.devID)->LockBuf();
    XTensor* padding2 = NewTensorBufV2(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType, 1.0F,
        paddingEnc.devID, paddingEnc.mem);

@@ -331,6 +316,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,

    DelTensorBuf(padding3);
    DelTensorBuf(padding2);
+    GMems.GetMem(paddingEnc.devID)->UnlockBuf();
 }

 /*
@@ -344,7 +330,6 @@ void Model::MakeMTMaskEnc(XTensor& paddingEnc, XTensor& maskEnc)

    /* mask of the padding */
    Unsqueeze(paddingEnc, padding2, paddingEnc.order - 1, paddingEnc.GetDim(-1));
-
    Unsqueeze(padding2, maskEnc, 0, nhead);
    ScaleAndShiftMe(maskEnc, 1e9F, -1e9F);
 }
@@ -378,7 +363,6 @@ void Model::MakeMTMaskDec(XTensor& paddingEnc, XTensor& paddingDec,

    Unsqueeze(paddingEnc, maskEncDecTMP, paddingEnc.order - 1, paddingDec.GetDim(-1));
    ScaleAndShiftMe(maskEncDecTMP, 1e9F, -1e9F);
-
    Unsqueeze(maskEncDecTMP, maskEncDec, 0, dims[0]);

    delete[] dims;
@@ -571,4 +555,14 @@ void Model::Read(FILE* file)
    LOG("model loaded (took %.1fs)", elapsed);
 }

+XModel* Model::Clone(int devID)
+{
+    return nullptr;
+}
+
+bool Model::RunSimple(XList* inputs, XList* outputs, XList* golds, XList* losses)
+{
+    return false;
+}
+
 }
\ No newline at end of file
--- a/source/sample/transformer/Model.h
+++ b/source/sample/transformer/Model.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -24,17 +24,18 @@

 #include "Encoder.h"
 #include "Decoder.h"
+#include "Utility.h"
 #include "submodel/FNN.h"
 #include "submodel/Output.h"
-#include "Utility.h"
 #include "submodel/Attention.h"
+#include "../../train/XModel.h"

 namespace nmt
 {

-/* a nmt model that keeps parameters of the encoder,
+/* an nmt model that keeps parameters of the encoder,
   the decoder and the output layer (softmax). */
-class Model
+class Model : public XModel
 {
 public:
    /* device id */
@@ -85,26 +86,26 @@ public:

    /* make the encoding network */
    XTensor MakeDecoder(XTensor& inputEnc, XTensor& inputDec, XTensor* mask,
-        XTensor& MaskEncDec, bool isTraining);
+                        XTensor& MaskEncDec, bool isTraining);

    /* make the network for language modeling (with the output softmax layer) */
    void MakeLM(XTensor& input, XTensor& output, XTensor& padding, bool isTraining);

    /* make the network for machine translation (with the output softmax layer) */
    void MakeMT(XTensor& inputEnc, XTensor& inputDec, XTensor& output,
-        XTensor& paddingEnc, XTensor& paddingDec, bool isTraining);
+                XTensor& paddingEnc, XTensor& paddingDec, bool isTraining);

    /* make the mask for training MT models */
    void MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
-        XTensor& paddingEnc, XTensor& paddingDec,
-        XTensor& maskEnc, XTensor& maskDec, XTensor& maskEncDec);
+                    XTensor& paddingEnc, XTensor& paddingDec,
+                    XTensor& maskEnc, XTensor& maskDec, XTensor& maskEncDec);

    /* make the mask of the encoder */
    void MakeMTMaskEnc(XTensor& paddingEnc, XTensor& maskEnc);

    /* make the mask of the decoder */
    void MakeMTMaskDec(XTensor& paddingEnc, XTensor& paddingDec,
-        XTensor& maskDec, XTensor& maskEncDec);
+                       XTensor& maskDec, XTensor& maskEncDec);

    /* get parameter matrices */
    void GetParams(TensorList& list);
@@ -114,6 +115,13 @@ public:

    /* read the parameters */
    void Read(FILE* file);
+
+public:
+    /* clone the model (overloaded method of XModel) */
+    XModel * Clone(int devID);
+
+    /* run the neural network (overloaded method of XModel) */
+    bool RunSimple(XList * inputs, XList * outputs, XList * golds, XList * losses);
 };

 }

--- a/source/sample/transformer/NMT.cpp
+++ b/source/sample/transformer/NMT.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/NMT.h
+++ b/source/sample/transformer/NMT.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/Utility.cpp
+++ b/source/sample/transformer/Utility.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -28,6 +28,7 @@

 #include "Utility.h"
 #include "../../tensor/XGlobal.h"
+#include "../../tensor/XConfig.h"

 using namespace nts;
 using namespace std;
@@ -165,89 +166,7 @@ int Config::LoadFromFile(const char* configFN, char** args) {
    return argsNum;
 }

-void LoadParamString(int argc, char** argv, const char* name, char* p, const char* defaultP)
-{
-    char vname[128];
-    vname[0] = '-';
-    strcpy(vname + 1, name);
-    bool hit = false;
-    for (int i = 0; i < argc; i++) {
-        if (!strcmp(argv[i], vname) && i + 1 < argc) {
-            strcpy(p, argv[i + 1]);
-            hit = true;
-            break;
-        }
-    }
-    if (!hit)
-        strcpy(p, defaultP);
-}

-void LoadParamInt(int argc, char** argv, const char* name, int* p, int defaultP)
-{
-    char vname[128];
-    vname[0] = '-';
-    strcpy(vname + 1, name);
-    bool hit = false;
-    for (int i = 0; i < argc; i++) {
-        if (!strcmp(argv[i], vname) && i + 1 < argc) {
-            *(int*)p = atoi(argv[i + 1]);
-            hit = true;
-            break;
-        }
-    }
-    if (!hit)
-        *p = defaultP;
-}
-
-void LoadParamBool(int argc, char** argv, const char* name, bool* p, bool defaultP)
-{
-    char vname[128];
-    vname[0] = '-';
-    strcpy(vname + 1, name);
-    bool hit = false;
-    for (int i = 0; i < argc; i++) {
-        if (!strcmp(argv[i], vname)) {
-            *(bool*)p = true;
-            hit = true;
-            break;
-        }
-    }
-    if (!hit)
-        *p = defaultP;
-}
-
-void LoadParamFloat(int argc, char** argv, const char* name, float* p, float defaultP)
-{
-    char vname[128];
-    vname[0] = '-';
-    strcpy(vname + 1, name);
-    bool hit = false;
-    for (int i = 0; i < argc; i++) {
-        if (!strcmp(argv[i], vname) && i + 1 < argc) {
-            *p = (float)atof(argv[i + 1]);
-            hit = true;
-            break;
-        }
-    }
-    if (!hit)
-        *p = defaultP;
-}
-
-void ShowParams(int argc, char** argv)
-{
-    fprintf(stderr, "args:\n");
-    for (int i = 0; i < argc; i++) {
-        if (argv[i][1] == 0)
-            continue;
-        if (argv[i][0] == '-' && (argv[i][1] < '1' || argv[i][1] > '9')) {
-            if (i + 1 < argc && argv[i + 1][0] != '-')
-                fprintf(stderr, " %s=%s\n", argv[i], argv[i + 1]);
-            else
-                fprintf(stderr, " %s=yes\n", argv[i]);
-        }
-    }
-    fprintf(stderr, "\n");
-}

 /*
 split string by delimiter, this will return indices of all sub-strings
@@ -281,7 +200,9 @@ IntList SplitInt(const string& s, const string& delimiter)
    IntList values;
    auto indices = SplitToPos(s, delimiter);
    for (int i = 0; i < indices.Size(); i++) {
-        values.Add(strtol(s.data() + indices[i], nullptr, 10));
+        
+        /* this line is with problem. Why do we need an IntList to keep an int64*/
+        values.Add((int)strtol(s.data() + indices[i], nullptr, 10));
    }
    return values;
 }
@@ -297,4 +218,4 @@ FloatList SplitFloat(const string& s, const string& delimiter)
    return values;
 }

-}
\ No newline at end of file
+}
--- a/source/sample/transformer/Utility.h
+++ b/source/sample/transformer/Utility.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -34,16 +34,6 @@ namespace nmt
 {

 #define MAX_PARAM_NUM 100
-
-/* load arguments */
-void LoadParamInt(int argc, char** argv, const char* name, int* p, int defaultP);
-void LoadParamBool(int argc, char** argv, const char* name, bool* p, bool defaultP);
-void LoadParamFloat(int argc, char** argv, const char* name, float* p, float defaultP);
-void LoadParamString(int argc, char** argv, const char* name, char* p, const char* defaultP);
-
-/* show arguments */
-void ShowParams(int argc, char** argv);
-
 /* split string */
 IntList SplitInt(const string& s, const string& delimiter);
 FloatList SplitFloat(const string& s, const string& delimiter);
@@ -115,10 +105,10 @@ public:
    /* the maximum length in positional embedding */
    int maxPosition;

-    /* the maximum length for the source sequence */
+    /* the maximum length of the source sequence */
    int maxSrcLen;

-    /* the maximum length for the target sequence */
+    /* the maximum length of the target sequence */
    int maxTgtLen;

    /* the dimension of fnn hidden layer */

--- a/source/sample/transformer/submodel/Attention.cpp
+++ b/source/sample/transformer/submodel/Attention.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -259,7 +259,7 @@ XTensor Attention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
        relativeKey = ConvertDataType(relativeKey, X_FLOAT);
    }

-    float scaling = float(sqrt(d / nhead));
+    float scaling = (float)sqrt(d / nhead);
    qheads = ScaleAndShift(qheads, 1.0F / scaling);

    dot = RPDotProduct(qheads, kheads, relativeKey, true);
@@ -373,7 +373,7 @@ XTensor Attention::RPDotProduct(XTensor& x, XTensor& y, XTensor& z, const bool i
    xTrans = Transpose(x, 0, 1);

    XTensor relative;
-    relative = BMMul(xTrans, X_NOTRANS, z, transposeFlag);
+    relative = MatrixMulBatched(xTrans, X_NOTRANS, z, transposeFlag);

    XTensor relativeTrans;
    relativeTrans = Transpose(relative, 0, 1);

--- a/source/sample/transformer/submodel/Attention.h
+++ b/source/sample/transformer/submodel/Attention.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/submodel/CommonModules.cpp
+++ b/source/sample/transformer/submodel/CommonModules.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/submodel/CommonModules.h
+++ b/source/sample/transformer/submodel/CommonModules.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/submodel/Embedding.cpp
+++ b/source/sample/transformer/submodel/Embedding.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/submodel/Embedding.h
+++ b/source/sample/transformer/submodel/Embedding.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/submodel/FNN.cpp
+++ b/source/sample/transformer/submodel/FNN.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -67,9 +67,7 @@ void FNN::InitModel(Config& config)
    float scale = 1.0F;
    _SetDataFanInOut(&w1, scale);
    _SetDataFanInOut(&w2, scale);
-
-    //w1.SetDataRand(-(DTYPE)sqrt(6.0F / inSize), (DTYPE)sqrt(6.0F / inSize));
-    //w2.SetDataRand(-(DTYPE)sqrt(6.0F / hSize), (DTYPE)sqrt(6.0F / hSize));
+	

    b1.SetZeroAll();
    b2.SetZeroAll();

--- a/source/sample/transformer/submodel/FNN.h
+++ b/source/sample/transformer/submodel/FNN.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/submodel/GLU.cpp
+++ b/source/sample/transformer/submodel/GLU.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/submodel/GLU.h
+++ b/source/sample/transformer/submodel/GLU.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/submodel/LayerHistory.cpp
+++ b/source/sample/transformer/submodel/LayerHistory.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/submodel/LayerHistory.h
+++ b/source/sample/transformer/submodel/LayerHistory.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/submodel/LayerNorm.cpp
+++ b/source/sample/transformer/submodel/LayerNorm.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/submodel/LayerNorm.h
+++ b/source/sample/transformer/submodel/LayerNorm.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/submodel/NNUtil.cpp
+++ b/source/sample/transformer/submodel/NNUtil.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/submodel/NNUtil.h
+++ b/source/sample/transformer/submodel/NNUtil.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/submodel/Output.cpp
+++ b/source/sample/transformer/submodel/Output.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/submodel/Output.h
+++ b/source/sample/transformer/submodel/Output.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/train/TrainDataSet.cpp
+++ b/source/sample/transformer/train/TrainDataSet.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/train/TrainDataSet.h
+++ b/source/sample/transformer/train/TrainDataSet.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -29,6 +29,7 @@
 #include "../../../tensor/XList.h"
 #include "../../../tensor/XTensor.h"
 #include "../../../tensor/XGlobal.h"
+#include "../../../train/XBaseTemplate.h"


 using namespace std;
@@ -74,8 +75,8 @@ struct ReservedIDs {
 };

 /* A `TrainDataSet` is associated with a file which contains training data. */
-struct TrainDataSet {
-
+struct TrainDataSet : public DataDistributeBase
+{
 public:

    /* the pointer to file stream */

--- a/source/sample/transformer/train/Trainer.cpp
+++ b/source/sample/transformer/train/Trainer.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -97,7 +97,6 @@ initialization
 void Trainer::Init(Config& config)
 {
    cfg = &config;
-    
    lrate = config.lrate;
    lrbias = config.lrbias;
    sBatchSize = config.sBatchSize;
@@ -242,17 +241,8 @@ void Trainer::Train(const char* fn, const char* validFN,
            DTYPE lossLocal = lossBatch / wc;
            bool doUpdate = (!IsNAN(lossLocal) && !IsINF(lossLocal) && lossLocal < 1e3F);

-            net.isGradEfficient = true;
-
-            bool debug(false);
-            if (debug) {
-                LOG("after forward:");
-                batchEnc.mem->ShowMemUsage(stderr);
-                exit(0);
-            }
-
            if (doUpdate) {
-
+                /* back-propagation */
                net.Backward(lossTensor);

                if (model->encoder->useHistory)
@@ -502,6 +492,7 @@ void Trainer::Update(Model* model, const float lr)
            _ScaleAndShiftMe(v, (1.0F - adamBeta2), 0);

            /* v2 = m / (sqrt(v) + delta) */
+            GMems.GetMem(v->devID)->LockBuf();
            XTensor* v2 = NewTensorBufV2(v, v->devID, v->mem);
            _Power(v, v2, 0.5F);
            _ScaleAndShiftMe(v2, 1.0F, d);
@@ -511,6 +502,7 @@ void Trainer::Update(Model* model, const float lr)
            _Sum(para, v2, para, -e);

            DelTensorBuf(v2);
+            GMems.GetMem(v->devID)->UnlockBuf();
        }
        else {
            /* the delta rule */

--- a/source/sample/transformer/train/Trainer.h
+++ b/source/sample/transformer/train/Trainer.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/translate/DataSet.cpp
+++ b/source/sample/transformer/translate/DataSet.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/translate/DataSet.h
+++ b/source/sample/transformer/translate/DataSet.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/translate/LengthPenalty.cpp
+++ b/source/sample/transformer/translate/LengthPenalty.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -42,7 +42,7 @@ float LengthPenalizer::GNMT(float length, float alpha)

    base = (length + 5.0F) / (1.0F + 5.0F);

-    lp = float(pow(base, alpha));
+    lp = (float)pow(base, alpha);

    return lp;
 }

--- a/source/sample/transformer/translate/LengthPenalty.h
+++ b/source/sample/transformer/translate/LengthPenalty.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/translate/Predictor.cpp
+++ b/source/sample/transformer/translate/Predictor.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/translate/Predictor.h
+++ b/source/sample/transformer/translate/Predictor.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/translate/Search.cpp
+++ b/source/sample/transformer/translate/Search.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -322,7 +322,7 @@ void BeamSearch::Generate(StateBundle* prev, StateBundle* beam)
    /* keep the most promising candidates in the beam */
    TopK(score, scoreTopK, index, -1, beamSize, true);

-    float lp = LengthPenalizer::GNMT(beam->nstep, alpha);
+    //float lp = LengthPenalizer::GNMT(beam->nstep, alpha);

    CopyValues(index, indexCPU);
    CopyValues(index, preID);
@@ -493,8 +493,8 @@ void BeamSearch::Collect(StateBundle* beam)

        /* check if this is the first end symbol. It is false
           if there have been end symbols in previously generated words. */
-        bool isCompleted = state.isCompleted && 
-             (state.last == NULL || !state.last->isCompleted);
+        //bool isCompleted = state.isCompleted &&
+        //     (state.last == NULL || !state.last->isCompleted);

        /* we push the hypothesis into the heap when it is completed */
        if ((state.isEnd || state.isCompleted)) {
@@ -557,7 +557,6 @@ void BeamSearch::Dump(IntList* output, XTensor* score)
            }
        }

-        int count = 0;
        bool isCompleted = true;

        /* we track the state from the end to the beginning */
@@ -874,4 +873,4 @@ void GreedySearch::Search(Model* model, XTensor& input,
    delete[] finishedFlags;
 }

-}
\ No newline at end of file
+}
--- a/source/sample/transformer/translate/Search.h
+++ b/source/sample/transformer/translate/Search.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/translate/Translator.cpp
+++ b/source/sample/transformer/translate/Translator.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -161,7 +161,7 @@ void Translator::Translate(const char* ifn, const char* sfn,
        batchLoader.outputBuffer.emplace_back(emptyRes);
    }

-    double startDump = GetClockSec();
+    //double startDump = GetClockSec();

    /* reorder the result */
    batchLoader.SortOutput();
@@ -169,10 +169,10 @@ void Translator::Translate(const char* ifn, const char* sfn,
    /* print the result to a file */
    batchLoader.DumpRes(ofn);

-    double elapsed = GetClockSec() - startDump;
+    //double elapsed = GetClockSec() - startDump;

    LOG("translation completed (word=%d, sent=%zu)", 
-        wordCountTotal, batchLoader.outputBuffer.size() + batchLoader.emptyLines.size());
+        wordCountTotal, batchLoader.inputBuffer.size() + batchLoader.emptyLines.size());
 }

 /*
@@ -202,4 +202,4 @@ void Translator::Dump(FILE* file, XTensor* output)
    }
 }

-}
\ No newline at end of file
+}
--- a/source/sample/transformer/translate/Translator.h
+++ b/source/sample/transformer/translate/Translator.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/translate/Vocab.cpp
+++ b/source/sample/transformer/translate/Vocab.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -34,14 +34,14 @@ void Vocab::Load(const string& src)

    /* get the vocab size and the start id */
    f >> vsz >> sid;
-    startID = stol(sid);
-    vocabSize = stol(vsz);
+    startID = (int)stol(sid);
+    vocabSize = (int)stol(vsz);

    string word, id;
    for (int i = 0; i < vocabSize - startID; i++) {
        f >> word >> id;
-        word2id[word] = stol(id);
-        id2word[stol(id)] = word;
+        word2id[word] = (int)stol(id);
+        id2word[(int)stol(id)] = word;
    }

    f.close();
@@ -75,4 +75,4 @@ void Vocab::CopyFrom(const Vocab& v)
        id2word.insert(i2w);
 }

-}
\ No newline at end of file
+}
--- a/source/sample/transformer/translate/Vocab.h
+++ b/source/sample/transformer/translate/Vocab.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/tensor/XConfig.cpp
+++ b/source/tensor/XConfig.cpp
+/*
+* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2021
+* Natural Language Processing Lab, Northeastern University
+* and
+* NiuTrans Research
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* this class keeps a batch of paramters.
+*
+* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-02-28
+*/
+
+#include "XConfig.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* constructor */
+XConfig::XConfig()
+{
+    n = 0;
+    args = NULL;
+    nReal = 0;
+}
+
+/* de-constructor */
+XConfig::~XConfig()
+{
+    for (int i = 0; i < n; i++) {
+        delete[] args[i];
+    }
+    delete[] args;
+}
+
+/* clear it */
+void XConfig::Clear()
+{
+    for (int i = 0; i < n; i++) {
+        delete[] args[i];
+    }
+    delete[] args;
+    n = 0;
+    args = NULL;
+    nReal = 0;
+}
+
+/* 
+create a config 
+>> myN - number of the input arguments
+>> myArgs - the input arguments
+*/
+void XConfig::Create(const int myN, const char ** myArgs)
+{
+    CheckNTErrors(myN > 0, "No input parameters to XConfig!");
+
+    for (int i = 0; i < n; i++) {
+        delete[] args[i];
+    }
+    delete[] args;
+    args = NULL;
+    n = myN;
+    nReal = n * 2;
+    
+    
+    args = new char*[nReal];
+
+    for (int i = 0; i < nReal; i++) {
+        args[i] = NULL;
+    }
+
+    for (int i = 0; i < n; i++) {
+        CheckNTErrors(myArgs[i] != NULL, "Illegal parameter input!");
+        args[i] = new char[strlen(myArgs[i]) + 1];
+        strcpy(args[i], myArgs[i]);
+    }
+}
+
+/* 
+add an argument 
+>> myArg - the argument
+>> myValue - the value of the argument
+*/
+void XConfig::Add(const char * myArg, const char * myValue)
+{
+    CheckNTErrors(myArg != NULL, "No argument!");
+
+    if (n + 2 > nReal) {
+        nReal = MAX(n * 2 + 1, 128);
+        char ** newArgs = new char*[nReal];
+        memset(newArgs, 0, sizeof(char*) * n);
+        memcpy(newArgs, args, sizeof(char*) * n);
+        delete[] args;
+        args = newArgs;
+    }
+
+    args[n] = new char[strlen(myArg) + 2];
+    args[n][0] = '-';
+    strcpy(args[n] + 1, myArg);
+    n++;
+
+    if (myValue != NULL) {
+        args[n] = new char[strlen(myValue) + 1];
+        strcpy(args[n], myValue);
+        n++;
+    }
+}
+
+/* 
+add an argument (in integer) 
+>> myArg - the argument
+>> myValue - the value of the argument
+*/
+void XConfig::Add(const char * myArg, int myValue)
+{
+    char value[MAX_WORD_LENGTH_IN_CONFIG];
+
+    sprintf(value, "%d", myValue);
+
+    Add(myArg, value);
+}
+
+/* 
+add an argument (in bool) 
+>> myArg - the argument
+>> myValue - the value of the argument
+*/
+void XConfig::Add(const char * myArg, bool myValue)
+{
+    char value[2];
+
+    if (myValue)
+        value[0] = '1';
+    else
+        value[0] = '0';
+    value[1] = 0;
+
+    Add(myArg, value);
+}
+
+/*
+add an argument (in float)
+>> myArg - the argument
+>> myValue - the value of the argument
+*/
+void XConfig::Add(const char * myArg, float myValue)
+{
+    char value[MAX_WORD_LENGTH_IN_CONFIG];
+
+    sprintf(value, "%f", myValue);
+
+    Add(myArg, value);
+}
+
+/* 
+load the value of an argument (in integer) 
+>> name - the name of the argument
+>> p - where we place the loaded value
+>> defaultP - the default value (used only if no argument is hit in the list)
+*/
+void XConfig::LoadInt(const char * name, int * p, int defaultP)
+{
+    LoadParamInt(n, args, name, p, defaultP);
+}
+
+/*
+load the value of an argument (in boolean)
+>> name - the name of the argument
+>> p - where we place the loaded value
+>> defaultP - the default value (used only if no argument is hit in the list)
+*/
+void XConfig::LoadBool(const char * name, bool * p, bool defaultP)
+{
+    LoadParamBool(n, args, name, p, defaultP);
+}
+
+/*
+load the value of an argument (in float)
+>> name - the name of the argument
+>> p - where we place the loaded value
+>> defaultP - the default value (used only if no argument is hit in the list)
+*/void XConfig::LoadFloat(const char * name, float * p, float defaultP)
+{
+    LoadParamFloat(n, args, name, p, defaultP);
+}
+
+/*
+load the value of an argument (in char string)
+>> name - the name of the argument
+>> p - where we place the loaded value
+>> defaultP - the default value (used only if no argument is hit in the list)
+*/
+void XConfig::LoadString(const char * name, char * p, const char* defaultP)
+{
+    LoadParamString(n, args, name, p, defaultP);
+}
+
+/* 
+get the value of an argument (in integer) 
+>> name - the name of the argument
+>> defaultP - the default value (used only if no argument is hit in the list)
+*/
+int XConfig::GetInt(const char * name, int defaultP)
+{
+    int r;
+
+    LoadInt(name, &r, defaultP);
+
+    return r;
+}
+
+/* 
+get the value of an argument (in bool)
+>> name - the name of the argument
+>> defaultP - the default value (used only if no argument is hit in the list)
+*/
+bool XConfig::GetBool(const char * name, bool defaultP)
+{
+    bool r;
+
+    LoadBool(name, &r, defaultP);
+
+    return r;
+}
+
+/* 
+get the value of an argument (in float) 
+>> name - the name of the argument
+>> defaultP - the default value (used only if no argument is hit in the list)
+*/
+float XConfig::GetFloat(const char * name, float defaultP)
+{
+    float r;
+
+    LoadFloat(name, &r, defaultP);
+
+    return r;
+}
+
+/* get item number */
+int XConfig::GetItemNum()
+{
+    return n;
+}
+
+/* 
+get the item with offset i 
+>> i - offset
+*/
+char * XConfig::GetItem(int i)
+{
+    if (i < n && i >= 0)
+        return args[i];
+    else
+        return NULL;
+}
+
+/* 
+initialize with another config model 
+>> myConfig - the configure model that we want to copy
+*/
+void XConfig::CreateFromMe(XConfig & myConfig)
+{
+    Clear();
+
+    for (int i = 0; i < myConfig.GetItemNum(); i++)
+        Add(myConfig.GetItem(i), i);
+}
+
+/*
+load the value of an argument (in integer)
+>> argc - number of arguments
+>> argv - arguments
+>> name - the argument we search for
+>> p - the pointer to the target variable where we want to place the value
+>> defaultP - the default value we use if no argument is found
+*/
+void LoadParamInt(int argc, char** argv, const char* name, int* p, int defaultP)
+{
+    char vname[128];
+    vname[0] = '-';
+    strcpy(vname + 1, name);
+    bool hit = false;
+    for (int i = 0; i < argc; i++) {
+        if (!strcmp(argv[i], vname) && i + 1 < argc) {
+            *(int*)p = atoi(argv[i + 1]);
+            hit = true;
+            break;
+        }
+    }
+    if (!hit)
+        *p = defaultP;
+}
+
+/*
+load the value of an argument (in boolean)
+>> argc - number of arguments
+>> argv - arguments
+>> name - the argument we search for
+>> p - the pointer to the target variable where we want to place the value
+>> defaultP - the default value we use if no argument is found
+*/
+void LoadParamBool(int argc, char** argv, const char* name, bool* p, bool defaultP)
+{
+    char vname[128];
+    vname[0] = '-';
+    strcpy(vname + 1, name);
+    bool hit = false;
+    for (int i = 0; i < argc; i++) {
+        if (!strcmp(argv[i], vname)) {
+            *(bool*)p = true;
+            hit = true;
+            break;
+        }
+    }
+    if (!hit)
+        *p = defaultP;
+}
+
+/*
+load the value of an argument (in float)
+>> argc - number of arguments
+>> argv - arguments
+>> name - the argument we search for
+>> p - the pointer to the target variable where we want to place the value
+>> defaultP - the default value we use if no argument is found
+*/
+void LoadParamFloat(int argc, char** argv, const char* name, float* p, float defaultP)
+{
+    char vname[128];
+    vname[0] = '-';
+    strcpy(vname + 1, name);
+    bool hit = false;
+    for (int i = 0; i < argc; i++) {
+        if (!strcmp(argv[i], vname) && i + 1 < argc) {
+            *p = (float)atof(argv[i + 1]);
+            hit = true;
+            break;
+        }
+    }
+    if (!hit)
+        *p = defaultP;
+}
+
+/*
+load the value of an argument (in char string)
+>> argc - number of arguments
+>> argv - arguments
+>> name - the argument we search for
+>> p - the pointer to the target variable where we want to place the value
+>> defaultP - the default value we use if no argument is found
+*/
+void LoadParamString(int argc, char** argv, const char* name, char* p, const char* defaultP)
+{
+    char vname[128];
+    vname[0] = '-';
+    strcpy(vname + 1, name);
+    bool hit = false;
+    for (int i = 0; i < argc; i++) {
+        if (!strcmp(argv[i], vname) && i + 1 < argc) {
+            strcpy(p, argv[i + 1]);
+            hit = true;
+            break;
+        }
+    }
+    if (!hit)
+        strcpy(p, defaultP);
+}
+
+/*
+show the argument list
+>> argc - number of arguments
+>> argv - arguments
+*/
+void ShowParams(int argc, char** argv)
+{
+    fprintf(stderr, "args:\n");
+    for (int i = 0; i < argc; i++) {
+        if (argv[i][1] == 0)
+            continue;
+        if (argv[i][0] == '-' && (argv[i][1] < '1' || argv[i][1] > '9')) {
+            if (i + 1 < argc && argv[i + 1][0] != '-')
+                fprintf(stderr, " %s=%s\n", argv[i], argv[i + 1]);
+            else
+                fprintf(stderr, " %s=yes\n", argv[i]);
+        }
+    }
+    fprintf(stderr, "\n");
+}
+
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/XConfig.h
+++ b/source/tensor/XConfig.h
+/*
+* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2021
+* Natural Language Processing Lab, Northeastern University
+* and
+* NiuTrans Research
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* this class defines a parameter keeper.
+*
+* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-02-28
+* A new semester begins today.
+*/
+
+#ifndef __XCONFIG_H__
+#define __XCONFIG_H__
+
+#include "XGlobal.h"
+#include "XUtility.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+#define MAX_WORD_LENGTH_IN_CONFIG 256
+
+/* the parameter keeper */
+class XConfig
+{
+private:
+    /* number of arguments */
+    int n;
+    
+    /* argument list (in char*) */
+    char ** args;
+
+    /* number of items we rellocate for these arguments */
+    int nReal;
+
+public:
+    /* constructor */
+    XConfig();
+
+    /* de-constructor */
+    ~XConfig();
+    
+    /* clear it */
+    void Clear();
+
+    /* create a config */
+    void Create(const int myN, const char ** myArgs);
+
+    /* add an argument */
+    void Add(const char * myArg, const char * myValue);
+
+    /* add an argument (in integer) */
+    void Add(const char * myArg, int myValue);
+
+    /* add an argument (in bool) */
+    void Add(const char * myArg, bool myValue);
+
+    /* add an argument (in float) */
+    void Add(const char * myArg, float myValue);
+
+    /* load the value of an argument to a variable (in integer) */
+    void LoadInt(const char * name, int * p, int defaultP);
+
+    /* load the value of an argument to a variable (in boolean) */
+    void LoadBool(const char * name, bool * p, bool defaultP);
+
+    /* load the value of an argument to a variable (in float) */
+    void LoadFloat(const char * name, float * p, float defaultP);
+
+    /* load the value of an argument to a variable (in char string) */
+    void LoadString(const char * name, char * p, const char* defaultP);
+
+    /* get the value of an argument (in integer) */
+    int GetInt(const char * name, int defaultP);
+
+    /* get the value of an argument (in boolean) */
+    bool GetBool(const char * name, bool defaultP);
+
+    /* get the value of an argument (in float) */
+    float GetFloat(const char * name, float defaultP);
+
+    /* get item number */
+    int GetItemNum();
+
+    /* get the item with offset i */
+    char * GetItem(int i);
+
+    /* initialize with another config model */
+    void CreateFromMe(XConfig &myConfig);
+
+};
+
+#define MAX_PARAM_NUM 100
+
+/* load arguments */
+void extern LoadParamInt(int argc, char** argv, const char* name, int* p, int defaultP);
+void extern LoadParamBool(int argc, char** argv, const char* name, bool* p, bool defaultP);
+void extern LoadParamFloat(int argc, char** argv, const char* name, float* p, float defaultP);
+void extern LoadParamString(int argc, char** argv, const char* name, char* p, const char* defaultP);
+
+/* show arguments */
+void extern ShowParams(int argc, char** argv);
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif
\ No newline at end of file
--- a/source/tensor/XDevice.cpp
+++ b/source/tensor/XDevice.cpp
@@ -182,10 +182,11 @@ void XDevice::Reset()
    XMem * mem = GMems.GetMem(devID);
    mem->Free();

+#ifdef USE_CUDA
    int devIDReset = devID;
+    
    Clear();
-
-#ifdef USE_CUDA
+    
    if (devIDReset >= 0) {
        int devIDBackup = -1;
        cudaGetDevice(&devIDBackup);
@@ -195,6 +196,8 @@ void XDevice::Reset()

        cudaSetDevice(devIDBackup);
    }
+#else
+    Clear();
 #endif
 }


--- a/source/tensor/XGlobal.h
+++ b/source/tensor/XGlobal.h
@@ -132,6 +132,36 @@ extern int TRAINING_SAMPLE_BUF_SIZE;
 extern int CONST_MINUSONE;
 extern bool CONST_TRUE;

+//////////////////////////////////////////////////
+// mutex
+#ifdef WIN32
+#define      THREAD_HANDLE            HANDLE
+#define      MUTEX_HANDLE             CRITICAL_SECTION
+#define      COND_HANDLE              HANDLE
+#define      MUTEX_INIT( x )          InitializeCriticalSection( &(x) )
+#define      MUTEX_DELE( x )          DeleteCriticalSection( &(x) )
+#define      MUTEX_LOCK( x )          EnterCriticalSection( &(x) )
+#define      MUTEX_UNLOCK( x )        LeaveCriticalSection( &(x) )
+#define      COND_INIT( x )           ( x = CreateEvent( NULL, false, false, NULL ) )
+#define      COND_DELE( x )           CloseHandle( (x) )
+#define      COND_WAIT( x, y )        WaitForSingleObject( (x), INFINITE )
+#define      COND_SIGNAL( x )         SetEvent( (x) )
+#define      COND_RESET( x)           ResetEvent( (x) )
+#else
+#define      THREAD_HANDLE            pthread_t
+#define      MUTEX_HANDLE             pthread_mutex_t
+#define      COND_HANDLE              pthread_cond_t
+#define      MUTEX_INIT( x )          pthread_mutex_init( &(x), NULL )
+#define      MUTEX_DELE( x )          pthread_mutex_destroy( &(x) )
+#define      MUTEX_LOCK( x )          pthread_mutex_lock( &(x) )
+#define      MUTEX_UNLOCK( x )        pthread_mutex_unlock( &(x) )
+#define      COND_INIT( x )           pthread_cond_init( &(x), NULL )
+#define      COND_DELE( x )           pthread_cond_destroy( &(x) )
+#define      COND_WAIT( x, y )        pthread_cond_wait( &(x), &(y) )
+#define      COND_SIGNAL( x )         pthread_cond_signal( &(x) )
+#define      COND_BROADCAST( x )      pthread_cond_broadcast( &(x) )
+#endif
+
 //#define USE_CUDA_RESURSION 1

 #define NIUTRANSNNDEBUG

--- a/source/tensor/XList.cpp
+++ b/source/tensor/XList.cpp
@@ -36,7 +36,7 @@ TensorListBase<T>::TensorListBase()
 {
    maxNum = 1;
    count = 0;
-    items = (T*)malloc(sizeof(T) * 1);
+    items = new T[1];
 }

 /* 
@@ -49,7 +49,7 @@ TensorListBase<T>::TensorListBase(int myMaxNum)
    CheckNTErrors(myMaxNum > 0, "check if the input number > 0");
    maxNum = myMaxNum;
    count = 0;
-    items = (T*)malloc(sizeof(T) * myMaxNum);
+    items = new T[myMaxNum];
 }

 /*
@@ -62,7 +62,7 @@ TensorListBase<T>::TensorListBase(const T* inputItems, int inputItemCount)
    CheckNTErrors(inputItemCount > 0, "check if the input number > 0");
    maxNum = inputItemCount;
    count = inputItemCount;
-    items = (T*)malloc(sizeof(T) * inputItemCount);
+    items = new T[inputItemCount];
    memcpy(items, inputItems, inputItemCount * sizeof(T));
 }

@@ -73,7 +73,7 @@ TensorListBase<T>::TensorListBase(const TensorListBase<T>& l)
    CheckNTErrors(l.maxNum > 0, "check if the input number > 0");
    maxNum = l.maxNum;
    count = l.count;
-    items = (T*)malloc(sizeof(T) * maxNum);
+    items = new T[maxNum];
    memcpy(items, l.items, l.count * sizeof(T));
 }

@@ -94,7 +94,7 @@ TensorListBase<T> TensorListBase<T>::operator=(const TensorListBase<T>& l)
 {
    maxNum = l.maxNum;
    count = l.count;
-    items = (T*)malloc(sizeof(T) * maxNum);
+    items = new T[maxNum];
    memcpy(items, l.items, l.count * sizeof(T));
    return *this;
 }
@@ -105,7 +105,7 @@ TensorListBase<T> TensorListBase<T>::operator=(TensorListBase<T>&& l)
 {
    maxNum = l.maxNum;
    count = l.count;
-    items = (T*)malloc(sizeof(T) * maxNum);
+    items = new T[maxNum];
    memcpy(items, l.items, l.count * sizeof(T));
    return *this;
 }
@@ -115,10 +115,25 @@ template <typename T>
 TensorListBase<T>::~TensorListBase()
 {
    if(items != NULL)
-        free(items);
+        delete[] items;
    items = NULL;
 }

+/* 
+reallocate 
+>> itemNum - the number of items
+*/
+template <typename T>
+void TensorListBase<T>::Reallocate(int itemNum)
+{
+    if (maxNum < itemNum) {
+        T * newItems = new T[itemNum];
+        memcpy(newItems, items, count * sizeof(T));
+        delete[] items;
+        items = newItems;
+        maxNum = itemNum;
+    }
+}

 /*
 add an item into the list
@@ -128,20 +143,10 @@ template <typename T>
 void TensorListBase<T>::Add(T&& item)
 {
    if (count == maxNum) {
-        
-        T* newItems;
-        
-        newItems = (T*)realloc(items, sizeof(T) * (count * 2 + 1));
-        if (newItems != NULL)
-            items = newItems;
-        else {
-            newItems = (T*)malloc(sizeof(T) * (count * 2 + 1));
-            memcpy(newItems, items, count * sizeof(T));
-            free(items);
-            items = newItems;
-        }
-            
-
+        T * newItems = new T[count * 2 + 1];
+        memcpy(newItems, items, count * sizeof(T));
+        delete[] items;
+        items = newItems;
        maxNum = count * 2 + 1;
    }
    items[count++] = item;
@@ -162,24 +167,49 @@ template <typename T>
 void TensorListBase<T>::Add(const T& item)
 {
    if (count == maxNum) {
-        T* newItems;
-
-        newItems = (T*)realloc(items, sizeof(T) * (count * 2 + 1));
-        if (newItems != NULL)
-            items = newItems;
-        else {
-            newItems = (T*)malloc(sizeof(T) * (count * 2 + 1));
-            memcpy(newItems, items, count * sizeof(T));
-            free(items);
-            items = newItems;
-        }
-
+        T * newItems = new T[count * 2 + 1];
+        memcpy(newItems, items, count * sizeof(T));
+        delete[] items;
+        items = newItems;
        maxNum = count * 2 + 1;
    }

    items[count++] = item;
 }

+/* add an item (as an integer) into the list */
+template <typename T>
+void TensorListBase<T>::AddInt(const int item)
+{
+    if (count == maxNum)
+        Reallocate(count * 2 + 1);
+
+    *(int*)(items + count) = item;
+    count++;
+}
+
+/* add an item (as a float) into the list */
+template <typename T>
+void TensorListBase<T>::AddFloat(const float item)
+{
+    if (count == maxNum)
+        Reallocate(count * 2 + 1);
+
+    *(float*)(items + count) = item;
+    count++;
+}
+
+/* add an item (as a long long) into the list */
+template <typename T>
+void TensorListBase<T>::AddLLong(const long long item)
+{
+    if (count == maxNum)
+        Reallocate(count * 2 + 1);
+
+    *(long long*)(items + count) = item;
+    count++;
+}
+
 /* 
 add a number of items into the list 
 >> inputItems - pointer to the array of items
@@ -189,18 +219,10 @@ template <typename T>
 void TensorListBase<T>::Add(const T* inputItems, int inputItemCount)
 {
    if (count + inputItemCount >= maxNum) {
-        T* newItems;
-
-        newItems = (T*)realloc(items, sizeof(T) * (count + inputItemCount + 1));
-        if (newItems != NULL)
-            items = newItems;
-        else {
-            newItems = (T*)malloc(sizeof(T) * (maxNum + count + inputItemCount + 1));
-            memcpy(newItems, items, count * sizeof(T));
-            free(items);
-            items = newItems;
-        }
-
+        T* newItems = new T[maxNum + count + inputItemCount + 1];
+        memcpy(newItems, items, count * sizeof(T));
+        delete[] items;
+        items = newItems;
        maxNum += (count + inputItemCount + 1);
    }
    memcpy(items + count, inputItems, sizeof(T) * inputItemCount);
@@ -226,18 +248,10 @@ template <typename T>
 void TensorListBase<T>::Insert(int pos, const T& item)
 {
    if (count == maxNum) {
-        T* newItems;
-
-        newItems = (T*)realloc(items, sizeof(T) * (count * 2 + 1));
-        if (newItems != NULL)
-            items = newItems;
-        else {
-            newItems = (T*)malloc(sizeof(T) * (count * 2 + 1));
-            memcpy(newItems, items, count * sizeof(T));
-            free(items);
-            items = newItems;
-        }
-
+        T * newItems = new T[count * 2 + 1];
+        memcpy(newItems, items, count * sizeof(T));
+        delete[] items;
+        items = newItems;
        maxNum = count * 2 + 1;
    }

@@ -251,18 +265,10 @@ template<typename T>
 void TensorListBase<T>::Insert(int pos, T&& item)
 {
    if (count == maxNum) {
-        T* newItems;
-
-        newItems = (T*)realloc(items, sizeof(T) * (count * 2 + 1));
-        if (newItems != NULL)
-            items = newItems;
-        else {
-            newItems = (T*)malloc(sizeof(T) * (count * 2 + 1));
-            memcpy(newItems, items, count * sizeof(T));
-            free(items);
-            items = newItems;
-        }
-
+        T * newItems = new T[count * 2 + 1];
+        memcpy(newItems, items, count * sizeof(T));
+        delete[] items;
+        items = newItems;
        maxNum = count * 2 + 1;
    }

@@ -274,16 +280,64 @@ void TensorListBase<T>::Insert(int pos, T&& item)

 /* get the item at position i */
 template <typename T>
-T& TensorListBase<T>::GetItem(int i) const
+inline T& TensorListBase<T>::GetItem(int i) const
 {
    CheckNTErrors(i >= -count && i < count, "Index of a list item is out of scope!");
-    CheckNTErrors(count > 0, "Cannt index the item in an empty list!");
+    CheckNTErrors(count > 0, "Cannot index the item in an empty list!");
    if (i < 0)
        return items[count + i];
    else
        return items[i];
 }

+/* get the item at position i and force it to an integer */
+template <typename T>
+inline int TensorListBase<T>::GetItemInt(int i) const
+{
+    CheckNTErrors(i >= -count && i < count, "Index of a list item is out of scope!");
+    CheckNTErrors(count > 0, "Cannot index the item in an empty list!");
+
+    if (i < 0)
+        return 0;
+    else {
+        T r = items[i];
+        void * p = &r;
+        return *(int*)p;
+    }
+}
+
+/* get the item at position i and force it to a float number */
+template <typename T>
+inline float TensorListBase<T>::GetItemFloat(int i) const
+{
+    CheckNTErrors(i >= -count && i < count, "Index of a list item is out of scope!");
+    CheckNTErrors(count > 0, "Cannot index the item in an empty list!");
+
+    if (i < 0)
+        return 0;
+    else {
+        T r = items[i];
+        void * p = &r;
+        return *(float*)p;
+    }
+}
+
+/* get the item at position i and force it to an long long number */
+template <typename T>
+inline long long TensorListBase<T>::GetItemLLong(int i) const
+{
+    CheckNTErrors(i >= -count && i < count, "Index of a list item is out of scope!");
+    CheckNTErrors(count > 0, "Cannot index the item in an empty list!");
+
+    if (i < 0)
+        return 0;
+    else {
+        T r = items[i];
+        void * p = &r;
+        return *(long long*)p;
+    }
+}
+
 /* set the item at position i */
 template <typename T>
 inline void TensorListBase<T>::SetItem(int i, const T& item)
@@ -299,6 +353,33 @@ inline void TensorListBase<T>::SetItem(int i, T&& item)
        items[i] = item;
 }

+/* set the item (as an integer) at position i */
+template<typename T>
+inline void TensorListBase<T>::SetItemInt(int i, const int item)
+{
+    if (i >= 0 && i < count) {
+        *(int*)(items + i) = item;
+    }
+}
+
+/* set the item (as a float) at position i */
+template<typename T>
+inline void TensorListBase<T>::SetItemFloat(int i, const float item)
+{
+    if (i >= 0 && i < count) {
+        *(float*)(items + i) = item;
+    }
+}
+
+/* set the item (as a long long) at position i */
+template<typename T>
+inline void TensorListBase<T>::SetItemLLong(int i, const long long item)
+{
+    if (i >= 0 && i < count) {
+        *(long long*)(items + i) = item;
+    }
+}
+
 /* 
 find the position of the first matched item 
 >> item - the item for matching
@@ -329,7 +410,7 @@ void TensorListBase<T>::Clear()
    count = 0;
    maxNum = 0;
    if(items != NULL)
-        free(items);
+        delete[] items;
    items = NULL;
 }

@@ -384,7 +465,7 @@ void TensorListBase<T>::Reserve(int n)
        return;
    }

-    items = (T*)malloc(sizeof(T) * n);
+    items = new T[n];
 }

 /* 
@@ -430,8 +511,8 @@ void TensorListBase<T>::ReadFromFile(FILE* fp, int num)
        if(!items)
            Reserve(num - maxNum);
        else {
-            free(items);
-            items = (T*)malloc(sizeof(T) * num);
+            delete[] items;
+            items = new T[num];
        }
    }
    fread(items, sizeof(T), num, fp);

--- a/source/tensor/XList.h
+++ b/source/tensor/XList.h
@@ -75,6 +75,9 @@ public:
    /* de-constructor */
    ~TensorListBase();

+    /* reallocate */
+    void Reallocate(int itemNum);
+
    /* add an item into the list */
    void Add(T&& item);

@@ -84,6 +87,15 @@ public:
    /* add an item into the list */
    void Add(const T& item);

+    /* add an item (as an integer) into the list */
+    void AddInt(const int item);
+
+    /* add an item (as a float) into the list */
+    void AddFloat(const float item);
+
+    /* add an item (as a long long) into the list */
+    void AddLLong(const long long item);
+
    /* add a number of items into the list */
    void Add(const T* inputItems, int inputItemCount);

@@ -99,12 +111,30 @@ public:
    /* get the item at position i */
    T& GetItem(int i) const;

+    /* get the item at position i and force it to an integer */
+    int GetItemInt(int i) const;
+
+    /* get the item at position i and force it to a float number */
+    float GetItemFloat(int i) const;
+
+    /* get the item at position i and force it to an long long number */
+    long long GetItemLLong(int i) const;
+
    /* set the item at position i */
    void SetItem(int i, const T& item);

    /* set the item at position i */
    void SetItem(int i, T&& item);

+    /* set the item (as an integer) at position i */
+    void SetItemInt(int i, const int item);
+
+    /* set the item (as a float) at position i */
+    void SetItemFloat(int i, const float item);
+
+    /* set the item (as a long long) at position i */
+    void SetItemLLong(int i, const long long item);
+
    /* find the position of the first matched item  */
    int FindFirst(const T& item);

@@ -135,7 +165,13 @@ public:
    /* short */
    T& operator[] (int i) const { return GetItem(i); };
    T& Get(int i) const { return GetItem(i); };
+    int GetInt(int i) const { return GetItemInt(i); };
+    float GetFloat(int i) const { return GetItemFloat(i); };
+    long long GetLLong(int i) const { return GetItemLLong(i); };
    void Set(int i, T item) { SetItem(i, item); };
+    void SetInt(int i, int item) { SetItemInt(i, item); };
+    void SetFloat(int i, float item) { SetItemFloat(i, item); };
+    void SetLLong(int i, long long item) { SetItemLLong(i, item); };
 };

 struct XTensor;

--- a/source/tensor/XMem.cpp
+++ b/source/tensor/XMem.cpp
@@ -54,6 +54,8 @@ XMem::XMem()
    signature = 0;
    mergeFreeOTF = true;
    isInitialized = false;
+    MUTEX_INIT(allocMutex);
+    MUTEX_INIT(bufMutex);
 }

 /* 
@@ -77,6 +79,8 @@ XMem::XMem(int myDevID, MEMPOOL_MODE myMode, MTYPE myBlockSize, int myBlockNum, 
    strcpy(name, "xmem");
    signature = 0;
    mergeFreeOTF = true;
+    MUTEX_INIT(allocMutex);
+    MUTEX_INIT(bufMutex);
    Initialize(myDevID, myMode, myBlockSize, myBlockNum, myBufSize);
 }

@@ -99,6 +103,8 @@ XMem::~XMem()
    delete[] memIndex;
    delete[] memIndex2;
    delete[] minSizeIndex;
+    MUTEX_DELE(allocMutex);
+    MUTEX_DELE(bufMutex);
 }

 /* 
@@ -379,12 +385,18 @@ require a piece of memory
 */
 void * XMem::Alloc(int myDevID, MTYPE mySize)
 {
+    void * p = NULL;
+
+    MUTEX_LOCK(allocMutex);
    if(mode == FREE_ON_THE_FLY)
-        return AllocStandard(myDevID, mySize);
+        p = AllocStandard(myDevID, mySize);
    else if(isStatic)
-        return AllocStatic(myDevID, mySize);
+        p = AllocStatic(myDevID, mySize);
    else
-        return AllocDynamic(myDevID, mySize);
+        p = AllocDynamic(myDevID, mySize);
+    MUTEX_UNLOCK(allocMutex);
+
+    return p;
 }

 /* 
@@ -521,6 +533,11 @@ void * XMem::AllocBuf(int myDevID, MTYPE mySize, int pitch)
 {
    MTYPE backOffset = 0;

+    /* NOTE THAT this is tricky because we lock the buffer
+       but DO NOT unlock it in this function. The unlock would
+       happans when we call ReleaseBuf() */
+    //MUTEX_LOCK(bufMutex);
+
    if(pitch > 1){
        MTYPE address = (MTYPE)((char*)buf + bufUsed);
        int offset  = address % pitch;
@@ -560,8 +577,10 @@ release a piece of memory
 */
 void XMem::Release(int myDevID, void * p, MTYPE size)
 {
+    MUTEX_LOCK(allocMutex);
    if(mode == FREE_ON_THE_FLY)
        ReleaseStandard(myDevID, p, size);
+    MUTEX_UNLOCK(allocMutex);
 }

 /* 
@@ -583,6 +602,9 @@ void XMem::ReleaseBuf(int myDevID, MTYPE mySize, int pitch)
    }

    bufUsed -= (mySize + backOffset);
+
+    /* NOTE THAT this is a response to the lock in AllocBuf() */
+    //MUTEX_UNLOCK(bufMutex);
 }

 /* 
@@ -825,6 +847,18 @@ void * XMem::AllocStandard(int myDevID, MTYPE mySize, bool myIsRebuiltIndex)
    return result;
 }

+/* lock the buffer mutex */
+void XMem::LockBuf()
+{
+    MUTEX_LOCK(bufMutex);
+}
+
+/* unlock the buffer mutex */
+void XMem::UnlockBuf()
+{
+    MUTEX_UNLOCK(bufMutex);
+}
+
 /* 
 find the highest set bit (or most significant set bit) in an integer-64 
 >> mySize - required size
@@ -1604,6 +1638,9 @@ void XMemManager::GetBufferSize(MTYPE freeMem, MTYPE * myBufSize)
            }
        }
    }
+    else {
+        ShowNTErrors("No enough memory for buffer allocation!");
+    }
 } 

 /* initialize it and set the global memory information */

--- a/source/tensor/XMem.h
+++ b/source/tensor/XMem.h
@@ -24,6 +24,7 @@
 #ifndef __XMEM_H__
 #define __XMEM_H__

+#include "XGlobal.h"
 #include <stdio.h>
 #include <stdlib.h>

@@ -249,6 +250,13 @@ public:
    /* indicates whether we merge free memory pieces on the fly */
    bool mergeFreeOTF;

+private:
+    /* a mutex for memory allocation and release */
+    MUTEX_HANDLE allocMutex;
+
+    /* a mutex for buffer memory allocation and release */
+    MUTEX_HANDLE bufMutex;
+
 public:

    /* constructor */
@@ -337,6 +345,12 @@ public:
    /* allocate a piece of memory as "malloc" */
    void * AllocStandard(int myDevID, MTYPE mySize, bool myIsRebuiltIndex = false);

+    /* lock the buffer mutex */
+    void LockBuf();
+
+    /* unlock the buffer mutex */
+    void UnlockBuf();
+
    /* find the highest set bit (or most significant set bit) in an integer-64 */
    int GetMSB(MTYPE mySize);


--- a/source/tensor/XQueue.cpp
+++ b/source/tensor/XQueue.cpp
@@ -215,7 +215,8 @@ void XQueue::DequeueJobs(XList * args)
    int devID = *(int*)args->GetItem(1);

    int devIDBackup = -1;
-    XDevice::SetDevice(devID, devIDBackup);
+    if(devID >= 0)
+        XDevice::SetDevice(devID, devIDBackup);

    while(1){
        JobQueueNode * node = (JobQueueNode*)q->Dequeue();
@@ -236,7 +237,8 @@ void XQueue::DequeueJobs(XList * args)

    }

-    XDevice::SetDevice(devIDBackup);
+    if(devID >= 0)
+        XDevice::SetDevice(devIDBackup);
 }

 /* get the break flag */
@@ -248,7 +250,11 @@ bool XQueue::GetJobBreak()
 /* get the number of jobs */
 int XQueue::GetJobNum()
 {
-    return runningJobCount;
+    MUTEX_LOCK(jobQueueMutex);
+    int c = runningJobCount;
+    MUTEX_UNLOCK(jobQueueMutex);
+
+    return c;
 }

 } /* end of the nts (NiuTrans.Tensor) namespace */
--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -1985,6 +1985,19 @@ void XTensor::FlushToMem(XMem* targetMem)
    }
 }

+/* 
+flush the data to the target device (with id) 
+>> myDevID - id of the target device
+*/
+void XTensor::FlushToDevice(int myDevID)
+{
+    if (myDevID == devID)
+        return;
+
+    XMem * myMem = GMems.GetMem(myDevID);
+    FlushToMem(myMem);
+}
+
 /*
 allocate the memory space of the tensor (in the global memory) 
 >> tensor - the tensor we intend to process

--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
@@ -457,6 +457,9 @@ public:
    /* flush the data to the target device */
    void FlushToMem(XMem * targetMem);

+    /* flush the data to the target device (with id) */
+    void FlushToDevice(int myDevID);
+
    /* allocate the memory space of the tensor (in the global memory) */
    static
    void AllocateData(XTensor * tensor, XMem * myMem = NULL, bool useBuf = false);

--- a/source/tensor/XThread.h
+++ b/source/tensor/XThread.h
@@ -54,37 +54,6 @@ namespace nts{
                   (unsigned)(flag), (unsigned *)(id))
 #endif

-//////////////////////////////////////////////////
-// mutex
-#ifdef WIN32
-#define      THREAD_HANDLE            HANDLE
-#define      MUTEX_HANDLE             CRITICAL_SECTION
-#define      COND_HANDLE              HANDLE
-#define      MUTEX_INIT( x )          InitializeCriticalSection( &(x) )
-#define      MUTEX_DELE( x )          DeleteCriticalSection( &(x) )
-#define      MUTEX_LOCK( x )          EnterCriticalSection( &(x) )
-#define      MUTEX_UNLOCK( x )        LeaveCriticalSection( &(x) )
-#define      COND_INIT( x )           ( x = CreateEvent( NULL, false, false, NULL ) )
-#define      COND_DELE( x )           CloseHandle( (x) )
-#define      COND_WAIT( x, y )        WaitForSingleObject( (x), INFINITE )
-#define      COND_SIGNAL( x )         SetEvent( (x) )
-#define      COND_RESET( x)           ResetEvent( (x) )
-#else
-#define      THREAD_HANDLE            pthread_t
-#define      MUTEX_HANDLE             pthread_mutex_t
-#define      COND_HANDLE              pthread_cond_t
-#define      MUTEX_INIT( x )          pthread_mutex_init( &(x), NULL )
-#define      MUTEX_DELE( x )          pthread_mutex_destroy( &(x) )
-#define      MUTEX_LOCK( x )          pthread_mutex_lock( &(x) )
-#define      MUTEX_UNLOCK( x )        pthread_mutex_unlock( &(x) )
-#define      COND_INIT( x )           pthread_cond_init( &(x), NULL )
-#define      COND_DELE( x )           pthread_cond_destroy( &(x) )
-#define      COND_WAIT( x, y )        pthread_cond_wait( &(x), &(y) )
-#define      COND_SIGNAL( x )         pthread_cond_signal( &(x) )
-#define      COND_BROADCAST( x )      pthread_cond_broadcast( &(x) )
-
-#endif
-
 typedef void (*TFunction) (volatile XList*);

 /*

--- a/source/tensor/XUtility.cpp
+++ b/source/tensor/XUtility.cpp
@@ -155,13 +155,13 @@ void XMemSet(int devID, void * p, int value, size_t size)
 cudaMemcpyKind GetMemcpyKind(int devIDFrom, int devIDTo)
 {
    if(devIDFrom < 0 && devIDTo < 0)
-        return cudaMemcpyHostToHost;
+        return cudaMemcpyKind::cudaMemcpyHostToHost;
    else if(devIDFrom < 0 && devIDTo >= 0)
-        return cudaMemcpyHostToDevice;
+        return cudaMemcpyKind::cudaMemcpyHostToDevice;
    else if(devIDFrom >= 0 && devIDTo < 0)
-        return cudaMemcpyDeviceToHost;
+        return cudaMemcpyKind::cudaMemcpyDeviceToHost;
    else
-        return cudaMemcpyDeviceToDevice;
+        return cudaMemcpyKind::cudaMemcpyDeviceToDevice;
 }
 #endif

@@ -485,6 +485,9 @@ unsigned int GetNextPower2(unsigned int n)
 /* sleep for a while */
 void XSleep(int sleepTime)
 {
+    if (sleepTime <= 0)
+        return;
+
 #ifdef  _WIN32
    Sleep((DWORD)sleepTime);
 #else
@@ -553,9 +556,9 @@ void XQSort(void * data, void * index, int num, int width, int stride, int (*com
    stackptr = 0;

    lo = (char*)data;
-    hi = (char*)data + realStride * (num - 1);
+    hi = (char*)data + (long)realStride * (num - 1);
    indexlo = (int*)index;
-    indexhi = index != NULL ? (int*)index + stride * (num - 1) : NULL;
+    indexhi = index != NULL ? (int*)index + (long)stride * (num - 1) : NULL;

 recurse:

@@ -565,8 +568,8 @@ recurse:
    if(size <= MIN_QSORT_NUM)
        XShortSort(lo, hi, indexlo, indexhi, width, stride, comp);
    else {
-        mid = lo + (size/2) * realStride;
-        indexmid = indexlo + (size/2) * stride;
+        mid = lo + (long)(size/2) * realStride;
+        indexmid = indexlo + (long)(size/2) * stride;
        
        /* sort the first, last and middle elements into order */
        if(comp(lo, mid) > 0)
@@ -834,8 +837,7 @@ int SplitALine(char* inputString, const char* seperator, StrList* items)
        return 0;

    if (sepLen == 0) {
-
-        char* item = new char[inputLen + 1];
+        char* item = new char[(long)inputLen + 1];
        strcpy(item, inputString);
        items->Add(item);
    }

--- a/source/tensor/core/arithmetic/Div.cpp
+++ b/source/tensor/core/arithmetic/Div.cpp
@@ -253,15 +253,25 @@ void Div(const XTensor & a, const XTensor & b, XTensor & c, DTYPE alpha, int lea

    if (b.order == 0){
        DTYPE scale = 1.0F / b.Get0D();
+        if (a.mem != NULL)
+            a.mem->LockBuf();
        XTensor * tmp1 = NewTensorBufV2(&a, a.devID, a.mem);
+        if ((c.mem != NULL) && (c.mem != a.mem)) {
+            c.mem->LockBuf();
+        }
        XTensor * tmp2 = NewTensorBufV2(&c, c.devID, c.mem);

        ScaleAndShift(a, *tmp1, scale, 0.0F);
        ScaleAndShift(c, *tmp2, alpha, 0.0F);
        Sum(*tmp2, *tmp1, c);

-        DelTensorBuf(tmp1);
        DelTensorBuf(tmp2);
+        if ((c.mem != NULL) && (c.mem != a.mem)) {
+            c.mem->UnlockBuf();
+        }
+        DelTensorBuf(tmp1);
+        if (a.mem != NULL)
+            a.mem->UnlockBuf();
    }
    else {
        int n = GetBroadcastDimIndex(a, b);

--- a/source/tensor/core/arithmetic/MulAndShift.cpp
+++ b/source/tensor/core/arithmetic/MulAndShift.cpp
@@ -61,6 +61,8 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,

    float dr = (!x.isSparse || !w.isSparse) ? 1.0F : MAX(x.denseRatio, w.denseRatio);

+    if (x.mem != NULL)
+        x.mem->LockBuf();
    XTensor * tmp = NewTensorBufV2(order, dimSize, x.dataType, dr, x.devID, x.mem);

    /* call _MatrixMul function */
@@ -101,6 +103,8 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
    /* destroy variables */
    delete[] dimSize;
    DelTensorBuf(tmp);
+    if (x.mem != NULL)
+        x.mem->UnlockBuf();

    return c;
 }
@@ -121,8 +125,8 @@ XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedX,
    CheckNTErrors(x.order >= 2 && w.order >= 2, "Input tensors must have a order >= 2!");

    int xn = transposedX == X_TRANS ? x.dimSize[x.order - 1] : x.dimSize[x.order - 2];
-    int xm = transposedX == X_TRANS ? x.dimSize[x.order - 2] : x.dimSize[x.order - 1];
-    int wn = transposedW == X_TRANS ? w.dimSize[w.order - 1] : w.dimSize[w.order - 2];
+    //int xm = transposedX == X_TRANS ? x.dimSize[x.order - 2] : x.dimSize[x.order - 1];
+    //int wn = transposedW == X_TRANS ? w.dimSize[w.order - 1] : w.dimSize[w.order - 2];
    int wm = transposedW == X_TRANS ? w.dimSize[w.order - 2] : w.dimSize[w.order - 1];

    int order = x.order + w.order - 2;
@@ -137,6 +141,8 @@ XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedX,

    float dr = (!x.isSparse || !w.isSparse) ? 1.0F : MAX(x.denseRatio, w.denseRatio);

+    if (x.mem != NULL)
+        x.mem->LockBuf();
    XTensor * tmp = NewTensorBufV2(order, dimSize, x.dataType, dr, x.devID, x.mem);

    /* call _MatrixMul function */
@@ -175,8 +181,10 @@ XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedX,
    /* destroy variables */
    delete[] dimSize;
    DelTensorBuf(tmp);
+    if (x.mem != NULL)
+        x.mem->UnlockBuf();

    return c;
 }

-}
\ No newline at end of file
+}
--- a/source/tensor/core/arithmetic/Multiply.cpp
+++ b/source/tensor/core/arithmetic/Multiply.cpp
@@ -277,15 +277,25 @@ void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int l

    if (b.order == 0){
        DTYPE scale = b.Get0D();
+        if (a.mem != NULL)
+            a.mem->LockBuf();
        XTensor * tmp1 = NewTensorBufV2(&a, a.devID, a.mem);
+        if ((c.mem != NULL) && (c.mem != a.mem)) {
+            c.mem->LockBuf();
+        }
        XTensor * tmp2 = NewTensorBufV2(&c, c.devID, c.mem);

        ScaleAndShift(a, *tmp1, scale, 0.0F);
        ScaleAndShift(c, *tmp2, alpha, 0.0F);
        Sum(*tmp2, *tmp1, c);

-        DelTensorBuf(tmp1);
        DelTensorBuf(tmp2);
+        if ((c.mem != NULL) && (c.mem != a.mem)) {
+            c.mem->UnlockBuf();
+        }
+        DelTensorBuf(tmp1);
+        if (a.mem != NULL)
+            a.mem->UnlockBuf();
    }
    else {
        int n = GetBroadcastDimIndex(a, b);

--- a/source/tensor/core/arithmetic/MultiplyDim.cpp
+++ b/source/tensor/core/arithmetic/MultiplyDim.cpp
@@ -290,9 +290,16 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
                source = target;
            }
            
-            target = t->mem != NULL ?
+            /*target = t->mem != NULL ?
                     t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize):
-                     XMemAlloc(t->devID, t->unitNum * t->unitSize);
+                     XMemAlloc(t->devID, t->unitNum * t->unitSize);*/
+            if (t->mem != NULL) {
+                t->mem->LockBuf();
+                target = t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize);
+            }
+            else {
+                target = XMemAlloc(t->devID, t->unitNum * t->unitSize);
+            }
            
            s->data = source;
            t->data = target;
@@ -302,8 +309,9 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
            /* free the memory space of the one before the last allocation */
            if(count > 0){
                int size = s->unitNum * s->unitSize;
-                if(t->mem != NULL)
+                if(t->mem != NULL) {
                    t->mem->ReleaseBuf(t->devID, size);
+                }
                else
                    XMemFree(t->devID, source);
            }
@@ -312,8 +320,10 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
            if(isLast){
                CheckNTErrors(t->unitNum == c->unitNum, "Wrong tensor size!");
                _Multiply(a, t, c, beta);
-                if(t->mem != NULL)
+                if(t->mem != NULL) {
                    t->mem->ReleaseBuf(t->devID, t->unitNum * t->unitSize);
+                    t->mem->UnlockBuf();
+                }
                else
                    XMemFree(t->devID, target);
                target = NULL;

--- a/source/tensor/core/arithmetic/SumDim.cpp
+++ b/source/tensor/core/arithmetic/SumDim.cpp
@@ -293,10 +293,16 @@ void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta
                source = target;
            }
            
-            target = t->mem != NULL ?
+            /*target = t->mem != NULL ?
                     t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize):
-                     XMemAlloc(t->devID, t->unitNum * t->unitSize);
-            
+                     XMemAlloc(t->devID, t->unitNum * t->unitSize);*/
+            if (t->mem != NULL) {
+                t->mem->LockBuf();
+                target = t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize);
+            }
+            else {
+                target = XMemAlloc(t->devID, t->unitNum * t->unitSize);
+            }
            s->data = source;
            t->data = target;
            
@@ -315,8 +321,10 @@ void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta
            if(isLast){
                CheckNTErrors(t->unitNum == c->unitNum, "Wrong tensor size!");
                _Sum(a, t, c, beta);
-                if(t->mem != NULL)
+                if(t->mem != NULL) {
                    t->mem->ReleaseBuf(t->devID, t->unitNum * t->unitSize);
+                    t->mem->UnlockBuf();
+                }
                else
                    XMemFree(t->devID, target);
                target = NULL;

--- a/source/tensor/core/arithmetic/XTensorBLAS.cu
+++ b/source/tensor/core/arithmetic/XTensorBLAS.cu
@@ -330,6 +330,7 @@ void _CudaBLASMatrixMULList(cublasHandle_t * handle,
            DTYPE ** cpGPU = NULL;

            if (mem != NULL) {
+                mem->LockBuf();
                mem->SetPinBuf();
                apGPU = (DTYPE**)mem->AllocBuf(mem->devID, sizeof(DTYPE*) * a->count, 256);
                bpGPU = (DTYPE**)mem->AllocBuf(mem->devID, sizeof(DTYPE*) * a->count, 256);
@@ -356,8 +357,10 @@ void _CudaBLASMatrixMULList(cublasHandle_t * handle,
            delete[] bp;
            delete[] cp;

-            if(mem != NULL)
+            if (mem != NULL) {
                mem->BackToPinBuf();
+                mem->UnlockBuf();
+            }
            else {
                XMemFree(a0->devID, apGPU);
                XMemFree(a0->devID, bpGPU);

--- a/source/tensor/core/getandset/OnehotAndIndex.cpp
+++ b/source/tensor/core/getandset/OnehotAndIndex.cpp
@@ -96,9 +96,12 @@ XTensor OnehotToIndex(const XTensor & onehot, int size)
 /* 
 convert index tensor to onehot tensor 

->> index - index tensor, which value is an integer num
->> onehot - onehot tensor, which value is 0 or 1
->> size - the last dimension size of the onehot tensor
+>> index - index of the output dimension (over the vocabulary)
+>> onehot - one-hot representation of the index
+>> size - vocabuary size (last dimension size of onehot)
+>> labelSmoothingP - the parameter that controls how smooth the output is.
+                     E.g., p = 0 means no smoothing
+                           p = 1 means a uniform distribution (almost)
 */
 void _IndexToOnehot(const XTensor * index, XTensor * onehot, 
                    int size, float labelSmoothingP)

--- a/source/tensor/core/getandset/SetData.cpp
+++ b/source/tensor/core/getandset/SetData.cpp
@@ -696,13 +696,23 @@ void _SetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYPE nu
 #ifdef USE_CUDA
        XMem * mem = tensor->mem;
        MTYPE size = num * sizeof(MTYPE);
-        MTYPE * offsetsCuda = mem != NULL ? (MTYPE*)mem->AllocBuf(mem->devID, size) : (MTYPE*)XMemAlloc(tensor->devID, size);
+        //MTYPE * offsetsCuda = mem != NULL ? (MTYPE*)mem->AllocBuf(mem->devID, size) : (MTYPE*)XMemAlloc(tensor->devID, size);
+        MTYPE * offsetsCuda;
+        if (mem != NULL) {
+            mem->LockBuf();
+            offsetsCuda = (MTYPE*)mem->AllocBuf(mem->devID, size);
+        }
+        else {
+            offsetsCuda = (MTYPE*)XMemAlloc(tensor->devID, size);
+        }
        XMemCopy(offsetsCuda, tensor->devID, offsets, -1, num * sizeof(MTYPE));

        _CudaSetDataWithOffset(tensor, offsetsCuda, value, num);
        
-        if (mem != NULL)
+        if (mem != NULL) {
            mem->ReleaseBuf(mem->devID, size);
+            mem->UnlockBuf();
+        }
        else
            XMemFree(tensor->devID, offsetsCuda);
 #else

--- a/source/tensor/core/getandset/SetData.cu
+++ b/source/tensor/core/getandset/SetData.cu
@@ -636,12 +636,23 @@ void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * va
    int devIDBackup;
    ProtectCudaDev(tensor->devID, devIDBackup);

-    MTYPE * offsetsCuda = mem != NULL ? 
+    /*MTYPE * offsetsCuda = mem != NULL ? 
                            (MTYPE*)mem->AllocBuf(mem->devID, offsetSize) : 
                            (MTYPE*)XMemAlloc(tensor->devID, offsetSize);
-    void * valuesCuda  = mem != NULL ? 
-                            mem->AllocBuf(mem->devID, valueSize) : 
-                            XMemAlloc(tensor->devID, valueSize);
+    void * valuesCuda = mem != NULL ?
+                        mem->AllocBuf(mem->devID, valueSize) :
+                        XMemAlloc(tensor->devID, valueSize);*/
+    MTYPE * offsetsCuda;
+    void * valuesCuda; 
+    if (mem != NULL) {
+        mem->LockBuf();
+        offsetsCuda = (MTYPE*)mem->AllocBuf(mem->devID, offsetSize);
+        valuesCuda = mem->AllocBuf(mem->devID, valueSize);
+    }
+    else {
+        offsetsCuda = (MTYPE*)XMemAlloc(tensor->devID, offsetSize);
+        valuesCuda = XMemAlloc(tensor->devID, valueSize);
+    }

    if (mem != NULL) {
        XMemCopy(offsetsCuda, mem->devID, offsets, -1, offsetSize);
@@ -657,6 +668,7 @@ void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * va
    if (mem != NULL) {
        mem->ReleaseBuf(mem->devID, valueSize);
        mem->ReleaseBuf(mem->devID, offsetSize);
+        mem->UnlockBuf();
    }
    else {
        XMemFree(tensor->devID, valuesCuda);

--- a/source/tensor/core/movement/CopyBlocks.cpp
+++ b/source/tensor/core/movement/CopyBlocks.cpp
@@ -45,15 +45,25 @@ void _CopyBlocks(void * source, int unitSize, int blockSize, int blockNum, void 
    if (devID >= 0) {
 #ifdef USE_CUDA
        /* copy the index from host to device */
-        int * targetBlocksTMP = myMem != NULL ?
+        /*int * targetBlocksTMP = myMem != NULL ?
                               (int*)myMem->AllocBuf(devID, blockNum * sizeof(int)):
-                               (int*)XMemAlloc(devID, blockNum * sizeof(int));
+                               (int*)XMemAlloc(devID, blockNum * sizeof(int));*/
+        int * targetBlocksTMP;
+        if (myMem != NULL) {
+            myMem->LockBuf();
+            targetBlocksTMP = (int*)myMem->AllocBuf(devID, blockNum * sizeof(int));
+        }
+        else {
+            targetBlocksTMP = (int*)XMemAlloc(devID, blockNum * sizeof(int));
+        }
        XMemCopy(targetBlocksTMP, devID, targetBlocks, -1, blockNum * sizeof(int));

        _CopyBlocksOnSite(source, unitSize, blockSize, blockNum, target, targetBlocksTMP, devID);

-        if(myMem != NULL)
+        if (myMem != NULL) {
            myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
+            myMem->UnlockBuf();
+        }
        else
            XMemFree(devID, targetBlocksTMP);
 #else

--- a/source/tensor/core/movement/CopyBlocksInGrid.cpp
+++ b/source/tensor/core/movement/CopyBlocksInGrid.cpp
@@ -47,14 +47,17 @@ void _CopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, 
 #ifdef USE_CUDA
        int * indexGPU = index;
        if (!isIndexOnDev) {
+            myMem->LockBuf();
            indexGPU = (int*)myMem->AllocBuf(myMem->devID, blockNum * gridNum * sizeof(int));
            XMemCopy(indexGPU, myMem->devID, index, -1, blockNum * gridNum * sizeof(int));
        }

        _CudaCopyBlocksInGrid(source, blockSize, blockNum, gridNum, target, indexGPU, unitSize, myMem);

-        if (!isIndexOnDev)
+        if (!isIndexOnDev) {
            myMem->ReleaseBuf(myMem->devID, blockNum * gridNum * sizeof(int));
+            myMem->UnlockBuf();
+        }
 #else
        ShowNTErrors("Plesae specify USE_CUDA and recompile the code!");
 #endif

--- a/source/tensor/core/movement/CopyBlocksSelected.cu
+++ b/source/tensor/core/movement/CopyBlocksSelected.cu
@@ -80,12 +80,23 @@ void _CudaCopyBlocksSelected(void * source, int unitSize, int blockSize, int * s
    ProtectCudaDev(devID, devIDBackup);

    /* copy the index to the GPU memory */
-    int * sourceBlocksTMP = myMem != NULL ? 
+    /*int * sourceBlocksTMP = myMem != NULL ? 
                           (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) : 
                           (int *)XMemAlloc(devID, blockNum * sizeof(int));
    int * targetBlocksTMP = myMem != NULL ? 
                           (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) : 
-                           (int *)XMemAlloc(devID, blockNum * sizeof(int));
+                           (int *)XMemAlloc(devID, blockNum * sizeof(int));*/
+    int * sourceBlocksTMP;
+    int * targetBlocksTMP;
+    if (myMem != NULL) {
+        myMem->LockBuf();
+        sourceBlocksTMP = (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int));
+        targetBlocksTMP = (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int));
+    }
+    else {
+        sourceBlocksTMP = (int *)XMemAlloc(devID, blockNum * sizeof(int));
+        targetBlocksTMP = (int *)XMemAlloc(devID, blockNum * sizeof(int));
+    }
    
    XMemCopy(sourceBlocksTMP, devID, sourceBlocks, -1, blockNum * sizeof(int));
    XMemCopy(targetBlocksTMP, devID, targetBlocks, -1, blockNum * sizeof(int));
@@ -107,6 +118,7 @@ void _CudaCopyBlocksSelected(void * source, int unitSize, int blockSize, int * s
    if (myMem != NULL) {
        myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
        myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
+        myMem->UnlockBuf();
    }
    else {
        XMemFree(devID, sourceBlocksTMP);

--- a/source/tensor/core/movement/Gather.cpp
+++ b/source/tensor/core/movement/Gather.cpp
@@ -115,7 +115,7 @@ void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex)

        for (int i = 0; i < indexSize; i++) {
            int sIndex = sIndexData[i] * stride;
-            CheckNTErrors(sIndex < s->unitNum, "Wrong index!");
+            CheckNTErrors(sIndex < s->unitNum && sIndex >= 0, "Wrong index!");
            for (int j = 0; j < stride; j++)
                tData[i * stride + j] = sData[sIndex + j];
        }

--- a/source/tensor/core/movement/Gather.cu
+++ b/source/tensor/core/movement/Gather.cu
@@ -131,9 +131,16 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
            CheckNTErrors(srcIndexValue < s->unitNum, "Wrong index!");
        }

-        sIndex = mem != NULL ? 
+        /*sIndex = mem != NULL ? 
                  (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) : 
-                  (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
+                  (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);*/
+        if (mem != NULL) {
+            mem->LockBuf();
+            sIndex = (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize);
+        }
+        else {
+            sIndex = (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
+        }
        XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize);
    }
    else {
@@ -169,8 +176,10 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
    }

    if (srcIndex->devID < 0) {
-        if(mem != NULL)
+        if (mem != NULL) {
            mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
+            mem->UnlockBuf();
+        }
        else
            XMemFree(mem->devID, sIndex);
    }
@@ -209,9 +218,16 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
            CheckNTErrors(srcIndexValue < s->unitNum, "Wrong index!");
        }

-        sIndex = mem != NULL ?
-                  (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
-                  (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
+        /*sIndex = mem != NULL ?
+        (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
+        (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);*/
+        if (mem != NULL) {
+            mem->LockBuf();
+            sIndex = (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize);
+        }
+        else {
+            sIndex = (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
+        }
        XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize);
    }
    else {
@@ -238,6 +254,15 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
    else {
        ShowNTErrors("Unsupported dataType!");
    }
+
+    if (srcIndex->devID < 0) {
+        if (mem != NULL) {
+            mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
+            mem->UnlockBuf();
+        }
+        else
+            XMemFree(mem->devID, sIndex);
+    }
 }
 #endif // USE_CUDA


--- a/source/tensor/core/movement/Spread.cpp
+++ b/source/tensor/core/movement/Spread.cpp
@@ -231,8 +231,8 @@ And this is a special spread function for backward computation of gather functio
 */
 void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index)
 {
-    int dim = 0;
-    int order = source->order;
+    //int dim = 0;
+    //int order = source->order;

    CheckNTErrors(source->dataType == DEFAULT_DTYPE, "TODO!");
    CheckNTErrors(collection->GetDim(-1) == source->GetDim(-1), "Illegal dimension!");
@@ -272,4 +272,4 @@ void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index)
    }
 }

-} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/movement/Spread.cu
+++ b/source/tensor/core/movement/Spread.cu
@@ -177,9 +177,17 @@ void _CudaSpread(XTensor * source, XTensor * collection, int dim,
        DTYPE * c = (DTYPE*)collection->data;

        XMem * mem = source->mem;
-        int * si = mem != NULL ? 
+        /*int * si = mem != NULL ? 
                   (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize * 2) : 
-                   (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize * 2);
+                   (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize * 2);*/
+        int * si;
+        if (mem != NULL) {
+            mem->LockBuf();
+            si = (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize * 2);
+        }
+        else {
+            si = (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize * 2);
+        }
        int * ci = si + indexSize;

        XMemCopy(si, mem->devID, srcIndex, -1, sizeof(int) * indexSize);
@@ -188,8 +196,10 @@ void _CudaSpread(XTensor * source, XTensor * collection, int dim,
        KernelSpreadFuzed<<<blocks, threads >>>(s, c, blockNum, blockSizeSrc, blockSizeColl,
                                                stride, indexSize, si, ci);

-        if(mem != NULL)
+        if (mem != NULL) {
            mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize * 2);
+            mem->UnlockBuf();
+        }
        else
            XMemFree(mem->devID, si);
    }
@@ -393,9 +403,16 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcI
    dim3 threads(cudaBlocks[0], cudaBlocks[1]);

    if (srcIndex->devID < 0) {
-        sIndex = mem != NULL ? 
+        /*sIndex = mem != NULL ? 
                (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) : 
-                (int*)XMemAlloc(devID, sizeof(int) * indexSize);
+                (int*)XMemAlloc(devID, sizeof(int) * indexSize);*/
+        if (mem != NULL) {
+            mem->LockBuf();
+            sIndex = (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize);
+        }
+        else {
+            sIndex = (int*)XMemAlloc(devID, sizeof(int) * indexSize);
+        }
        XMemCopy(sIndex, devID, srcIndex->data, -1, sizeof(int) * indexSize);
    }
    else
@@ -422,8 +439,10 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcI
    }

    if (srcIndex->devID < 0) {
-        if(mem != NULL)
+        if (mem != NULL) {
            mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
+            mem->UnlockBuf();
+        }
        else
            XMemFree(devID, sIndex);
    }

--- a/source/tensor/core/reduce/ReduceMax.cu
+++ b/source/tensor/core/reduce/ReduceMax.cu
@@ -512,8 +512,8 @@ void funName(DTYPE * input, DTYPE * output,int stride, int strideNum,           
 KERNELREDUCEFUN1(KernelReduceMaxOp, MAX, shflDownReduceMax, FLOAT_MIN)
 KERNELREDUCEFUN1(KernelReduceMinOp, MIN, shflDownReduceMin, MAX_FLOAT)

-/* 
-get the max-valued items along a dimension of the tensor (cuda version). 
+/*
+get the max-valued items along a dimension of the tensor (cuda version).
 For a 1-dimensional data array a,
 sum_i = max_{0<=j<strideNum} input_{i,j}
 >> input - the input tensor
@@ -574,7 +574,14 @@ void _funcName(const XTensor * input, XTensor * output, int dim)                
        XMem * mem = input->mem;                                                                                                              \
        GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);                                     \
        int bufSize = input->unitSize * cudaGridSize[0] * stride * blockNum * 2;                                                              \
-        DTYPE * buf  = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(devID, bufSize);                          \
+        DTYPE * buf;                                                                                                                          \
+        if (mem != NULL) {                                                                                                                    \
+            mem->LockBuf();                                                                                                                   \
+            buf = (DTYPE*)mem->AllocBuf(mem->devID, bufSize);                                                                                 \
+        }                                                                                                                                     \
+        else {                                                                                                                                \
+            buf = (DTYPE*)XMemAlloc(devID, bufSize);                                                                                          \
+        }                                                                                                                                     \
        DTYPE * buf1 = buf;                                                                                                                   \
        DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum;                                                                             \
        do {                                                                                                                                  \
@@ -706,8 +713,10 @@ void _funcName(const XTensor * input, XTensor * output, int dim)                
                                                                                                                                              \
        } while (strideNum > 1);                                                                                                              \
                                                                                                                                              \
-        if (mem != NULL)                                                                                                                      \
+        if (mem != NULL) {                                                                                                                    \
            mem->ReleaseBuf(mem->devID, bufSize);                                                                                             \
+            mem->UnlockBuf();                                                                                                                 \
+        }                                                                                                                                     \
        else                                                                                                                                  \
            XMemFree(input->devID, buf);                                                                                                      \
    }                                                                                                                                         \

--- a/source/tensor/core/reduce/ReduceSum.cu
+++ b/source/tensor/core/reduce/ReduceSum.cu
@@ -757,7 +757,15 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
        GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);

        int bufSize = input->unitSize * cudaGridSize[0] * stride * blockNum * 2;
-        DTYPE * buf  = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(devID, bufSize);
+        //DTYPE * buf  = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(devID, bufSize);
+        DTYPE * buf;
+        if (mem != NULL) {
+            mem->LockBuf();
+            buf = (DTYPE*)mem->AllocBuf(mem->devID, bufSize);
+        }
+        else {
+            buf = (DTYPE*)XMemAlloc(devID, bufSize);
+        }
        DTYPE * buf1 = buf;
        DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum;
        do {
@@ -907,8 +915,10 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
        } while (strideNum > 1);
        

-        if (mem != NULL)
+        if (mem != NULL) {
            mem->ReleaseBuf(mem->devID, bufSize);
+            mem->UnlockBuf();
+        }
        else
            XMemFree(devID, buf);
    }

--- a/source/tensor/core/reduce/ReduceSumAll.cpp
+++ b/source/tensor/core/reduce/ReduceSumAll.cpp
@@ -56,12 +56,16 @@ void _ReduceSumAll(const XTensor * source, XTensor * target)

    int dims[1] = {source->unitNum};

+    if (source->mem != NULL)
+        source->mem->LockBuf();
    XTensor * all = NewTensorBufV2(1, dims, source->dataType, source->denseRatio, source->devID, source->mem);

    _CopyValues(source, all);
    _ReduceSum(all, target, 0);

    DelTensorBuf(all);
+    if (source->mem != NULL)
+        source->mem->UnlockBuf();
 }

 /*
@@ -72,7 +76,8 @@ sum all the items of the tensor (It should be optimized!)
 void _ReduceSumAll(const XTensor * source, DTYPE * value)
 {
    int * dimSize = new int[MAX_TENSOR_DIM_NUM];
-    float dr = (!source->isSparse) ? 1.0F : source->denseRatio;
+    if (source->mem != NULL)
+        source->mem->LockBuf();
    XTensor * target = NewTensorBufV2(0, dimSize, source->dataType, source->denseRatio, source->devID, source->mem);
    target->SetTMPFlag();

@@ -82,6 +87,8 @@ void _ReduceSumAll(const XTensor * source, DTYPE * value)

    delete[] dimSize;
    DelTensorBuf(target);
+    if (source->mem != NULL)
+        source->mem->UnlockBuf();
 }

 /*
@@ -122,4 +129,4 @@ DTYPE ReduceSumAllValue(const XTensor & source)
    return target.Get0D();
 }

-} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/shape/Merge.cpp
+++ b/source/tensor/core/shape/Merge.cpp
@@ -32,14 +32,14 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /*
 transform a tensor by merging it along with a dimension.

-e.g., (N/3, M, 3) -> (N, M)
+e.g., (3, M, N/3) -> (M, N)

 >> s - the source tensor
 >> t - the target tensor (for return)
 >> whereToMerge - the merging operation is along with which dimension
->> leadingDim - the leading dimension of merging, take (N/3, M, 3) -> (N, M) 
-   for example, whereToMerge = 0 (i.e., the dimension for "N/3")
-   leadingDim = 2 (i.e., the dimension for "3")
+>> leadingDim - the leading dimension of merging, take (3, M, N/3) -> (M, N)
+                for example, whereToMerge = 2 (i.e., the dimension for "N/3")
+                leadingDim = 0 (i.e., the dimension for "3")
 */
 void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
 {
@@ -118,30 +118,54 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)

        void * dataTMP = t->data;

-        if (!isOnSameDevice)
-            dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(mem->devID, size);
+        if (!isOnSameDevice) {
+            /*dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(mem->devID, size);*/
+            if (mem != NULL) {
+                mem->LockBuf();
+                dataTMP = mem->AllocBuf(mem->devID, size);
+            }
+            else {
+                dataTMP = XMemAlloc(mem->devID, size);
+            }
+        }

        int blockNumInMerge = s->dimSize[leadingDim];
        int splitSizeInGrid = gridSize / blockNumInMerge;
        int realBlockSize = blockSize * t->unitSize;

-        int * blockIndex = (int*)(mem != NULL ?
+        /*int * blockIndex = (int*)(mem != NULL ?
                                  mem->AllocBuf(mem->devID, blockNum * gridNum * sizeof(int)) :
-                                  XMemAlloc(s->devID, blockNum * gridNum * sizeof(int)));
+                                  XMemAlloc(s->devID, blockNum * gridNum * sizeof(int)));*/
+        int * blockIndex;
+        if (mem != NULL) {
+            if (isOnSameDevice) {
+                mem->LockBuf();
+            }
+            blockIndex = (int*)mem->AllocBuf(mem->devID, blockNum * gridNum * sizeof(int));
+        }
+        else {
+            blockIndex = (int*)XMemAlloc(s->devID, blockNum * gridNum * sizeof(int));
+        }

        _MakeMergeBlockIndex(blockIndex, blockNum, blockNumInMerge, splitSizeInGrid, gridSize, gridNum, s->devID);

        _CopyBlocksOnSite(s->data, s->unitSize, realBlockSize, blockNum * gridNum, dataTMP, blockIndex, s->devID);

-        if (mem != NULL)
+        if (mem != NULL) {
            mem->ReleaseBuf(mem->devID, blockNum * gridNum * sizeof(int));
+            if (isOnSameDevice) {
+                mem->UnlockBuf();
+            }
+        }
        else
            XMemFree(s->devID, blockIndex);

        if (!isOnSameDevice) {
            XMemCopy(t->data, t->devID, dataTMP, s->devID, size);
-            if (mem != NULL)
+            if (mem != NULL) {
                mem->ReleaseBuf(mem->devID, size);
+                mem->UnlockBuf();
+            }
            else
                XMemFree(s->devID, dataTMP);
        }
@@ -185,13 +209,13 @@ bool CheckMergeSize(const XTensor * s, const XTensor * t, int whereToMerge, int 
 transform a tensor by merging it along with a dimension (return an XTensor structure)
 make a new tensor to keep the result and  return it

-e.g., (N/3, M, 3) -> (N, M)
+e.g., (3, M, N/3) -> (M, N)

 >> s - the source tensor
 >> whereToMerge - the merging operation is along with which dimension
->> leadingDim - the leading dimension of merging, take (N/3, M, 3) -> (N, M) 
-   for example, whereToMerge = 0 (i.e., the dimension for "N/3")
-   leadingDim = 2 (i.e., the dimension for "3")
+>> leadingDim - the leading dimension of merging, take (3, M, N/3) -> (M, N) 
+   for example, whereToMerge = 2 (i.e., the dimension for "N/3")
+   leadingDim = 0 (i.e., the dimension for "3")
 << return - the transformed tensor by merging along with a dimension
 */
 XTensor Merge(const XTensor &s, int whereToMerge, int leadingDim)
@@ -358,8 +382,16 @@ void _Merge(const TensorList * smalls, XTensor * t, int whereToMerge)
        void * dataTMP = NULL;
        if (uniform)
            dataTMP = smallsItem0->data;
-        else
-            dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(t->devID, size);
+        else {
+            //dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(t->devID, size);
+            if (mem != NULL) {
+                mem->LockBuf();
+                dataTMP = mem->AllocBuf(mem->devID, size);
+            }
+            else {
+                dataTMP = XMemAlloc(t->devID, size);
+            }
+        }

        tensorTMP->data = dataTMP;

@@ -378,8 +410,10 @@ void _Merge(const TensorList * smalls, XTensor * t, int whereToMerge)
        tensorTMP->data = NULL;
        delete tensorTMP;

-        if ((!uniform) && (mem != NULL))
+        if ((!uniform) && (mem != NULL)) {
            mem->ReleaseBuf(mem->devID, size);
+            mem->UnlockBuf();
+        }
        else
            XMemFree(t->devID, dataTMP);
    }

--- a/source/tensor/core/shape/MergeBlockLists.cu
+++ b/source/tensor/core/shape/MergeBlockLists.cu
@@ -117,7 +117,7 @@ void _CudaMergeBlockLists(const StrList* sourceList, int * blockSizes, int block

    GDevs.GetCudaThread2D(myMem->devID, realMaxBlockSize, newBlockListSize, MAX_INT,
                          cudaGridSizes, cudaBlockSizes);
-
+    myMem->LockBuf();
    myMem->SetPinBuf();
    int * sizesGPU = (int*)myMem->AllocBuf(myMem->devID, sizeof(int) * newBlockListSize, 256);

@@ -133,6 +133,7 @@ void _CudaMergeBlockLists(const StrList* sourceList, int * blockSizes, int block
                            (sourceArraysGPU, sizesGPU, newBlockListSize, targetArraysGPU);

    myMem->BackToPinBuf();
+    myMem->UnlockBuf();

    delete[] sourceArrays;
    delete[] targetArrays;

--- a/source/tensor/core/shape/Split.cpp
+++ b/source/tensor/core/shape/Split.cpp
@@ -110,22 +110,44 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)

        void * dataTMP = t->data;

-        if (!isOnSameDevice)
-            dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(s->devID, size);
+        if (!isOnSameDevice) {
+            //dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(s->devID, size);
+            if (mem != NULL) {
+                mem->LockBuf();
+                dataTMP = mem->AllocBuf(mem->devID, size);
+            }
+            else {
+                dataTMP = XMemAlloc(s->devID, size);
+            }
+        }

        int realBlockSize = blockSize * t->unitSize;
        int blockSplitSize = blockNum / splitNum;

-        int * blockIndex = (int*)(mem != NULL ?
+        /*int * blockIndex = (int*)(mem != NULL ?
                                  mem->AllocBuf(mem->devID, blockNum * sizeof(int)) :
-                                  XMemAlloc(s->devID, blockNum * sizeof(int)));
+                                  XMemAlloc(s->devID, blockNum * sizeof(int)));*/
+        int * blockIndex;
+        if (mem != NULL) {
+            if (isOnSameDevice) {
+                mem->LockBuf();
+            }
+            blockIndex = (int*)mem->AllocBuf(mem->devID, blockNum * sizeof(int));
+        }
+        else {
+            blockIndex = (int*)XMemAlloc(s->devID, blockNum * sizeof(int));
+        }

        _MakeSplitBlockIndex(blockIndex, splitNum, blockSplitSize, blockNum, s->devID);

        _CopyBlocksOnSite(s->data, s->unitSize, realBlockSize, blockNum, dataTMP, blockIndex, s->devID);

-        if (mem != NULL)
+        if (mem != NULL) {
            mem->ReleaseBuf(mem->devID, blockNum * sizeof(int));
+            if (isOnSameDevice) {
+                mem->UnlockBuf();
+            }
+        }
        else
            XMemFree(s->devID, blockIndex);

@@ -133,8 +155,10 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
        if (!isOnSameDevice) {
            XMemCopy(t->data, t->devID, dataTMP, s->devID, size);

-            if (mem != NULL)
+            if (mem != NULL) {
                mem->ReleaseBuf(mem->devID, size);
+                mem->UnlockBuf();
+            }
            else
                XMemFree(s->devID, dataTMP);
        }
@@ -333,7 +357,14 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli
            dataTMP = first->data;
        }
        else {
-            dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(big->devID, size);
+            //dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(big->devID, size);
+            if (mem != NULL) {
+                mem->LockBuf();
+                dataTMP = mem->AllocBuf(mem->devID, size);
+            }
+            else {
+                dataTMP = XMemAlloc(big->devID, size);
+            }
        }

        tensorTMP->data = dataTMP;
@@ -354,8 +385,10 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli
        tensorTMP->data = NULL;
        delete tensorTMP;

-        if ((!uniform) && (mem != NULL))
+        if ((!uniform) && (mem != NULL)) {
            mem->ReleaseBuf(mem->devID, size);
+            mem->UnlockBuf();
+        }
        else
            XMemFree(big->devID, dataTMP);
    }

--- a/source/tensor/core/shape/Stack.cpp
+++ b/source/tensor/core/shape/Stack.cpp
@@ -43,13 +43,11 @@ void _Stack(const TensorList * smalls, XTensor * t, int dim)

    int blockSize = 1;
    int blockNum = 1;
-    int gridSize = 1;
    int gridNum = 1;

    XTensor * smallsItem0 = smalls->GetItem(0);
-    int unitNum = smallsItem0->unitNum;
+    //int unitNum = smallsItem0->unitNum;
    int unitSize = smallsItem0->unitSize;
-    int itemSize = unitNum * unitSize;

    for (int i = 0; i < smallsItem0->order; i++) {
        if (i >= dim)
@@ -129,7 +127,7 @@ bool CheckStackShape(const TensorList &smalls, XTensor &t, int dim)
    XTensor * tensor = (XTensor*)smalls.GetItem(0);
    int order = tensor->order;

-    for (int i = 0; i < tensor->order; i++) {
+    for (int i = 0; i < order; i++) {
        if (i < dim) {
            if (t.GetDim(i) != tensor->GetDim(i)) 
                return false;

--- a/source/tensor/core/sort/Sort.cu
+++ b/source/tensor/core/sort/Sort.cu
@@ -234,7 +234,15 @@ void _CudaSortBig(const XTensor * a, XTensor * b, XTensor * indexA, XTensor * in
    int m = GetNextPower2(strideNum);
    int n = stride * blockNum;

-    void * buf = mem != NULL ? mem->AllocBuf(a->devID, n * m * a->unitSize) : XMemAlloc(a->devID, n * m * a->unitSize);
+    //void * buf = mem != NULL ? mem->AllocBuf(a->devID, n * m * a->unitSize) : XMemAlloc(a->devID, n * m * a->unitSize);
+    void * buf;
+    if (mem != NULL) {
+        mem->LockBuf();
+        buf = mem->AllocBuf(a->devID, n * m * a->unitSize);
+    }
+    else {
+        buf = XMemAlloc(a->devID, n * m * a->unitSize);
+    }
    void * bufIndex = NULL;
    if (indexA != NULL && indexB != NULL) {
        bufIndex = mem != NULL ? mem->AllocBuf(a->devID, n * m * sizeof(int)) : XMemAlloc(a->devID, n * m * sizeof(int));
@@ -289,8 +297,10 @@ void _CudaSortBig(const XTensor * a, XTensor * b, XTensor * indexA, XTensor * in
        KernelReorganizeBack<int> << <dim3(cudaGrids[1], cudaGrids[0]), dim3(cudaBlocks[1], cudaBlocks[0]) >> >
                                      (bufIndex, indexB->data, m, n, stride, k, blockNum);

-    if (mem != NULL)
+    if (mem != NULL) {
        mem->ReleaseBuf(a->devID, n * m * a->unitSize);
+        mem->UnlockBuf();
+    }
    else
        XMemFree(a->devID, buf);
    if (indexA != NULL && indexB != NULL)

--- a/source/tensor/function/LogSoftmax.cpp
+++ b/source/tensor/function/LogSoftmax.cpp
@@ -79,6 +79,8 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
        blockSize = stride * dimensionSize;
        blockNum = y->unitNum / blockSize;

+        if (mem != NULL)
+            mem->LockBuf();
        max = NewTensorBufV2(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
        sum = NewTensorBufV2(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);

@@ -153,6 +155,8 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)

        DelTensorBuf(max);
        DelTensorBuf(sum);
+        if (mem != NULL)
+            mem->UnlockBuf();

        if (x->devID >= 0) {
            delete blockx;

--- a/source/tensor/function/Softmax.cpp
+++ b/source/tensor/function/Softmax.cpp
@@ -54,6 +54,8 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
        XTensor * max = NULL;
        XTensor * sum = NULL;

+        if (mem != NULL)
+            mem->LockBuf();
        max = NewTensorBufV2(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
        sum = NewTensorBufV2(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);

@@ -113,6 +115,8 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)

        DelTensorBuf(sum);
        DelTensorBuf(max);
+        if (mem != NULL)
+            mem->UnlockBuf();

        delete[] dimSize;
    }

--- a/source/tensor/loss/CrossEntropy.cpp
+++ b/source/tensor/loss/CrossEntropy.cpp
@@ -354,8 +354,10 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
            dimSize[i - 1] = output->dimSize[i];
    }

+    if (output->mem != NULL)
+        output->mem->LockBuf();
    XTensor * lossBuf = NewTensorBufV2(output->order - 1, dimSize, output->dataType, output->denseRatio, 
-                                     output->devID, output->mem);
+                                       output->devID, output->mem);

    _CrossEntropy(output, gold, lossBuf, weight, padding, leadingDim);

@@ -367,10 +369,16 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
            nonZeroNum = (DTYPE)lossBuf->unitNum;
        }
        else {
+            if ((padding->mem != NULL) && (padding->mem != output->mem)) {
+                padding->mem->LockBuf();
+            }
            XTensor * tmp = NewTensorBufV2(padding, padding->devID, padding->mem);
            _IsNonZero(padding, tmp);
            _ReduceSumAll(tmp, &nonZeroNum);
            DelTensorBuf(tmp);
+            if ((padding->mem != NULL) && (padding->mem != output->mem)) {
+                padding->mem->UnlockBuf();
+            }
        }

        loss = loss / nonZeroNum;
@@ -384,6 +392,8 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,

    delete[] dimSize;
    DelTensorBuf(lossBuf);
+    if (output->mem != NULL)
+        output->mem->UnlockBuf();

    return loss;
 }

--- a/source/tensor/loss/CrossEntropy.cu
+++ b/source/tensor/loss/CrossEntropy.cu
@@ -57,6 +57,9 @@ void _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
 {
    int n = leadingDim < 0 ? output->order - 1 : leadingDim;
    
+    if (output->mem != NULL) {
+        output->mem->LockBuf();
+    }
    XTensor * interBuf1 = NewTensorBufV2(output, output->devID, output->mem);
    XTensor * interBuf2 = NewTensorBufV2(output, output->devID, output->mem);
    
@@ -73,6 +76,9 @@ void _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,

    DelTensorBuf(interBuf2);
    DelTensorBuf(interBuf1);
+    if (output->mem != NULL) {
+        output->mem->UnlockBuf();
+    }
 }

 /*
@@ -118,6 +124,9 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
            dimSize[i - 1] = output->dimSize[i];
    }

+    if (output->mem != NULL) {
+        output->mem->LockBuf();
+    }
    XTensor * lossBuf = NewTensorBufV2(output->order - 1, dimSize, output->dataType, output->denseRatio, 
                                     output->devID, output->mem);

@@ -131,10 +140,16 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
            nonZeroNum = (DTYPE)lossBuf->unitNum;
        }
        else {
+            if ((padding->mem != NULL) && (padding->mem != output->mem)) {
+                padding->mem->LockBuf();
+            }
            XTensor * tmp = NewTensorBufV2(padding, padding->devID, padding->mem);
            _IsNonZero(padding, tmp);
            _ReduceSumAll(tmp, &nonZeroNum);
            DelTensorBuf(tmp);
+            if ((padding->mem != NULL) && (padding->mem != output->mem)) {
+                padding->mem->UnlockBuf();
+            }
        }

        loss = loss / nonZeroNum;
@@ -148,6 +163,9 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,

    delete[] dimSize;
    DelTensorBuf(lossBuf);
+    if (output->mem != NULL) {
+        output->mem->UnlockBuf();
+    }

    return loss;
 }

--- a/source/tensor/test/TConvertDataType.cpp
+++ b/source/tensor/test/TConvertDataType.cpp
@@ -215,12 +215,7 @@ bool TestConvertDataType3()
                          {0.5F, -4.0F},
                          {0.0F, 6.0F} };
    
-    DTYPE data2[2][3] = { {1.0F, 2.0F, 3.0F},
-                          {0.0F, 4.0F, 5.0F} };
    
-    DTYPE answer[3][3] = { {1.0F, -6.0F, -7.0F},
-                           {0.5F, -15.0F, -18.5F}, 
-                           {0.0F, 24.0F, 30.0F} };

    /* CPU test */
    bool cpuTest = true;
@@ -241,6 +236,14 @@ bool TestConvertDataType3()
    cpuTest = _CheckData(a, data1, unitNum1, 1e-4F);

 #ifdef USE_CUDA
+
+    DTYPE data2[2][3] = { { 1.0F, 2.0F, 3.0F },
+                          { 0.0F, 4.0F, 5.0F } };
+
+    DTYPE answer[3][3] = { { 1.0F, -6.0F, -7.0F },
+                           { 0.5F, -15.0F, -18.5F },
+                           { 0.0F, 24.0F, 30.0F } };
+
    /* GPU test */
    bool gpuTest = true;


--- a/source/tensor/test/TGather.cpp
+++ b/source/tensor/test/TGather.cpp
@@ -67,7 +67,6 @@ bool TestGather1()
    DTYPE answer[2][3] = { {0.0F, -1.0F, 2.0F},
                           {1.0F, 2.0F, 4.0F} };

-    int dim = 0;
    int indexSize = 2;
    int srcIndex[2] = {0, 2};


--- a/source/tensor/test/TSetData.cpp
+++ b/source/tensor/test/TSetData.cpp
@@ -422,7 +422,7 @@ bool TestSetData6()
    for (int i = 0; i < order; i++)
        unitNum *= dimSize[i];

-    DTYPE answer[5] = {5.2F, 3.2F, 1.2F, -0.8F, -2.8F};
+    //DTYPE answer[5] = {5.2F, 3.2F, 1.2F, -0.8F, -2.8F};

    /* CPU test */
    bool cpuTest = true;

--- a/source/train/TTrain.cpp
+++ b/source/train/TTrain.cpp
+/*
+* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2016-2021
+* Natural Language Processing Lab, Northeastern University
+* and
+* NiuTrans Research
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* We test XTrain here. It is simple, we design a simple task in that we
+* make the model to predict an integer D (0-100) from four input integers
+* A, B, C and D (0-100). We generate a number of samples with different values
+* of A, B, C and D. The gold standard is
+*
+*          D = (int)(sqrt(A * B) + abs(C - D))/2
+*
+* Our model is a two-layer feed-forward neural network. It can be treated
+* as a classifier rather than a regression model.
+*
+* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-03-03
+*/
+
+#include "TTrain.h"
+#include "../tensor/core/CHeader.h"
+#include "../tensor/function/FHeader.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+XTensor * tmpTT = NULL;
+
+/* genreate the training data file */
+void GeneateTTrainData(const char * fileName)
+{
+    FILE * file = fopen(fileName, "wb");
+    CheckNTErrors(file, "Cannot open the file");
+
+    XPRINT(1, stderr, "[INFO] Generating data ... ");
+
+    int sampleNum = MAX_SAMPLE_NUM_IN_TTRAIN;
+    int range = MAX_INT_IN_TTRAIN;
+
+    fprintf(file, "%d\n", sampleNum);
+
+    srand(1);
+
+    for (int i = 0; i < sampleNum; i++) {
+        int A = (int)(((float)rand() / RAND_MAX) * range);
+        int B = (int)(((float)rand() / RAND_MAX) * range);
+        int C = (int)(((float)rand() / RAND_MAX) * range);
+        int D = (int)(((float)rand() / RAND_MAX) * range);
+        int E = (int)((sqrt(A * B) + abs(C - D)) / 2);
+        fprintf(file, "%d %d %d %d %d\n", A, B, C, D, E);
+    }
+
+    XPRINT2(1, stderr, "%d samples in \"%s\" [DONE]\n", sampleNum, fileName);
+    
+    fclose(file);
+}
+
+/* run the test */
+void TestTrain()
+{
+    GeneateTTrainData("ttrain.txt");
+
+    XConfig config;
+    //config.Add("dev", -1);
+    config.Add("lrate", 0.1F);
+    config.Add("nstep", 100000);
+    config.Add("nepoch", 5);
+    config.Add("jobdev0", 0);
+    //config.Add("jobdev4", -1);
+
+    int serverDevID = config.GetInt("jobdev0", -1);
+
+    TTDataLoader loader;
+    loader.SetFileName("ttrain.txt");
+    loader.SetBatchSize(config.GetInt("batchsize", TT_BATCH_SIZE));
+
+    TTModel model;
+    model.Init(config, serverDevID);
+
+    tmpTT = model.params[0].param;
+
+    XOptimizer optimizer;
+    optimizer.Init(config);
+
+    XTrainer trainer;
+    trainer.Run(&config, &loader, &model, &optimizer);
+}
+
+/*****************************
+* data loader
+******************************/
+
+/* constructor */
+TTDataLoader::TTDataLoader()
+{
+    fileName = new char[MAX_FILE_NAME_LENGTH];
+    file = NULL;
+    batchSize = TT_BATCH_SIZE;
+}
+
+/* de-constructor */
+TTDataLoader::~TTDataLoader()
+{
+    delete[] fileName;
+}
+
+/* set file name */
+void TTDataLoader::SetFileName(const char * myFileName)
+{
+    strcpy(fileName, myFileName);
+}
+
+/* set batch size */
+void TTDataLoader::SetBatchSize(int myBatchSize)
+{
+    batchSize = myBatchSize;
+}
+
+/* start the process */
+bool TTDataLoader::Start()
+{
+    file = fopen(fileName, "rb");
+    CheckNTErrors(file != NULL, "Cannot open the file");
+
+    /* skip the first line */
+    char * line = new char[MAX_SAMPLE_LINE_LENGTH];
+    fgets(line, MAX_SAMPLE_LINE_LENGTH, file);
+    delete[] line;
+
+    return true;
+}
+
+/* end the process */
+bool TTDataLoader::End()
+{
+    fclose(file);
+
+    return true;
+}
+
+/* 
+get a batch of samples 
+>> inputs - inputs of the model
+>> golds - gold standards
+*/
+bool TTDataLoader::GetBatchSimple(XList * inputs, XList * golds)
+{
+    CheckNTErrors(file != NULL, "No input file specificed!");
+    CheckNTErrors(inputs != NULL && inputs->count >= 1, "Wrong argument!");
+    CheckNTErrors(golds != NULL && golds->count >= 1, "Wrong argument!");
+
+    XTensor * input = (XTensor*)inputs->GetItem(0);
+    XTensor * gold = (XTensor*)golds->GetItem(0);
+
+    int count = 0;
+    int sampleSize = MAX_SAMPLE_SIZE;
+    char * line = new char[MAX_SAMPLE_LINE_LENGTH];
+    int * inputBatch = new int[batchSize * sampleSize];
+    int * goldBatch = new int[batchSize];
+    int A, B, C, D, E;
+    
+    while (fgets(line, MAX_SAMPLE_LINE_LENGTH, file)) {
+
+        if (count == batchSize)
+            break;
+
+        if (sscanf(line, "%d %d %d %d %d", &A, &B, &C, &D, &E) < sampleSize + 1) {
+            ShowNTErrors("Wrong format in the training file!");
+        }
+
+        inputBatch[count * sampleSize] = A;
+        inputBatch[count * sampleSize + 1] = B;
+        inputBatch[count * sampleSize + 2] = C;
+        inputBatch[count * sampleSize + 3] = D;
+        goldBatch[count] = E;
+
+        count++;
+    }
+
+    if (count > 0) {
+        InitTensor2D(input, count, 4, X_INT);
+        InitTensor2D(gold, count, 1, X_INT);
+
+        input->SetData(inputBatch, count * 4);
+        gold->SetData(goldBatch, count);
+    }
+
+    delete[] line;
+    delete[] inputBatch;
+    delete[] goldBatch;
+
+    if (count > 0)
+        return true;
+    else
+        return false;
+}
+
+/*****************************
+* the neural model
+******************************/
+
+/* constructor */
+TTModel::TTModel()
+{
+    devID = -1;
+    vSize = 0;
+    eSize = 0;
+    hSize = 0;
+}
+
+/* de-constructor */
+TTModel::~TTModel()
+{
+}
+
+/* config it */
+void TTModel::SetConfig(XConfig &myConfig)
+{
+    config.CreateFromMe(myConfig);
+}
+
+/* 
+initialize the model 
+>> myConfig - configuration
+>> devID - device id
+*/
+void TTModel::Init(XConfig &myConfig, int myDevID)
+{
+    Clear();
+    SetConfig(myConfig);
+
+    devID = myDevID;
+
+    vSize = MAX_INT_IN_TTRAIN + 1;
+    eSize = config.GetInt("esize", TT_EMBEDDING_SIZE);
+    hSize = config.GetInt("hsize", TT_HIDDEN_SIZE);
+
+    InitTensor2D(&embeddingW, vSize, eSize, X_FLOAT, devID);
+    InitTensor2D(&hiddenW, MAX_SAMPLE_SIZE * eSize, hSize, X_FLOAT, devID);
+    InitTensor2D(&outputW, hSize, vSize, X_FLOAT, devID);
+
+    embeddingW.SetName("embeddingw");
+    hiddenW.SetName("hiddenw");
+    outputW.SetName("outputw");
+
+    embeddingW.SetDataRand(-0.1F, 0.1F);
+    hiddenW.SetDataRand(-0.1F, 0.1F);
+    outputW.SetDataRand(-0.1F, 0.1F);
+    
+    AddParam(&embeddingW);
+    AddParam(&hiddenW);
+    AddParam(&outputW);
+}
+
+/* 
+create the model 
+>> devID - device id
+>> input - as it is
+>> output - as it is
+*/
+void TTModel::Forward(int devID, XTensor * input, XTensor * output)
+{
+    XTensor embedding;
+    XTensor embeddingCat;
+    XTensor hidden;
+
+    /* [e_0, e_1, e_2] = w_e * input(one-hot) */
+    embedding = Gather(embeddingW, *input);
+
+    /* e = merge(e_0, e_1, e_2) */
+    embeddingCat = Merge(embedding, embedding.order - 1, embedding.order - 2);
+
+    /* h = hardtanh(e * w_h) */
+    hidden = HardTanH(MMul(embeddingCat, hiddenW));
+
+    /* output = Softmax(h * w_o) */
+    *output = Softmax(MMul(hidden, outputW), -1);
+}
+
+/* clear the model */
+void TTModel::Clear()
+{
+    config.Clear();
+}
+
+/* 
+clone the model 
+>> devID - device id
+*/
+XModel * TTModel::Clone(int devID)
+{
+    TTModel * model = new TTModel();
+    model->SetConfig(config);
+    model->Init(config, devID);
+
+    CopyValues(embeddingW, model->embeddingW);
+    CopyValues(hiddenW, model->hiddenW);
+    CopyValues(outputW, model->outputW);
+
+    return model;
+}
+
+/* 
+run the neural network
+>> inputs - inputs of the model
+>> outputs - outputs of the model
+>> golds - gold standards
+>> losses - losses of the output respect to the gold standards
+*/
+bool TTModel::RunSimple(XList * inputs, XList * outputs, XList * golds, XList* losses)
+{
+    //fprintf(stderr, "run simple 0\n");
+    CheckNTErrors(inputs != NULL && inputs->count >= 1, "Wrong arguments!");
+    CheckNTErrors(outputs != NULL && outputs->count >= 1, "Wrong arguments!");
+    CheckNTErrors(golds != NULL && golds->count >= 1, "Wrong arguments!");
+    CheckNTErrors(losses != NULL && losses->count >= 1, "Wrong arguments!");
+
+    XTensor * input = (XTensor*)inputs->GetItem(0);
+    XTensor * output = (XTensor*)outputs->GetItem(0);
+    XTensor * gold = (XTensor*)golds->GetItem(0);
+    XTensor * loss = (XTensor*)losses->GetItem(0);
+    XTensor goldOneHot;
+
+    /* place all input data on the correct device */
+    input->FlushToDevice(devID);
+    output->FlushToDevice(devID);
+    gold->FlushToDevice(devID);
+
+    XNet net;
+
+    /* create the neural network and run it */
+    Forward(devID, input, output);
+
+    /* gold standard in ong-hot representaiton */
+    goldOneHot = IndexToOnehot(*gold, vSize, 0.0F);
+
+    int * dims = new int[goldOneHot.order];
+    for (int i = 0; i < goldOneHot.order - 2; i++)
+        dims[i] = goldOneHot.GetDim(i);
+    dims[goldOneHot.order - 2] = goldOneHot.GetDim(goldOneHot.order - 1);
+    goldOneHot.Reshape(goldOneHot.order - 1, dims);
+
+    /* loss */
+    *loss = CrossEntropy(*output, goldOneHot);
+
+    /* back-propagation */
+    net.Backward(*loss);
+
+    delete[] dims;
+    
+    //fprintf(stderr, "run simple 1\n");
+
+    return true;
+}
+
+}
--- a/source/train/TTrain.h
+++ b/source/train/TTrain.h
+/*
+* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2016-2021
+* Natural Language Processing Lab, Northeastern University
+* and
+* NiuTrans Research
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* We test XTrain here. It is simple, we design a simple task in that we
+* make the model to predict an integer D (0-100) from three input integers 
+* A, B and C (0-100). We generate a number of samples with different values
+* of A, B and C. The gold standard is 
+*     
+*          D = (int)(sqrt(A * B) + C)/2
+* 
+* Our model is a two-layer feed-forward neural network. It can be treated
+* as a classifier rather than a regression model.
+*
+* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-03-03
+* The express train was updated this year. It just takes me two hours and
+* a half from Shenyang to Beijing.
+*/
+
+#ifndef __TTRAIN_H__
+#define __TTRAIN_H__
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "XTrainer.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+#define MAX_SAMPLE_NUM_IN_TTRAIN 200000
+#define MAX_INT_IN_TTRAIN 100
+#define MAX_SAMPLE_LINE_LENGTH 128
+#define MAX_SAMPLE_SIZE 4
+#define TT_BATCH_SIZE 256
+#define TT_EMBEDDING_SIZE 128
+#define TT_HIDDEN_SIZE 512
+
+extern XTensor * tmpTT;
+
+/* genreate the training data file */
+void GeneateTTrainData(const char * fileName);
+
+/* run the test */
+extern
+void TestTrain();
+
+/* data loader */
+class TTDataLoader : public DataDistributeBase
+{
+protected:
+    /* file name */
+    char * fileName;
+
+    /* file handle */
+    FILE * file;
+
+    /* batch size */
+    int batchSize;
+
+public:
+    /* constructor */
+    TTDataLoader();
+
+    /* de-constructor */
+    ~TTDataLoader();
+
+    /* set file name */
+    void SetFileName(const char * myFileName);
+
+    /* set batch size */
+    void SetBatchSize(int myBatchSize);
+
+    /* start the process */
+    bool Start();
+
+    /* end the process */
+    bool End();
+
+    /* get a batch of samples */
+    bool GetBatchSimple(XList * inputs, XList * golds);
+};
+
+/* the model */
+class TTModel : public XModel
+{
+protected:
+    /* device id */
+    int devID;
+
+    /* configuration */
+    XConfig config;
+
+    /* embedding matrix of the input */
+    XTensor embeddingW;
+
+    /* parameter matrix of the hidden layer */
+    XTensor hiddenW;
+
+    /* parameter matrix of the output layer */
+    XTensor outputW;
+
+    /* vocabulary size */
+    int vSize;
+
+    /* embedding size */
+    int eSize;
+
+    /* hidden layer size */
+    int hSize;
+
+public:
+    /* constructor */
+    TTModel();
+
+    /* de-constructor */
+    ~TTModel();
+
+    /* config it */
+    void SetConfig(XConfig &myConfig);
+
+    /* initialize the parameters */
+    void Init(XConfig &myConfig, int myDevID);
+
+    /* create the model */
+    void Forward(int devID, XTensor * input, XTensor * output);
+
+    /* clear the model */
+    void Clear();
+
+    /* clone the model */
+    XModel * Clone(int devID);
+
+    /* run the neural network */
+    bool RunSimple(XList * inputs, XList * outputs, XList * golds, XList * losses);
+};
+
+/*  */
+
+}
+
+#endif
\ No newline at end of file
--- a/source/train/XBaseTemplate.cpp
+++ b/source/train/XBaseTemplate.cpp
+/*
+* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2016-2021
+* Natural Language Processing Lab, Northeastern University
+* and
+* NiuTrans Research
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* We define various template classes here. They will be overloaded and used
+* in applications.
+*
+* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-02-25
+*/
+
+#include "XBaseTemplate.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/******************************* 
+ * data loader template 
+ *******************************/
+
+/* constructor */
+DataDistributeBase::DataDistributeBase()
+{
+    MUTEX_INIT(loadMutex);
+}
+
+/* de-constructor */
+DataDistributeBase::~DataDistributeBase()
+{
+    MUTEX_DELE(loadMutex);
+}
+
+/* * start the job (e.g., open the file) */
+bool DataDistributeBase::Start()
+{
+    ShowNTErrors("DataDistributeBase::Start must be overloaded!");
+    return true;
+}
+
+/* end the job (e.g., close the file) */
+bool DataDistributeBase::End()
+{
+    ShowNTErrors("DataDistributeBase::End must be overloaded!");
+    return true;
+}
+
+/* 
+get a batch of samples 
+>> inputs - inputs of the model
+>> golds - gold standards
+*/
+bool DataDistributeBase::GetBatchSimple(XList * inputs, XList * golds)
+{
+    return false;
+}
+
+/* get a batch of samples */
+bool DataDistributeBase::GetBatch(XList * args)
+{
+    CheckNTErrors(args->count >= 2, "More input arguments are required!");
+
+    XList * input = (XList*)args->GetItem(0);
+    XList * gold = (XList*)args->GetItem(1);
+
+    if (GetBatchSimple(input, gold))
+        return true;
+
+    ShowNTErrors("You must be overload one of these: DataDistributeBase::GetBatchSimple ... !");
+    return false;
+}
+
+/* get a batch of samples (for multi-threading) */
+bool DataDistributeBase::GetBatchSafe(XList * args)
+{
+    bool r;
+
+    MUTEX_LOCK(loadMutex);
+    r = GetBatch(args);
+    MUTEX_UNLOCK(loadMutex);
+
+    return r;
+}
+
+}
--- a/source/train/XBaseTemplate.h
+++ b/source/train/XBaseTemplate.h
+/*
+* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2016-2021
+* Natural Language Processing Lab, Northeastern University
+* and
+* NiuTrans Research
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* We define various template classes here. They will be overloaded and used 
+* in applications.
+*
+* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-02-25
+* The meeting at 3:00pm today was canceled. More time for coding.
+*/
+
+#ifndef __XNETTEMPLATE_H__
+#define __XNETTEMPLATE_H__
+
+#include "../tensor/XTensor.h"
+#include "../tensor/XThread.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* 
+data distributor template. It distributes batches of data to workers.
+
+The use of data distributor follows:
+Start() -> GetBatch() -> ... -> GetBatch() -> End()
+
+In addition, GetBatch() should be thread-safe, and thus could be 
+called by different threads simultaneously.
+*/
+class DataDistributeBase
+{
+protected:
+    /* mutex of batch loading */
+    MUTEX_HANDLE loadMutex;
+
+public:
+    /* constructor */
+    DataDistributeBase();
+
+    /* de-constructor */
+    ~DataDistributeBase();
+
+    /* start the job (e.g., open the file).
+       NOTE THAT before calling Start() one should initialize
+       the distributor if neccessary */
+    virtual
+    bool Start();
+
+    /* end the job (e.g., close the file) */
+    virtual
+    bool End();
+
+    /* get a batch of samples */
+    virtual
+    bool GetBatchSimple(XList * inputs, XList * golds);
+    
+
+public:
+    /* get a batch of samples */
+    bool GetBatch(XList * args);
+
+    /* get a batch of samples (for multi-threading) */
+    bool GetBatchSafe(XList * args);
+};
+
+}
+
+#endif // __XNETTEMPLATE_H__
+
--- a/source/train/XLeader.cpp
+++ b/source/train/XLeader.cpp
+/*
+* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2016-2021
+* Natural Language Processing Lab, Northeastern University
+* and
+* NiuTrans Research
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* A "leader" manages a number of "workers". The leader recieves jobs from
+* the central server (can be remote), or acts as an independent server itself.
+* For workers, the leader is the one who issues orders and organizes them.
+* Note that the leader and workers must be on the same machine. In case of
+* multi-machine training, one can deploy different leaders on different
+* machines. BUT, at this time, we need an additional way of distributing
+* data across machines.
+*
+* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-02-25
+*/
+
+#include "XLeader.h"
+
+/* the nts (NiuTrans.Tensor) namespace */
+namespace nts {
+
+/* constructor */
+XLeader::XLeader()
+{
+    id = -1;
+}
+
+/* de-constructor */
+XLeader::~XLeader()
+{
+}
+
+/* intialize the leader */
+void XLeader::Init()
+{
+    for (int i = 0; i < jworkers.count; i++)
+        delete (XWorkerJob*)jworkers.GetItem(i);
+    jworkers.Clear();
+
+    for (int i = 0; i < cworkers.count; i++)
+        delete (XWorkerCollect*)cworkers.GetItem(i);
+    cworkers.Clear();
+
+    for (int i = 0; i < uworkers.count; i++)
+        delete (XWorkerUpdate*)uworkers.GetItem(i);
+    uworkers.Clear();
+
+    for (int i = 0; i < bworkers.count; i++)
+        delete (XWorkerBroadcast*)bworkers.GetItem(i);
+    bworkers.Clear();
+
+    serverRecord.Clear();
+}
+
+/* set id */
+void XLeader::SetID(int myID)
+{
+    id = myID;
+}
+
+/* get id */
+int XLeader::GetID()
+{
+    return id;
+}
+
+/* 
+Set the server model. It distributes the server-side parameters on different devices.
+>> config - the configuration
+>> model - the base model
+>> memberModels - the models that run on different devices. We can place
+                  the server-side parameters on different member models.
+*/
+void XLeader::SetServerModel(XConfig * config, XModel * model, XList * memberModels)
+{
+    serverModel.Clear();
+    for (int i = 0; i < model->paramNum; i++) {
+        XTensor * param = model->params[i].param;
+        serverModel.AddParam(param);
+    }
+
+    /* TODO: we can place parameters on different devices */
+}
+
+/* 
+set the server model. It distributes the server-side parameters on different devices.
+>> config - the configuration
+>> model - the base model*/
+void XLeader::SetServerModel(XConfig * config, XModel * model)
+{
+    XList members;
+    for (int i = 0; i < jworkers.count; i++) {
+        XModel * member = ((XWorkerJob*)jworkers[i])->GetModel();
+        members.Add(member);
+    }
+
+    SetServerModel(config, model, &members);
+}
+    
+/* initialize the models for running them */
+void XLeader::InitForRun()
+{
+    serverModel.InitForRun();
+
+    for (int i = 0; i < jworkers.count; i++) {
+        XModel* model = ((XWorkerJob*)jworkers[i])->GetModel();
+        model->InitForRun();
+    }
+
+    XList workers;
+    workers.AddList(&jworkers);
+    workers.AddList(&cworkers);
+    workers.AddList(&uworkers);
+    workers.AddList(&bworkers);
+
+    for (int i = 0; i < workers.count; i++) {
+        XWorker* worker = (XWorker*)workers[i];
+        CheckNTErrors(worker->IsEmpty(), "Something is wrong with the finishedQueue!");
+    }
+}
+
+/* set grad = 0 */
+void XLeader::ResetParamGrad()
+{
+    for (int i = 0; i < serverModel.paramNum; i++) {
+        XTensor* param = serverModel.params[i].param;
+        if (param->grad != NULL) {
+            param->grad->SetZeroAll();
+        }
+    }
+
+    for (int j = 0; j < jworkers.count; j++) {
+        XWorkerJob * worker = (XWorkerJob*)jworkers[j];
+        XModel * model = worker->GetModel();
+        for (int i = 0; i < model->paramNum; i++) {
+            XTensor* param = model->params[i].param;
+            if (param->grad != NULL) {
+                param->grad->SetZeroAll();
+            }
+        }
+    }
+}
+
+/*
+wait for finished states (i.e., all workers finish their jobs)
+>> activeJobWorkers - indicates whether each job worker is active
+>> isToUpdate - indicates whether the model is updated
+*/
+void XLeader::WaitForFinishing(const int* activeJobWorkers, const int isToUpdate)
+{
+    int activeCount = 0;
+    for (int i = 0; i < jworkers.count; i++) {
+        if (activeJobWorkers[i] > 0) {
+            XWorker* worker = (XWorker*)jworkers[i];
+            worker->DequeueFinishedJob();
+            activeCount++;
+        }
+    }
+
+    if (activeCount > 0 && isToUpdate) {
+        for (int i = 0; i < cworkers.count; i++) {
+            XWorker* worker = (XWorker*)cworkers[i];
+            worker->DequeueFinishedJob();
+        }
+
+        for (int i = 0; i < uworkers.count; i++) {
+            XWorker* worker = (XWorker*)uworkers[i];
+            for (int j = 0; j < serverModel.paramNum; j++)
+                worker->DequeueFinishedJob();
+        }
+
+        for (int i = 0; i < bworkers.count; i++) {
+            XWorker* worker = (XWorker*)bworkers[i];
+            for (int j = 0; j < serverModel.paramNum; j++)
+                worker->DequeueFinishedJob();
+        }
+    }
+}
+
+/* get loss */
+float XLeader::GetLoss()
+{
+    return serverRecord.lossAll;
+}
+    
+/* get sample number */
+int XLeader::GetSampleNum()
+{
+    return serverRecord.sampleNum;
+}
+
+/* get prediction number */
+int XLeader::GetPredictNum()
+{
+    return serverRecord.predictNum;
+}
+
+/* 
+set the communication mode 
+>> myMode - the mode
+*/
+void XLeader::SetMode(XLEADER_MODE myMode)
+{
+    mode = myMode;
+}
+
+/* set the flag of instant run */
+void XLeader::SetInstantRun(bool flag)
+{
+    for (int i = 0; i < jworkers.count; i++) {
+        XWorkerJob * worker = (XWorkerJob*)jworkers.GetItem(i);
+        worker->SetInstantRun(flag);
+    }
+
+    for (int i = 0; i < cworkers.count; i++) {
+        XWorkerJob * worker = (XWorkerJob*)cworkers.GetItem(i);
+        worker->SetInstantRun(flag);
+    }
+
+    for (int i = 0; i < uworkers.count; i++) {
+        XWorkerJob * worker = (XWorkerJob*)uworkers.GetItem(i);
+        worker->SetInstantRun(flag);
+    }
+
+    for (int i = 0; i < bworkers.count; i++) {
+        XWorkerJob * worker = (XWorkerJob*)bworkers.GetItem(i);
+        worker->SetInstantRun(flag);
+    }
+}
+
+/* start the workers */
+void XLeader::Start()
+{
+    serverModel.CheckParam();
+
+    for (int i = 0; i < jworkers.count; i++) {
+        XWorkerJob * worker = (XWorkerJob*)jworkers.GetItem(i);
+        worker->GetModel()->CheckParam();
+        worker->Start();
+    }
+
+    for (int i = 0; i < cworkers.count; i++) {
+        XWorkerJob * worker = (XWorkerJob*)cworkers.GetItem(i);
+        worker->Start();
+    }
+
+    for (int i = 0; i < uworkers.count; i++) {
+        XWorkerJob * worker = (XWorkerJob*)uworkers.GetItem(i);
+        worker->Start();
+    }
+
+    for (int i = 0; i < bworkers.count; i++) {
+        XWorkerJob * worker = (XWorkerJob*)bworkers.GetItem(i);
+        worker->Start();
+    }
+}
+
+/* 
+add a number of job workers (given their device ids) 
+>> model - the neural network
+>> n - number of the models
+>> ids - the array of device ids
+*/
+void XLeader::AddJobWorker(XModel * model, int n, int * ids)
+{
+    /* we keep the input model */
+    if (n >= 1) {
+        XWorkerJob * worker = new XWorkerJob();
+        worker->SetModel(model);
+        jworkers.Add(worker);
+    }
+
+    /* we clone the input model */
+    for (int i = 1; i < n; i++) {
+        XWorkerJob * worker = new XWorkerJob();
+        worker->SetModel(model->Clone(ids[i]));
+        jworkers.Add(worker);
+    }
+}
+
+/* 
+add a data-collecting worker 
+>> mode - the data-transfer mode of the worker
+*/
+void XLeader::AddJobCollectWorker(DATA_COLLECT_TYPE mode)
+{
+    XWorkerCollect * worker = new XWorkerCollect();
+    worker->SetCollectMode(mode);
+    cworkers.Add(worker);
+}
+
+/* 
+add a model-update worker 
+>> model - the model
+>> optimizer - the optimizer
+*/
+void XLeader::AddJobUpdateWorker(XModel * model, XOptimizer * optimizer)
+{
+    XWorkerUpdate * worker = new XWorkerUpdate();
+    worker->SetOptimizer(optimizer);
+    uworkers.Add(worker);
+}
+
+/* add a data-broadcasting worker */
+void XLeader::AddJobBroadcastWorker()
+{
+    XWorkerBroadcast * worker = new XWorkerBroadcast();
+    bworkers.Add(worker);
+}
+
+/* 
+run the model (for one time). Basically this is a map-reduce process.
+>> config - the configuration
+>> dataDistributor - data distributor
+>> model - the neural network that we want to run
+>> optimizer - the optimization method
+<< return - if we can fetch the new data
+*/
+bool XLeader::Run(XConfig * config, DataDistributeBase * dataDistributor,
+                  XModel * model, XOptimizer * optimizer)
+{
+    CheckNTErrors(jworkers.count > 0, "No jworkers!");
+    CheckNTErrors(cworkers.count > 0, "No cworkers!");
+    CheckNTErrors(uworkers.count > 0, "No uworkers!");
+    CheckNTErrors(bworkers.count > 0, "No bworkers!");
+
+    bool isDataOK = true;
+    bool isToUpdate = (optimizer != NULL);
+    int activeJobCount = 0;
+    int* active = new int[jworkers.count];
+    
+    InitForRun();
+
+    for (int i = 0; i < jworkers.count; i++)
+        active[i] = 0;
+
+    /* Feed the input to each worker and geneate the output.
+       For each worker, we define a job queue and enqueue jobs
+       into it. 
+    */
+    for (int i = 0; i < jworkers.count; i++) {
+        XWorkerJob * worker = (XWorkerJob*)jworkers[i];
+        XModel * jmodel = worker->GetModel();
+
+        /* get a batch of samples */
+        bool fetched = dataDistributor->GetBatchSimple(worker->GetInput(), worker->GetGold()); 
+
+        if (!fetched)
+            isDataOK = false;
+        else {
+            /* job in queue 1: refresh the model */
+            worker->AddJobRefresh(jmodel);
+
+            /* job in queue 1: run the model */
+            worker->AddJobNeuralNet(jmodel, 
+                                    worker->GetInput(), worker->GetOutput(), 
+                                    worker->GetGold(), worker->GetLoss());
+
+            /* job in queue 1: make a record of the run */
+            worker->AddJobRecord(&serverRecord);
+
+            /* job in queue 1: mark finished */
+            worker->AddJobEnqueueFinished();
+
+            active[i] = 1;
+            activeJobCount++;
+        }
+    }
+
+    if (activeJobCount > 0 && isToUpdate) {
+        /* workers */
+        XWorkerCollect * collecter = (XWorkerCollect*)cworkers.GetItem(0);
+        XWorkerUpdate * updater = (XWorkerUpdate*)uworkers.GetItem(0);
+        XWorkerBroadcast * broadcaster = (XWorkerBroadcast*)bworkers.GetItem(0);
+
+        /* member models that are active in this run */
+        XList members(jworkers.count);
+
+        /* all member models */
+        XList membersAll(jworkers.count);
+
+        for (int i = 0; i < jworkers.count; i++) {
+            XWorkerJob* worker = (XWorkerJob*)jworkers[i];
+            membersAll.Add(worker->GetModel());
+            if (active[i] == 1)
+                members.Add(worker->GetModel());
+        }
+
+        /* jobs in queue 2: collect the (gradient) data and other stuff. This
+           is a reduce process. The collector will add a job in queue 3
+           to update the model. The updater will  add a job job in queue 4 to 
+           broadcast the lastest parameters to workers. NOTE that we would update 
+           a worker to the laster model parameters, even if it is not involved 
+           in this run. */
+        collecter->AddJobUpdateAll(&members, &membersAll, &serverModel, 
+                                    optimizer, updater, broadcaster);
+        collecter->AddJobEnqueueFinished();
+    }
+
+    WaitForFinishing(active, isToUpdate);
+
+    for (int i = 0; i < jworkers.count; i++) {
+        XWorkerJob * worker = (XWorkerJob*)jworkers[i];
+        worker->Clear();
+    }
+
+    delete[] active;
+
+    return isDataOK;
+}
+
+/* wait until all workers finish their job */
+void XLeader::WaitForFinishing(int sleepTime)
+{
+    while (1) {
+        bool finished = true;
+
+        if (finished) {
+            for (int i = 0; i < jworkers.count; i++) {
+                XWorkerJob* worker = (XWorkerJob*)jworkers[i];
+                if (worker->GetJobNum() > 0) {
+                    finished = false;
+                    break;
+                }
+            }
+        }
+
+        if (finished) {
+            for (int i = 0; i < cworkers.count; i++) {
+                XWorkerJob* worker = (XWorkerJob*)cworkers[i];
+                if (worker->GetJobNum() > 0) {
+                    finished = false;
+                    break;
+                }
+            }
+        }
+
+        if (finished) {
+            for (int i = 0; i < uworkers.count; i++) {
+                XWorkerJob* worker = (XWorkerJob*)uworkers[i];
+                if (worker->GetJobNum() > 0) {
+                    finished = false;
+                    break;
+                }
+            }
+        }
+
+        if (finished) {
+            for (int i = 0; i < bworkers.count; i++) {
+                XWorkerJob* worker = (XWorkerJob*)bworkers[i];
+                if (worker->GetJobNum() > 0) {
+                    finished = false;
+                    break;
+                }
+            }
+        }
+
+        if (finished)
+            break;
+
+        XSleep(sleepTime);
+    }
+}
+
+} /* end of the nts (NiuTrans.Tensor) namespace */
--- a/source/train/XLeader.h
+++ b/source/train/XLeader.h
+/*
+* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2016-2021
+* Natural Language Processing Lab, Northeastern University
+* and
+* NiuTrans Research
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* A "leader" manages a number of "workers". The leader recieves jobs from
+* the central server (can be remote), or acts as an independent server itself. 
+* For workers, the leader is the one who issues orders and organizes them. 
+* Note that the leader and workers must be on the same machine. In case of 
+* multi-machine training, one can deploy different leaders on different 
+* machines. BUT, at this time, we need an additional way of distributing 
+* data across machines.
+*
+* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-02-25
+* We will go for a business trip. The first trip after the Spring Festival.
+*/
+
+#ifndef __XLEADER_H__
+#define __XLEADER_H__
+
+#include "XModel.h"
+#include "XOptimizer.h"
+#include "XBaseTemplate.h"
+#include "XWorkerJob.h"
+#include "XWorkerCollect.h"
+#include "XWorkerUpdate.h"
+#include "XWorkerBroadcast.h"
+#include "./optimizer/OHeader.h"
+#include "../tensor/XConfig.h"
+#include "../tensor/XList.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+#define MAX_NUM_OF_WORKERS 1024
+#define SLEEP_TIME_IN_WAITING_FOR_JOBS 20
+
+/* 
+conmmunication mode of a leader. This offers a way of organizing a hierachy of the work
+1) run as a standalone program
+2) give orders to another leader (probably remote)
+3) recieve orders from anothe leader (probably remote)
+4) give (and recieve) orders to (and from) different leaders
+*/
+enum XLEADER_MODE { XLEADER_STANDALONE, XLEADER_SEND, XLEADER_RECIEVE, XLEADER_SEND_AND_RECIEVE };
+
+/* a leader who manages workers */
+class XLeader
+{
+protected:
+    /* id of the leader */
+    int id;
+
+    /* a model that keeps the parameters (as a server) */
+    XModel serverModel;
+
+    /* a record that keeps the information of the run */
+    XNNRecord serverRecord;
+
+    /* communication mode */
+    XLEADER_MODE mode;
+
+    /* job workers */
+    XList jworkers;
+
+    /* data-collecting workers */
+    XList cworkers;
+
+    /* model-update workers */
+    XList uworkers;
+
+    /* data-broadcasting workers */
+    XList bworkers;
+
+public:
+    /* constructor */
+    XLeader();
+
+    /* de-constructor */
+    ~XLeader();
+
+    /* intialize the leader */
+    void Init();
+
+    /* set id */
+    void SetID(int myID);
+
+    /* get id */
+    int GetID();
+
+    /* set the server model */
+    void SetServerModel(XConfig * config, XModel * model, XList * memberModels);
+
+    /* set the server model */
+    void SetServerModel(XConfig * config, XModel * model);
+    
+    /* initialize the models for running them */
+    void InitForRun();
+
+    /* set grad = 0 */
+    void ResetParamGrad();
+
+    /* wait for finished states (i.e., all workers finish their jobs) */
+    void WaitForFinishing(const int * activeJobWorkers, const int isToUpdate);
+
+    /* get loss */
+    float GetLoss();
+    
+    /* get sample number */
+    int GetSampleNum();
+
+    /* get prediction number */
+    int GetPredictNum();
+
+    /* start the workers */
+    void Start();
+
+    /* set the communication mode */
+    void SetMode(XLEADER_MODE myMode);
+
+    /* set the flag of instant run */
+    void SetInstantRun(bool flag = true);
+    
+    /* add a number of job workers (given their device ids) */
+    void AddJobWorker(XModel * model, int n, int * ids);
+
+    /* add a data-collecting worker */
+    void AddJobCollectWorker(DATA_COLLECT_TYPE mode = DATA_COLLECT_P2P);
+
+    /* add a model-update worker */
+    void AddJobUpdateWorker(XModel * model, XOptimizer * optimizer);
+
+    /* add a data-broadcasting worker */
+    void AddJobBroadcastWorker();
+
+    /* run the model (for one time) */
+    bool Run(XConfig * config, DataDistributeBase * dataDistributor, 
+             XModel * model, XOptimizer * optimizer);
+
+    /* wait until all workers finish their job */
+    void WaitForFinishing(int sleepTime = SLEEP_TIME_IN_WAITING_FOR_JOBS);
+};
+
+}
+
+#endif // __XLEADER_H__
--- a/source/train/XLearningRate.cpp
+++ b/source/train/XLearningRate.cpp
+/*
+* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2016-2021
+* Natural Language Processing Lab, Northeastern University
+* and
+* NiuTrans Research
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+*
+* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-03-16
+* I wore my coat again after the rain yesterday.
+*/
+
+#include "XLearningRate.h"
+#include <math.h>
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* constructor */
+XLearningRate::XLearningRate()
+{
+}
+
+/* de-constructor */
+XLearningRate::~XLearningRate()
+{
+}
+
+/* a Transformer-style scheduler. For more details, see
+"Attention is all need" by Vaswani at al. 
+>> lrate - the learning rate
+>> nstep - the update step number 
+>> nwarmup - the warmup step number 
+*/
+float XLearningRate::MakeLRTransformer(const float lrate, const int nstep, const int nwarmup)
+{
+    float lr = 0;
+    float warmupEndLR = lrate;
+    float warmupInitLR = 1e-7F;
+    float lrStep = (warmupEndLR - warmupInitLR) / nwarmup;
+    float decayFactor = warmupEndLR * (float)pow(float(nwarmup), 0.5F);
+
+    /* learning rate, scheduled by inverse square root */
+    if (nstep < nwarmup)
+        lr = warmupInitLR + nstep * lrStep;
+    else
+        lr = decayFactor * (float)pow((float)nstep, -0.5F);
+
+    return lr;
+}
+
+}
\ No newline at end of file
--- a/source/train/XLearningRate.h
+++ b/source/train/XLearningRate.h
+/*
+* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2016-2021
+* Natural Language Processing Lab, Northeastern University
+* and
+* NiuTrans Research
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* This a learning rate generator. E.g., one can adjust learning rate as
+* the training process proceeds.
+*
+* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-03-16
+* I wore my coat again after the rain yesterday.
+*/
+
+#ifndef __XLEARNINGRATE_H__
+#define __XLEARNINGRATE_H__
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* Learning rate scheduler */
+class XLearningRate
+{
+public:
+    /* constructor */
+    XLearningRate();
+
+    /* de-constructor */
+    ~XLearningRate();
+
+    /* a Transformer-style scheduler */
+    float MakeLRTransformer(const float lrate, const int nstep, const int nwarmup);
+};
+
+}
+
+#endif
\ No newline at end of file
--- a/source/train/XModel.cpp
+++ b/source/train/XModel.cpp
+/*
+* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2016-2021
+* Natural Language Processing Lab, Northeastern University
+* and
+* NiuTrans Research
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* This class maintains the parameters (and other stuff) for training. It
+* could be used to manage the parameter copy and update in training. E.g.,
+* one can use this class to keep the parameters on the server side, or
+* treat it as an individual model on the worker side.
+*
+* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-02-24
+*/
+
+#include "XModel.h"
+
+/* the nts (NiuTrans.Tensor) namespace */
+namespace nts {
+
+
+/* constructor */
+XParamKeeper::XParamKeeper()
+{
+    param = NULL;
+    flag = PARAM_STATE_NOT_READY;
+    trainFlag = PARAM_STATE_NOT_READY;
+    MUTEX_INIT(accessLock);
+    MUTEX_INIT(trainLock);
+}
+
+/* constructor */
+XParamKeeper::~XParamKeeper()
+{
+    MUTEX_DELE(accessLock);
+    MUTEX_DELE(trainLock);
+}
+
+/* constructor */
+XModel::XModel()
+{
+    params = NULL;
+    paramNum = 0;
+    MUTEX_INIT(modelMutex);
+}
+
+/* de-constructor */
+XModel::~XModel()
+{
+    Clear();
+    MUTEX_DELE(modelMutex);
+}
+
+/* clear the model */
+void XModel::Clear()
+{
+    delete[] params;
+    paramNum = 0;
+}
+
+/* 
+clone the model (would be overloaded) 
+>> devID - the device on that we keep the model
+<< return - a cloned model
+*/
+XModel * XModel::Clone(int devID)
+{
+    ShowNTErrors("XModel::Clone() should be overloaded!");
+    return NULL;
+}
+
+/* 
+run the neural network 
+>> inputs - inputs of the model
+>> outputs - outputs of the model
+>> golds - gold standards
+>> losses - losses of the input with respect to the gold standards
+*/
+bool XModel::RunSimple(XList * inputs, XList * outputs, XList * golds, XList * losses)
+{
+    return false;
+}
+
+/* 
+run the neural network 
+>> args - the arguments
+*/
+bool XModel::RunMe(XList * args)
+{
+    CheckNTErrors(args->count >= 3, "More arguments are required!");
+
+    XList * inputs = (XList*)args->GetItem(0);
+    XList * outputs = (XList*)args->GetItem(1);
+    XList * golds = (XList*)args->GetItem(2);
+    XList* losses = (XList*)args->GetItem(3);
+
+    if (RunSimple(inputs, outputs, golds, losses))
+        return true;
+
+    ShowNTErrors("You must be overload one of these: XModel::RunSimple ... !");
+    return false;
+}
+
+/* 
+add a parameter tensor 
+>> param - add a 
+*/
+void XModel::AddParam(XTensor* param)
+{
+    param->SetVarFlag();
+
+    XParamKeeper * newParams = new XParamKeeper[paramNum + 1];
+
+    for (int i = 0; i < paramNum; i++) {
+        newParams[i].param = params[i].param;
+        newParams[i].flag = params[i].flag;
+    }
+
+    newParams[paramNum].param = param;
+    newParams[paramNum].flag = PARAM_STATE_NOT_READY;
+
+    delete[] params;
+    params = newParams;
+    paramNum++;
+}
+
+/* check if the parameters are well-defined for training */
+bool XModel::CheckParam()
+{
+    for (int i = 0; i < paramNum; i++) {
+        XTensor * param = params[i].param;
+        if (!param->isGrad)
+            return false;
+    }
+
+    return true;
+}
+    
+/* initial model for running the it */
+void XModel::InitForRun()
+{
+    RefreshMe();
+}
+
+/* lock the parameter states (wait for unlocking them when
+   a run of training is finished) */
+void XModel::LockParamsForTraining()
+{
+    for (int i = 0; i < paramNum; i++) {
+        params[i].trainFlag = PARAM_STATE_NOT_READY;
+        MUTEX_LOCK(params[i].trainLock);
+
+        /* where is UNLOCK? We will do this when the training (a step)
+           is finsished. Then, WaitForUnlockedParams() can continue. In
+           such a way, we implement a START-WAIT process in each run
+           of training (a step) */
+    }
+}
+
+/* unlock the parameter states */
+void XModel::WaitForUnlockedParams()
+{
+    for (int i = 0; i < paramNum; i++) {
+        /* the lock proceeds only when the trainLock is unlocked 
+           in training. In this way, we are actually waiting for
+           the FINISHED signal from other workers/threads. */
+        MUTEX_LOCK(params[i].trainLock);
+
+        CheckNTErrors(params[i].trainFlag == PARAM_STATE_UPDATED,
+                      "the state of the parameter is wrong!");
+        MUTEX_UNLOCK(params[i].trainLock);
+    }
+}
+
+/* refresh the model */
+void XModel::RefreshMe()
+{
+    for (int i = 0; i < paramNum; i++) {
+        params[i].param->isGradFinished = false;
+        params[i].flag = PARAM_STATE_NOT_READY;
+        params[i].trainFlag = PARAM_STATE_NOT_READY;
+    }
+}
+
+/* wrapper of RefreshMe */
+void XModel::Refresh(XList * args)
+{
+    CheckNTErrors(args != NULL || args->count == 0, "no arguments for XModel::Refresh");
+    XModel * model = (XModel*)args->GetItem(0);
+    model->RefreshMe();
+}
+
+/* wrapper of Run() */
+bool XModel::Run(XList * args)
+{
+    CheckNTErrors(args != NULL || args->count == 0, "no arguments for XModel::Refresh");
+    XModel * model = (XModel*)args->GetItem(0);
+    XList newArgs;
+    
+    for (int i = 1; i < args->count; i++) {
+        void * arg = args->GetItem(i);
+        newArgs.Add(arg);
+    }
+
+    return model->RunMe(&newArgs);
+}
+
+} /* end of the nts (NiuTrans.Tensor) namespace */
--- a/source/train/XModel.h
+++ b/source/train/XModel.h
+/*
+* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2016-2021
+* Natural Language Processing Lab, Northeastern University
+* and
+* NiuTrans Research
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* This class maintains the parameters (and other stuff) for training. It
+* could be used to manage the parameter copy and update in training. E.g.,
+* one can use this class to keep the parameters on the server side, or 
+* treat it as an individual model on the worker side.
+*
+* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-02-24
+* I created more than one file today, hahaha
+*/
+
+#ifndef __XMODEL_H__
+#define __XMODEL_H__
+
+#include "../network/XNet.h"
+#include "../tensor/XQueue.h"
+#include "../tensor/XList.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* 
+parameter state
+1) not ready 
+2) ready 
+3) the parameter has been collected from other models 
+4) the updated parameter
+*/
+enum PARAM_STATE { PARAM_STATE_NOT_READY, 
+                   PARAM_STATE_READY, 
+                   PARAM_STATE_COLLECTED, 
+                   PARAM_STATE_UPDATED };
+
+/* parameter keeper */
+class XParamKeeper
+{
+public:
+    /* the parameter */
+    XTensor * param;
+
+    /* the parameter state */
+    PARAM_STATE flag;
+
+    /* the state of the entire training process 
+      (choosing from PARAM_STATE_NOT_READY and 
+      PARAM_STATE_UPDATED */
+    PARAM_STATE trainFlag;
+
+    /* a mutex for locking and unlocking the parameter */
+    MUTEX_HANDLE accessLock;
+
+    /* a mutex of the overall training */
+    MUTEX_HANDLE trainLock;
+
+public:
+    /* constructor */
+    XParamKeeper();
+
+    /* constructor */
+    ~XParamKeeper();
+
+};
+
+/* a model template for training */
+class XModel
+{
+protected:
+    /* mutex of the model */
+    MUTEX_HANDLE modelMutex;
+
+public:
+    /* the list of model parameters */
+    XParamKeeper * params;
+
+    /* parameter number */
+    int paramNum;
+
+public:
+
+    /* constructor */
+    XModel();
+
+    /* de-constructor */
+    ~XModel();
+
+    /* clear the model (would be overloaded) */
+    virtual
+    void Clear();
+
+    /* clone the model (would be overloaded) */
+    virtual
+    XModel * Clone(int devID);
+
+    /* run the neural network */
+    virtual
+    bool RunSimple(XList * inputs, XList * outputs, XList * golds, XList * losses);
+
+protected:
+    /* run the neural network */
+    bool RunMe(XList * args);
+
+public:
+    /* add a parameter tensor */
+    void AddParam(XTensor * param);
+
+    /* check if the parameters are well-defined for training */
+    bool CheckParam();
+
+    /* lock the parameter states (wait for unlocking them when
+       a run of training is finished) */
+    void LockParamsForTraining();
+
+    /* wait for unlocked the parameter states */
+    void WaitForUnlockedParams();
+    
+    /* initial model for running the it */
+    void InitForRun();
+
+    /* refresh the model */
+    void RefreshMe();
+
+    /* wrapper of RefreshMe() */
+    static
+    void Refresh(XList * args);
+
+    /* wrapper of Run() */
+    static
+    bool Run(XList * args);
+
+};
+
+}
+
+#endif // __XMODEL_H__
--- a/source/train/XNNRecord.cpp
+++ b/source/train/XNNRecord.cpp
+/*
+* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2016-2021
+* Natural Language Processing Lab, Northeastern University
+* and
+* NiuTrans Research
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* A record that keeps some information in running and training neural networks
+*
+* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-03-06
+* I will climb mountains with my wife and son this afternoon, hahaha :)
+*/
+
+#include "XNNRecord.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* constructor */
+XNNRecord::XNNRecord()
+{
+	Clear();
+    MUTEX_INIT(mutex);
+}
+
+/* de-constructor */
+XNNRecord::~XNNRecord()
+{
+    MUTEX_DELE(mutex);
+}
+
+/* clear it */
+void XNNRecord::Clear()
+{
+	lossAll = 0;
+    sampleNum = 0;
+	predictNum = 0;
+	state = XWORKER_UNSTARTED;
+}
+
+/* update me with another record */
+void XNNRecord::Update(XNNRecord & record)
+{
+	lossAll += record.lossAll;
+    sampleNum += record.sampleNum;
+	predictNum += record.predictNum;
+
+}
+
+}
--- a/source/train/XNNRecord.h
+++ b/source/train/XNNRecord.h
+/*
+* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2016-2021
+* Natural Language Processing Lab, Northeastern University
+* and
+* NiuTrans Research
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* A record that keeps some information in running and training neural networks
+*
+* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-03-06
+* I will climb mountains with my wife and son this afternoon, hahaha :)
+*/
+
+#ifndef __XNNRECORD_H__
+#define __XNNRECORD_H__
+
+#include "XWorker.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* a record of keeping some stuff during training */
+class XNNRecord
+{
+public:
+	/* loss over all samples */
+	float lossAll;
+    
+    /* sample number */
+    int sampleNum;
+
+	/* prediction number */
+	int predictNum;
+
+	/* state */
+	XWORKER_STATE state;
+
+    /* mutex */
+    MUTEX_HANDLE mutex;
+
+public:
+	/* constructor */
+	XNNRecord();
+
+	/* de-constructor */
+	~XNNRecord();
+
+	/* clear it */
+	void Clear();
+
+	/* update me with another record */
+	void Update(XNNRecord & record);
+};
+}
+
+#endif
--- a/source/train/XOptimizer.cpp
+++ b/source/train/XOptimizer.cpp
+/*
+* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2016-2021
+* Natural Language Processing Lab, Northeastern University
+* and
+* NiuTrans Research
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* This class define the template of the update rule in gradient based methods
+*
+* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-03-01
+*/
+
+#include "XOptimizer.h"
+#include "../tensor/core/CHeader.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* constructor */
+XOptimizer::XOptimizer()
+{
+    Clear();
+}
+
+/* de-constructor */
+XOptimizer::~XOptimizer()
+{
+    Clear();
+}
+
+/* 
+initialize the optimizer 
+>> config - the configuration
+*/
+void XOptimizer::Init(XConfig &config)
+{
+    nstep = config.GetInt("nstep", 100000);
+    nepoch = config.GetInt("nepoch", 50);
+    lrate = config.GetFloat("lrate", 0.1F);
+}
+
+/* clear the optimizer */
+void XOptimizer::Clear()
+{
+    nstep = 0;
+    nepoch = 0;
+    lrate = 0;
+}
+
+/* reset the optimizer (re-start) */
+void XOptimizer::Reset()
+{
+}
+
+void XOptimizer::ShowSettings()
+{
+    XPRINT(1, stderr, "[INFO] Optimizer Setup:\n");
+    XPRINT2(1, stderr, "%25s = %d\n", "nstep", nstep);
+    XPRINT2(1, stderr, "%25s = %d\n", "nepoch", nepoch);
+    XPRINT2(1, stderr, "%25s = %.3f\n", "lrate", lrate);
+}
+
+/* 
+record the update 
+>> model - the model that we want to update
+*/
+void XOptimizer::Note(XModel * model)
+{
+    nstep++;
+}
+
+/* 
+update a parameter matrix
+>> param - the parameter matrix
+>> gard - the gradient
+>> pid - the id of the parameter matrix
+*/
+void XOptimizer::UpdateParam(XTensor * param, XTensor * grad, int pid)
+{
+    /* the delta rule
+       \theta_new = \theta_old - \grad * \lrate */
+    _Sum(param, grad, param, -lrate);
+}
+
+/* get learning rate */
+float XOptimizer::GetLearningRate()
+{
+    return lrate;
+}
+
+/* 
+set learning rate 
+>> myLRate - the learning rate that we want to use
+*/
+void XOptimizer::SetLearningRate(float myLRate)
+{
+    lrate = myLRate;
+}
+
+}
--- a/source/train/XOptimizer.h
+++ b/source/train/XOptimizer.h
+/*
+* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2016-2021
+* Natural Language Processing Lab, Northeastern University
+* and
+* NiuTrans Research
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* This class define the template of the update rule in gradient based methods
+*
+* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-03-01
+* March came finally but there was a snow last night.
+*/
+
+#ifndef __XOPTIMIZER_H__
+#define __XOPTIMIZER_H__
+
+#include "XModel.h"
+#include "../tensor/XConfig.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* this class defines a template of the optimizer and 
+   implement the simple delta-rule in SGD. */
+class XOptimizer
+{
+public:
+    /* update step number */
+    int nstep;
+
+    /* training epoch number */
+    int nepoch;
+
+    /* learning rate */
+    float lrate;
+
+public:
+    /* constructor */
+    XOptimizer();
+
+    /* de-constructor */
+    ~XOptimizer();
+
+    /* initialize the optimizer */
+    virtual
+    void Init(XConfig &config);
+
+    /* clear the optimizer */
+    virtual
+    void Clear();
+
+    /* reset the optimizer (re-start) */
+    virtual
+    void Reset();
+    
+    /* show settings */
+    virtual
+    void ShowSettings();
+
+    /* record the update */
+    virtual
+    void Note(XModel * model);
+
+    /* update a parameter matrix */
+    virtual
+    void UpdateParam(XTensor * param, XTensor * grad, int pid);
+
+    /* get learning rate */
+    float GetLearningRate();
+
+    /* set learning rate */
+    void SetLearningRate(float myLRate);
+};
+
+}
+
+#endif
\ No newline at end of file
--- a/source/train/XTrainer.cpp
+++ b/source/train/XTrainer.cpp
+/* 
+* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2016-2021
+* Natural Language Processing Lab, Northeastern University
+* and
+* NiuTrans Research
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+*
+* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-02-23
+*
+*/
+
+#include "XTrainer.h"
+#include "XLearningRate.h"
+
+/* the nts (NiuTrans.Tensor) namespace */
+namespace nts {
+
+/* constructor */
+XTrainer::XTrainer()
+{
+}
+
+/* de-constructor */
+XTrainer::~XTrainer()
+{
+}
+
+/* 
+get the device ids of the jobs 
+>> config - configuration
+>> ids - the array of device ids
+>> num - number of the jobs
+>> maxDevNum - the maximum number of devices
+*/
+void XTrainer::GetDevIDs(XConfig * config, int * ids, int & num, int maxDevNum)
+{
+    CheckNTErrors(maxDevNum > 0, "No data array for input!");
+
+    num = 0;
+    for (int i = 0; i < maxDevNum; i++) {
+        char dev[16];
+        sprintf(dev, "jobdev%d", i);
+        int id = config->GetInt(dev, -128);
+        if (id != -128) {
+            ids[num++] = id;
+        }
+        else
+            break;
+    }
+
+    if (num == 0) {
+        char dev[16];
+        sprintf(dev, "jobdev");
+        int id = config->GetInt(dev, -128);
+        if (id != -128)
+            ids[num++] = id;
+    }
+
+    if (num == 0) {
+        char dev[16];
+        sprintf(dev, "dev");
+        int id = config->GetInt(dev, -128);
+        if (id != -128)
+            ids[num++] = id;
+    }
+}
+
+/*
+run the trainer (this is the core process)
+>> config - configuration
+>> dataDistributor - the data distributor that generates an input for the net each time
+>> model - the neural network
+>> optimizer - the optimizer
+*/
+void XTrainer::Run(XConfig * config, DataDistributeBase * dataDistributor,
+                   XModel * model, XOptimizer * optimizer)
+{
+    CheckNTErrors(config != NULL, "No input config!");
+    CheckNTErrors(dataDistributor != NULL, "No input data distributor!");
+    CheckNTErrors(model != NULL, "No input neural network!");
+
+    int epoch = 0;
+    int step = 0;
+    int stepAll = 0;
+    int jobNum = 0;
+    int accumulation = config->GetInt("accumulation", 1);
+    int nwarmup = config->GetInt("nwarmup", 0);
+    int lrate = optimizer->GetLearningRate();
+
+    CheckNTErrors(accumulation >= 1, "accumulation must be larger than 0!");
+
+    int * ids = new int[MAX_DEVICE_NUM_TRAINING];
+    GetDevIDs(config, ids, jobNum, MAX_DEVICE_NUM_TRAINING);
+
+    optimizer->ShowSettings();
+    this->ShowSettings(config);
+
+    /* create the server and workers */
+    XLeader leader;
+    leader.Init();
+    leader.AddJobWorker(model, jobNum, ids);
+    leader.AddJobCollectWorker();
+    leader.AddJobUpdateWorker(model, optimizer);
+    leader.AddJobBroadcastWorker();
+    //leader.SetInstantRun();
+    leader.SetServerModel(config, model);
+    leader.Start();
+
+    /* learning rate scheduler */
+    XLearningRate LRScheduler;
+
+    double startT = GetClockSec();
+
+    XPRINT(1, stderr, "[INFO] Initializing the model ... [DONE]\n");
+
+    /* train the model */
+    for (epoch = 0; epoch < optimizer->nepoch; epoch++) {
+
+        bool ok = true;
+        dataDistributor->Start();
+
+        while (ok) {
+            if (++stepAll % accumulation == 0) {
+
+                /* learning rate scheduling */
+                if (nwarmup > 0)
+                    optimizer->SetLearningRate(LRScheduler.MakeLRTransformer(lrate, step + 1, nwarmup));
+
+                /* one step of udpate */
+                ok = leader.Run(config, dataDistributor, model, optimizer);
+
+                float loss = leader.GetLoss() / leader.GetSampleNum();
+
+                if ((step + 1) % 100 == 0)
+                    XPRINT5(1, stderr, "[INFO] elapsed=%.1fs epoch:%d step:%d sample:%d loss:%f\n",
+                        GetClockSec() - startT, epoch + 1, step + 1, leader.GetSampleNum(), loss);
+
+                leader.ResetParamGrad();
+
+                if (++step >= optimizer->nstep)
+                    break;
+            }
+            else {
+                /* one step with no udpate */
+                ok = leader.Run(config, dataDistributor, model, NULL);
+            }
+        }
+
+        dataDistributor->End();
+
+        if (step >= optimizer->nstep)
+            break;   
+    }
+
+    delete[] ids;
+}
+
+/* show settings of training */
+void XTrainer::ShowSettings(XConfig* config)
+{
+    int workerNum = 0;
+    int* ids = new int[MAX_DEVICE_NUM_TRAINING];
+
+    GetDevIDs(config, ids, workerNum, MAX_DEVICE_NUM_TRAINING);
+
+    XPRINT(1, stderr, "[INFO] Training Setup:\n");
+    XPRINT2(1, stderr, "%25s = %d\n", "nworker", workerNum);
+
+    if (workerNum > 0) {
+        if (ids[0] < 0) {
+            XPRINT2(1, stderr, "%25s = CPU[%d]\n", "worker0(server)", ids[0]);
+        }
+        else{
+            XPRINT2(1, stderr, "%25s = GPU[%d]\n", "worker0(server)", ids[0]);
+        }
+
+        for (int i = 1; i < workerNum; i++) {
+            char name[32];
+            sprintf(name, "worker%d", i);
+            if (ids[i] < 0) {
+                XPRINT2(1, stderr, "%25s = CPU[%d]\n", name, ids[i]);
+            }
+            else {
+                XPRINT2(1, stderr, "%25s = GPU[%d]\n", name, ids[i]);
+            }
+        }
+    }
+
+    XPRINT2(1, stderr, "%25s = %d\n", "accumulation", config->GetInt("accumulation", 1));
+
+    delete[] ids;
+}
+
+} /* end of the nts (NiuTrans.Tensor) namespace */
--- a/source/trainer/XTrainer.h
+++ b/source/trainer/XTrainer.h
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2021
+/* 
+* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2016-2021
 * Natural Language Processing Lab, Northeastern University
 * and
 * NiuTrans Research
@@ -23,18 +24,22 @@
 * Distributed training is supported.
 *
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-02-23
-* I start coding in 2021 after one year since I typed last C code.
+* I start coding in 2021 after one year since I typed last line of C code.
 * BUT i was a GOOD tex writter in 2020 :)
 */

 #ifndef __XTRAINER_H__
 #define __XTRAINER_H__

+#include "XLeader.h"
 #include "../network/XNet.h"
 #include "../tensor/XQueue.h"
+#include "../tensor/XConfig.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

+#define MAX_DEVICE_NUM_TRAINING 128
+
 /* 
 Training of neural networks with gradient methods. Here we suppose that we 
 are training NLP models. The routine could be:
@@ -56,14 +61,25 @@ the job to the workers and maintain the model.
 */
 class XTrainer
 {
-private:
-
 public:
    /* constructor */
    XTrainer();

    /* de-constructor */
    ~XTrainer();
+
+protected:
+    /* get the device ids of the jobs */
+    void GetDevIDs(XConfig * config, int * ids, int & num, int maxDevNum);
+
+public:
+    /* run the leader (this is the core process) */
+    virtual
+    void Run(XConfig * config, DataDistributeBase * dataDistributor, 
+             XModel * model, XOptimizer * optimizer);
+
+    /* show settings of training */
+    void ShowSettings(XConfig * config);
 };
 }
 #endif // __XTRAINER_H__
\ No newline at end of file
--- a/source/train/XWorker.cpp
+++ b/source/train/XWorker.cpp
+/*
+* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2016-2021
+* Natural Language Processing Lab, Northeastern University
+* and
+* NiuTrans Research
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* The base class of worker. It maintains a job queue and offers utilities
+* of controlling the working pipeline.
+*
+* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-02-24
+*/
+
+#include "XWorker.h"
+
+/* the nts (NiuTrans.Tensor) namespace */
+namespace nts {
+
+/* constructor */
+XWorker::XWorker()
+{
+    devID = -1;
+    id = -1;
+    state = XWORKER_UNSTARTED;
+    isInstantRun = false;
+}
+
+/* de-constructor */
+XWorker::~XWorker()
+{
+    Stop();
+}
+
+/* set device id */
+void XWorker::SetDeviceID(int myDevID)
+{
+    devID = myDevID;
+}
+
+/* get device id */
+int XWorker::GetDeviceID()
+{
+    return devID;
+}
+
+/* set worker id */
+void XWorker::SetID(int myID)
+{
+    id = myID;
+}
+
+/* get worker id */
+int XWorker::GetID()
+{
+    return id;
+}
+
+/* set the flag of instant run */
+void XWorker::SetInstantRun(bool flag)
+{
+    isInstantRun = flag;
+}
+
+/* 
+enqueue a new job 
+>> job - the job function
+>> jobArgs - the arguments of the function
+*/
+void XWorker::AddJob(void * job, XList * jobArgs)
+{
+    queue.EnqueueJob(job, jobArgs);
+}
+
+/* start the work */
+void XWorker::Start()
+{
+    queue.RunJobConsumer();
+}
+
+/* stop the work */
+void XWorker::Stop()
+{
+    queue.StopJobConsumer();
+}
+
+/* get the number of remaining jobs */
+int XWorker::GetJobNum()
+{
+    return queue.GetJobNum();
+}
+
+/* whether the job queue is empty? */
+bool XWorker::IsEmpty()
+{
+    return queue.IsEmpty();
+}
+
+/* enqueue a counting job of a finished job */
+void XWorker::EnqueueFinishedJob()
+{
+    finishedQueue.Enqueue(NULL);
+}
+
+/* dequeue a counting job of a finished job */
+void XWorker::DequeueFinishedJob()
+{
+    finishedQueue.Dequeue();
+}
+
+/* wrapper of EnqueueFinished() */
+void XWorker::EnqueueFinished(XList* args)
+{
+    XWorker* worker = (XWorker*)args->GetItem(0);
+    worker->EnqueueFinishedJob();
+}
+
+/* wrapper of DequeueFinished() */
+void XWorker::DequeueFinished(XList* args)
+{
+    XWorker* worker = (XWorker*)args->GetItem(0);
+    worker->DequeueFinishedJob();
+}
+
+/* add a job of enqueuing a counting a finished job */
+void XWorker::AddJobEnqueueFinished()
+{
+    XList args;
+    args.Add(this);
+
+    if (isInstantRun)
+        XWorker::EnqueueFinished(&args);
+    else
+        queue.EnqueueJob((void*)(char*)XWorker::EnqueueFinished, &args);
+}
+
+/* add a job of dequeuing a counting a finished job */
+void XWorker::AddJobDequeueFinished()
+{
+    XList args;
+    args.Add(this);
+
+    if (isInstantRun)
+        XWorker::DequeueFinished(&args);
+    else
+        queue.EnqueueJob((void*)(char*)XWorker::DequeueFinished, &args);
+
+}
+
+} /* end of the nts (NiuTrans.Tensor) namespace */
--- a/source/train/XWorker.h
+++ b/source/train/XWorker.h
+/*
+* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2016-2021
+* Natural Language Processing Lab, Northeastern University
+* and
+* NiuTrans Research
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* The base class of worker. It maintains a job queue and offers utilities
+* of controlling the working pipeline.
+*
+* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-02-24
+* People started to go back to the normal life after the Spring Festival.
+* Traffic jams again.
+*/
+
+#ifndef __XWORKER_H__
+#define __XWORKER_H__
+
+#include "../tensor/XQueue.h"
+#include "../tensor/XUtility.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/*
+state of a worker
+1) unstarted
+2) started
+3) finished
+*/
+    enum XWORKER_STATE { XWORKER_UNSTARTED, XWORKER_STARTED, XWORKER_FINISHED };
+
+/* the worker class */
+class XWorker
+{
+protected:
+    /* id of the device where we run the worker (we suppose that
+    the worker is insite. */
+    int devID;
+
+    /* id of the worker */
+    int id;
+
+    /* the queue of jobs */
+    XQueue queue;
+
+    /* state of the worker */
+    XWORKER_STATE state;
+
+    /* fire the flag of instant run */
+    bool isInstantRun;
+
+    /* the queue of counting finished jobs */
+    XQueue finishedQueue;
+    
+public:
+    /* constructor */
+    XWorker();
+
+    /* de-constructor */
+    ~XWorker();
+
+    /* set device id */
+    void SetDeviceID(int myDevID);
+
+    /* get device id */
+    int GetDeviceID();
+
+    /* set worker id */
+    void SetID(int myID);
+
+    /* get worker id */
+    int GetID();
+
+    /* set the flag of instant run */
+    void SetInstantRun(bool flag = true);
+
+    /* enqueue a new job */
+    void AddJob(void * job, XList * jobArgs);
+
+    /* start the work */
+    void Start();
+
+    /* stop the work */
+    void Stop();
+
+    /* get the number of remaining jobs */
+    int GetJobNum();
+
+    /* whether the job queue is empty? */
+    bool IsEmpty();
+
+    /* enqueue a counting job of a finished job */
+    void EnqueueFinishedJob();
+
+    /* dequeue a counting job of a finished job */
+    void DequeueFinishedJob();
+
+    /* wrapper of EnqueueFinished() */
+    static
+    void EnqueueFinished(XList* args);
+
+    /* wrapper of DequeueFinished() */
+    static
+    void DequeueFinished(XList* args);
+
+    /* add a job of enqueuing a counting a finished job */
+    void AddJobEnqueueFinished();
+
+    /* add a job of dequeuing a counting a finished job */
+    void AddJobDequeueFinished();
+};
+
+}
+
+#endif
--- a/source/train/XWorkerBroadcast.cpp
+++ b/source/train/XWorkerBroadcast.cpp
+/*
+* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2016-2021
+* Natural Language Processing Lab, Northeastern University
+* and
+* NiuTrans Research
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* The worker that boradcast the lastest parameters from the server to
+* the workers.
+*
+* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-03-03
+*/
+
+
+#include "XWorkerBroadcast.h"
+#include "../tensor/core/CHeader.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+
+/* constructor */
+XWorkerBroadcast::XWorkerBroadcast()
+{
+}
+
+/* de-constructor */
+XWorkerBroadcast::~XWorkerBroadcast()
+{
+}
+
+/* set the broadcasting type */
+void XWorkerBroadcast::SetBroadcastMode(DATA_BROADCAST_TYPE myMode)
+{
+    broadcastMode = myMode;
+}
+
+/* 
+broadcast data for a parameter 
+>> source - the data (as a model) that we want to broadcast
+>> targetList - the target places that we recieve the data
+>> pid - the parameter index
+*/
+void XWorkerBroadcast::BroadcastDataSingle(XModel * source, XList * targetList, int pid)
+{
+    CheckNTErrors(source->params[pid].flag == PARAM_STATE_UPDATED,
+                  "The parameter is not ready for broadcasting");
+
+    for (int i = 0; i < targetList->count; i++) {
+        XModel * target = (XModel*)targetList->GetItem(i);
+
+        /* data transmit */
+        BroadcastP2P(source->params[pid].param, target->params[pid].param);
+
+        /* update the flag */
+        target->params[pid].flag = PARAM_STATE_UPDATED;
+    }
+}
+
+/* 
+broadcast data for a model
+>> source - the data that we want to broadcast
+>> targetList - the target places that we recieve the data
+>> sleepTime - the waiting time in broadcasting
+*/
+void XWorkerBroadcast::BroadcastData(XModel * source, XList * targetList, int sleepTime)
+{
+    int finished = 0;
+    int * finishedFlag = new int[source->paramNum];
+    memset(finishedFlag, 0, sizeof(int) * source->paramNum);
+
+    /* check */
+    for (int i = 0; i < targetList->count; i++) {
+        XModel * target = (XModel*)targetList->GetItem(i);
+        CheckNTErrors(source->paramNum == target->paramNum, "Incompatiable models!");
+    }
+
+    /* the major body of broadcasting */
+    while (1) {
+        for (int i = 0; i < source->paramNum; i++) {
+            if (source->params[i].flag == PARAM_STATE_UPDATED && finishedFlag[i] == 0) {
+
+                /* broadcasting */
+                BroadcastDataSingle(source, targetList, i);
+
+                /* counting */
+                finished += targetList->count;
+                finishedFlag[i] = 1;
+            }
+        }
+
+        if (finished == source->paramNum * targetList->count)
+            break;
+
+        XSleep(sleepTime);
+    }
+
+    delete[] finishedFlag;
+}
+
+/* 
+wrapper of BroadcastDataSingle 
+>> args - the list of arguments
+*/
+void XWorkerBroadcast::BroadcastSingle(XList * args)
+{
+    XWorkerBroadcast * broadcaster = (XWorkerBroadcast*)args->GetItem(0);
+    XModel * source = (XModel*)args->GetItem(1);
+
+    /* target models */
+    int targetNum = args->GetItemInt(2);
+    XList target;
+    for (int i = 0; i < targetNum; i++) {
+        XModel * model = (XModel*)args->GetItem(3 + i);
+        target.Add(model);
+    }
+
+    /* parameter index */
+    int p = args->GetInt(3 + targetNum);
+
+    broadcaster->BroadcastDataSingle(source, &target, p);
+}
+
+/* 
+wrapper of BroadcastData 
+>> args - the list of arguments
+*/
+void XWorkerBroadcast::Broadcast(XList * args)
+{
+    //fprintf(stderr, "broadcast 0\n");
+    XWorkerBroadcast * broadcaster = (XWorkerBroadcast*)args->GetItem(0);
+    XModel * source = (XModel*)args->GetItem(1);
+
+    /* target models */
+    int targetNum = args->GetItemInt(2);
+    XList target;
+    for (int i = 0; i < targetNum; i++) {
+        XModel * model = (XModel*)args->GetItem(3 + i);
+        target.Add(model);
+    }
+
+    broadcaster->BroadcastData(source, &target, SLEEP_TIME_IN_BROADCASTING);
+    //fprintf(stderr, "broadcast 1\n");
+}
+
+/* 
+P2P data broadcasting 
+>> source - the source data
+>> target - the target data
+*/
+void XWorkerBroadcast::BroadcastP2P(XTensor * source, XTensor * target)
+{
+    CheckNTErrors(source != NULL, "The source tensor should not be NULL!");
+    CheckNTErrors(target != NULL, "The target tensor should not be NULL!");
+    CheckNTErrors(IsSameShaped(*source, *target), "The two tensors should be of the same shape!");
+
+    if(source != target)
+        CopyValues(*source, *target);
+}
+
+/* 
+add a new job of broadcasting data (for a parameter)
+>> source - the data that we want to broadcast
+>> targetList - the target places that we recieve the data
+>> pid - the parameter index
+*/
+bool XWorkerBroadcast::AddJobBroadcastSingle(XModel * source, XList * targetList, int pid)
+{
+    CheckNTErrors(source != NULL, "no input source tensor!");
+    CheckNTErrors(targetList != NULL, "no input target tensor list!");
+    CheckNTErrors(pid >= 0 && pid < source->paramNum, "illegal parameter index!");
+
+    XList args;
+    args.Add(this);
+    args.Add(source);
+    args.AddInt(targetList->count);
+    args.AddList(targetList);
+    args.AddInt(pid);
+
+    if (isInstantRun)
+        XWorkerBroadcast::BroadcastSingle(&args);
+    else
+        queue.EnqueueJob((void*)(char*)XWorkerBroadcast::BroadcastSingle, &args);
+
+    return true;
+}
+
+/* 
+add a new job of broadcasting data (for a model)
+>> source - the data that we want to broadcast
+>> targetList - the target places that we recieve the data
+*/
+bool XWorkerBroadcast::AddJobBroadcast(XModel * source, XList * targetList)
+{
+    CheckNTErrors(source != NULL, "no input source tensor!");
+    CheckNTErrors(targetList != NULL, "no input target tensor list!");
+
+    XList args;
+    args.Add(this);
+    args.Add(source);
+    args.AddInt(targetList->count);
+    args.AddList(targetList);
+
+    if (isInstantRun)
+        XWorkerBroadcast::Broadcast(&args);
+    else
+        queue.EnqueueJob((void*)(char*)XWorkerBroadcast::Broadcast, &args);
+
+    return true;
+}
+
+}
--- a/source/train/XWorkerBroadcast.h
+++ b/source/train/XWorkerBroadcast.h
+/*
+* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2016-2021
+* Natural Language Processing Lab, Northeastern University
+* and
+* NiuTrans Research
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* The worker that boradcast the lastest parameters from the server to
+* the workers.
+*
+* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-03-03
+* Several visiters will come today, so i have less time for coding.
+*/
+
+#ifndef __XWORKERBROADCAST_H__
+#define __XWORKERBROADCAST_H__
+
+#include "XWorker.h"
+#include "XModel.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+#define SLEEP_TIME_IN_BROADCASTING 5
+
+/*
+data broadcasting method
+1) point-to-point
+*/
+enum DATA_BROADCAST_TYPE { DATA_BROADCAST_P2P };
+
+/* This class defines a broadcaster that transmits parameters from
+   a server to workers. */
+class XWorkerBroadcast : public XWorker
+{
+protected:
+    DATA_BROADCAST_TYPE broadcastMode;
+
+public:
+    /* constructor */
+    XWorkerBroadcast();
+
+    /* de-constructor */
+    ~XWorkerBroadcast();
+
+    /* set the broadcasting type */
+    void SetBroadcastMode(DATA_BROADCAST_TYPE myMode);
+
+    /* broadcast data for a parameter */
+    void BroadcastDataSingle(XModel * source, XList * targetList, int pid);
+
+    /* broadcast data for a model */
+    void BroadcastData(XModel * source, XList * targetList, int sleepTime);
+
+    /* wrapper of BroadcastDataSingle */
+    static
+    void BroadcastSingle(XList * args);
+
+    /* wrapper of BroadcastData */
+    static
+    void Broadcast(XList * args);
+
+    /* P2P data broadcasting */
+    void BroadcastP2P(XTensor * source, XTensor * target);
+
+    /* add a new job of broadcasting data (for a parameter) */
+    bool AddJobBroadcastSingle(XModel * source, XList * targetList, int pid);
+
+    /* add a new job of broadcasting data (for a model) */
+    bool AddJobBroadcast(XModel * source, XList * targetList);
+};
+
+}
+
+#endif
\ No newline at end of file
--- a/source/train/XWorkerCollect.cpp
+++ b/source/train/XWorkerCollect.cpp
+/*
+* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2016-2021
+* Natural Language Processing Lab, Northeastern University
+* and
+* NiuTrans Research
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* The worker that collects data from workers.
+*
+* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-03-01
+*/
+
+#include "XWorkerCollect.h"
+#include "../tensor/core/CHeader.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+
+/* constructor */
+XWorkerCollect::XWorkerCollect()
+{
+    collectMode = DATA_COLLECT_P2P;
+}
+
+/* de-constructor */
+XWorkerCollect::~XWorkerCollect()
+{
+}
+
+/* set the collection type */
+void XWorkerCollect::SetCollectMode(DATA_COLLECT_TYPE myMode)
+{
+    collectMode = myMode;
+}
+
+/* 
+collect the gradient data, update the parameters, and broadcast the
+new parameters to all models. NOTE that this method just collect graident
+from member models. Then it calls an XWorkerUpdate to update the parameters.
+The XWorkerUpdate also calls an XWorkerBroadcast to broadcast the new parameter
+to member models back. 
+>> memberActive - member models that are active, i.e., have generated gradients
+>> memberAll -  all member models
+>> server - the server model
+>> optimizer - the optimizer
+>> updater - the worker that updates the parameters
+>> broadcaster - the worker that broadcasts the new parameters to all member
+                 models
+>> sleepTime - waiting time in collecting
+*/
+void XWorkerCollect::UpdateDataAll(XList * memberActive, XList * memberAll, XModel * server,
+                                   XOptimizer * optimizer, XWorkerUpdate * updater, 
+                                   XWorkerBroadcast * broadcaster, int sleepTime)
+{
+    int finished = 0;
+
+    for (int j = 0; j < server->paramNum; j++)
+        server->params[j].flag = PARAM_STATE_NOT_READY;
+
+    /* check */
+    for (int i = 0; i < memberAll->count; i++) {
+        XModel * source = (XModel*)memberAll->GetItem(i);
+        CheckNTErrors(source->paramNum == server->paramNum, "Incompatiable models!");
+    }
+
+    for (int i = 0; i < memberActive->count; i++) {
+        XModel * source = (XModel*)memberActive->GetItem(i);
+        CheckNTErrors(source->paramNum == server->paramNum, "Incompatiable models!");
+    }
+
+    /* counts how many member models are collect for each parameters */
+    int * finishedCount = new int[server->paramNum];
+    memset(finishedCount, 0, sizeof(int) * server->paramNum);
+
+    /* This is a simple implementation of the wait-and-collect process. But
+       there is a risk that some models are not available, that is, the
+       loop would never stop. A solution might be that we force the loop
+       to break after waiting for a short time. */
+    while (1) {
+        if (collectMode == DATA_COLLECT_P2P) {
+            for (int j = 0; j < server->paramNum; j++) {
+
+                XParamKeeper &paramServer = server->params[j];
+
+                /* tp[j]->isGradFinished is true only if the model finishes the computation
+                (in another process) */
+                if (paramServer.flag != PARAM_STATE_NOT_READY || !paramServer.param->isGradFinished)
+                    continue;
+
+                /* check if all the models (or part of them) are ready */
+                for (int i = 0; i < memberActive->count; i++) {
+                    XModel * source = (XModel*)memberActive->GetItem(i);
+                    XParamKeeper &paramSource = source->params[j];
+
+                    /* sp[j]->isGradFinished is true only if the model finishes the computation
+                    (in another process) */
+                    if (paramSource.flag == PARAM_STATE_NOT_READY && paramSource.param->isGradFinished) {
+
+                        /* data transmit */
+                        CollectP2P(paramSource.param->grad, paramServer.param->grad);
+
+                        /* reset the flag */
+                        paramSource.flag = PARAM_STATE_COLLECTED;
+                        finished++;
+                        finishedCount[j]++;
+
+                        /* we call model update (in another thread) and then
+                           broadcast the new parameters to member models 
+                           (in another thread) */
+                        if (finishedCount[j] == memberActive->count) {
+                            paramServer.flag = PARAM_STATE_COLLECTED;
+                            if (updater != NULL) {
+                                updater->AddJobUpdateSingle(server, memberAll, j, optimizer, broadcaster);
+                                updater->AddJobEnqueueFinished();
+                            }
+                        }
+                        else if (finishedCount[j] > memberActive->count) {
+                            ShowNTErrors("Something is wrong with finishedCount!");
+                        }
+                    }
+                }
+            }
+        }
+        else {
+            ShowNTErrors("Unsupported data collection mode!");
+        }
+
+        /* the collection finishes if all data tensors are processed */
+        if (finished == server->paramNum * memberActive->count)
+            break;
+
+        XSleep(sleepTime);
+    }
+
+    delete[] finishedCount;
+}
+
+/* wrapper of UpdateDataAll */
+void XWorkerCollect::UpdateAll(XList * args)
+{
+    XWorkerCollect * collecter = (XWorkerCollect*)args->GetItem(0);
+    int activeNum = args->GetInt(1);
+    
+    XList memberActive;
+    for (int i = 0; i < activeNum; i++) {
+        XModel * member = (XModel*)args->GetItem(2 + i);
+        memberActive.Add(member);
+    }
+
+    int allNum = args->GetInt(2 + activeNum);
+
+    XList memberAll;
+    for (int i = 0; i < allNum; i++) {
+        XModel * member = (XModel*)args->GetItem(2 + activeNum + 1 + i);
+        memberAll.Add(member);
+    }
+
+    XModel * server = (XModel*)args->GetItem(2 + activeNum + 1 + allNum);
+    XOptimizer * optimizer = (XOptimizer*)args->GetItem(2 + activeNum + 1 + allNum + 1);
+    XWorkerUpdate * updater = (XWorkerUpdate*)args->GetItem(2 + activeNum + 1 + allNum + 2);
+    XWorkerBroadcast * broadcaster = (XWorkerBroadcast*)args->GetItem(2 + activeNum + 1 + allNum + 3);
+
+    collecter->UpdateDataAll(&memberActive, &memberAll, server, 
+                             optimizer, updater, broadcaster, 
+                             SLEEP_TIME_IN_COLLECTING);
+}
+
+/* 
+P2P data collection
+target += source
+
+>> source - the source tensor
+>> target - the target tensor
+*/
+void XWorkerCollect::CollectP2P(XTensor * source, XTensor * target)
+{
+    CheckNTErrors(source != NULL, "The source tensor should not be NULL!");
+    CheckNTErrors(target != NULL, "The target tensor should not be NULL!");
+    CheckNTErrors(IsSameShaped(*source, *target), "The two tensors should be of the same shape!");
+
+    /* target += source */
+    if (source != target) {
+        XTensor * sourceOnSite = source;
+        if (source->devID != target->devID) {
+            sourceOnSite = new XTensor(target);
+            _CopyValues(source, sourceOnSite);
+        }
+
+        _Sum(target, sourceOnSite, target);
+
+        if (sourceOnSite != source)
+            delete sourceOnSite;
+    }
+}
+
+/* 
+sum-reduce for given tensors 
+target += source_0
+target += source_1
+...
+target += source_n
+
+>> source - the source tensor
+>> target - the target tensor
+*/
+void XWorkerCollect::CollectReduceSum(XList * source, XTensor * target)
+{
+    for (int i = 0; i < source->count; i++) {
+        XTensor * s = (XTensor*)source->GetItem(i);
+        CollectP2P(s, target);
+    }
+}
+
+/* 
+all-reduce: the well-known all-reduce method
+every tensor is involved in every data transmition. The final outcome
+is that all input tensors share the same value (i.e., the sum of them).
+
+>> all - the tensors for sum
+*/
+void XWorkerCollect::CollectAllReduce(XList * all)
+{
+    ShowNTErrors("TODO!");
+}
+
+/* 
+add a new job of collecting data, update the parameter and 
+broadcast the new parameter
+>> memberActive - member models that are active, i.e., have generated gradients
+>> memberAll -  all member models
+>> server - the server model
+>> optimizer - the optimizer
+>> updater - the worker that updates the parameters
+>> broadcaster - the worker that broadcasts the new parameters to all member
+                 models
+<< return - successful or not
+*/
+bool XWorkerCollect::AddJobUpdateAll(XList * memberActive, XList * memberAll, XModel * server,
+                                     XOptimizer * optimizer, XWorkerUpdate * updater, XWorkerBroadcast * broadcaster)
+{
+    CheckNTErrors(memberActive != NULL, "No input (active) member list!");
+    CheckNTErrors(memberAll != NULL, "No input (all) member list!");
+    CheckNTErrors(server != NULL, "No input server model!");
+    CheckNTErrors(optimizer != NULL, "No input optimizer!");
+    CheckNTErrors(updater != NULL, "No input updater!");
+    CheckNTErrors(broadcaster != NULL, "No input broadcaster!");
+
+    XList args;
+    args.Add(this);
+    args.AddInt(memberActive->count);
+    args.AddList(memberActive);
+    args.AddInt(memberAll->count);
+    args.AddList(memberAll);
+    args.Add(server);
+    args.Add(optimizer);
+    args.Add(updater);
+    args.Add(broadcaster);
+
+    if (isInstantRun)
+        XWorkerCollect::UpdateAll(&args);
+    else
+        queue.EnqueueJob((void*)(char*)XWorkerCollect::UpdateAll, &args);
+
+    return true;
+}
+
+/* 
+add a new job of collecting data
+>> sourceList - the list of models that we want collect data from
+>> target - the destination of the collection
+<< return - successful or not
+*/
+bool XWorkerCollect::AddJobCollect(XList * sourceList, XModel * target)
+{
+    CheckNTErrors(sourceList != NULL, "no input source model list!");
+    CheckNTErrors(target != NULL, "no input target model!");
+
+    XList args;
+    args.Add(this);
+    args.AddInt(sourceList->count);
+    args.AddList(sourceList);
+    args.AddInt(0);
+    args.Add(target);
+    args.Add(NULL);
+    args.Add(NULL);
+    args.Add(NULL);
+
+    if (isInstantRun)
+        XWorkerCollect::UpdateAll(&args);
+    else
+        queue.EnqueueJob((void*)(char*)XWorkerCollect::UpdateAll, &args);
+
+    return true;
+}
+
+/* 
+collect the data of the run (i.e., loss). This is a reducer. 
+>> sourceList - the list of record
+>> target - the record that we keep the reduce result
+>> sleepTime - waiting time in collecting data
+*/
+void XWorkerCollect::CollectOtherData(XList* sourceList, XNNRecord* target, int sleepTime)
+{
+    int finished = 0;
+    int* flags = new int[sourceList->count];
+    
+    for (int i = 0; i < sourceList->count; i++)
+        flags[i] = 0;
+
+    while (1) {
+        for (int i = 0; i < sourceList->count; i++) {
+            if (flags[i] != 0)
+                continue;
+
+            XNNRecord* source = (XNNRecord*)sourceList->GetItem(i);
+            if (source->state == XWORKER_FINISHED) {
+                if(target != source)
+                    target->Update(*source);
+                flags[i] = 1;
+                finished++;
+            }
+        }
+
+        if (finished == sourceList->count)
+            break;
+
+        XSleep(sleepTime);
+    }
+
+    delete[] flags;
+}
+
+/* wrapper of CollectOtherData */
+void XWorkerCollect::CollectOther(XList* args)
+{
+    //fprintf(stderr, "collect data other 0\n");
+
+    XWorkerCollect* collecter = (XWorkerCollect*)args->GetItem(0);
+    int sourceNum = args->GetItemInt(1);
+
+    /* the source records */
+    XList source;
+    for (int i = 0; i < sourceNum; i++) {
+        XNNRecord * record = (XNNRecord*)args->GetItem(2 + i);
+        source.Add(record);
+    }
+
+    /* the target record */
+    XNNRecord* target = (XNNRecord*)args->GetItem(2 + sourceNum);
+
+    collecter->CollectOtherData(&source, target, SLEEP_TIME_IN_COLLECTING_OTHER);
+
+    //fprintf(stderr, "collect data other 1\n");
+}
+
+/* 
+add a new job of collecting data of the run (i.e., loss) 
+collect the data of the run (i.e., loss). This is a reducer.
+>> sourceList - the list of record
+>> target - the record that we keep the reduce result
+*/
+bool XWorkerCollect::AddJobCollectOther(XList* sourceList, XNNRecord* target)
+{
+    CheckNTErrors(sourceList != NULL, "no input source record list!");
+    CheckNTErrors(target != NULL, "no input target record!");
+
+    XList args;
+    args.Add(this);
+    args.AddInt(sourceList->count);
+    args.AddList(sourceList);
+    args.Add(target);
+
+    if (isInstantRun)
+        XWorkerCollect::CollectOther(&args);
+    else
+        queue.EnqueueJob((void*)(char*)XWorkerCollect::CollectOther, &args);
+
+    return true;
+}
+
+}
--- a/source/train/XWorkerCollect.h
+++ b/source/train/XWorkerCollect.h
+/*
+* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2016-2021
+* Natural Language Processing Lab, Northeastern University
+* and
+* NiuTrans Research
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* The worker that collects data from workers.
+*
+* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-03-02
+* minus 10 degrees centigrade comes again!
+*/
+
+#ifndef __XWORKERCOLLECT_H__
+#define __XWORKERCOLLECT_H__
+
+#include "XWorker.h"
+#include "XModel.h"
+#include "XWorkerJob.h"
+#include "XWorkerUpdate.h"
+#include "XWorkerBroadcast.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+#define SLEEP_TIME_IN_COLLECTING 5
+#define SLEEP_TIME_IN_COLLECTING_OTHER 5
+
+/*
+data collection method
+1) point-to-point
+2) reduce sum
+3) all-reduce
+*/
+enum DATA_COLLECT_TYPE { DATA_COLLECT_P2P, DATA_COLLECT_REDUCESUM};
+
+/* The class defines the collecting-data worker. It collect (gradient) data
+   from workers for the leader (server). */
+class XWorkerCollect : public XWorker
+{
+protected:
+    DATA_COLLECT_TYPE collectMode;
+
+public:
+    /* constructor */
+    XWorkerCollect();
+
+    /* de-constructor */
+    ~XWorkerCollect();
+
+    /* set the collection type */
+    void SetCollectMode(DATA_COLLECT_TYPE myMode);
+
+    /* collect the gradient data, update the parameters, and broadcast the 
+       new parameters to all models. NOTE that this method just collects graidents
+       from member models. Then it calls an XWorkerUpdate to update the parameters.
+       The XWorkerUpdate also calls an XWorkerBroadcast to broadcast the new parameter
+       to member models back. */
+    void UpdateDataAll(XList * memberActive, XList * memberAll, XModel * server, 
+                       XOptimizer * optimizer, XWorkerUpdate * updater, XWorkerBroadcast * broadcaster, 
+                       int sleepTime);
+
+    /* wrapper of UpdateDataAll */
+    static
+    void UpdateAll(XList * args);
+
+    /* P2P data collection */
+    void CollectP2P(XTensor * source, XTensor * target);
+
+    /* sum-reduce for given tensors */
+    void CollectReduceSum(XList * source, XTensor * target);
+
+    /* all-reduce */
+    void CollectAllReduce(XList * all);
+
+    /* add a new job of collecting data, update the parameter and broadcast the new parameter */
+    bool AddJobUpdateAll(XList * memberActive, XList * memberAll, XModel * server,
+                         XOptimizer * optimizer, XWorkerUpdate * updater, XWorkerBroadcast * broadcaster);
+
+    /* add a new job of collecting data */
+    bool AddJobCollect(XList * sourceList, XModel * target);
+
+    /* collect the data of the run (i.e., loss). This is a reducer. */
+    void CollectOtherData(XList * sourceList, XNNRecord * target, int sleepTime);
+
+    /* wrapper of CollectOtherData */
+    static
+    void CollectOther(XList * args);
+
+    /* add a new job of collecting data of the run (i.e., loss) */
+    bool AddJobCollectOther(XList * sourceList, XNNRecord * target);
+};
+
+}
+
+#endif
--- a/source/train/XWorkerJob.cpp
+++ b/source/train/XWorkerJob.cpp
+/*
+* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2016-2021
+* Natural Language Processing Lab, Northeastern University
+* and
+* NiuTrans Research
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* The worker of running the neural network.
+*
+* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-02-24
+*/
+
+#include "XWorkerJob.h"
+#include "../tensor/XList.h"
+#include "../tensor/core/CHeader.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* constructor */
+XWorkerJob::XWorkerJob() 
+{
+    Clear();
+}
+
+/* de-constructor */
+XWorkerJob::~XWorkerJob()
+{
+    for (int i = 0; i < inputs.count; i++)
+        delete (XTensor*)inputs[i];
+
+    for (int i = 0; i < outputs.count; i++)
+        delete (XTensor*)outputs[i];
+
+    for (int i = 0; i < golds.count; i++)
+        delete (XTensor*)golds[i];
+
+    for (int i = 0; i < losses.count; i++)
+        delete (XTensor*)losses[i];
+}
+
+/* set the model */
+void XWorkerJob::SetModel(XModel * myModel)
+{
+    model = myModel;
+}
+
+/* get the model */
+XModel * XWorkerJob::GetModel()
+{
+    return model;
+}
+
+/* set the state of the worker */
+void XWorkerJob::SetState(XWORKER_STATE myState)
+{
+    state = myState;
+    record.state = myState;
+}
+
+/* clear the worker */
+void XWorkerJob::Clear()
+{
+    for (int i = 0; i < inputs.count; i++)
+        delete (XTensor*)inputs[i];
+    inputs.Clear();
+    inputs.Add(new XTensor());
+
+    for (int i = 0; i < outputs.count; i++)
+        delete (XTensor*)outputs[i];
+    outputs.Clear();
+    outputs.Add(new XTensor());
+
+    for (int i = 0; i < golds.count; i++)
+        delete (XTensor*)golds[i];
+    golds.Clear();
+    golds.Add(new XTensor());
+
+    for (int i = 0; i < losses.count; i++)
+        delete (XTensor*)losses[i];
+    losses.Clear();
+    losses.Add(new XTensor());
+
+    record.Clear();
+
+    SetState(XWORKER_UNSTARTED);
+}
+
+/* get the input list */
+XList * XWorkerJob::GetInput()
+{
+    return &inputs;
+}
+
+/* get the output list */
+XList * XWorkerJob::GetOutput()
+{
+    return &outputs;
+}
+
+/* get the gold standard */
+XList * XWorkerJob::GetGold()
+{
+    return &golds;
+}
+
+/* get the loss */
+XList * XWorkerJob::GetLoss()
+{
+    return &losses;
+}
+
+/* get the record of the run */
+XNNRecord * XWorkerJob::GetRecord()
+{
+    return &record;
+}
+
+/* record some stuff */
+void XWorkerJob::RecordMe()
+{
+    float lossAll = 0;
+    int sampleNum = 0;
+
+    for (int i = 0; i < losses.count; i++) {
+        XTensor* loss = (XTensor*)losses[i];
+        lossAll += ReduceSumAllValue(*loss);
+        sampleNum += loss->GetSize();
+    }
+
+    record.lossAll = lossAll;
+    record.sampleNum = sampleNum;
+
+    int predictNum = 0;
+
+    for (int i = 0; i < outputs.count; i++) {
+        XTensor* output = (XTensor*)outputs[i];
+        predictNum += output->GetSize();
+    }
+
+    record.predictNum = predictNum;
+}
+
+/* get the sum of losses over samples */
+float XWorkerJob::GetLossAll()
+{
+    return record.lossAll;
+}
+    
+/* get the number of samples */
+int XWorkerJob::GetSampleNum()
+{
+    return record.sampleNum;
+}
+
+/* get the number of outputs (predictoins) */
+int XWorkerJob::GetPredictNum()
+{
+    return record.predictNum;
+}
+
+/* 
+add a new job of model refreshment 
+>> myModel - the model
+<< return - succeeded or not
+*/
+bool XWorkerJob::AddJobRefresh(XModel * myModel)
+{
+    //fprintf(stderr, "refresh 0\n");
+
+    CheckNTErrors(myModel != NULL, "no parameter keeper!");
+
+    XList args(1);
+    args.Add(myModel);
+
+    if(isInstantRun)
+        XModel::Refresh(&args);
+    else
+        queue.EnqueueJob((void*)(char*)XModel::Refresh, &args);
+
+    //fprintf(stderr, "refresh 1\n");
+
+    return true;
+}
+
+/* 
+add a new job of neural network forward and backward computation (with the input) 
+>> myModel - the model
+>> inputs - inputs of the neural network
+>> outputs - outputs of the neural network
+>> golds - gold standards
+>> losses - losses of the outputs respect to the gold standards
+<< return - succeeded or not
+*/
+bool XWorkerJob::AddJobNeuralNet(XModel * myModel, 
+                                 XList * inputs, XList * outputs, XList * golds, XList * losses)
+{
+    CheckNTErrors(myModel != NULL, "no input neural network!");
+    CheckNTErrors(inputs != NULL, "no inputs of the model!");
+    CheckNTErrors(outputs != NULL, "no outputs of the model!");
+
+    XList args;
+    args.Add(myModel);
+    args.Add(inputs);
+    args.Add(outputs);
+    args.Add(golds);
+    args.Add(losses);
+
+    if(isInstantRun)
+        XModel::Run(&args);
+    else
+        queue.EnqueueJob((void*)(char*)XModel::Run, &args);
+
+    SetState(XWORKER_STARTED);
+
+    return true;
+}
+
+/* wrapper of RecordMe */
+void XWorkerJob::RecordMeStatic(XList* args)
+{
+    //fprintf(stderr, "record static 0\n");
+
+    CheckNTErrors(args != NULL && args->count > 0, "Illegal arguments!");
+
+    XWorkerJob * worker = (XWorkerJob*)args->GetItem(0);
+    XNNRecord * serverRecord = (XNNRecord *)args->GetItem(1);
+
+    worker->RecordMe();
+
+    /* push information to the server end */
+    MUTEX_LOCK(serverRecord->mutex);
+    serverRecord->Update(*worker->GetRecord());
+    MUTEX_UNLOCK(serverRecord->mutex);
+
+    worker->SetState(XWORKER_FINISHED);
+
+    //fprintf(stderr, "record static 1\n");
+}
+
+/* 
+add a new job of recording the running of the nerual network 
+>> 
+*/
+bool XWorkerJob::AddJobRecord(XNNRecord * serverRecord)
+{
+    XList args;
+    args.Add(this);
+    args.Add(serverRecord);
+
+    if (isInstantRun)
+        XWorkerJob::RecordMeStatic(&args);
+    else
+        queue.EnqueueJob((void*)(char*)XWorkerJob::RecordMeStatic, &args);
+
+    return true;
+}
+
+}  /* end of the nts (NiuTrans.Tensor) namespace */
+
--- a/source/train/XWorkerJob.h
+++ b/source/train/XWorkerJob.h
+/*
+* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2016-2021
+* Natural Language Processing Lab, Northeastern University
+* and
+* NiuTrans Research
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* The worker of running the neural network.
+*
+* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-02-24
+* My son had new glasses yesterday.
+*/
+
+#ifndef __XWORDERJOB_H__
+#define __XWORDERJOB_H__
+
+#include "XWorker.h"
+#include "XModel.h"
+#include "XNNRecord.h"
+#include "XBaseTemplate.h"
+#include "../tensor/XList.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* a model template for training */
+class XWorkerJob : public XWorker
+{
+protected:
+    /* the model */
+    XModel * model;
+
+    /* the input tensors of the model */
+    XList inputs;
+
+    /* the output tensors of the model */
+    XList outputs;
+
+    /* the gold standard  */
+    XList golds;
+
+    /* the loss */
+    XList losses;
+
+    /* record the information in running the neural network */
+    XNNRecord record;
+    
+public:
+
+    /* constructor */
+    XWorkerJob();
+
+    /* de-constructor */
+    ~XWorkerJob();
+
+    /* set the parameter keeper */
+    void SetModel(XModel * myModel);
+
+    /* get the parameter keeper */
+    XModel * GetModel();
+
+    /* set the state of the worker */
+    void SetState(XWORKER_STATE myState);
+
+    /* clear the worker */
+    void Clear();
+
+    /* get the input list */
+    XList * GetInput();
+
+    /* get the output list */
+    XList * GetOutput();
+    
+    /* get the gold standard */
+    XList * GetGold();
+
+    /* get the loss */
+    XList * GetLoss();
+
+    /* get the record of the run */
+    XNNRecord * GetRecord();
+
+    /* record some stuff */
+    void RecordMe();
+
+    /* get the sum of losses over samples */
+    float GetLossAll();
+    
+    /* get the number of samples */
+    int GetSampleNum();
+
+    /* get the number of outputs (predictoins) */
+    int GetPredictNum();
+
+    /* add a new job of model refreshment */
+    bool AddJobRefresh(XModel * myModel);
+
+    /* add a new job of neural network forward and backward computation (with the input) */
+    bool AddJobNeuralNet(XModel * myModel, XList * inputs, XList * outputs, XList * golds, XList * losses);
+
+    /* add a new job of recording the running of the nerual network */
+    bool AddJobRecord(XNNRecord * serverRecord);
+
+private:
+    /* wrapper of RecordMe */
+    static
+    void RecordMeStatic(XList * args);
+};
+
+}
+
+#endif
--- a/source/train/XWorkerUpdate.cpp
+++ b/source/train/XWorkerUpdate.cpp
+/*
+* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2016-2021
+* Natural Language Processing Lab, Northeastern University
+* and
+* NiuTrans Research
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* The worker that updates the model.
+*
+* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-03-01
+*/
+
+#include "XWorkerUpdate.h"
+
+namespace nts { // namespace nts (NiuTrans.Tensor)
+
+/* constructor */
+XWorkerUpdate::XWorkerUpdate()
+{
+    optimizer = NULL;
+}
+
+/* de-constructor */
+XWorkerUpdate::~XWorkerUpdate()
+{
+}
+
+/* set the optimizer */
+void XWorkerUpdate::SetOptimizer(XOptimizer * myOptimizer)
+{
+    optimizer = myOptimizer;
+}
+
+/* get the optimizer */
+XOptimizer * XWorkerUpdate::GetOptimizer()
+{
+    return optimizer;
+}
+
+/* 
+update a parameter of a model 
+>> model - the model that we want to update (on the server side)
+>> members - models that would share the updated parameters
+>> pid - the parameter index
+>> optimizer - the optimizer
+>> broadcaster - the worker that would broadcast the new parameter to members
+*/
+void XWorkerUpdate::UpdateParameter(XModel * server, XList * members, int pid,
+                                    XOptimizer * optimizer, XWorkerBroadcast * broadcaster)
+{
+
+    CheckNTErrors(server->params[pid].flag == PARAM_STATE_COLLECTED, "The state of the parameter is wrong!");
+
+    XTensor * param = server->params[pid].param;
+    XTensor * grad = param->grad;
+
+    CheckNTErrors(grad != NULL, "No gradient!");
+
+    /* update the parameter */
+    optimizer->UpdateParam(param, grad, pid);
+
+    /* set the flag */
+    server->params[pid].flag = PARAM_STATE_UPDATED;
+
+    /* broadcast the new parameter to other models (in anotehr worker/thread) */
+    broadcaster->AddJobBroadcastSingle(server, members, pid);
+    broadcaster->AddJobEnqueueFinished();
+}
+
+/* 
+update the model 
+>> model - the model that we want to update
+>> optimizer - the optimizer
+>> sleepTime - waiting time in each update
+*/
+void XWorkerUpdate::UpdateModel(XModel * model, XOptimizer * optimizer, int sleepTime)
+{
+    int finished = 0;
+
+    while (1) {
+        for (int i = 0; i < model->paramNum; i++) {
+            if (model->params[i].flag == PARAM_STATE_COLLECTED) {
+                XTensor * param = model->params[i].param;
+                XTensor * grad = param->grad;
+
+                CheckNTErrors(grad != NULL, "No gradient!");
+
+                /* update the parameter */
+                optimizer->UpdateParam(param, grad, i);
+
+                /* set the flag */
+                model->params[i].flag = PARAM_STATE_UPDATED;
+                finished++;
+            }
+        }
+
+        if (finished == model->paramNum)
+            break;
+
+        XSleep(sleepTime);
+    }
+
+    optimizer->Note(model);
+}
+
+/* 
+wrapper of UpdateParameter 
+>> args - arguments of the update
+*/
+void XWorkerUpdate::UpdateSingle(XList * args)
+{
+    CheckNTErrors(args != NULL && args->count >= 6, "Illegal argument list!");
+
+    XWorkerUpdate * updater = (XWorkerUpdate*)args->GetItem(0);
+    XModel * server = (XModel*)args->GetItem(1);
+    int memNum = args->GetInt(2);
+
+    XList members;
+    for (int i = 0; i < memNum; i++) {
+        XModel * member = (XModel*)args->GetItem(3 + i);
+        members.Add(member);
+    }
+
+    int pid = args->GetInt(3 + memNum);
+    XOptimizer * optimizer = (XOptimizer*)args->GetItem(3 + memNum + 1);
+    XWorkerBroadcast * broadcaster = (XWorkerBroadcast*)args->GetItem(3 + memNum + 2);
+
+    if(updater != NULL)
+        updater->UpdateParameter(server, &members, pid, optimizer, broadcaster);
+}
+
+/* 
+wrapper of UpdateModel
+>> args - arguments of the update
+*/
+void XWorkerUpdate::Update(XList * args)
+{
+    //fprintf(stderr, "update 0\n");
+
+    CheckNTErrors(args != NULL && args->count >= 3, "Illegal argument list!");
+
+    XWorkerUpdate * updater = (XWorkerUpdate*)args->GetItem(0);
+    XModel * model = (XModel*)args->GetItem(1);
+    XOptimizer * optimizer = (XOptimizer*)args->GetItem(2);
+
+    if(updater != NULL)
+        updater->UpdateModel(model, optimizer, SLEEP_TIME_IN_MODEL_UPDATE);
+
+    //fprintf(stderr, "update 1\n");
+}
+
+/* 
+add a new job of model update (for a parameter) 
+>> model - the model that we want to update (on the server side)
+>> members - models that would share the updated parameters
+>> pid - the parameter index
+>> optimizer - the optimizer
+>> broadcaster - the worker that would broadcast the new parameter to members
+*/
+bool XWorkerUpdate::AddJobUpdateSingle(XModel * model, XList * members, int pid,
+                                       XOptimizer * optimizer, XWorkerBroadcast * broadcaster)
+{
+    CheckNTErrors(model != NULL, "No input model!");
+    CheckNTErrors(members != NULL, "No member model list!");
+    CheckNTErrors(optimizer != NULL, "No optimizer!");
+    CheckNTErrors(broadcaster != NULL, "No broadcaster!");
+    CheckNTErrors(pid >= 0 && pid < model->paramNum, "Illegal parameter index!");
+
+    XList args;
+    args.Add(this);
+    args.Add(model);
+    args.AddInt(members->count);
+    args.AddList(members);
+    args.AddInt(pid);
+    args.Add(optimizer);
+    args.Add(broadcaster);
+
+    if (isInstantRun)
+        XWorkerUpdate::UpdateSingle(&args);
+    else
+        queue.EnqueueJob((void*)(char*)XWorkerUpdate::UpdateSingle, &args);
+
+    return true;
+}
+
+/* 
+add a new job of model update
+>> model - the model that we want to update
+>> optimizer - the optimizer
+*/
+bool XWorkerUpdate::AddJobUpdate(XModel * model, XOptimizer * optimizer)
+{
+    CheckNTErrors(model != NULL, "No input model!");
+    CheckNTErrors(optimizer != NULL, "No optimizer!");
+
+    XList args;
+    args.Add(this);
+    args.Add(model);
+    args.Add(optimizer);
+
+    if(isInstantRun)
+        XWorkerUpdate::Update(&args);
+    else
+        queue.EnqueueJob((void*)(char*)XWorkerUpdate::Update, &args);
+    
+    return true;
+}
+
+}
--- a/source/train/XWorkerUpdate.h
+++ b/source/train/XWorkerUpdate.h
+/*
+* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2016-2021
+* Natural Language Processing Lab, Northeastern University
+* and
+* NiuTrans Research
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* The worker that updates the model.
+*
+* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-03-01
+*/
+
+#ifndef __XWORKERUPDATE_H__
+#define __XWORKERUPDATE_H__
+
+#include "XWorker.h"
+#include "XOptimizer.h"
+#include "XWorkerBroadcast.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+#define SLEEP_TIME_IN_MODEL_UPDATE 5
+
+/* The class defines the model-update worker */
+class XWorkerUpdate : public XWorker
+{
+protected:
+    /* the optimizer */
+    XOptimizer * optimizer;
+
+public:
+    /* constructor */
+    XWorkerUpdate();
+
+    /* de-constructor */
+    ~XWorkerUpdate();
+
+    /* set the optimizer */
+    void SetOptimizer(XOptimizer * myOptimizer);
+
+    /* get the optimizer */
+    XOptimizer * GetOptimizer();
+
+    /* update the parameter */
+    void UpdateParameter(XModel * server, XList * members, int pid,
+                         XOptimizer * optimizer, XWorkerBroadcast * broadcaster);
+
+    /* update the model */
+    void UpdateModel(XModel * model, XOptimizer * optimizer, int sleepTime);
+
+    /* wrapper of UpdateParameter */
+    static
+    void UpdateSingle(XList * args);
+
+    /* wrapper of UpdateModel */
+    static
+    void Update(XList * args);
+
+    /* add a new job of model update (for a parameter) */
+    bool AddJobUpdateSingle(XModel * model, XList * members, int pid,
+                            XOptimizer * optimizer, XWorkerBroadcast * broadcaster);
+
+    /* add a new job of model update */
+    bool AddJobUpdate(XModel * model, XOptimizer * optimizer);
+};
+
+}
+
+#endif
--- a/source/train/optimizer/Adam.cpp
+++ b/source/train/optimizer/Adam.cpp
+/*
+* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2016-2021
+* Natural Language Processing Lab, Northeastern University
+* and
+* NiuTrans Research
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* An implementation of the Adam optimizer.
+*
+* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-03-15
+*/
+
+#include "Adam.h"
+#include "../../tensor/core/CHeader.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* constructor */
+Adam::Adam() : XOptimizer()
+{
+    Clear();
+}
+
+/* de-constructor */
+Adam::~Adam()
+{
+    Clear();
+}
+
+/*
+initialize the optimizer
+>> config - the configuration
+*/
+void Adam::Init(XConfig &config)
+{
+    XOptimizer::Init(config);
+
+    adamBeta1 = config.GetFloat("adambeta1", 0.9F);
+    adamBeta2 = config.GetFloat("adambeta2", 0.98F);
+    adamDelta = config.GetFloat("adamdelta", 1e-9F);
+}
+
+/* clear the optimizer */
+void Adam::Clear()
+{
+    XOptimizer::Clear();
+
+    for (int i = 0; i < moments.count; i++) {
+        XTensor * m = moments[i];
+        delete m;
+    }
+    moments.Clear();
+
+    for (int i = 0; i < moments2nd.count; i++) {
+        XTensor * m2nd = moments2nd[i];
+        delete m2nd;
+    }
+    moments2nd.Clear();
+
+    adamBeta1T = 1.0F;
+    adamBeta2T = 1.0F;
+}
+
+/* reset the optimizer (re-start) */
+void Adam::Reset()
+{
+    for (int i = 0; i < moments.count; i++) {
+        XTensor * m = moments[i];
+        m->SetZeroAll();
+    }
+
+    for (int i = 0; i < moments2nd.count; i++) {
+        XTensor * m2nd = moments2nd[i];
+        m2nd->SetZeroAll();
+    }
+
+    adamBeta1T = 1.0F;
+    adamBeta2T = 1.0F;
+}
+
+/* show settings */
+void Adam::ShowSettings()
+{
+    XPRINT(1, stderr, "[INFO] Optimizer = Adam\n");
+    XOptimizer::ShowSettings();
+    XPRINT2(1, stderr, "%25s = %f\n", "adambeta1", adamBeta1);
+    XPRINT2(1, stderr, "%25s = %f\n", "adambeta2", adamBeta2);
+    XPRINT2(1, stderr, "%25s = %f\n", "adamdelta", adamDelta);
+}
+
+/* record the update */
+void Adam::Note(XModel * model)
+{
+    nstep++;
+}
+
+/* 
+update a parameter matrix using Adam
+>> param - the parameter to update
+>> grad - the gradient of the parameter
+>> pid - index of the parameter
+*/
+void Adam::UpdateParam(XTensor * param, XTensor * grad, int pid)
+{
+    adamBeta1T *= adamBeta1;
+    adamBeta2T *= adamBeta2;
+    float e = lrate * (float)sqrt(1 - adamBeta2T) / (1 - adamBeta1T);
+    float d = adamDelta * (float)sqrt(1 - adamBeta2T);
+
+    /* m = beta_1 * m + (1-beta_1) * grad */
+    XTensor * m = moments[pid];
+    _ScaleAndShiftMe(m, adamBeta1, 0);
+    _Sum(m, grad, m, (1.0F - adamBeta1));
+
+    /* v = beta_2 * v + (1-beta_2) * grad * grad*/
+    XTensor * v = moments2nd[pid];
+    _Multiply(grad, grad, v, adamBeta2 / (1.0F - adamBeta2));
+    _ScaleAndShiftMe(v, (1.0F - adamBeta2), 0);
+
+    /* allocate a piece of buffer memory */
+    GMems.GetMem(v->devID)->LockBuf();
+    XTensor* v2 = NewTensorBuf(v, v->devID);
+
+    /* v2 = m / (sqrt(v) + delta) */
+    _Power(v, v2, 0.5F);
+    _ScaleAndShiftMe(v2, 1.0F, d);
+    _Div(m, v2, v2);
+
+    /* the delta rule */
+    _Sum(param, v2, param, -e);
+
+    /* release a piece of buffer memory */
+    DelTensorBuf(v2);
+    GMems.GetMem(v->devID)->UnlockBuf();
+}
+
+}
\ No newline at end of file
--- a/source/train/optimizer/Adam.h
+++ b/source/train/optimizer/Adam.h
+/*
+* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2016-2021
+* Natural Language Processing Lab, Northeastern University
+* and
+* NiuTrans Research
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* An implementation of the Adam optimizer.
+* 
+* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-03-15
+* A foggy day. But all my students come back for work after the holiday
+* - full of happiness to see a new start.
+*/
+
+#ifndef __ADAM_H__
+#define __ADAM_H__
+
+#include "../XOptimizer.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* an implementation of the Adam optimizer */
+class Adam : public XOptimizer
+{
+protected:
+    /* list of the moment of the parameter matrices */
+    TensorList moments;
+
+    /* list of the 2nd order moment of the parameter matrices */
+    TensorList moments2nd;
+
+    /* hyper parameters of Adam */
+    float adamBeta1;
+    float adamBeta2;
+    float adamDelta;
+    float adamBeta1T;
+    float adamBeta2T;
+
+public:
+    /* constructor */
+    Adam();
+
+    /* de-constructor */
+    ~Adam();
+
+    /* initialize the optimizer */
+    void Init(XConfig &config);
+
+    /* clear the optimizer */
+    void Clear();
+
+    /* reset the optimizer (re-start) */
+    void Reset();
+
+    /* show settings */
+    void ShowSettings();
+
+    /* record the update */
+    void Note(XModel * model);
+
+    /* update a parameter matrix */
+    void UpdateParam(XTensor * param, XTensor * grad, int pid);
+
+};
+
+}
+
+#endif
\ No newline at end of file
--- a/source/trainer/XTrainer.cpp
+++ b/source/trainer/XTrainer.cpp
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2021
+/*
+* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2016-2021
 * Natural Language Processing Lab, Northeastern University
 * and
 * NiuTrans Research
@@ -19,24 +20,14 @@
 */

 /*
+* This ia header that includes all optimizer headers.
 *
-* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-02-23
-*
+* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-03-15
 */

-#include "XTrainer.h"
-
-/* the nts (NiuTrans.Tensor) namespace */
-namespace nts {
-
-/* constructor */
-XTrainer::XTrainer()
-{
-}
+#ifndef __OHEADER_H__
+#define __OHEADER_H__

-/* de-constructor */
-XTrainer::~XTrainer()
-{
-}
+#include "Adam.h"

-} /* end of the nts (NiuTrans.Tensor) namespace */
\ No newline at end of file
+#endif
\ No newline at end of file