Merge with NiuTrans.NMT

4bbd6a27 · hello · 4a3a47f1 · 4bbd6a27 · 4bbd6a27 · 4bbd6a27
Commit 4bbd6a27 authored Feb 28, 2021 by hello
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,6 @@ vc140.pdb
 NiuTrans.Tensor.vcxproj.user
 NiuTrans.Tensor.aps
 data/
+build/
+xxx/
+bin/
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
-# cmake minimum version
+# CMake minimum version
 cmake_minimum_required(VERSION 2.8)

 # Project's name
 project(NiuTensor)

-# The name of the generated executable file
-# The name of the dynamic link library
-set(NIUTENSOR_EXE "NiuTensor")
-set(NIUTENSOR_DLL "${NIUTENSOR_EXE}")
+# The prefix of the generated executable file
+set(NIUTRANS_EXE "NiuTensor")
+set(NIUTRANS_DLL "${NIUTRANS_EXE}")
+

 # Generated file path
 set(EXECUTABLE_OUTPUT_PATH ../bin)
 set(LIBRARY_OUTPUT_PATH ../lib)

-# Use CMAKE_MACOSX_RPATH for macOS 
+# Use CMAKE_MACOSX_RPATH for MacOS 
 set(CMAKE_MACOSX_RPATH 1)

 # Open floder manage
 set_property(GLOBAL PROPERTY USE_FOLDERS ON)
-set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
 option(USE_CUDA "Use CUDA" OFF)
-option(USE_HALF_PRECISION "Use Half Precision in CUDA Codes" OFF)
 option(USE_MKL "Use MKL" OFF)
 option(USE_OPENBLAS "Use OpenBLAS" OFF)
+option(USE_FP16 "Use FP16" OFF)
 option(GEN_DLL "Generate Dynamic Link Library" OFF)

-# If set USE_CUDA ON, please modify CUDA_ROOT below.
-# If set USE_MKL ON, please modify the INTEL_ROOT below.
-# If set USE_OPENBLAS ON, please modify the OPENBLAS_ROOT below.
 if (USE_CUDA)
-    if(NOT EXISTS ${CUDA_ROOT})
+    if(NOT DEFINED CUDA_TOOLKIT_ROOT_DIR)
        if(WIN32)
-            set(CUDA_ROOT "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.2")
+            message(STATUS "HERE cuda")
+            set(CUDA_TOOLKIT_ROOT_DIR "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1")
        else()
-            set(CUDA_ROOT "/usr/local/cuda-9.2")
+            set(CUDA_TOOLKIT_ROOT_DIR "/usr/cuda-9.0")
        endif()
    endif()
-    set(CUDA_TOOLKIT_ROOT_DIR ${CUDA_ROOT})
-    message(STATUS "CUDA_ROOT: ${CUDA_ROOT}")
+    message(STATUS "CUDA_TOOLKIT_ROOT_DIR: ${CUDA_TOOLKIT_ROOT_DIR}")
 endif()
 if(USE_MKL)
    if(NOT DEFINED INTEL_ROOT)
        if(WIN32)
+            message(STATUS "HERE mkl")
            set(INTEL_ROOT "C:/Program Files (x86)/IntelSWTools/compilers_and_libraries_2020.2.254/windows")
        else()
-            set(INTEL_ROOT "/opt/intel/compilers_and_libraries_2020.2.254/linux")
+            set(INTEL_ROOT "/usr/intel/compilers_and_libraries_2020.2.254/linux")
        endif()
    endif()
    message(STATUS "INTEL_ROOT: ${INTEL_ROOT}")
@@ -52,9 +49,9 @@ endif()
 if(USE_OPENBLAS)
    if(NOT DEFINED OPENBLAS_ROOT)
        if(WIN32)
-            set(OPENBLAS_ROOT "C:/Program Files/OpenBLAS")
+            set(OPENBLAS_ROOT "D:/software/BaiduNetdiskDownload/thirdparty20170624/OpenBLAS")
        else()
-            set(OPENBLAS_ROOT "/opt/OpenBLAS")
+            set(OPENBLAS_ROOT "/usr/OpenBLAS")
        endif()
    endif()
    message(STATUS "OPENBLAS_ROOT: ${OPENBLAS_ROOT}")
@@ -93,66 +90,57 @@ endfunction(my_add_executable)
 # Set libs and compiler options for CUDA
 if(USE_CUDA)
    add_definitions(-DUSE_CUDA)
-    if(USE_HALF_PRECISION)
+    if(USE_FP16)
        add_definitions(-DHALF_PRECISION)
    endif()
-    find_package(CUDA REQUIRED)
-    if(GPU_ARCH STREQUAL K) # Kepler cards (CUDA 5 until CUDA 10)
-        set(ARCH_FLAGS -arch=compute_30 -code=compute_30,sm_30,sm_35,sm_37)
-    elseif(GPU_ARCH STREQUAL M) # Maxwell cards (CUDA 6 until CUDA 11)
-        set(ARCH_FLAGS -arch=compute_50 -code=compute_50,sm_50,sm_52,sm_53)
-    elseif(GPU_ARCH STREQUAL P) # Pascal (CUDA 8 and later)
-        set(ARCH_FLAGS -arch=compute_60 -code=compute_60,sm_60,sm_61,sm_62)
-    elseif(GPU_ARCH STREQUAL V) # Volta (CUDA 9 and later)
-        set(ARCH_FLAGS -arch=compute_70 -code=compute_70,sm_70,sm_72)
-    elseif(GPU_ARCH STREQUAL T) # Turing (CUDA 10 and later)
-        set(ARCH_FLAGS -arch=compute_75 -code=sm_75)
-    elseif(GPU_ARCH STREQUAL A) # Ampere (CUDA 11 and later)
-        set(ARCH_FLAGS -arch=compute_80 -code=sm_80)
-    endif()
-
-    if(USE_HALF_PRECISION)
-        if(NOT DEFINED GPU_ARCH)
-            set(ARCH_FLAGS -arch=sm_60
-                -gencode=arch=compute_60,code=sm_60
-                -gencode=arch=compute_61,code=sm_61
-                -gencode=arch=compute_62,code=sm_62
-                -gencode=arch=compute_70,code=sm_70
-                -gencode=arch=compute_72,code=sm_72
-                -gencode=arch=compute_70,code=compute_70
-            )
-        elseif(${GPU_ARCH} STREQUAL K OR ${GPU_ARCH} STREQUAL M)
-            message(FATAL_ERROR "your GPU cannot use the function half precision")
-        endif()
-    endif()
-    
+    find_package(CUDA ${CUDA_VERSION} REQUIRED)
    if(WIN32)
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4819")
-        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-maxrregcount=0 -m64 -Wno-deprecated-gpu-targets -use_fast_math")
-        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${ARCH_FLAGS})
+        SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4819")
+        set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-maxrregcount=0 -m64 -use_fast_math -DUSE_CUDA")
+        set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} -arch=sm_30 
+            -gencode=arch=compute_30,code=sm_30 
+            -gencode=arch=compute_50,code=sm_50 
+            -gencode=arch=compute_52,code=sm_52 
+            -gencode=arch=compute_60,code=sm_60 
+            -gencode=arch=compute_61,code=sm_61 
+            -gencode=arch=compute_62,code=sm_62 
+            -gencode=arch=compute_70,code=sm_70 
+            -gencode=arch=compute_70,code=compute_70 
+        )
        set(CMAKE_POLICY_DEFAULT_CMP0028 NEW)
-        link_directories("${CUDA_ROOT}/lib/x64")
-        include_directories("${CUDA_ROOT}/include")
-        set(CUDA_LIB_DIR "${CUDA_ROOT}/lib/x64/")
+        link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib/x64")
+        include_directories("${CUDA_TOOLKIT_ROOT_DIR}/include")
+        set(CUDA_LIB_DIR "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64/")
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}cublas.lib")
-        if(CUDA_VERSION_MAJOR EQUAL 11)
-            set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}cublasLt.lib")
-        endif()
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}npps.lib")
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}nppc.lib")
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}cudadevrt.lib")
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}curand.lib")
    else()
        set(CMAKE_CXX_FLAGS "-fPIC -msse4.2 -w -march=native -Wno-enum-compare -Wno-sign-compare -Wno-format -Wno-dev -O3 -DNDEBUG -rdynamic")
-        set(CUDA_NVCC_FLAGS "-Xcompiler -fPIC -maxrregcount=0 --disable-warnings -use_fast_math -DUSE_CUDA -Wno-deprecated-gpu-targets -std=c++11")
-        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${ARCH_FLAGS})
-        link_directories("${CUDA_ROOT}/lib64")
-        include_directories("${CUDA_ROOT}/include")
-        set(CUDA_LIB_DIR "${CUDA_ROOT}/lib64/")
-        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcublas_static.a")
-        if(CUDA_VERSION_MAJOR EQUAL 11)
-            set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcublasLt_static.a")
+        if(USE_FP16)
+            set(CUDA_NVCC_FLAGS "-Xcompiler -fPIC -maxrregcount=0 -use_fast_math -DUSE_CUDA -DHALF_PRECISION -Wno-deprecated-gpu-targets -std=c++11 ")
+            set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -arch=sm_60
+                -gencode=arch=compute_60,code=sm_60 
+                -gencode=arch=compute_61,code=sm_61 
+                -gencode=arch=compute_62,code=sm_62 
+                -gencode=arch=compute_70,code=sm_70 
+                -gencode=arch=compute_70,code=compute_70 
+            )
+        else()
+            set(CUDA_NVCC_FLAGS "-Xcompiler -fPIC -maxrregcount=0 -use_fast_math -DUSE_CUDA -Wno-deprecated-gpu-targets -std=c++11 ")
+            set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -arch=sm_60
+                -gencode=arch=compute_60,code=sm_60 
+                -gencode=arch=compute_61,code=sm_61 
+                -gencode=arch=compute_62,code=sm_62 
+                -gencode=arch=compute_70,code=sm_70 
+                -gencode=arch=compute_70,code=compute_70 
+            )
        endif()
+        link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64)
+        include_directories(${CUDA_TOOLKIT_ROOT_DIR}/include)
+        set(CUDA_LIB_DIR "${CUDA_TOOLKIT_ROOT_DIR}/lib64/")
+        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcublas_static.a")
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libculibos.a")
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libnpps_static.a")
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libnppc_static.a")
@@ -164,12 +152,12 @@ endif()

 # Set libs and compiler options for MKL
 if(USE_MKL)
-    add_definitions(-DUSE_BLAS -DMKL)
+    add_definitions(-DMKL)
    set(COMPILER_DIR "${INTEL_ROOT}/compiler")
    set(MKL_DIR "${INTEL_ROOT}/mkl")
    set(CPU_ARCH intel64)
    if(WIN32)
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNDEBUG")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNDEBUG -DMKL")
        link_directories(${MKL_DIR}/lib/intel64/)
        link_directories(${COMPILER_DIR}/lib/intel64)
        include_directories(${MKL_DIR}/include)
@@ -181,9 +169,9 @@ if(USE_MKL)
        set(MKL_LIB_PATH ${MKL_LIB_PATH} "${COMPILER_LIB_DIR}libiomp5md.lib")
    else()
        if(USE_CUDA)
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-reorder")
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-reorder -DMKL")
        else()
-            set(CMAKE_CXX_FLAGS "-std=c++11 -msse4.2 -w -march=native -Wno-enum-compare -Wno-sign-compare -Wno-reorder -Wno-format -O3 -flto -DNDEBUG -rdynamic -fkeep-inline-functions -fPIC")
+            set(CMAKE_CXX_FLAGS "-std=c++11 -msse4.2 -w -march=native -Wno-enum-compare -Wno-sign-compare -Wno-reorder -Wno-format -O3 -flto -DNDEBUG -rdynamic -fkeep-inline-functions -fPIC -DMKL")
        endif(USE_CUDA)
        link_directories(${MKL_DIR}/lib/intel64/)
        link_directories(${COMPILER_DIR}/lib/intel64)
@@ -199,9 +187,10 @@ endif()

 # Set libs and compiler options for OpenBLAS
 if(USE_OPENBLAS)
-    add_definitions(-DUSE_BLAS -DOPENBLAS)
+    add_definitions(-DUSE_BLAS -DMKL)
    set(OPENBLAS_INCLUDE_DIR "${OPENBLAS_ROOT}/include")
    set(OPENBLAS_LIB_DIR "${OPENBLAS_ROOT}/lib")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_BLAS")
    if(WIN32)
        link_directories(${OPENBLAS_LIB_DIR})
        include_directories(${OPENBLAS_INCLUDE_DIR})
@@ -222,15 +211,15 @@ set(OPENBLAS_LIB ${OPENBLAS_LIB_PATH})
 # Generate dynamic link library about project
 if(USE_CUDA)
    if(GEN_DLL)
-        cuda_add_library(${NIUTENSOR_DLL} SHARED ${CPP_FILES} ${H_FILES} ${CU_FILES} ${CUH_FILES})
+        cuda_add_library(${NIUTRANS_DLL} SHARED ${CPP_FILES} ${H_FILES} ${CU_FILES} ${CUH_FILES})
    else()
-        my_add_executable(${NIUTENSOR_EXE} ${CPP_FILES} ${H_FILES} ${CU_FILES} ${CUH_FILES})
+        my_add_executable(${NIUTRANS_EXE} ${CPP_FILES} ${H_FILES} ${CU_FILES} ${CUH_FILES})
    endif()
 else()
    if(GEN_DLL)
-        add_library(${NIUTENSOR_DLL} SHARED ${CPP_FILES} ${H_FILES})
+        add_library(${NIUTRANS_DLL} SHARED ${CPP_FILES} ${H_FILES})
    else()
-        my_add_executable(${NIUTENSOR_EXE} ${CPP_FILES} ${H_FILES})
+        my_add_executable(${NIUTRANS_EXE} ${CPP_FILES} ${H_FILES})
    endif()
 endif()

@@ -254,17 +243,17 @@ if(WIN32)

    if(GEN_DLL)
        message(STATUS "Generate Dynamic Link Library")
-        message(STATUS "Name of Dynamic Link Library: " ${NIUTENSOR_DLL})
-        target_link_libraries(${NIUTENSOR_DLL} ${ALL_LIB})
+        message(STATUS "Name of Dynamic Link Library: " ${NIUTRANS_DLL})
+        target_link_libraries(${NIUTRANS_DLL} ${ALL_LIB})
    else()
        message(STATUS "Generate Makefile For Executable File")
-        message(STATUS "Name of Executable File :" ${NIUTENSOR_EXE})
-        target_link_libraries(${NIUTENSOR_EXE} ${ALL_LIB})
+        message(STATUS "Name of Executable File :" ${NIUTRANS_EXE})
+        target_link_libraries(${NIUTRANS_EXE} ${ALL_LIB})
    endif()
    message(STATUS "${MESS}")
 else()
    add_definitions(-std=c++11)
-    set(MESS ${MESS} "On Linux or macOS")
+    set(MESS ${MESS} "On Linux")
    if(USE_CUDA)
        set(MESS ${MESS} " Use CUDA")
        set(ALL_LIB ${ALL_LIB} ${CUDA_LIB})
@@ -285,12 +274,12 @@ else()

    if(GEN_DLL)
        message(STATUS "Generate Dynamic Link Library")
-        message(STATUS "Name of Dynamic Link Library: " ${NIUTENSOR_DLL})
-        target_link_libraries(${NIUTENSOR_DLL} ${ALL_LIB} ${FLAG})
+        message(STATUS "Name of Dynamic Link Library: " ${NIUTRANS_DLL})
+        target_link_libraries(${NIUTRANS_DLL} ${ALL_LIB} ${FLAG})
    else()
        message(STATUS "Generate Makefile For Executable File")
-        message(STATUS "Name of Executable File: " ${NIUTENSOR_EXE})
-        target_link_libraries(${NIUTENSOR_EXE} ${ALL_LIB} ${FLAG})
+        message(STATUS "Name of Executable File: " ${NIUTRANS_EXE})
+        target_link_libraries(${NIUTRANS_EXE} ${ALL_LIB} ${FLAG})
    endif()
    message(STATUS "${MESS}")
-endif()
\ No newline at end of file
+endif()
--- a/source/sample/transformer/Decoder.cpp
+++ b/source/sample/transformer/Decoder.cpp
-/* NiuTrans.NMT - an open-source neural machine translation system.
+/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -40,6 +40,16 @@ AttDecoder::AttDecoder()
    decoderLayerNorm = NULL;
    selfAttCache = NULL;
    enDeAttCache = NULL;
+    history = NULL;
+    preNorm = true;
+    useHistory = false;
+    finalNorm = false;
+    devID = -1;
+    eSize = -1;
+    hSize = -1;
+    nlayer = -1;
+    vSize = -1;
+    dropoutP = 0.0F;
 }

 /* de-constructor */
@@ -53,8 +63,10 @@ AttDecoder::~AttDecoder()
    delete[] fnnLayerNorms;
    delete[] enDeAtt;
    delete[] enDeAttLayerNorms;
-    if (preNorm)
+    if (finalNorm)
        delete decoderLayerNorm;
+    if (useHistory)
+        delete history;
 }

 /*
@@ -70,13 +82,12 @@ void AttDecoder::InitModel(Config& config)
    vSize = config.tgtVocabSize;
    dropoutP = config.dropout;
    preNorm = config.preNorm;
+    finalNorm = config.finalNorm;
+    useHistory = config.useHistory;

    CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
    CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsizetgt\"");

-    /* embedding model */
-    embedder.InitModel(config, false);
-
    selfAtt = new Attention[nlayer];
    fnns = new FNN[nlayer];
    selfAttLayerNorms = new LN[nlayer];
@@ -86,10 +97,15 @@ void AttDecoder::InitModel(Config& config)

    selfAttCache = new Cache[nlayer];
    enDeAttCache = new Cache[nlayer];
-    if (preNorm)
+
+    if (finalNorm)
        decoderLayerNorm = new LN;

+    if (useHistory)
+        history = new LayerHistory;
+
    /* initialize the stacked layers */
+    embedder.InitModel(config, false);
    for (int i = 0; i < nlayer; i++) {
        selfAtt[i].InitModel(config);
        fnns[i].InitModel(config);
@@ -100,8 +116,10 @@ void AttDecoder::InitModel(Config& config)
        selfAttCache[i].enable = true;
        enDeAttCache[i].enable = true;
    }
-    if (preNorm)
+    if (finalNorm)
        decoderLayerNorm->InitModel(config);
+    if (useHistory)
+        history->InitModel(config);
 }

 /*
@@ -117,15 +135,26 @@ make the decoding network
 XTensor AttDecoder::Make(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
                         XTensor* maskEncDec, int nstep, bool isTraining)
 {
+    /* clear the history */
+    if (useHistory)
+        history->ClearHistory();
+
    XTensor x;

    x = embedder.Make(inputDec, true, isTraining, nstep);

    /* dropout */
    if (isTraining && dropoutP > 0)
-        x = Dropout(x, dropoutP);
+        x = Dropout(x, dropoutP, /*inplace=*/true);
+
+    if (useHistory)
+        history->Add(x);

    for (int i = 0; i < nlayer; i++) {
+
+        if (useHistory)
+            x = history->Pop();
+
        XTensor att;
        XTensor ende;
        XTensor fnn;
@@ -146,10 +175,10 @@ XTensor AttDecoder::Make(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,

        /* dropout */
        if (isTraining && dropoutP > 0)
-            att = Dropout(att, dropoutP);
+            att = Dropout(att, dropoutP, /*inplace=*/true);

        /* residual connection */
-        res = Sum(att, x);
+        res = Sum(att, x, /*inplace=*/true);

        /* layer normalization with post-norm for self-attention */
        selfAttnAfter = LayerNorm(res, selfAttLayerNorms[i], preNorm, false, true);
@@ -163,10 +192,10 @@ XTensor AttDecoder::Make(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,

        /* dropout */
        if (isTraining && dropoutP > 0)
-            ende = Dropout(ende, dropoutP);
+            ende = Dropout(ende, dropoutP, /*inplace=*/true);

        /* residual connection */
-        res = Sum(ende, selfAttnAfter);
+        res = Sum(ende, selfAttnAfter, /*inplace=*/true);

        /* layer normalization with post-norm for encoder-decoder attention */
        endeAttnAfter = LayerNorm(res, enDeAttLayerNorms[i], preNorm, false, true);
@@ -179,94 +208,27 @@ XTensor AttDecoder::Make(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,

        /* dropout */
        if (isTraining && dropoutP > 0)
-            fnn = Dropout(fnn, dropoutP);
+            fnn = Dropout(fnn, dropoutP, /*inplace=*/true);

        /* residual connection */
-        res = Sum(fnn, endeAttnAfter);
+        res = Sum(fnn, endeAttnAfter, /*inplace=*/true);

        /* layer normalization with post-norm for fnn */
        x = LayerNorm(res, fnnLayerNorms[i], preNorm, false, true);
-    }
-
-    if (preNorm)
-        return decoderLayerNorm->Make(x);
-
-    return x;
-}
-
-/*
-make the decoding network
->> inputDec - the input tensor of the decoder
->> outputEnc - the output tensor of the encoder
->> mask - mask that indicates which position is valid
->> maskEncDec - mask for the encoder-decoder attention
->> nstep - the current length of the decoder input
->> isTraining - indicates whether the model is used for training
-<< return - the output tensor of the decoder
-*/
-XTensor AttDecoder::MakeFast(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
-                             XTensor* maskEncDec, int nstep, bool isTraining)
-{
-    XTensor x;
-
-    x = embedder.Make(inputDec, true, isTraining, nstep);

-    /* dropout */
-    if (isTraining && dropoutP > 0)
-        x = Dropout(x, dropoutP);
-
-    for (int i = 0; i < nlayer; i++) {
-        XTensor res;
-
-        res = x;
-
-        /* layer normalization with pre-norm for self-attn */
-        x = selfAttLayerNorms[i].Make(x);
-
-        /******************/
-        /* self attention */
-        x = selfAtt[i].Make(x, x, x, mask, isTraining, &selfAttCache[i], SELF_ATT);
-
-        /* dropout */
-        if (isTraining && dropoutP > 0)
-            x = Dropout(x, dropoutP);
-
-        /* residual connection */
-        x = Sum(res, x);
-
-        res = x;
-
-        /* layer normalization with pre-norm for encoder-decoder attention */
-        x = enDeAttLayerNorms[i].Make(x);
-
-        /* encoder-decoder attention */
-        x = enDeAtt[i].Make(outputEnc, x, outputEnc, maskEncDec,
-                            isTraining, &enDeAttCache[i], EN_DE_ATT);
-
-        /* dropout */
-        if (isTraining && dropoutP > 0)
-            x = Dropout(x, dropoutP);
-
-        /* residual connection */
-        x = Sum(res, x);
-
-        res = x;
-
-        /* layer normalization with pre-norm for fnn */
-        x = fnnLayerNorms[i].Make(x);
-
-        /* fnn */
-        x = fnns[i].Make(x, isTraining);
+        if (useHistory)
+            history->Add(x);
+    }

-        /* dropout */
-        if (isTraining && dropoutP > 0)
-            x = Dropout(x, dropoutP);
+    if (useHistory)
+        x = history->Pop();

-        /* residual connection */
-        x = Sum(res, x);
-    }
+    /* clear the history while not training */
+    if (useHistory && !isTraining)
+        history->ClearHistory();

-    x = decoderLayerNorm->Make(x);
+    if (finalNorm)
+        return decoderLayerNorm->Make(x);

    return x;
 }

--- a/source/sample/transformer/Decoder.h
+++ b/source/sample/transformer/Decoder.h
-/* NiuTrans.NMT - an open-source neural machine translation system.
+/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -74,6 +74,9 @@ public:
    /* layer normalization for encoder-decoder attention */
    LN* enDeAttLayerNorms;

+    /* dynamic layer history */
+    LayerHistory* history;
+
    /* layer cache list */
    Cache* selfAttCache;

@@ -83,6 +86,12 @@ public:
    /* the location of layer normalization */
    bool preNorm;

+    /* add LN to the decoder output or not */
+    bool finalNorm;
+
+    /* reserve history for layers or not */
+    bool useHistory;
+
 public:
    /* constructor */
    AttDecoder();
@@ -96,10 +105,6 @@ public:
    /* make the decoding network */
    XTensor Make(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
                 XTensor* maskEncDec, int nstep, bool isTraining);
-
-    /* make the decoding network (pre norm) */
-    XTensor MakeFast(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
-                     XTensor* maskEncDec, int nstep, bool isTraining);
 };

 }

--- a/source/sample/transformer/Encoder.cpp
+++ b/source/sample/transformer/Encoder.cpp
-/* NiuTrans.NMT - an open-source neural machine translation system.
+/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -31,11 +31,22 @@ namespace nmt
 /* constructor */
 AttEncoder::AttEncoder()
 {
+    devID = -1;
    selfAtt = NULL;
    fnns = NULL;
    attLayerNorms = NULL;
    fnnLayerNorms = NULL;
    encoderLayerNorm = NULL;
+    useHistory = false;
+    history = NULL;
+    dropoutP = 0.0;
+    eSize = -1;
+    finalNorm = false;
+    hSize = -1;
+    ignored = -1;
+    nlayer = -1;
+    preNorm = false;
+    vSize = -1;
 }

 /* de-constructor */
@@ -45,8 +56,10 @@ AttEncoder::~AttEncoder()
    delete[] fnns;
    delete[] attLayerNorms;
    delete[] fnnLayerNorms;
-    if (preNorm)
+    if (finalNorm)
        delete encoderLayerNorm;
+    if (useHistory)
+        delete history;
 }

 /*
@@ -62,31 +75,36 @@ void AttEncoder::InitModel(Config& config)
    hSize = config.modelSize;
    vSize = config.srcVocabSize;
    preNorm = config.preNorm;
+    finalNorm = config.finalNorm;
+    useHistory = config.useHistory;
    dropoutP = config.dropout;

    CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
    CheckNTErrors(vSize > 1, "Set vocabulary size by \"-vsize\"");

-    /* embedding model */
-    embedder.InitModel(config);
-
    selfAtt = new Attention[nlayer];
    fnns = new FNN[nlayer];
    attLayerNorms = new LN[nlayer];
    fnnLayerNorms = new LN[nlayer];

-    if (preNorm)
+    if (finalNorm)
        encoderLayerNorm = new LN;

+    if (useHistory)
+        history = new LayerHistory;
+
    /* initialize the stacked layers */
+    embedder.InitModel(config);
    for (int i = 0; i < nlayer; i++) {
        selfAtt[i].InitModel(config);
        fnns[i].InitModel(config);
        attLayerNorms[i].InitModel(config);
        fnnLayerNorms[i].InitModel(config);
    }
-    if (preNorm)
+    if (finalNorm)
        encoderLayerNorm->InitModel(config);
+    if (useHistory)
+        history->InitModel(config);
 }

 /*
@@ -99,15 +117,25 @@ make the encoding network
 */
 XTensor AttEncoder::Make(XTensor& input, XTensor* mask, XTensor& maskEncDec, bool isTraining)
 {
-    XTensor x;
+    /* clear the history */
+    if (useHistory)
+        history->ClearHistory();

+    XTensor x;
    x = embedder.Make(input, false, isTraining);

    /* dropout */
    if (isTraining && dropoutP > 0)
-        x = Dropout(x, dropoutP);
+        x = Dropout(x, dropoutP, /*inplace=*/true);
+
+    if (useHistory)
+        history->Add(x);

    for (int i = 0; i < nlayer; i++) {
+
+        if (useHistory)
+            x = history->Pop();
+
        XTensor att;
        XTensor fnn;
        XTensor res;
@@ -123,10 +151,10 @@ XTensor AttEncoder::Make(XTensor& input, XTensor* mask, XTensor& maskEncDec, boo

        /* dropout */
        if (isTraining && dropoutP > 0)
-            att = Dropout(att, dropoutP);
+            att = Dropout(att, dropoutP, /*inplace=*/true);

        /* residual connection */
-        res = Sum(att, x);
+        res = Sum(att, x, /*inplace=*/true);

        /* layer normalization with post-norm for self-attn */
        attnAfter = LayerNorm(res, attLayerNorms[i], preNorm, false, true);
@@ -139,72 +167,27 @@ XTensor AttEncoder::Make(XTensor& input, XTensor* mask, XTensor& maskEncDec, boo

        /* dropout */
        if (isTraining && dropoutP > 0)
-            fnn = Dropout(fnn, dropoutP);
+            fnn = Dropout(fnn, dropoutP, /*inplace=*/true);

        /* residual connection */
-        res = Sum(fnn, attnAfter);
+        res = Sum(fnn, attnAfter, /*inplace=*/true);

        /* layer normalization with post-norm for fnn */
        x = LayerNorm(res, fnnLayerNorms[i], preNorm, false, true);
-    }
-    if (preNorm)
-        return encoderLayerNorm->Make(x);
-
-    return x;
-}
-
-/*
-make the encoding network
->> input - the input tensor of the encoder
->> mask - the mask that indicate each position is valid
->> maskEncDec - no use
->> isTraining - indicates whether the model is used for training
-<< return - the output tensor of the encoder
-*/
-XTensor AttEncoder::MakeFast(XTensor& input, XTensor* mask, XTensor& maskEncDec, bool isTraining)
-{
-    XTensor x;
-
-    x = embedder.Make(input, false, isTraining);
-
-    /* dropout */
-    if (isTraining && dropoutP > 0)
-        x = Dropout(x, dropoutP);
-
-    for (int i = 0; i < nlayer; i++) {
-        XTensor res;
-
-        res = x;
-
-        /* layer normalization with pre-norm for self-attn */
-        x = attLayerNorms[i].Make(x);
-
-        /* self attention */
-        x = selfAtt[i].Make(x, x, x, mask, isTraining, NULL, SELF_ATT);
-
-        /* dropout */
-        if (isTraining && dropoutP > 0)
-            x = Dropout(x, dropoutP);
-
-        /* residual connection */
-        x = Sum(res, x);

-        res = x;
+        if (useHistory)
+            history->Add(x);
+    }

-        /* layer normalization with pre-norm for fnn */
-        x = fnnLayerNorms[i].Make(x);
+    if (useHistory)
+        x = history->Pop();

-        /* fnn */
-        x = fnns[i].Make(x, isTraining);
+    /* clear the history while not training */
+    if (useHistory && !isTraining)
+        history->ClearHistory();

-        /* dropout */
-        if (isTraining && dropoutP > 0)
-            x = Dropout(x, dropoutP);
-
-        /* residual connection */
-        x = Sum(res, x);
-    }
-    x = encoderLayerNorm->Make(x);
+    if (finalNorm)
+        return encoderLayerNorm->Make(x);

    return x;
 }

--- a/source/sample/transformer/Encoder.h
+++ b/source/sample/transformer/Encoder.h
-/* NiuTrans.NMT - an open-source neural machine translation system.
+/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -27,6 +27,7 @@
 #include "submodel/Attention.h"
 #include "submodel/Embedding.h"
 #include "submodel/LayerNorm.h"
+#include "submodel/LayerHistory.h"
 #include "../../network/XNet.h"

 using namespace nts;
@@ -89,9 +90,18 @@ public:
    /* layer normalization for encoder */
    LN* encoderLayerNorm;

+    /* dynamic layer history */
+    LayerHistory* history;
+
    /* the location of layer normalization */
    bool preNorm;

+    /* add LN to the encoder output or not */
+    bool finalNorm;
+
+    /* reserve history for layers or not */
+    bool useHistory;
+
 public:
    /* constructor */
    AttEncoder();
@@ -105,9 +115,6 @@ public:
    /* make the encoding network */
    XTensor Make(XTensor& input, XTensor* mask, XTensor& maskEncDec, bool isTraining);

-    /* make the encoding network */
-    XTensor MakeFast(XTensor& input, XTensor* mask, XTensor& maskEncDec, bool isTraining);
-
    /* make the encoding network (wrapper) */
    XTensor Make(XTensor& input, XTensor* mask, bool isTraining);
 };

--- a/source/sample/transformer/Model.cpp
+++ b/source/sample/transformer/Model.cpp
-/* NiuTrans.NMT - an open-source neural machine translation system.
+/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -72,20 +72,23 @@ void Model::InitModel(Config& config)
        &config.tgtVocabSize, &config.nhead,
        &config.maxRP, &config.shareAllEmbeddings,
        &config.shareDecInputOutputWeight,
-        &config.maxPosLen
+        &config.maxPosition
    };

    FILE* modelFile = NULL;

    /* read model configurations */
-    if (!config.isTraining) {
-        modelFile = fopen(config.modelFN, "rb");
+    if (!config.isTraining || strcmp(config.pretrainedModel, "") != 0) {
+        if (strcmp(config.pretrainedModel, "") != 0)
+            modelFile = fopen(config.pretrainedModel, "rb");
+        else
+            modelFile = fopen(config.modelFN, "rb");
        CheckNTErrors(modelFile, "Failed to open the model file");
        for (auto& meta : metaInfo) {
            fread(meta, sizeof(int), 1, modelFile);
        }
    }
-    else {
+    if (config.isTraining) {
        /* read the source and target vocab size */
        FILE* trainF = fopen(config.trainFN, "rb");
        CheckNTErrors(trainF, "Failed to open the training file");
@@ -110,9 +113,10 @@ void Model::InitModel(Config& config)
        decoder->InitModel(config);

    /* load parameters */
-    if (!config.isTraining)
+    if (!config.isTraining || strcmp(config.pretrainedModel, "") != 0)
        Read(modelFile);
-    else {
+
+    if (config.isTraining) {
        TensorList params;
        GetParams(params);
        for (int i = 0; i < params.Size(); i++)
@@ -220,6 +224,8 @@ void Model::MakeMT(XTensor& inputEnc, XTensor& inputDec, XTensor& output,
    XTensor maskDec;
    XTensor maskEncDec;

+    bool debug(false);
+
    /* encoder mask */
    MakeMTMaskEnc(paddingEnc, maskEnc);

@@ -228,9 +234,25 @@ void Model::MakeMT(XTensor& inputEnc, XTensor& inputDec, XTensor& output,

    encoding = MakeEncoder(inputEnc, &maskEnc, isTraining);

+    if (debug) {
+        LOG("after encoding:");
+        encoding.mem->ShowMemUsage(stderr);
+    }
+    
    decoding = MakeDecoder(inputDec, encoding, &maskDec, maskEncDec, isTraining);

+    if (debug) {
+        LOG("after decoding:");
+        encoding.mem->ShowMemUsage(stderr);
+    }
+
    outputLayer->Make(decoding, output, true, true);
+
+    if (debug) {
+        LOG("after outputing:");
+        encoding.mem->ShowMemUsage(stderr);
+        exit(0);
+    }
 }

 /*
@@ -265,9 +287,9 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
    dims[inputDec.order + 1] = inputEnc.GetDim(inputEnc.order - 1);
    InitTensor(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, paddingEnc.devID);

-    XTensor* maskEncDecTMPEnc = NewTensorBuf(paddingEnc.order + 1, dims + 1,
-        paddingEnc.dataType, paddingEnc.devID);
-    XTensor* maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID);
+    XTensor* maskEncDecTMPEnc = NewTensorBufV2(paddingEnc.order + 1, dims + 1,
+        paddingEnc.dataType, 1.0F, paddingEnc.devID, paddingEnc.mem);
+    XTensor* maskEncDecTMPDec = NewTensorBufV2(maskEncDecTMPEnc, paddingEnc.devID, paddingEnc.mem);

    _Unsqueeze(&paddingEnc, maskEncDecTMPEnc, paddingEnc.order - 1, paddingDec.GetDim(-1));
    _ScaleAndShiftMe(maskEncDecTMPEnc, 1e9F, -1e9F);
@@ -283,14 +305,14 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
    dimsPadding[paddingEnc.order - 1] = paddingEnc.GetDim(-1);
    dimsPadding[paddingEnc.order] = paddingEnc.GetDim(-1);

-    XTensor* padding2 = NewTensorBuf(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType,
-        paddingEnc.devID);
+    XTensor* padding2 = NewTensorBufV2(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType, 1.0F,
+        paddingEnc.devID, paddingEnc.mem);

    for (int i = 0; i < padding2->order; i++)
        dimsPadding[i + 1] = padding2->GetDim(i);
    dimsPadding[0] = nhead;

-    XTensor* padding3 = NewTensorBuf(paddingEnc.order + 2, dimsPadding, paddingEnc.dataType, paddingEnc.devID);
+    XTensor* padding3 = NewTensorBufV2(paddingEnc.order + 2, dimsPadding, paddingEnc.dataType, 1.0F, paddingEnc.devID, paddingEnc.mem);

    /* mask of the padding */
    _Unsqueeze(&paddingEnc, padding2, paddingEnc.order - 1, paddingEnc.GetDim(-1));
@@ -322,6 +344,7 @@ void Model::MakeMTMaskEnc(XTensor& paddingEnc, XTensor& maskEnc)

    /* mask of the padding */
    Unsqueeze(paddingEnc, padding2, paddingEnc.order - 1, paddingEnc.GetDim(-1));
+
    Unsqueeze(padding2, maskEnc, 0, nhead);
    ScaleAndShiftMe(maskEnc, 1e9F, -1e9F);
 }
@@ -355,6 +378,7 @@ void Model::MakeMTMaskDec(XTensor& paddingEnc, XTensor& paddingDec,

    Unsqueeze(paddingEnc, maskEncDecTMP, paddingEnc.order - 1, paddingDec.GetDim(-1));
    ScaleAndShiftMe(maskEncDecTMP, 1e9F, -1e9F);
+
    Unsqueeze(maskEncDecTMP, maskEncDec, 0, dims[0]);

    delete[] dims;
@@ -369,6 +393,14 @@ void Model::GetParams(TensorList& list)
    list.Clear();

    /* encoder parameters */
+    if (encoder->useHistory) {
+        for (int i = 0; i < encoder->nlayer + 1; i++)
+            list.Add(&encoder->history->weights[i]);
+        for (int i = 0; i < encoder->nlayer; i++) {
+            list.Add(&encoder->history->layerNorms[i].weight);
+            list.Add(&encoder->history->layerNorms[i].bias);
+        }
+    }
    for (int i = 0; i < encoder->nlayer; i++) {
        list.Add(&encoder->selfAtt[i].weightQ);
        list.Add(&encoder->selfAtt[i].weightK);
@@ -384,18 +416,27 @@ void Model::GetParams(TensorList& list)
        list.Add(&encoder->fnns[i].b1);
        list.Add(&encoder->fnns[i].w2);
        list.Add(&encoder->fnns[i].b2);
-        list.Add(&encoder->attLayerNorms[i].w);
-        list.Add(&encoder->attLayerNorms[i].b);
-        list.Add(&encoder->fnnLayerNorms[i].w);
-        list.Add(&encoder->fnnLayerNorms[i].b);
+        list.Add(&encoder->attLayerNorms[i].weight);
+        list.Add(&encoder->attLayerNorms[i].bias);
+        list.Add(&encoder->fnnLayerNorms[i].weight);
+        list.Add(&encoder->fnnLayerNorms[i].bias);
    }
-    if (encoder->preNorm) {
-        list.Add(&encoder->encoderLayerNorm->w);
-        list.Add(&encoder->encoderLayerNorm->b);
+    if (encoder->finalNorm) {
+        list.Add(&encoder->encoderLayerNorm->weight);
+        list.Add(&encoder->encoderLayerNorm->bias);
    }

    if (isMT) {
        /* decoder parameters */
+        if (decoder->useHistory) {
+            for (int i = 0; i < decoder->nlayer + 1; i++)
+                list.Add(&decoder->history->weights[i]);
+            for (int i = 0; i < decoder->nlayer; i++) {
+                list.Add(&decoder->history->layerNorms[i].weight);
+                list.Add(&decoder->history->layerNorms[i].bias);
+            }
+        }
+
        for (int i = 0; i < decoder->nlayer; i++) {
            list.Add(&decoder->selfAtt[i].weightQ);
            list.Add(&decoder->selfAtt[i].weightK);
@@ -407,8 +448,8 @@ void Model::GetParams(TensorList& list)
                list.Add(&decoder->selfAtt[i].RPEmbK);
            list.Add(&decoder->selfAtt[i].weightO);
            list.Add(&decoder->selfAtt[i].biasO);
-            list.Add(&decoder->selfAttLayerNorms[i].w);
-            list.Add(&decoder->selfAttLayerNorms[i].b);
+            list.Add(&decoder->selfAttLayerNorms[i].weight);
+            list.Add(&decoder->selfAttLayerNorms[i].bias);
            list.Add(&decoder->enDeAtt[i].weightQ);
            list.Add(&decoder->enDeAtt[i].weightK);
            list.Add(&decoder->enDeAtt[i].weightV);
@@ -417,18 +458,18 @@ void Model::GetParams(TensorList& list)
            list.Add(&decoder->enDeAtt[i].biasV);
            list.Add(&decoder->enDeAtt[i].weightO);
            list.Add(&decoder->enDeAtt[i].biasO);
-            list.Add(&decoder->enDeAttLayerNorms[i].w);
-            list.Add(&decoder->enDeAttLayerNorms[i].b);
+            list.Add(&decoder->enDeAttLayerNorms[i].weight);
+            list.Add(&decoder->enDeAttLayerNorms[i].bias);
            list.Add(&decoder->fnns[i].w1);
            list.Add(&decoder->fnns[i].b1);
            list.Add(&decoder->fnns[i].w2);
            list.Add(&decoder->fnns[i].b2);
-            list.Add(&decoder->fnnLayerNorms[i].w);
-            list.Add(&decoder->fnnLayerNorms[i].b);
+            list.Add(&decoder->fnnLayerNorms[i].weight);
+            list.Add(&decoder->fnnLayerNorms[i].bias);
        }
-        if (decoder->preNorm) {
-            list.Add(&decoder->decoderLayerNorm->w);
-            list.Add(&decoder->decoderLayerNorm->b);
+        if (decoder->finalNorm) {
+            list.Add(&decoder->decoderLayerNorm->weight);
+            list.Add(&decoder->decoderLayerNorm->bias);
        }
    }

@@ -490,7 +531,7 @@ void Model::Read(FILE* file)

    TensorList params;
    GetParams(params);
-    LOG("params count: %lu", (unsigned long)params.Size());
+    LOG("params count: %zd", params.Size());
    int size = 0;
    for (int i = 0; i < params.Size(); i++) {
        size += params[i]->unitNum;

--- a/source/sample/transformer/Model.h
+++ b/source/sample/transformer/Model.h
-/* NiuTrans.NMT - an open-source neural machine translation system.
+/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/NMT.cpp
+++ b/source/sample/transformer/NMT.cpp
-/* NiuTrans.NMT - an open-source neural machine translation system.
+/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -36,22 +36,31 @@ int NMTMain(int argc, const char** argv)
    /* load configurations */
    Config config(argc, argv);

-    srand(1);
+    srand(config.seed);

    /* training */
    if (strcmp(config.trainFN, "") != 0) {
        
        Model model;
        model.InitModel(config);
+
+        TensorList params;
+        model.GetParams(params);
+        int count = 0;
+        for (int i = 0; i < params.count; i++){
+            count += params[i]->unitNum;
+        }
+        LOG("number of parameters: %d", count);
+
        Trainer trainer;
        trainer.Init(config);
        trainer.Train(config.trainFN, config.validFN, config.modelFN, &model);
    }

    /* translating */
-    if (strcmp(config.testFN, "") != 0 && strcmp(config.outputFN, "") != 0) {
+    else if (strcmp(config.testFN, "") != 0 && strcmp(config.outputFN, "") != 0) {
        
-        /* disable grad flow */
+        /* disable gradient flow */
        DISABLE_GRAD;

        Model model;

--- a/source/sample/transformer/NMT.h
+++ b/source/sample/transformer/NMT.h
-/* NiuTrans.NMT - an open-source neural machine translation system.
+/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/Utility.cpp
+++ b/source/sample/transformer/Utility.cpp
-/* NiuTrans.NMT - an open-source neural machine translation system.
+/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -63,18 +63,23 @@ Config::Config(int argc, const char** argv)
    LoadParamInt(argsNum, args, "nhead", &nhead, 4);
    LoadParamInt(argsNum, args, "enclayer", &nEncLayer, 6);
    LoadParamInt(argsNum, args, "declayer", &nDecLayer, 6);
-    LoadParamInt(argsNum, args, "maxrp", &maxRP, 8);
+    LoadParamInt(argsNum, args, "maxrp", &maxRP, -1);
    LoadParamInt(argsNum, args, "embsize", &embSize, 512);
    LoadParamInt(argsNum, args, "modelsize", &modelSize, 512);
-    LoadParamInt(argsNum, args, "maxpos", &maxPosLen, 1024);
+    LoadParamInt(argsNum, args, "maxpos", &maxPosition, 1024);
+    LoadParamInt(argsNum, args, "maxsrclen", &maxSrcLen, 1024);
+    LoadParamInt(argsNum, args, "maxtgtlen", &maxTgtLen, 1024);
    LoadParamInt(argsNum, args, "fnnhidden", &fnnHiddenSize, modelSize * 2);
    LoadParamInt(argsNum, args, "vsize", &srcVocabSize, 10152);
    LoadParamInt(argsNum, args, "vsizetgt", &tgtVocabSize, 10152);
    LoadParamInt(argsNum, args, "padid", &padID, 1);
    LoadParamInt(argsNum, args, "startid", &startID, 2);
    LoadParamInt(argsNum, args, "endid", &endID, 2);
-    LoadParamBool(argsNum, args, "rpr", &useRPR, false);
+    LoadParamInt(argsNum, args, "unkid", &unkID, 3);
+    LoadParamBool(argsNum, args, "rpr", &useRPR, maxRP > 0);
    LoadParamBool(argsNum, args, "prenorm", &preNorm, true);
+    LoadParamBool(argsNum, args, "finalnorm", &finalNorm, true);
+    LoadParamBool(argsNum, args, "dlcl", &useHistory, false);

    // TODO: refactor the parameters type to support weight sharing during training
    LoadParamInt(argsNum, args, "shareemb", &shareAllEmbeddings, 0);
@@ -86,9 +91,12 @@ Config::Config(int argc, const char** argv)
    /* options for training */
    LoadParamString(argsNum, args, "train", trainFN, "");
    LoadParamString(argsNum, args, "valid", validFN, "");
+    LoadParamString(argsNum, args, "pretrain", pretrainedModel, "");
    LoadParamInt(argsNum, args, "dev", &devID, 0);
+    LoadParamInt(argsNum, args, "seed", &seed, 1);
+    LoadParamInt(argsNum, args, "log", &logInterval, 100);
    LoadParamInt(argsNum, args, "wbatch", &wBatchSize, 4096);
-    LoadParamInt(argsNum, args, "sbatch", &sBatchSize, 8);
+    LoadParamInt(argsNum, args, "sbatch", &sBatchSize, 16);
    isTraining = (strcmp(trainFN, "") == 0) ? false : true;
    LoadParamBool(argsNum, args, "mt", &isMT, true);
    LoadParamFloat(argsNum, args, "dropout", &dropout, 0.3F);
@@ -117,7 +125,7 @@ Config::Config(int argc, const char** argv)
    LoadParamBool(argc, args, "smallbatch", &isSmallBatch, true);
    LoadParamBool(argc, args, "bigbatch", &isBigBatch, false);
    LoadParamBool(argc, args, "randbatch", &isRandomBatch, false);
-    LoadParamInt(argc, args, "bucketsize", &bucketSize, wBatchSize * 10);
+    LoadParamInt(argc, args, "bucketsize", &bucketSize, wBatchSize * 1);

    /* options for translating */
    LoadParamString(argsNum, args, "test", testFN, "");
@@ -241,8 +249,6 @@ void ShowParams(int argc, char** argv)
    fprintf(stderr, "\n");
 }

-#define MAX_WORD_NUM 120
-
 /*
 split string by delimiter, this will return indices of all sub-strings
 >> s - the original string

--- a/source/sample/transformer/Utility.h
+++ b/source/sample/transformer/Utility.h
-/* NiuTrans.NMT - an open-source neural machine translation system.
+/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -73,9 +73,18 @@ public:
    /* path to the validation file */
    char validFN[1024];

+    /* path to the pre-trained model */
+    char pretrainedModel[1024];
+
    /* device id */
    int devID;

+    /* random seed */
+    int seed;
+
+    /* interval step for logging */
+    int logInterval;
+
    /* beam size */
    int beamSize;

@@ -104,7 +113,13 @@ public:
    int modelSize;

    /* the maximum length in positional embedding */
-    int maxPosLen;
+    int maxPosition;
+
+    /* the maximum length for the source sequence */
+    int maxSrcLen;
+
+    /* the maximum length for the target sequence */
+    int maxTgtLen;

    /* the dimension of fnn hidden layer */
    int fnnHiddenSize;
@@ -118,6 +133,9 @@ public:
    /* the padding id */
    int padID;

+    /* the unk id */
+    int unkID;
+
    /* start symbol */
    int startID;

@@ -127,6 +145,12 @@ public:
    /* indicates whether the model uses pre-norm */
    bool preNorm;

+    /* add LN to the encoder/decoder output or not */
+    bool finalNorm;
+
+    /* reserve history for encoder/decoder layers or not */
+    bool useHistory;
+
    /* indicates whether the model is running for machine translation */
    bool isMT;

@@ -139,10 +163,10 @@ public:
    /* indicates whether the model is running with FP16 data type */
    bool useFP16;

-    /* indicates whether we use the RPR attention */
+    /* use the RPR attention or not */
    bool useRPR;

-    /* indicates whether we train the model */
+    /* train the model or not */
    bool isTraining;

    /* dropout rate for the model */

--- a/source/sample/transformer/submodel/Attention.cpp
+++ b/source/sample/transformer/submodel/Attention.cpp
-/* NiuTrans.NMT - an open-source neural machine translation system.
+/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -29,10 +29,14 @@ namespace nmt
 /* constructor */
 Attention::Attention()
 {
+    devID = -1;
    nhead = -1;
    dk = -1;
    dv = -1;
    d = -1;
+    dropoutP = 0.0;
+    maxRP = -1;
+    useRPR = false;
 }

 /* de-constructor */
@@ -82,17 +86,17 @@ void Attention::InitModel(Config& config)
    biasQ.SetZeroAll();
    biasO.SetZeroAll();

-    biasK.SetDataRand(-(DTYPE)sqrt(6.0F / d), (DTYPE)sqrt(6.0F / d));
-    biasV.SetDataRand(-(DTYPE)sqrt(6.0F / d), (DTYPE)sqrt(6.0F / d));
+    biasK.SetDataRandn(-(DTYPE)sqrt(6.0F / d), (DTYPE)sqrt(6.0F / d));
+    biasV.SetDataRandn(-(DTYPE)sqrt(6.0F / d), (DTYPE)sqrt(6.0F / d));
 }

 /*
 make the network
->> k - keys, B * L * H for encoders, B * 1 * H for decoders
+>> k - keys, B * L * H 
       where B = batch size, L = sequence length,
       and H = vector size of each position
->> q - queries, B * L * H
->> v - values, B * L * H for encoders, B * 1 * H for decoders
+>> q - queries, B * L * H for encoders, B * 1 * H for decoders during inference
+>> v - values, B * L * H 
 >> mask - as it is
 >> isTraining - indicates whether the model is used for training
 >> cache - decoder cache
@@ -100,7 +104,7 @@ make the network
 << return - multi-attention result
 */
 XTensor Attention::Make(XTensor& k, XTensor& q, XTensor& v, XTensor* mask,
-    bool isTraining, Cache* cache, int attType)
+                        bool isTraining, Cache* cache, int attType)
 {
    const bool isEnc = (!cache) ? true : false;

@@ -159,7 +163,7 @@ make the attention network given keys, queries and values (after linear transfor
 >> isTraining - indicates whether the model is used for training
 */
 XTensor Attention::MakeAttention(XTensor& k, XTensor& q, XTensor& v,
-    XTensor* mask, bool isTraining)
+                                 XTensor* mask, bool isTraining)
 {
    XTensor kheads;
    XTensor qheads;
@@ -188,9 +192,9 @@ XTensor Attention::MakeAttention(XTensor& k, XTensor& q, XTensor& v,
    dot = BMMul(qheads, X_NOTRANS, kheads, X_TRANS);

    if (mask)
-        dot = dot + *mask;
+        dot = Sum(dot, *mask, /*inplace=*/true);

-    dot = Linear(dot, 1.0F / (float)sqrt((float)dk / nhead));
+    dot = Linear(dot, 1.0F / (float)sqrt((float)dk / nhead), 0.0F, true);

    scalar = Softmax(dot, -1);

@@ -244,7 +248,7 @@ XTensor Attention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
    XTensor embMatrix, relativeKey;

    /* generate the relative emb index (L_q, L_kv) */
-    embMatrix = GetRPEmbedding(lenQ, lenKV, maxRP, isEnc || isTraining);
+    embMatrix = GetRPEmbedding(lenQ, lenKV, maxRP, isEnc || isTraining, isTraining);

    /* generate the relative key from the RPEmbK (L_q, L_kv, H/K) */
    relativeKey = Gather(RPEmbK, embMatrix);
@@ -255,13 +259,13 @@ XTensor Attention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
        relativeKey = ConvertDataType(relativeKey, X_FLOAT);
    }

-    float scaling = (float)sqrt(d / nhead);
+    float scaling = float(sqrt(d / nhead));
    qheads = ScaleAndShift(qheads, 1.0F / scaling);

    dot = RPDotProduct(qheads, kheads, relativeKey, true);

    if (mask)
-        dot = dot + *mask;
+        dot = Sum(dot, *mask, /*inplace=*/true);

    /* softmax */
    scalar = Softmax(dot, -1);
@@ -287,12 +291,14 @@ generate relative position embeddings
 >> lenQ - the length of query
 >> lenKV - the length of key and value
 >> maxRelativeLen - the maximum length of relative position
+>> isEnc - indicates whether it is in the encoder
 */
-XTensor Attention::GetRPEmbedding(const int lenQ, const int lenKV,
-    const int maxRelativeLen, const bool isEnc)
+XTensor Attention::GetRPEmbedding(int lenQ, int lenKV, 
+        int maxRelativeLen, bool isEnc, bool isTraining)
 {
    XTensor range;
    XTensor embMatrix;
+
    InitTensor1D(&range, lenKV, X_INT, devID);
    int* index = new int[lenKV];

@@ -313,11 +319,19 @@ XTensor Attention::GetRPEmbedding(const int lenQ, const int lenKV,
        embMatrix = Unsqueeze(range, 0, lenQ);
    }

-    //ClipMe(embMatrix, -float(maxRelativeLen), float(maxRelativeLen));
-    embMatrix = Clip(embMatrix, -float(maxRelativeLen), float(maxRelativeLen));
-    embMatrix = ScaleAndShift(embMatrix, 1.0F, float(maxRelativeLen));
+    ClipMe(embMatrix, -float(maxRelativeLen), float(maxRelativeLen));
+    ScaleAndShiftMe(embMatrix, 1.0F, float(maxRelativeLen));

    delete[] index;
+
+    /* disable gradient flow */
+    if (isTraining) {
+        XTensor copyEmbMatrix;
+        InitTensor(&copyEmbMatrix, &embMatrix);
+        _CopyValues(&embMatrix, &copyEmbMatrix);
+        return copyEmbMatrix;
+    }
+
    return embMatrix;
 }

@@ -351,6 +365,7 @@ XTensor Attention::RPDotProduct(XTensor& x, XTensor& y, XTensor& z, const bool i

    XTensor context;
    context = BMMul(x, y);
+
    int newDims[]{ headNum, batchSize, context.GetDim(1), context.GetDim(2) };
    context = Reshape(context, 4, newDims);

@@ -358,7 +373,7 @@ XTensor Attention::RPDotProduct(XTensor& x, XTensor& y, XTensor& z, const bool i
    xTrans = Transpose(x, 0, 1);

    XTensor relative;
-    relative = MatrixMulBatched(xTrans, X_NOTRANS, z, transposeFlag);
+    relative = BMMul(xTrans, X_NOTRANS, z, transposeFlag);

    XTensor relativeTrans;
    relativeTrans = Transpose(relative, 0, 1);
@@ -367,7 +382,7 @@ XTensor Attention::RPDotProduct(XTensor& x, XTensor& y, XTensor& z, const bool i

    relativeTrans = Reshape(relativeTrans, 4, splitDims);

-    return context + relativeTrans;
+    return Sum(context, relativeTrans);
 }

 /* constructor */

--- a/source/sample/transformer/submodel/Attention.h
+++ b/source/sample/transformer/submodel/Attention.h
-/* NiuTrans.NMT - an open-source neural machine translation system.
+/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -93,10 +93,6 @@ public:
    /* bias for V */
    XTensor biasV;

-    XTensor wBig;
-
-    XTensor bBig;
-
    /* RPR emb */
    XTensor RPEmbK;

@@ -148,7 +144,7 @@ public:
                             XTensor* mask, bool isTraining, bool isEnc);

    /* generate relative position embeddings */
-    XTensor GetRPEmbedding(const int lenQ, const int lenKV, const int maxRelativeLen, const bool isEnc);
+    XTensor GetRPEmbedding(int lenQ, int lenKV, int maxRelativeLen, bool isEnc, bool isTraining);

    /* relative position-aware dot-product attention inner calculation */
    XTensor RPDotProduct(XTensor& x, XTensor& y, XTensor& z, const bool is_key);

--- a/source/sample/transformer/submodel/CommonModules.cpp
+++ b/source/sample/transformer/submodel/CommonModules.cpp
-/* NiuTrans.NMT - an open-source neural machine translation system.
+/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/submodel/CommonModules.h
+++ b/source/sample/transformer/submodel/CommonModules.h
-/* NiuTrans.NMT - an open-source neural machine translation system.
+/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/submodel/Embedding.cpp
+++ b/source/sample/transformer/submodel/Embedding.cpp
-/* NiuTrans.NMT - an open-source neural machine translation system.
+/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -29,8 +29,10 @@ namespace nmt
 /* constructor */
 Embedder::Embedder()
 {
+    d = -1;
    devID = -1;
    vSize = -1;
+    eSize = -1;
    maxLength = -1;
 }

@@ -50,7 +52,7 @@ void Embedder::InitModel(Config& config, bool isEnc)
    d = config.modelSize;
    padIdx = config.padID;
    eSize = config.embSize;
-    maxLength = config.maxPosLen;
+    maxLength = config.maxPosition;
    vSize = (isEnc) ? config.srcVocabSize : config.tgtVocabSize;

    InitTensor2D(&w, vSize, eSize, X_FLOAT, devID);
@@ -59,6 +61,10 @@ void Embedder::InitModel(Config& config, bool isEnc)
    DTYPE v = 1.0F / (float)sqrt((float)eSize);
    w.SetDataRandn(0, v);

+    for (int i = 0; i < eSize; i++) {
+        w.Set2D(0.0F, padIdx, i);
+    }
+
    /* create the positional embedding matrix */
    MakePosEmbedding(maxLength);
 }
@@ -138,13 +144,13 @@ XTensor Embedder::Make(XTensor& input, bool isDec, bool isTraining, int nstep)
    posEmbedding = Unsqueeze(embTMP, 0, input.GetDim(0));

    /* then we make word embeddings */
-    //w.enableGrad = false;
    wordEmbedding = Gather(w, input);

-    wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize));
+    wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize), 0.0F, true);

    /* we sum over the two embeddings */
    SumMe(wordEmbedding, posEmbedding);
+
    return wordEmbedding;
 }


--- a/source/sample/transformer/submodel/Embedding.h
+++ b/source/sample/transformer/submodel/Embedding.h
-/* NiuTrans.NMT - an open-source neural machine translation system.
+/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/submodel/FNN.cpp
+++ b/source/sample/transformer/submodel/FNN.cpp
-/* NiuTrans.NMT - an open-source neural machine translation system.
+/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -31,9 +31,11 @@ namespace nmt
 /* constructor */
 FNN::FNN()
 {
+    dropoutP = 0.0F;
    inSize = -1;
    outSize = -1;
    hSize = -1;
+    devID = -1;
 }

 /* de-constructor */
@@ -66,8 +68,8 @@ void FNN::InitModel(Config& config)
    _SetDataFanInOut(&w1, scale);
    _SetDataFanInOut(&w2, scale);

-    w1.SetDataRand(-(DTYPE)sqrt(6.0F / inSize), (DTYPE)sqrt(6.0F / inSize));
-    w2.SetDataRand(-(DTYPE)sqrt(6.0F / hSize), (DTYPE)sqrt(6.0F / hSize));
+    //w1.SetDataRand(-(DTYPE)sqrt(6.0F / inSize), (DTYPE)sqrt(6.0F / inSize));
+    //w2.SetDataRand(-(DTYPE)sqrt(6.0F / hSize), (DTYPE)sqrt(6.0F / hSize));

    b1.SetZeroAll();
    b2.SetZeroAll();
@@ -85,9 +87,9 @@ XTensor FNN::Make(XTensor& input, bool isTraining)

    /* t1 = max(0, x * w1 + b1) */
    t1 = Rectify(MulAndShift(input, w1, b1));
-
+    
    if (isTraining && dropoutP > 0)
-        t1 = Dropout(t1, dropoutP);
+        t1 = Dropout(t1, dropoutP, /*inplace=*/true);

    /* result = t1 * w2 + b2 */
    return MulAndShift(t1, w2, b2);

--- a/source/sample/transformer/submodel/FNN.h
+++ b/source/sample/transformer/submodel/FNN.h
-/* NiuTrans.NMT - an open-source neural machine translation system.
+/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/submodel/GLU.cpp
+++ b/source/sample/transformer/submodel/GLU.cpp
-/* NiuTrans.NMT - an open-source neural machine translation system.
+/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -30,6 +30,7 @@ namespace nmt
 /* constructor */
 GLU::GLU()
 {
+    devID = -1;
    inSize = -1;
    outSize = -1;
    hSize = -1;

--- a/source/sample/transformer/submodel/GLU.h
+++ b/source/sample/transformer/submodel/GLU.h
-/* NiuTrans.NMT - an open-source neural machine translation system.
+/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/submodel/LayerHistory.cpp
+++ b/source/sample/transformer/submodel/LayerHistory.cpp
-/* NiuTrans.NMT - an open-source neural machine translation system.
+/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -16,6 +16,7 @@

 /*
 * $Created by: Bei Li (libei_neu@outlook.com) 2020-02-03
+ * $Modified by: Chi Hu (huchinlp@gmail.com) 2020-12-10
 */

 #include "Embedding.h"
@@ -23,6 +24,7 @@
 #include "LayerHistory.h"
 #include "../Utility.h"
 #include "../../../tensor/core/CHeader.h"
+#include "../../../tensor/XName.h"

 #define SAFE_DELETE(x) do{ if((x) != NULL){delete (x); (x) = NULL;} } while(false)
 #define SAFE_DELETE_ARRAY(x) do{ if((x) != NULL) {delete [] (x); (x)=NULL;} } while(false)
@@ -34,16 +36,20 @@ namespace nmt
 LayerHistory::LayerHistory()
 {
    d = -1;
+    devID = -1;
    count = -1;
-    weight = NULL;
+    nlayer = -1;
+    weights = NULL;
+    history = NULL;
    layerNorms = NULL;
 }

 /* de-constructor */
 LayerHistory::~LayerHistory()
 {
-    history.Clear();
+    delete history;
    delete[] layerNorms;
+    delete[] weights;
 }

 /*
@@ -56,7 +62,20 @@ void LayerHistory::InitModel(Config& config)
    d = config.modelSize;
    nlayer = config.nEncLayer;

-    InitTensor2D(&weight, nlayer + 1, nlayer + 1, X_FLOAT, devID);
+    /*  the triangle weight matrices for dlcl 
+        layer 0: [1, 0, ..., 0]               
+        layer 1: [0.5, 0.5, ..., 0]           
+        layer 2: [0.33, 0.33, 0.33, ..., 0]   */
+    weights = new XTensor[nlayer + 1];
+    for (int i = 0; i < nlayer + 1; i++) {
+        InitTensor1D(&(weights[i]), i + 1, X_FLOAT, devID);
+        float* data = new float[i + 1];
+        for (int j = 0; j < i + 1; j++) {
+            data[j] = 1.0F / float(i + 1);
+        }
+        weights[i].SetData(data, i + 1);
+        delete[] data;
+    }

    layerNorms = new LN[nlayer];

@@ -68,59 +87,88 @@ void LayerHistory::InitModel(Config& config)

 /*
 the Add operation
->> tensor - the previous layer output. It might be of size B * L * H
-            where B = batch size, L = sequence length,
-            and H = vector size of each position
+>> layer - the previous layer output. It might be of size B * L * H
+           where B = batch size, L = sequence length,
+           and H = vector size of each position
 */
-void LayerHistory::Add(XTensor& tensor)
+void LayerHistory::Add(XTensor& layer)
 {
    /* the embedding is not normed */
    count += 1;
-    if (history.Size() == 0) {
-
-        //sample_ = tensor;
-        history.Add(&tensor);
+    if (history->count == 0) {
+        history->Add(layer);
        return;
    }
-    XTensor ln = layerNorms[count - 2].Make(tensor);
-    history.Add(&ln);
+    layer = layerNorms[count - 2].Make(layer);
+    history->Add(layer);
 }

 /*
-generate the weight sum vector of all previous layer output in the history as the layer input
+calculate the weighted sum of previous layers
+the result for the i-th layer is:
+result = sum(layers[0...i] * weight[i][0...i])
+shape of the result: B * L * H
 */
 XTensor LayerHistory::Pop()
 {
-    /* the number of layer output in the history */
-    int size = (int)history.Size();
-
-    TensorList historyList;
-    for (int i = 0; i < size; i++)
-        historyList.Add(history[i]);
+    TensorList list;
+    for (int i = 0; i < history->count; i++) {
+        list.Add(&(history->list[i]));
+    }
+    XTensor stack;
+    stack = Merge(list, 0);
+    //Stack(list, 0);

-    /* we need stack the tensor along the first dim*/
-    XTensor stackTensor = Stack(historyList, 0);
+    int dimSize[MAX_TENSOR_DIM_NUM];
+    for (int i = 0; i < stack.order + 1; i++)
+        dimSize[i + 1] = stack.dimSize[i];
+    dimSize[0] = int(list.Size());
+    dimSize[1] /= dimSize[0];
+    stack = Reshape(stack, stack.order + 1, dimSize);

-    XTensor interWeight;
+    XTensor res;
+    res = MultiplyDim(stack, weights[list.Size() - 1], 0);

-    InitTensor2D(&interWeight, 1, weight.dimSize[1], DEFAULT_DTYPE, devID);
-    XTensor layerWeight;
-    InitTensor1D(&layerWeight, size, DEFAULT_DTYPE, devID);
+    return ReduceSum(res, 0);
+}

-    _SelectRange(&weight, &interWeight, 0, size - 1, size);
-    interWeight.Reshape(interWeight.unitNum);
-    _SelectRange(&interWeight, &layerWeight, 0, 0, size);
-    MultiplyDimMe(stackTensor, layerWeight, 0);
+/* clear the history */
+void LayerHistory::ClearHistory(bool reset)
+{
+    if(history != NULL)
+        delete history;
+    if(reset)
+        history = new History;
+    else
+        history = NULL;
+    count = 0;
+}

-    XTensor result;
-    ReduceSum(stackTensor, result, 0);
+/* initialize the history */
+History::History()
+{
+    count = 0;
+}

-    return result;
+/* delete the history */
+History::~History()
+{
+    for (int i = 0; i < MAX_LAYER_NUM; i++) {
+        list[i].DestroyData();
+        XLink::ClearOutgoing(&list[i]);
+        XLink::ClearIncoming(&list[i]);
+        if (list[i].grad != NULL)
+            delete list[i].grad;
+    }
 }

-void LayerHistory::ClearHistory()
+/* append a layer to the history */
+void History::Add(XTensor& layer)
 {
-    history.Clear();
+    list[count] = std::move(layer);
+    XLink::ClearOutgoing(&layer);
+    XLink::ClearIncoming(&layer);
+    count++;
 }

 }
\ No newline at end of file
--- a/source/sample/transformer/submodel/LayerHistory.h
+++ b/source/sample/transformer/submodel/LayerHistory.h
-/* NiuTrans.NMT - an open-source neural machine translation system.
+/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -27,16 +27,42 @@
 #include "../../../tensor/function/FHeader.h"

 using namespace nts;
+using namespace std;

 namespace nmt
 {

-/*
-multi-head attention
-y(Q, K, V) = cat(head_1, head_2, ..., head_n)
-where head_i = Attention(Q * w_i^Q, K * w_i^K, V * w_i^V)
-      attention(Q, K, V) = softmax(Q * K^T/d_k^0.5) V
-      d_k = dimension size of K
+#define MAX_LAYER_NUM 50
+
+/* 
+the class of history list
+*/
+class History {
+public:
+
+    /* number of elements in the list */
+    int count;
+
+    /* the history list */
+    XTensor list[MAX_LAYER_NUM];
+
+public: 
+
+    /* contructor */
+    History();
+
+    /* de-contructor */
+    ~History();
+
+    /* append a layer to the list */
+    void Add(XTensor& layer);
+};
+
+/* 
+the class of layer history
+it generates the weighted sum of previous layers
+the result for the i-th layer is:
+res = sum(layers[0...i] * weight[i][0...i])
 */
 class LayerHistory
 {
@@ -44,8 +70,8 @@ public:
    /* device id */
    int devID;

-    /* the triangle weight matrix for dlcl */
-    XTensor weight;
+    /* the triangle weight matrices for dlcl */
+    XTensor* weights;

    /* hidden size */
    int d;
@@ -57,7 +83,7 @@ public:
    int count;

    /* a history to store the value of intimidate layers */
-    TensorList history;
+    History* history;

    /* layer normalization for each intimidate layer */
    LN* layerNorms;
@@ -79,7 +105,7 @@ public:
    XTensor Pop();

    /* clean the history*/
-    void ClearHistory();
+    void ClearHistory(bool reset=true);
 };

 }

--- a/source/sample/transformer/submodel/LayerNorm.cpp
+++ b/source/sample/transformer/submodel/LayerNorm.cpp
-/* NiuTrans.NMT - an open-source neural machine translation system.
+/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -51,12 +51,12 @@ void LN::InitModel(Config& config)

    d = config.modelSize;

-    InitTensor1D(&w, d, X_FLOAT, devID);
-    InitTensor1D(&b, d, X_FLOAT, devID);
-    w.SetDataRand(1.0F, 1.0F);
-    b.SetZeroAll();
+    InitTensor1D(&weight, d, X_FLOAT, devID);
+    InitTensor1D(&bias, d, X_FLOAT, devID);
+    weight.SetDataRand(1.0F, 1.0F);
+    bias.SetZeroAll();

-    w.SetDataFixed(1);
+    weight.SetDataFixed(1);
 }

 /*
@@ -104,7 +104,11 @@ XTensor LN::Make(XTensor& input)
    }

    /* result = x' * w + b   */
-    return xn * w + b;
+    xn = xn * weight;
+
+    xn = Sum(xn, bias, true);
+
+    return xn;
 }

 }
\ No newline at end of file
--- a/source/sample/transformer/submodel/LayerNorm.h
+++ b/source/sample/transformer/submodel/LayerNorm.h
-/* NiuTrans.NMT - an open-source neural machine translation system.
+/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -39,10 +39,10 @@ public:
    int devID;

    /* the transformation matrix w */
-    XTensor w;
+    XTensor weight;

    /* the bias term b */
-    XTensor b;
+    XTensor bias;

    /* dimension size of the model */
    int d;

--- a/source/sample/transformer/submodel/NNUtil.cpp
+++ b/source/sample/transformer/submodel/NNUtil.cpp
-/* NiuTrans.NMT - an open-source neural machine translation system.
+/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/submodel/NNUtil.h
+++ b/source/sample/transformer/submodel/NNUtil.h
-/* NiuTrans.NMT - an open-source neural machine translation system.
+/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/submodel/Output.cpp
+++ b/source/sample/transformer/submodel/Output.cpp
-/* NiuTrans.NMT - an open-source neural machine translation system.
+/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -33,6 +33,7 @@ Output::Output()
    devID = -1;
    vSize = -1;
    hSize = -1;
+    padIdx = -1;
 }

 /* de-constructor */
@@ -49,11 +50,15 @@ void Output::InitModel(Config& config)
    devID = config.devID;
    hSize = config.modelSize;
    vSize = config.tgtVocabSize;
+    padIdx = config.padID;

    InitTensor2D(&w, vSize, hSize, X_FLOAT, devID);

    DTYPE v = 1.0F / (float)sqrt((float)hSize);
    w.SetDataRandn(0, v);
+    for (int i = 0; i < hSize; i++) {
+        w.Set2D(0.0F, padIdx, i);
+    }
 }

 /*

--- a/source/sample/transformer/submodel/Output.h
+++ b/source/sample/transformer/submodel/Output.h
-/* NiuTrans.NMT - an open-source neural machine translation system.
+/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -43,6 +43,9 @@ public:
    /* vector size of the linear transformation */
    int hSize;

+    /* the padding index */
+    int padIdx;
+
    /* transformation matrix */
    XTensor w;


--- a/source/sample/transformer/translate/DataSet.cpp
+++ b/source/sample/transformer/translate/DataSet.cpp
-/* NiuTrans.NMT - an open-source neural machine translation system.
+/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -32,19 +32,19 @@ using namespace nmt;

 namespace nts {

-/* sort the output by id (in ascending order) */
+/* sort the input by length (in descending order) */
 void DataSet::SortInput() {
-    sort(inputBuffer.items, inputBuffer.items + inputBuffer.count, 
-        [](Example* a, Example* b) {
-            return a->values.count > b->values.count;
+    sort(inputBuffer.begin(), inputBuffer.end(), 
+        [](const Example& a, const Example& b) {
+            return a.values.size() > b.values.size();
        });
 }

-/* sort the input by length (in descending order) */
+/* sort the output by id (in ascending order) */
 void DataSet::SortOutput() {
-    sort(outputBuffer.items, outputBuffer.items + outputBuffer.count, 
-        [](Result* a, Result* b) {
-            return a->id < b->id;
+    sort(outputBuffer.begin(), outputBuffer.end(),
+        [](const Example& a, const Example& b) {
+            return a.id < b.id;
        });
 }

@@ -54,43 +54,43 @@ load data from the file to the buffer
 void DataSet::LoadDataToBuffer()
 {
    string line;
-    inputBuffer.Clear();
+    inputBuffer.clear();
    bufferUsed = 0;

    int id = 0;
    const string tokenDelimiter = " ";

    while (getline(*fp, line)) {
-        IntList values;
+        vector<int> values;

        /* load words and transform them to ids */
-        auto indices = SplitToPos(line, tokenDelimiter);
+        UInt64List indices = SplitToPos(line, tokenDelimiter);

-        /* reserve the first 120 words if the input is too long */
-        size_t maxLen = indices.Size() > MAX_WORD_NUM ? MAX_WORD_NUM : indices.Size();
+        /* reserve the first maxSrcLen words if the input is too long */
+        int maxLen = int(indices.Size()) > maxSrcLen ? maxSrcLen : int(indices.Size());

-        for (size_t i = 0; i < maxLen; i++) {
-            size_t offset = (i != (indices.Size() - 1)) ?
-                              (size_t)indices[(int)i + 1] - (size_t)indices[(int)i] - tokenDelimiter.size()
-                            : line.size() - (size_t)indices[(int)i];
-            string word = line.substr((size_t)indices[(int)i], offset);
+        for (int i = 0; i < maxLen; i++) {
+            auto offset = (i != (int(indices.Size()) - 1)) ?
+                indices[i + 1] - indices[i] - tokenDelimiter.size()
+                : line.size() - indices[i];
+            string word = line.substr(indices[i], offset);
            if (srcVocab.word2id.find(word) == srcVocab.word2id.end())
-                values.Add(UNK);
+                values.emplace_back(unkID);
            else
-                values.Add(srcVocab.word2id.at(word));
+                values.emplace_back(srcVocab.word2id.at(word));
        }

        /* make sure that the sequence ends with EOS */
-        if (values.Size() != 0 && values[-1] != EOS)
-            values.Add(EOS);
-
-        Example* example = new Example;
-        example->id = id;
-        example->values = values;
-        if (values.Size() != 0)
-            inputBuffer.Add(example);
+        if (values.size() != 0 && values.back() != endID)
+            values.emplace_back(endID);
+
+        Example example;
+        example.id = id;
+        example.values = values;
+        if (values.size() != 0)
+            inputBuffer.emplace_back(example);
        else
-            emptyLines.Add(id);
+            emptyLines.emplace_back(id);
        id++;
    }
    fp->close();
@@ -109,23 +109,16 @@ load a mini-batch to the device (for translating)
 >> devID - the device id, -1 for the CPU
 << indices of the sentences
 */
-UInt64List DataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
-                              int minSentBatch, int batchSize, int devID)
+UInt64List DataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc, int minSentBatch, int batchSize, int devID)
 {
    int realBatchSize = minSentBatch;

    /* get the maximum sentence length in a mini-batch */
-    int maxLen = (int)inputBuffer[(int)bufferUsed]->values.Size();
-
-    /* dynamic batching for sentences */
-    //while ((realBatchSize < (inputBuffer.Size() - bufferUsed))
-    //    && (realBatchSize * maxLen < batchSize)) {
-    //    realBatchSize++;
-    //}
+    int maxLen = int(inputBuffer[bufferUsed].values.size());

    /* real batch size */
-    if ((inputBuffer.Size() - bufferUsed) < realBatchSize) {
-        realBatchSize = (int)(inputBuffer.Size() - bufferUsed);
+    if ((int(inputBuffer.size()) - bufferUsed) < realBatchSize) {
+        realBatchSize = int(inputBuffer.size()) - bufferUsed;
    }

    CheckNTErrors(maxLen != 0, "invalid length");
@@ -134,25 +127,25 @@ UInt64List DataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
    float* paddingValues = new float[realBatchSize * maxLen];

    for (int i = 0; i < realBatchSize * maxLen; i++) {
-        batchValues[i] = PAD;
+        batchValues[i] = padID;
        paddingValues[i] = 1.0F;
    }

-    size_t curSrc = 0;
+    int curSrc = 0;

    /* right padding */
    UInt64List infos;
-    size_t totalLength = 0;
+    int totalLength = 0;

-    for (size_t i = 0; i < (size_t)realBatchSize; ++i) {
-        infos.Add(inputBuffer[(int)(bufferUsed + i)]->id);
-        totalLength += inputBuffer[(int)(bufferUsed + i)]->values.Size();
+    for (int i = 0; i < realBatchSize; ++i) {
+        infos.Add(inputBuffer[bufferUsed + i].id);
+        totalLength += int(inputBuffer[bufferUsed + i].values.size());

        curSrc = maxLen * i;
-        for (size_t j = 0; j < inputBuffer[(int)(bufferUsed + i)]->values.Size(); j++)
-            batchValues[(int)(curSrc++)] = (int)inputBuffer[(int)(bufferUsed + i)]->values[(int)j];
+        for (int j = 0; j < int(inputBuffer[bufferUsed + i].values.size()); j++)
+            batchValues[curSrc++] = inputBuffer[bufferUsed + i].values[j];
        while (curSrc < maxLen * (i + 1))
-            paddingValues[(int)(curSrc++)] = 0;
+            paddingValues[curSrc++] = 0;
    }
    infos.Add(totalLength);

@@ -201,7 +194,7 @@ void DataSet::Init(const char* dataFile, const char* srcVocabFN, const char* tgt

 /* check if the buffer is empty */
 bool DataSet::IsEmpty() {
-    if (bufferUsed < inputBuffer.Size())
+    if (bufferUsed < inputBuffer.size())
        return false;
    return true;
 }
@@ -211,12 +204,11 @@ void DataSet::DumpRes(const char* ofn)
 {
    ofstream ofile(ofn, ios::out);

-    for (int t = 0; t < outputBuffer.Size(); t++) {
-        auto res = outputBuffer[t];
-        for (int i = 0; i < res->res.Size(); i++) {
-            if (res->res[i] < 4)
+    for (const auto& tgtSent : outputBuffer) {
+        for (const auto& w : tgtSent.values) {
+            if (w < 4)
                break;
-            ofile << tgtVocab.id2word[res->res[i]] << " ";
+            ofile << tgtVocab.id2word[w] << " ";
        }
        ofile << "\n";
    }
@@ -229,14 +221,6 @@ DataSet::~DataSet()
 {
    /* release the file */
    delete fp;
-
-    /* release the input buffer */
-    for (int i = 0; i < inputBuffer.Size(); i++)
-        delete inputBuffer[i];
-
-    /* release the output buffer */
-    for (int i = 0; i < outputBuffer.Size(); i++)
-        delete outputBuffer[i];
 }

 }
\ No newline at end of file
--- a/source/sample/transformer/translate/DataSet.h
+++ b/source/sample/transformer/translate/DataSet.h
-/* NiuTrans.NMT - an open-source neural machine translation system.
+/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -31,40 +31,41 @@
 #include "../../../tensor/XTensor.h"
 #include "../../../tensor/XGlobal.h"

-#define MAX_WORD_NUM 120
-
 using namespace std;

 namespace nts {
+
 /* the struct of tokenized input */
 struct Example {
-    int id;
-    IntList values;
-};

-/* the struct of tokenized output */
-struct Result {
    int id;
-    IntList res;
+
+    vector<int> values;
+
+public:
+    Example() {
+        id = 0;
+    }
 };

 /* A `DataSet` is associated with a file which contains variable length data.*/
 struct DataSet {
+
 public:
    /* the data buffer */
-    InputBufferType inputBuffer;
+    vector<Example> inputBuffer;

    /* a list of empty line number */
-    IntList emptyLines;
+    vector<int> emptyLines;

    /* the result buffer */
-    OutputBufferType outputBuffer;
+    vector<Example> outputBuffer;

    /* the pointer to file stream */
    ifstream* fp;

    /* size of used data in buffer */
-    size_t bufferUsed;
+    int bufferUsed;

    /* the source vocabulary */
    Vocab srcVocab;
@@ -72,6 +73,21 @@ public:
    /* the target vocabulary */
    Vocab tgtVocab;

+    /* the maximum length of an input sequence */
+    int maxSrcLen;
+
+    /* the padding id */
+    int padID;
+
+    /* the unk id */
+    int unkID;
+
+    /* start symbol */
+    int startID;
+
+    /* end symbol */
+    int endID;
+
 public:

    /* sort the input by length */
@@ -84,8 +100,7 @@ public:
    void LoadDataToBuffer();

    /* generate a mini-batch */
-    UInt64List LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
-                         int sBatch, int wBatch, int devID);
+    UInt64List LoadBatch(XTensor* batchEnc, XTensor* paddingEnc, int minSentBatch, int batchSize, int devID);

    /* initialization function */
    void Init(const char* dataFile, const char* srcVocabFN, const char* tgtVocabFN);

--- a/source/sample/transformer/translate/LengthPenalty.cpp
+++ b/source/sample/transformer/translate/LengthPenalty.cpp
-/* NiuTrans.NMT - an open-source neural machine translation system.
+/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -42,7 +42,7 @@ float LengthPenalizer::GNMT(float length, float alpha)

    base = (length + 5.0F) / (1.0F + 5.0F);

-    lp = pow(base, alpha);
+    lp = float(pow(base, alpha));

    return lp;
 }

--- a/source/sample/transformer/translate/LengthPenalty.h
+++ b/source/sample/transformer/translate/LengthPenalty.h
-/* NiuTrans.NMT - an open-source neural machine translation system.
+/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/translate/Predictor.cpp
+++ b/source/sample/transformer/translate/Predictor.cpp
-/* NiuTrans.NMT - an open-source neural machine translation system.
+/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/translate/Predictor.h
+++ b/source/sample/transformer/translate/Predictor.h
-/* NiuTrans.NMT - an open-source neural machine translation system.
+/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/translate/Search.cpp
+++ b/source/sample/transformer/translate/Search.cpp
-/* NiuTrans.NMT - an open-source neural machine translation system.
+/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -727,6 +727,7 @@ GreedySearch::GreedySearch()
    endSymbolNum = 0;
    endSymbols = new int[32];
    startSymbol = -1;
+    scalarMaxLength = -1;
 }

 /* de-constructor */

--- a/source/sample/transformer/translate/Search.h
+++ b/source/sample/transformer/translate/Search.h
-/* NiuTrans.NMT - an open-source neural machine translation system.
+/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/translate/Translator.cpp
+++ b/source/sample/transformer/translate/Translator.cpp
-/* NiuTrans.NMT - an open-source neural machine translation system.
+/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -54,6 +54,12 @@ void Translator::Init(Config& config)
    sentBatch = config.sBatchSize;
    wordBatch = config.wBatchSize;

+    batchLoader.maxSrcLen = config.maxSrcLen;
+    batchLoader.unkID = config.unkID;
+    batchLoader.padID = config.padID;
+    batchLoader.startID = config.startID;
+    batchLoader.endID = config.endID;
+
    if (beamSize > 1) {
        LOG("translating with beam search (%d)", beamSize);
        seacher = new BeamSearch();
@@ -123,12 +129,12 @@ void Translator::Translate(const char* ifn, const char* sfn,
            XTensor score;
            ((BeamSearch*)seacher)->Search(model, batchEnc, paddingEnc, output, score);
        }
-
        for (int i = 0; i < indices.Size() - 1; ++i) {
-            Result* res = new Result;
-            res->id = int(indices[i]);
-            res->res = output[i];
-            batchLoader.outputBuffer.Add(res);
+            Example res;
+            res.id = int(indices[i]);
+            for (int j = 0; j < output[i].Size(); j++)
+                res.values.emplace_back(output[i][j]);
+            batchLoader.outputBuffer.emplace_back(std::move(res));
        }
        delete[] output;

@@ -142,17 +148,17 @@ void Translator::Translate(const char* ifn, const char* sfn,
            double elapsed = GetClockSec() - batchStart;
            batchStart = GetClockSec();
            LOG("elapsed=%.1fs, sentence=%f, sword=%.1fw/s",
-                elapsed, float(sentCount) / float(batchLoader.inputBuffer.Size()), 
+                elapsed, float(sentCount) / float(batchLoader.inputBuffer.size()), 
                double(wc) / elapsed);
            wc = 0;
        }
    }

    /* append empty lines to the result */
-    for (int i = 0; i < batchLoader.emptyLines.Size(); i++) {
-        Result* emptyRes = new Result;
-        emptyRes->id = batchLoader.emptyLines[i];
-        batchLoader.outputBuffer.Add(emptyRes);
+    for (const auto& empty: batchLoader.emptyLines) {
+        Example emptyRes;
+        emptyRes.id = empty;
+        batchLoader.outputBuffer.emplace_back(emptyRes);
    }

    double startDump = GetClockSec();
@@ -166,7 +172,7 @@ void Translator::Translate(const char* ifn, const char* sfn,
    double elapsed = GetClockSec() - startDump;

    LOG("translation completed (word=%d, sent=%zu)", 
-        wordCountTotal, batchLoader.inputBuffer.Size() + batchLoader.emptyLines.Size());
+        wordCountTotal, batchLoader.outputBuffer.size() + batchLoader.emptyLines.size());
 }

 /*

--- a/source/sample/transformer/translate/Translator.h
+++ b/source/sample/transformer/translate/Translator.h
-/* NiuTrans.NMT - an open-source neural machine translation system.
+/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/translate/Vocab.cpp
+++ b/source/sample/transformer/translate/Vocab.cpp
-/* NiuTrans.NMT - an open-source neural machine translation system.
+/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/translate/Vocab.h
+++ b/source/sample/transformer/translate/Vocab.h
-/* NiuTrans.NMT - an open-source neural machine translation system.
+/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");