Update the codes of Transformer sample and XList class.

1. Update the codes of machine translation sample. The current version is the same with NiuTrans.NMT. 2. Update the XList class. 3. Bugs fix.

Update the codes of Transformer sample and XList class.
1. Update the codes of machine translation sample. The current version is the same with NiuTrans.NMT. 2. Update the XList class. 3. Bugs fix.
b801df51 · liyinqiao · 8178ba40 · b801df51 · 8178ba40 · b801df51
Commit b801df51 authored Aug 06, 2020 by liyinqiao
--- a/source/sample/fnnlm/FNNLM.cpp
+++ b/source/sample/fnnlm/FNNLM.cpp
@@ -128,8 +128,10 @@ int FNNLMMain(int argc, const char ** argv)
    Init(model);
    /* learn model parameters */
-    if(strcmp(trainFN, ""))
+    if(strcmp(trainFN, "")) {
+        ENABLE_GRAD;
        Train(trainFN, shuffled, model);
+    }
    /* save the final model */
    if(strcmp(modelFN, "") && strcmp(trainFN, ""))

--- a/source/sample/transformer/T2TAttention.cpp
+++ b/source/sample/transformer/T2TAttention.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
- */
-#include <math.h>
-#include "T2TAttention.h"
-#include "T2TUtility.h"
-#include "T2TEmbedding.h"
-#include "../../tensor/core/CHeader.h"
-namespace transformer
-{
-/* constructor */
-T2TAttention::T2TAttention()
-{
-    nhead = -1;
-    dk = -1;
-    dv = -1;
-    d  = -1;
-    isMasked = false;
-    ignored = 0;
-}
-/* deconstructor */
-T2TAttention::~T2TAttention()
-{
-}
-/* 
-initialize the model 
->> argc - number of arguments
->> argv - list of pointers to the arguments
->> myIgnored - number of position ignored in attention (from the begining)
->> myIsMasked - indicates whether the attention is with a mask
->> myDevID - device id
-*/
-void T2TAttention::InitModel(int argc, char ** argv, 
-                             bool myIsMasked, int myIgnored, 
-                             int myDevID)
-{
-    devID = myDevID;
-    isMasked = myIsMasked;
-    ignored = myIgnored;
-    float minmax = 0;
-    LoadParamInt(argc, argv, "nhead", &nhead, 8);
-    LoadParamInt(argc, argv, "d", &dk, DEFAULT_EMBEDDING_SIZE);
-    LoadParamInt(argc, argv, "d", &dv, DEFAULT_EMBEDDING_SIZE);
-    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
-    LoadParamFloat(argc, argv, "attminmax", &minmax, 0.1F);
-    LoadParamFloat(argc, argv, "dropoutatt", &dropoutP, 0);
-    InitTensor2D(&wk, d, dk, X_FLOAT, devID);
-    InitTensor2D(&wq, d, dk, X_FLOAT, devID);
-    InitTensor2D(&wv, d, dv, X_FLOAT, devID);
-    InitTensor2D(&wa, d, d, X_FLOAT, devID);
-    InitTensor2D(&wbig, d, 3 * d, X_FLOAT, devID);
-    float scale = 1.0F;
-    _SetDataFanInOut(&wk, scale);
-    _SetDataFanInOut(&wq, scale);
-    _SetDataFanInOut(&wv, scale);
-    _SetDataFanInOut(&wa, scale);
-    _SetDataFanInOut(&wbig, scale);
-}
-/* 
-make the network 
->> k - keys. It might be of size B * L * H
-       where B = batch size, L = sequence length, 
-       and H = vector size of each position
->> q - queries
->> v - values
->> mask - as it is
->> isTraining - indicates whether the model is used for training
-<< return - multi-attention result
-*/
-XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining)
-{
-    XTensor k2;
-    XTensor q2;
-    XTensor v2;
-    /* linear transformation before self-attention */
-    k2 = MMul(k, wk);
-    q2 = MMul(q, wq);
-    v2 = MMul(v, wv);
-    return MakeAttention(k2, q2, v2, mask, isTraining);
-}
-/*
-make the network given a big tensor that keeps keys, queries and values
->> kqv - the big tensor
->> mask - as it is
->> isTraining - indicates whether the model is used for training
-*/
-XTensor T2TAttention::MakeBig(XTensor &kqv, XTensor &mask, bool isTraining)
-{
-    XTensor k2;
-    XTensor q2;
-    XTensor v2;
-    XTensor kqv2;
-    TensorList split;
-    kqv2 = MMul(kqv, wbig);
-    int d1 = kqv2.GetDim(0);
-    int d2 = kqv2.GetDim(1);
-    int d3 = kqv2.GetDim(2) / 3;
-    InitTensor3D(&k2, d1, d2, d3, X_FLOAT, devID);
-    InitTensor3D(&q2, d1, d2, d3, X_FLOAT, devID);
-    InitTensor3D(&v2, d1, d2, d3, X_FLOAT, devID);
-    split.Add(&q2);
-    split.Add(&k2);
-    split.Add(&v2);
-    Split(kqv2, split, 2, 3);
-    return MakeAttention(k2, q2, v2, mask, isTraining);
-}
-/*
-make the attention network given keys, queries and values (after linear transformation)
->> k - keys. It might be of size B * L * H
-       where B = batch size, L = sequence length,
-       and H = vector size of each position
->> q - queries
->> v - values
->> mask - as it is
->> isTraining - indicates whether the model is used for training
-*/
-XTensor T2TAttention::MakeAttention(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining)
-{
-    XTensor kheads;
-    XTensor qheads;
-    XTensor vheads;
-    /* multi head */
-    kheads = Split(k, k.order - 1, nhead);
-    qheads = Split(q, q.order - 1, nhead);
-    vheads = Split(v, v.order - 1, nhead);
-    XTensor att;
-    XTensor dot;
-    XTensor scalar;
-    /* scalar = softmax(Q * K^T / sqrt(dk)) * V */
-    dot = BMMul(qheads, X_NOTRANS, kheads, X_TRANS);
-    if(isMasked)
-        dot = dot + mask;
-    dot = Linear(dot, 1.0F/(float)sqrt((float)dk/nhead));
-    scalar = Softmax(dot, -1);
-    if(isTraining && dropoutP > 0)
-        scalar = Dropout(scalar, dropoutP);
-    att = BMMul(scalar, vheads);
-    /* concatenate the heads */
-    return MMul(Merge(att, att.order - 1), wa);
-}
-}
--- a/source/sample/transformer/T2TDecoder.cpp
+++ b/source/sample/transformer/T2TDecoder.cpp
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,12 +17,15 @@
 /*
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-10-09
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */
-#include <math.h>
+#include <cmath>
 #include "T2TDecoder.h"
-#include "T2TUtility.h"
+#include "module/T2TUtility.h"
-#include "T2TLayerNormal.h"
+#include "module/T2TLayerNormal.h"
+#include "module/T2TCommonModules.h"
 #include "../../tensor/core/CHeader.h"
 namespace transformer
@@ -31,145 +34,162 @@ namespace transformer
 /* constructor */
 AttDecoder::AttDecoder()
 {
-    attentions = NULL;
+    selfAtt = NULL;
    fnns = NULL;
-    attLayerNorms = NULL;
+    selfAttLayerNorms = NULL;
    fnnLayerNorms = NULL;
-    attentionsEnde = NULL;
+    enDeAtt = NULL;
-    attEndeLayerNorms = NULL;
+    enDeAttLayerNorms = NULL;
+    decoderLayerNorm = NULL;
+    selfAttCache = NULL;
+    enDeAttCache = NULL;
 }
 /* de-constructor */
 AttDecoder::~AttDecoder()
 {
-    delete[] attentions;
+    delete[] selfAttCache;
+    delete[] enDeAttCache;
+    delete[] selfAtt;
    delete[] fnns;
-    delete[] attLayerNorms;
+    delete[] selfAttLayerNorms;
    delete[] fnnLayerNorms;
-    delete[] attentionsEnde;
+    delete[] enDeAtt;
-    delete[] attEndeLayerNorms;
+    delete[] enDeAttLayerNorms;
+    if (preNorm)
+        delete decoderLayerNorm;
 }
-/* 
+/*
-initialize the model 
+initialize the model
->> argc - number of arguments
+>> config - configurations of the model
->> argv - list of pointers to the arguments
->> myIsMasked - indicates whether the masked attention is employed
->> myIgnored - number of positions ignored in attention (from the start)
->> myDevID - device id
 */
-void AttDecoder::InitModel(int argc, char ** argv, 
+void AttDecoder::InitModel(T2TConfig& config)
-                           bool myIsMasked, int myIgnored, 
-                           int myDevID)
 {
-    //AttEncoder::InitModel(argc, argv, myIsMasked, myIgnored, myDevID);
+    devID = config.devID;
+    nlayer = config.nDecLayer;
-    devID = myDevID;
+    hSize = config.modelSize;
-    ignored = myIgnored;
+    eSize = config.embSize;
+    vSize = config.tgtVocabSize;
-    LoadParamInt(argc, argv, "nlayer", &nlayer, 6);
+    dropoutP = config.dropout;
-    LoadParamInt(argc, argv, "hsize", &hSize, DEFAULT_EMBEDDING_SIZE);
+    preNorm = config.preNorm;
-    LoadParamInt(argc, argv, "esize", &eSize, DEFAULT_EMBEDDING_SIZE);
-    LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
-    LoadParamFloat(argc, argv, "dropout", &dropoutP, 0);
    CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
    CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsizetgt\"");
    /* embedding model */
-    embedder.InitModel(argc, argv, devID, false);
+    embedder.InitModel(config, false);
-    attentions = new T2TAttention[nlayer];
+    selfAtt = new T2TAttention[nlayer];
    fnns = new T2TFNN[nlayer];
-    attLayerNorms = new T2TLN[nlayer];
+    selfAttLayerNorms = new T2TLN[nlayer];
+    enDeAtt = new T2TAttention[nlayer];
+    enDeAttLayerNorms = new T2TLN[nlayer];
    fnnLayerNorms = new T2TLN[nlayer];
-    attentionsEnde = new T2TAttention[nlayer];
+    selfAttCache = new Cache[nlayer];
-    attEndeLayerNorms = new T2TLN[nlayer];
+    enDeAttCache = new Cache[nlayer];
+    if (preNorm)
+        decoderLayerNorm = new T2TLN;
    /* initialize the stacked layers */
    for (int i = 0; i < nlayer; i++) {
-        attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID);
+        selfAtt[i].InitModel(config);
-        fnns[i].InitModel(argc, argv, myDevID);
+        fnns[i].InitModel(config);
-        attLayerNorms[i].InitModel(argc, argv, myDevID);
+        selfAttLayerNorms[i].InitModel(config);
-        fnnLayerNorms[i].InitModel(argc, argv, myDevID);
+        fnnLayerNorms[i].InitModel(config);
-        attentionsEnde[i].InitModel(argc, argv, true, myIgnored, myDevID);
+        enDeAtt[i].InitModel(config);
-        attEndeLayerNorms[i].InitModel(argc, argv, myDevID);
+        enDeAttLayerNorms[i].InitModel(config);
    }
+    if (preNorm)
+        decoderLayerNorm->InitModel(config);
 }
-/* 
+/*
 make the decoding network
 >> inputDec - the input tensor of the decoder
 >> outputEnc - the output tensor of the encoder
 >> mask - mask that indicates which position is valid
 >> maskEncDec - mask for the encoder-decoder attention
+>> nstep - the current length of the decoder input
 >> isTraining - indicates whether the model is used for training
-<< return - the output tensor of the encoder
+<< return - the output tensor of the decoder
 */
-XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, XTensor &maskEncDec, bool isTraining)
+XTensor AttDecoder::Make(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
+    XTensor* maskEncDec, int nstep, bool isTraining)
 {
    XTensor x;
+    x = embedder.Make(inputDec, true, isTraining, nstep);
-    x = embedder.Make(inputDec);
    /* dropout */
-    if(isTraining && dropoutP > 0)
+    if (isTraining && dropoutP > 0)
        x = Dropout(x, dropoutP);
-    for(int i = 0; i < nlayer; i++){
+    for (int i = 0; i < nlayer; i++) {
        XTensor att;
        XTensor ende;
-        XTensor ln;
        XTensor fnn;
        XTensor res;
+        XTensor selfAttnBefore;
+        XTensor selfAttnAfter;
+        XTensor endeAttnBefore;
+        XTensor endeAttnAfter;
+        XTensor fnnBefore;
+        /* layer normalization with pre-norm for self-attn */
+        selfAttnBefore = LayerNorm(x, selfAttLayerNorms[i], preNorm, true, false);
        /******************/
        /* self attention */
-        att = attentions[i].MakeBig(x, mask, isTraining);
+        att = selfAtt[i].Make(selfAttnBefore, selfAttnBefore, selfAttnBefore, 
+                              mask, isTraining, &selfAttCache[i], SELF_ATT);
        /* dropout */
-        if(isTraining && dropoutP > 0)
+        if (isTraining && dropoutP > 0)
            att = Dropout(att, dropoutP);
        /* residual connection */
        res = Sum(att, x);
-        /* layer normalization */
+        /* layer normalization with post-norm for self-attention */
-        x = attLayerNorms[i].Make(res);
+        selfAttnAfter = LayerNorm(res, selfAttLayerNorms[i], preNorm, false, true);
+        /* layer normalization with pre-norm for encoder-decoder attention */
+        endeAttnBefore = LayerNorm(selfAttnAfter, enDeAttLayerNorms[i], preNorm, true, false);
-        /*****************************/
        /* encoder-decoder attention */
-        ende = attentionsEnde[i].Make(outputEnc, x, outputEnc, maskEncDec, isTraining);
+        ende = enDeAtt[i].Make(outputEnc, endeAttnBefore, outputEnc, maskEncDec, 
+                               isTraining, &enDeAttCache[i], EN_DE_ATT);
        /* dropout */
-        if(isTraining && dropoutP > 0)
+        if (isTraining && dropoutP > 0)
            ende = Dropout(ende, dropoutP);
        /* residual connection */
-        res = Sum(ende, x);
+        res = Sum(ende, selfAttnAfter);
-        /* layer normalization */
+        /* layer normalization with post-norm for encoder-decoder attention */
-        x = attEndeLayerNorms[i].Make(res);
+        endeAttnAfter = LayerNorm(res, enDeAttLayerNorms[i], preNorm, false, true);
+        /* layer normalization with pre-norm for fnn */
+        fnnBefore = LayerNorm(endeAttnAfter, fnnLayerNorms[i], preNorm, true, false);
-        /*******/
        /* fnn */
-        fnn = fnns[i].Make(x, isTraining);
+        fnn = fnns[i].Make(fnnBefore, isTraining);
        /* dropout */
-        if(isTraining && dropoutP > 0)
+        if (isTraining && dropoutP > 0)
            fnn = Dropout(fnn, dropoutP);
        /* residual connection */
-        res = Sum(fnn, x);
+        res = Sum(fnn, endeAttnAfter);
-        /* layer normalization */
+        /* layer normalization with post-norm for fnn */
-        x = fnnLayerNorms[i].Make(res);
+        x = LayerNorm(res, fnnLayerNorms[i], preNorm, false, true);
    }
-    x.SetName(DECODING_NAME);
-    return x;
-}
+    if (preNorm)
+        x = decoderLayerNorm->Make(x);
+    return x;
 }
+}
\ No newline at end of file
--- a/source/sample/transformer/T2TDecoder.h
+++ b/source/sample/transformer/T2TDecoder.h
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,18 +17,17 @@
 /*
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */
 #ifndef __T2TDECODER_H__
 #define __T2TDECODER_H__
 #include "T2TEncoder.h"
+#include "module/T2TUtility.h"
 namespace transformer
 {
-#define DECODING_NAME "decoding"
-#define DECODING_INPUT_NAME "decoding_input"
 class AttDecoder
 {
@@ -52,50 +51,52 @@ public:
    /* dropout probability */
    DTYPE dropoutP;
-    /* some positions can be ignored in attention. this is useful in lm where the first position needs
- *     special design for the attention model. */
-    int ignored;
    /* embedding of word at each position */
    T2TEmbedder embedder;
    /* FNN model of each layer */
-    T2TFNN * fnns;
+    T2TFNN* fnns;
    /* attention model of each layer */
-    T2TAttention * attentions;
+    T2TAttention* selfAtt;
-    /* layer normalization for fnn */
-    T2TLN * fnnLayerNorms;
    /* layer normalization for attention */
-    T2TLN * attLayerNorms;
+    T2TLN* selfAttLayerNorms;
-    /* input tensor of the encoder */
+    /* layer normalization for fnn */
-    XTensor * input;
+    T2TLN* fnnLayerNorms;
-    /* output tensor of the encoder */
+    /* layer normalization for decoder */
-    XTensor * output;
+    T2TLN* decoderLayerNorm;
    /* encoder-decoder attention model of each layer */
-    T2TAttention * attentionsEnde;
+    T2TAttention* enDeAtt;
    /* layer normalization for encoder-decoder attention */
-    T2TLN * attEndeLayerNorms;
+    T2TLN* enDeAttLayerNorms;
+    /* layer cache list */
+    Cache* selfAttCache;
+    /* layer cache list */
+    Cache* enDeAttCache;
+    /* the location of layer normalization */
+    bool preNorm;
 public:
    /* constructor */
    AttDecoder();
-    /* deconstructor */
+    /* de-constructor */
    ~AttDecoder();
    /* initialize the model */
-    void InitModel(int argc, char ** argv, 
+    void InitModel(T2TConfig& config);
-                   bool myIsMasked, int myIgnored, 
-                   int myDevID = -1);
    /* make the decoding network */
-    XTensor Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, XTensor &maskEncDec, bool isTraining);
+    XTensor Make(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
+                 XTensor* maskEncDec, int nstep, bool isTraining);
 };
 }

--- a/source/sample/transformer/T2TEmbedding.cpp
+++ b/source/sample/transformer/T2TEmbedding.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-01
- */
-#include <math.h>
-#include "T2TEmbedding.h"
-#include "T2TUtility.h"
-#include "../../tensor/core/CHeader.h"
-namespace transformer
-{
-/* constructor */
-T2TEmbedder::T2TEmbedder()
-{
-    devID = -1;
-    vSize = -1;
-    maxLength = -1;
-}
-/* deconstructor */
-T2TEmbedder::~T2TEmbedder()
-{
-}
-/* 
-initialize the model 
->> argc - number of arguments
->> argv - list of pointers to the arguments
->> myDevID - device id
-*/
-void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, bool isEnc)
-{
-    devID = myDevID;
-    if(isEnc){
-        LoadParamInt(argc, argv, "vsize", &vSize, -1);
-    }
-    else{
-        LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
-    }
-    //LoadParamInt(argc, argv, "vsize", &vSize, -1);
-    LoadParamInt(argc, argv, "maxlen", &maxLength, 512);
-    LoadParamInt(argc, argv, "d", &eSize, DEFAULT_EMBEDDING_SIZE);
-    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
-    InitTensor2D(&w, vSize, eSize, X_FLOAT, devID);
-    DTYPE v = 1.0F/(float)sqrt((float)eSize);
-    w.SetDataRandn(0, v);
-    /* create the positional embedding matrix */
-    MakePosEmbedding(eSize, d, maxLength);
-}
-/* 
-make positional embeddings (of size eSize * length)
->> eSize - embedding size
->> d - dimension size of the hidden layers
->> length - length of the sequence
-*/
-void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length)
-{
-    InitTensor2D(&posEmbeddingBase, length, eSize, X_FLOAT, devID);
-    float * data = new float[posEmbeddingBase.unitNum];
-    for(int pos = 0; pos < length; pos++){
-        float * dp = data + pos * eSize;
-        int channelSize = eSize / 2;
-        int offset = 0;
-        for(int i = 0; i < channelSize; i++){
-            dp[offset++] = (float)sin(pos/pow(10000.0F, 2.0F*i/(d - 2)));
-        }
-        for(int i = 0; i < channelSize; i++){
-            dp[offset++] = (float)cos(pos/pow(10000.0F, 2.0F*i/(d - 2)));
-        }
-        /*
-        for(int k = 0; k < eSize; k++){
-            if(k % 2 == 0){
-                int i = k/2;
-                dp[k] = (float)sin(pos/pow(10000.0F, 2.0F*i/d));
-            }
-            else{
-                int i = (k - 1)/2;
-                dp[k] = (float)cos(pos/pow(10000.0F, 2.0F*i/d));
-            }
-        }
-        */
-    }
-    posEmbeddingBase.SetData(data, posEmbeddingBase.unitNum);
-    delete[] data;
-}
-/* 
-make the network 
-*/
-XTensor T2TEmbedder::Make(XTensor &input)
-{
-    //CheckNTErrors(input.GetDim(-1) == vSize, "Wrong vocabulary size!");
-    CheckNTErrors(input.order > 1, "Wrong input tensor size!");
-    CheckNTErrors(input.dimSize[input.order - 1] < maxLength, "The sequence is too long!");
-    CheckNTErrors(vSize > 0, "set vocabulary size by \"-vsize\"");
-    CheckNTErrors(eSize > 0, "set embedding size by \"-esize\"");
-    int dims[MAX_TENSOR_DIM_NUM];
-    memcpy(dims, input.dimSize, input.order * sizeof(int));
-    dims[input.order] = eSize;
-    XTensor wordEmbedding;
-    XTensor posEmbedding;
-    /* make positional embeddings */
-    XTensor position;
-    XTensor embTMP;
-    InitTensor1D(&position, input.GetDim(-1), X_INT, devID);
-    position.Range(0, position.unitNum, 1);
-    embTMP = Gather(posEmbeddingBase, position);
-    posEmbedding = Unsqueeze(embTMP, 0, dims[0]);
-    /* make word embeddings */
-    wordEmbedding = Gather(w, input);
-    wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize));
-    /* sum over the two embeddings */
-    return wordEmbedding + posEmbedding;
-}
-}
--- a/source/sample/transformer/T2TEncoder.cpp
+++ b/source/sample/transformer/T2TEncoder.cpp
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,12 +17,15 @@
 /*
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */
-#include <math.h>
+#include <cmath>
 #include "T2TEncoder.h"
-#include "T2TLayerNormal.h"
+#include "module/T2TUtility.h"
-#include "T2TUtility.h"
+#include "module/T2TLayerNormal.h"
+#include "module/T2TCommonModules.h"
 #include "../../tensor/core/CHeader.h"
 namespace transformer
@@ -31,62 +34,65 @@ namespace transformer
 /* constructor */
 AttEncoder::AttEncoder()
 {
-    attentions = NULL;
+    selfAtt = NULL;
    fnns = NULL;
    attLayerNorms = NULL;
    fnnLayerNorms = NULL;
+    encoderLayerNorm = NULL;
 }
 /* de-constructor */
 AttEncoder::~AttEncoder()
 {
-    delete[] attentions;
+    delete[] selfAtt;
    delete[] fnns;
    delete[] attLayerNorms;
    delete[] fnnLayerNorms;
+    if (preNorm)
+        delete encoderLayerNorm;
 }
-/* 
+/*
-initialize the model 
+initialize the model
->> argc - number of arguments
+>> config - configurations for the model
->> argv - list of pointers to the arguments
+*/
->> myIsMasked - indicates whether the masked attention is employed
+void AttEncoder::InitModel(T2TConfig& config)
->> myIgnored - number of positions ignored in attention (from the start)
->> myDevID - device id*/
-void AttEncoder::InitModel(int argc, char ** argv, 
-                           bool myIsMasked, int myIgnored, 
-                           int myDevID)
 {
-    devID = myDevID;
-    ignored = myIgnored;
+    devID = config.devID;
+    nlayer = config.nEncLayer;
-    LoadParamInt(argc, argv, "nlayer", &nlayer, 6);
+    eSize = config.embSize;
-    LoadParamInt(argc, argv, "hsize", &hSize, DEFAULT_EMBEDDING_SIZE);
+    hSize = config.modelSize;
-    LoadParamInt(argc, argv, "esize", &eSize, DEFAULT_EMBEDDING_SIZE);
+    vSize = config.srcVocabSize;
-    LoadParamInt(argc, argv, "vsize", &vSize, -1);
+    preNorm = config.preNorm;
-    LoadParamFloat(argc, argv, "dropout", &dropoutP, 0);
+    dropoutP = config.dropout;
    CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
    CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsize\"");
    /* embedding model */
-    embedder.InitModel(argc, argv, devID);
+    embedder.InitModel(config);
-    attentions = new T2TAttention[nlayer];
+    selfAtt = new T2TAttention[nlayer];
    fnns = new T2TFNN[nlayer];
    attLayerNorms = new T2TLN[nlayer];
    fnnLayerNorms = new T2TLN[nlayer];
+    if (preNorm)
+        encoderLayerNorm = new T2TLN;
    /* initialize the stacked layers */
-    for(int i = 0; i < nlayer; i++){
+    for (int i = 0; i < nlayer; i++) {
-        attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID);
+        selfAtt[i].InitModel(config);
-        fnns[i].InitModel(argc, argv, myDevID);
+        fnns[i].InitModel(config);
-        attLayerNorms[i].InitModel(argc, argv, myDevID);
+        attLayerNorms[i].InitModel(config);
-        fnnLayerNorms[i].InitModel(argc, argv, myDevID);
+        fnnLayerNorms[i].InitModel(config);
    }
+    if (preNorm)
+        encoderLayerNorm->InitModel(config);
 }
-/* 
+/*
 make the encoding network
 >> input - the input tensor of the encoder
 >> mask - the mask that indicate each position is valid
@@ -94,67 +100,74 @@ make the encoding network
 >> isTraining - indicates whether the model is used for training
 << return - the output tensor of the encoder
 */
-XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, bool isTraining)
+XTensor AttEncoder::Make(XTensor& input, XTensor* mask, XTensor& maskEncDec, bool isTraining)
 {
    XTensor x;
-    x = embedder.Make(input);
+    x = embedder.Make(input, false, isTraining);
    /* dropout */
-    if(isTraining && dropoutP > 0)
+    if (isTraining && dropoutP > 0)
        x = Dropout(x, dropoutP);
-    for(int i = 0; i < nlayer; i++){
+    for (int i = 0; i < nlayer; i++) {
        XTensor att;
-        XTensor ln;
        XTensor fnn;
        XTensor res;
+        XTensor attnBefore;
+        XTensor attnAfter;
+        XTensor fnnBefore;
+        /* layer normalization with pre-norm for self-attn */
+        attnBefore = LayerNorm(x, attLayerNorms[i], preNorm, true, false);
        /* self attention */
-        att = attentions[i].MakeBig(x, mask, isTraining);
+        att = selfAtt[i].Make(attnBefore, attnBefore, attnBefore, mask, isTraining, NULL, 0);
        /* dropout */
-        if(isTraining && dropoutP > 0)
+        if (isTraining && dropoutP > 0)
            att = Dropout(att, dropoutP);
        /* residual connection */
        res = Sum(att, x);
-        /* layer normalization */
+        /* layer normalization with post-norm for self-attn */
-        x = attLayerNorms[i].Make(res);
+        attnAfter = LayerNorm(res, attLayerNorms[i], preNorm, false, true);
+        /* layer normalization with pre-norm for fnn */
+        fnnBefore = LayerNorm(attnAfter, fnnLayerNorms[i], preNorm, true, false);
        /* fnn */
-        fnn = fnns[i].Make(x, isTraining);
+        fnn = fnns[i].Make(fnnBefore, isTraining);
        /* dropout */
-        if(isTraining && dropoutP > 0)
+        if (isTraining && dropoutP > 0)
            fnn = Dropout(fnn, dropoutP);
        /* residual connection */
-        res = Sum(fnn, x);
+        res = Sum(fnn, attnAfter);
-        /* layer normalization */
+        /* layer normalization with post-norm for fnn */
-        x = fnnLayerNorms[i].Make(res);
+        x = LayerNorm(res, fnnLayerNorms[i], preNorm, false, true);
    }
+    if (preNorm)
-    x.SetName(ENCODING_NAME);
+        x = encoderLayerNorm->Make(x);
-    input.SetName(ENCODING_INPUT_NAME);
    return x;
 }
 /*
-make the encoding network (wrapper) 
+make the encoding network (wrapper)
 >> input - the input tensor of the encoder
 >> mask - the mask that indicate each position is valid
 >> isTraining - indicates whether the model is used for training
 << return - the output tensor of the encoder
 */
-XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool isTraining)
+XTensor AttEncoder::Make(XTensor& input, XTensor* mask, bool isTraining)
 {
    XTensor nothing;
    return Make(input, mask, nothing, isTraining);
 }
 }
\ No newline at end of file
--- a/source/sample/transformer/T2TEncoder.h
+++ b/source/sample/transformer/T2TEncoder.h
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,47 +17,35 @@
 /*
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */
 #ifndef __T2TENCODER_H__
 #define __T2TENCODER_H__
-#include "T2TFNN.h"
+#include "module/T2TFNN.h"
-#include "T2TAttention.h"
+#include "module/T2TUtility.h"
-#include "T2TEmbedding.h"
+#include "module/T2TAttention.h"
-#include "T2TLayerNormal.h"
+#include "module/T2TEmbedding.h"
+#include "module/T2TLayerNormal.h"
 #include "../../network/XNet.h"
 using namespace nts;
 namespace transformer
 {
-#define ENCODING_NAME "encoding"
-#define ENCODING_INPUT_NAME "encoding_input"
-/* 
+/*
-base class of the encoder 
+base class of the encoder
 */
 class T2TEncoder
 {
 public:
-    virtual
+    virtual XTensor Make(XTensor& input, XTensor* mask, XTensor& mask2, bool isTraining) = 0;
-    XTensor Make(XTensor &input, XTensor &mask, XTensor &mask2, bool isTraining) = 0;
-};
-/* 
-the encoder based on RNN 
-*/
-class RNNEncoder : T2TEncoder
-{
-public:
-    XTensor Make(XTensor &input, XTensor &mask, XTensor &mask2, bool isTraining);
 };
+/*
-/* 
+the encoder based on self-attention
-the encoder based on self-attention 
 */
 class AttEncoder : T2TEncoder
 {
@@ -88,23 +76,23 @@ public:
    T2TEmbedder embedder;
    /* FNN model of each layer */
-    T2TFNN * fnns;
+    T2TFNN* fnns;
    /* attention model of each layer */
-    T2TAttention * attentions;
+    T2TAttention* selfAtt;
+    /* layer normalizations for attention */
+    T2TLN* attLayerNorms;
    /* layer normalization for fnn */
-    T2TLN * fnnLayerNorms;
+    T2TLN* fnnLayerNorms;
-    /* layer normalization for attention */
+    /* layer normalization for encoder */
-    T2TLN * attLayerNorms;
+    T2TLN* encoderLayerNorm;
-    /* input tensor of the encoder */
+    /* the location of layer normalization */
-    XTensor * input;
+    bool preNorm;
-    /* output tensor of the encoder */
-    XTensor * output;
 public:
    /* constructor */
    AttEncoder();
@@ -113,18 +101,15 @@ public:
    ~AttEncoder();
    /* initialize the model */
-    void InitModel(int argc, char ** argv, 
+    void InitModel(T2TConfig& config);
-                   bool myIsMasked, int myIgnored, 
-                   int myDevID = -1);
    /* make the encoding network */
-    XTensor Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, bool isTraining);
+    XTensor Make(XTensor& input, XTensor* mask, XTensor& maskEncDec, bool isTraining);
    /* make the encoding network (wrapper) */
-    XTensor Make(XTensor &input, XTensor &mask, bool isTraining);
+    XTensor Make(XTensor& input, XTensor* mask, bool isTraining);
 };
 }
 #endif
--- a/source/sample/transformer/T2TModel.cpp
+++ b/source/sample/transformer/T2TModel.cpp
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,13 +17,15 @@
 /*
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */
+#include <cstdint>
 #include "T2TModel.h"
-#include "T2TUtility.h"
+#include "module/T2TUtility.h"
-#include "../../tensor/core/CHeader.h"
 #include "../../tensor/XUtility.h"
+#include "../../tensor/core/CHeader.h"
 namespace transformer
 {
@@ -34,6 +36,9 @@ T2TModel::T2TModel()
    devID = -1;
    isLM = false;
    isMT = false;
+    useFP16 = false;
+    shareAllEmbeddings = false;
+    shareDecInputOutputWeight = false;
    nhead = 1;
    encoder = new AttEncoder();
@@ -49,48 +54,74 @@ T2TModel::~T2TModel()
    delete outputLayer;
 }
-/* 
+/*
-initialize the model 
+initialize the model
->> argc - number of arguments
+>> config - configurations of the model
->> argv - list of pointers to the arguments
 */
-void T2TModel::InitModel(int argc, char ** argv)
+void T2TModel::InitModel(T2TConfig& config)
 {
-    LoadParamInt(argc, argv, "dev", &devID, -1);
+    devID = config.devID;
-    LoadParamBool(argc, argv, "mt", &isMT, false);
+    isMT = config.isMT;
-    LoadParamBool(argc, argv, "lm", &isLM, !isMT);
+    isLM = !isMT;
-    LoadParamInt(argc, argv, "nhead", &nhead, 8);
+    useFP16 = config.useFP16;
+    /* configurations for the model */
+    int* metaInfo[] = {
+        &config.nEncLayer, &config.nDecLayer,
+        &config.fnnHiddenSize, &config.modelSize,
+        &config.embSize, &config.srcVocabSize,
+        &config.tgtVocabSize, &config.nhead,
+        &config.maxRP, &shareAllEmbeddings,
+        &shareDecInputOutputWeight,
+        &config.maxPosLen
+    };
+    FILE* modelFile = NULL;
+    /* read model configurations */
+    if (!config.isTraining) {
+        modelFile = fopen(config.modelFN, "rb");
+        for (auto& meta : metaInfo)
+            fread(meta, sizeof(int), 1, modelFile);
+    }
+    nhead = config.nhead;
-    encoder->InitModel(argc, argv, true, 0, devID);
+    encoder->InitModel(config);
-    outputLayer->InitModel(argc, argv, devID);
+    outputLayer->InitModel(config);
-    if(isMT)
+    if (isMT)
-        decoder->InitModel(argc, argv, true, 0, devID);
+        decoder->InitModel(config);
    TensorList params(10);
    GetParams(params);
-    for(int i = 0; i < params.count; i++){
+    /* load parameters */
-        XTensor * param = (XTensor*)params.Get(i);
+    if (!config.isTraining)
-        param->SetVarFlag();
+        Read(modelFile);
+    else {
+        for (int i = 0; i < params.Size(); i++)
+            params[i]->SetVarFlag();
    }
+    if (modelFile != NULL)
+        fclose(modelFile);
 }
-/* 
+/*
 make the encoding network
 >> input - input tensor
 >> mask - the mask for positions that are/not involved in computation
 >> isTraining - indicates whether we are training the model
 << return - encoding result
 */
-XTensor T2TModel::MakeEncoder(XTensor &input, XTensor &mask, bool isTraining)
+XTensor T2TModel::MakeEncoder(XTensor& input, XTensor* mask, bool isTraining)
 {
    XTensor nothing;
    return encoder->Make(input, mask, nothing, isTraining);
 }
-/* 
+/*
 make the decoding network
 >> inputDec - input tensor of the decoder
 >> outputEnc - output tensor of the encoder
@@ -100,23 +131,25 @@ make the decoding network
 >> isTraining - indicates whether we are training the model
 << return - encoding result
 */
-XTensor T2TModel::MakeDecoder(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, XTensor &maskEncDec, bool isTraining)
+XTensor T2TModel::MakeDecoder(XTensor& inputDec, XTensor& outputEnc, 
+                              XTensor* mask, XTensor& maskEncDec, bool isTraining)
 {
-    return decoder->Make(inputDec, outputEnc, mask, maskEncDec, isTraining);
+    return decoder->Make(inputDec, outputEnc, mask, &maskEncDec, 
+                         inputDec.GetDim(1), isTraining);
 }
-/* 
+/*
-make the network for language modeling (with the output softmax layer) 
+make the network for language modeling (with the output softmax layer)
 >> input - input tensor
 >> output - output tensor (distribution)
 >> padding - padding of the sequences
 >> isTraining - indicates whether the model is for training
 */
-void T2TModel::MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool isTraining)
+void T2TModel::MakeLM(XTensor& input, XTensor& output, XTensor& padding, bool isTraining)
 {
    int len = padding.GetDim(padding.order - 1);
-    int * dims = new int[padding.order + 2];
+    int* dims = new int[padding.order + 2];
-    for(int i = 0; i < padding.order; i++)
+    for (int i = 0; i < padding.order; i++)
        dims[i + 1] = padding.GetDim(i);
    dims[0] = nhead;
    dims[padding.order + 1] = len;
@@ -134,12 +167,12 @@ void T2TModel::MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool is
    /* forward */
    XTensor encoding;
-    encoding = MakeEncoder(input, mask, isTraining);
+    encoding = MakeEncoder(input, &mask, isTraining);
-    outputLayer->Make(encoding, output);
+    outputLayer->Make(encoding, output, true, true);
 }
-/* 
+/*
-make the network for machine translation (with the output softmax layer) 
+make the network for machine translation (with the output softmax layer)
 >> inputEnc - input tensor of the encoder
 >> inputDec - input tensor of the decoder
 >> output - output tensor (distribution)
@@ -147,9 +180,9 @@ make the network for machine translation (with the output softmax layer)
 >> paddingDec - padding of the sequences (on the decoder side)
 >> isTraining - indicates whether the model is for training
 */
-void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, 
+void T2TModel::MakeMT(XTensor& inputEnc, XTensor& inputDec, XTensor& output,
-                      XTensor &paddingEnc, XTensor &paddingDec,
+    XTensor& paddingEnc, XTensor& paddingDec,
-                      bool isTraining)
+    bool isTraining)
 {
    XTensor encoding;
    XTensor decoding;
@@ -159,19 +192,19 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output,
    /* encoder mask */
    MakeMTMaskEnc(paddingEnc, maskEnc);
    /* decoder mask */
    MakeMTMaskDec(paddingEnc, paddingDec, maskDec, maskEncDec);
-    encoding = MakeEncoder(inputEnc, maskEnc, isTraining);
+    encoding = MakeEncoder(inputEnc, &maskEnc, isTraining);
-    decoding = MakeDecoder(inputDec, encoding, maskDec, maskEncDec, isTraining);
+    decoding = MakeDecoder(inputDec, encoding, &maskDec, maskEncDec, isTraining);
-    outputLayer->Make(decoding, output);
+    outputLayer->Make(decoding, output, true, true);
 }
-/* 
+/*
-make the mask for training MT models 
+make the mask for training MT models
 >> inputEnc - input of the encoder
 >> inputDec - input of the decoder
 >> paddingEnc - padding of the encoder input
@@ -180,31 +213,31 @@ make the mask for training MT models
 >> maksDec - mask of the decoder self-attention
 >> maksEncDec - mask of the decoder enc-dec attention
 */
-void T2TModel::MakeMTMask(XTensor &inputEnc,   XTensor &inputDec, 
+void T2TModel::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
-                          XTensor &paddingEnc, XTensor &paddingDec, 
+    XTensor& paddingEnc, XTensor& paddingDec,
-                          XTensor &maskEnc,    XTensor &maskDec,    XTensor &maskEncDec)
+    XTensor& maskEnc, XTensor& maskDec, XTensor& maskEncDec)
 {
    int len = inputDec.GetDim(inputDec.order - 1);
-    int * dims = new int[inputDec.order + 2];
+    int* dims = new int[inputDec.order + 2];
-    for(int i = 0; i < inputDec.order; i++)
+    for (int i = 0; i < inputDec.order; i++)
        dims[i + 1] = inputDec.GetDim(i);
    dims[0] = nhead;
    dims[inputDec.order + 1] = len;
    InitTensor(&maskDec, inputDec.order + 2, dims, X_FLOAT, paddingDec.devID);
    /* an upper triangular matrix where the cells of the upper triangular are set to -1e-9.
       this matrix can be used to prevent the attention to current or following words in
       a given sequence. */
    _SetDataLowTri(&maskDec, 1e9F, 0);
-    _ScaleAndShiftMe(&maskDec, 1.0F, -1e9F);
+    ScaleAndShiftMe(maskDec, 1.0F, -1e9F);
    /* encoder-decoder mask that prevents the attention to padding dummy words */
    dims[inputDec.order + 1] = inputEnc.GetDim(inputEnc.order - 1);
    InitTensor(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, paddingEnc.devID);
-    XTensor * maskEncDecTMPEnc = NewTensorBuf(paddingEnc.order + 1, dims + 1, paddingEnc.dataType,
+    XTensor* maskEncDecTMPEnc = NewTensorBuf(paddingEnc.order + 1, dims + 1, 
-                                                paddingEnc.devID);
+                                paddingEnc.dataType, paddingEnc.devID);
-    XTensor * maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID);
+    XTensor* maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID);
    _Unsqueeze(&paddingEnc, maskEncDecTMPEnc, paddingEnc.order - 1, paddingDec.GetDim(-1));
    _ScaleAndShiftMe(maskEncDecTMPEnc, 1e9F, -1e9F);
@@ -214,21 +247,21 @@ void T2TModel::MakeMTMask(XTensor &inputEnc,   XTensor &inputDec,
    DelTensorBuf(maskEncDecTMPEnc);
    /* padding on the source side */
-    int * dimsPadding = new int[paddingEnc.order + 2];
+    int* dimsPadding = new int[paddingEnc.order + 2];
    for (int i = 0; i < paddingEnc.order - 1; i++)
        dimsPadding[i] = paddingEnc.GetDim(i);
    dimsPadding[paddingEnc.order - 1] = paddingEnc.GetDim(-1);
    dimsPadding[paddingEnc.order] = paddingEnc.GetDim(-1);
-    XTensor * padding2 = NewTensorBuf(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType,
+    XTensor* padding2 = NewTensorBuf(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType,
-                                        paddingEnc.devID);
+        paddingEnc.devID);
    for (int i = 0; i < padding2->order; i++)
        dimsPadding[i + 1] = padding2->GetDim(i);
    dimsPadding[0] = nhead;
-    XTensor * padding3 = NewTensorBuf(paddingEnc.order + 2, dimsPadding, paddingEnc.dataType,
+    XTensor* padding3 = NewTensorBuf(paddingEnc.order + 2, dimsPadding, paddingEnc.dataType,
-                                        paddingEnc.devID);
+        paddingEnc.devID);
    /* mask of the padding */
    _Unsqueeze(&paddingEnc, padding2, paddingEnc.order - 1, paddingEnc.GetDim(-1));
@@ -248,30 +281,30 @@ void T2TModel::MakeMTMask(XTensor &inputEnc,   XTensor &inputDec,
    DelTensorBuf(padding3);
    DelTensorBuf(padding2);
 }
 /*
 make the mask of the encoder
 >> inputEnc - input of the encoder
 >> paddingEnc - padding of the encoder input
 >> maskEnc - mask of the encoder self-attention
 */
-void T2TModel::MakeMTMaskEnc(XTensor &paddingEnc, XTensor &maskEnc)
+void T2TModel::MakeMTMaskEnc(XTensor& paddingEnc, XTensor& maskEnc)
 {
    XTensor padding2;
    XTensor padding3;
    /* mask of the padding */
    Unsqueeze(paddingEnc, padding2, paddingEnc.order - 1, paddingEnc.GetDim(-1));
    Unsqueeze(padding2, padding3, 0, nhead);
    ScaleAndShiftMe(padding3, 1e9F, -1e9F);
    InitTensor(&maskEnc, &padding3);
    maskEnc.SetZeroAll();
    /* generate the mask on the source language side (for padding) */
    SumMe(maskEnc, padding3);
 }
 /*
 make the mask of the decoder
 >> inputEnc - input of the encoder
@@ -281,106 +314,145 @@ make the mask of the decoder
 >> maksDec - mask of the decoder self-attention
 >> maksEncDec - mask of the decoder enc-dec attention
 */
-void T2TModel::MakeMTMaskDec(XTensor &paddingEnc, XTensor &paddingDec,
+void T2TModel::MakeMTMaskDec(XTensor& paddingEnc, XTensor& paddingDec,
-                             XTensor &maskDec, XTensor &maskEncDec)
+                             XTensor& maskDec, XTensor& maskEncDec)
 {
    int len = paddingDec.GetDim(paddingDec.order - 1);
-    int * dims = new int[paddingDec.order + 2];
+    int* dims = new int[paddingDec.order + 2];
-    for(int i = 0; i < paddingDec.order; i++)
+    for (int i = 0; i < paddingDec.order; i++)
        dims[i + 1] = paddingDec.GetDim(i);
    dims[0] = nhead;
    dims[paddingDec.order + 1] = len;
    InitTensor(&maskDec, paddingDec.order + 2, dims, X_FLOAT, paddingDec.devID);
    /* An upper triangular matrix where the cells of the upper triangular are set to -1e-9.
       This matrix can be used to block the attention to current or following words in
       a given sequence. */
    _SetDataLowTri(&maskDec, 1e9F, 0);
    ScaleAndShiftMe(maskDec, 1.0F, -1e9F);
    /* encoder-decoder mask that prevents the attention to padding dummy words */
    XTensor maskEncDecTMP;
    Unsqueeze(paddingEnc, maskEncDecTMP, paddingEnc.order - 1, paddingDec.GetDim(-1));
    ScaleAndShiftMe(maskEncDecTMP, 1e9F, -1e9F);
    Unsqueeze(maskEncDecTMP, maskEncDec, 0, dims[0]);
    delete[] dims;
 }
+/*
-/* 
+get parameter matrices
-get parameter matrics
 >> list - the list that keeps the parameter matrics
 */
-void T2TModel::GetParams(TensorList &list)
+void T2TModel::GetParams(TensorList& list)
 {
    list.Clear();
-    list.Add(&outputLayer->w);
+    /* encoder parameters */
-    for(int i = 0; i < encoder->nlayer; i++){
+    for (int i = 0; i < encoder->nlayer; i++) {
+        list.Add(&encoder->selfAtt[i].wq);
+        list.Add(&encoder->selfAtt[i].wk);
+        list.Add(&encoder->selfAtt[i].wv);
+        list.Add(&encoder->selfAtt[i].bq);
+        list.Add(&encoder->selfAtt[i].bk);
+        list.Add(&encoder->selfAtt[i].bv);
+        if (encoder->selfAtt[i].useRPR)
+            list.Add(&encoder->selfAtt[i].RPEmbK);
+        list.Add(&encoder->selfAtt[i].wo);
+        list.Add(&encoder->selfAtt[i].bo);
        list.Add(&encoder->fnns[i].w1);
        list.Add(&encoder->fnns[i].b1);
        list.Add(&encoder->fnns[i].w2);
        list.Add(&encoder->fnns[i].b2);
-        //list.Add(&encoder->attentions[i].wk);
-        //list.Add(&encoder->attentions[i].wq);
-        //list.Add(&encoder->attentions[i].wv);
-        list.Add(&encoder->attentions[i].wbig);
-        list.Add(&encoder->attentions[i].wa);
-        list.Add(&encoder->fnnLayerNorms[i].w);
-        list.Add(&encoder->fnnLayerNorms[i].b);
        list.Add(&encoder->attLayerNorms[i].w);
        list.Add(&encoder->attLayerNorms[i].b);
+        list.Add(&encoder->fnnLayerNorms[i].w);
+        list.Add(&encoder->fnnLayerNorms[i].b);
+    }
+    if (encoder->preNorm) {
+        list.Add(&encoder->encoderLayerNorm->w);
+        list.Add(&encoder->encoderLayerNorm->b);
    }
-    list.Add(&encoder->embedder.w);
-    if(isMT){
+    if (isMT) {
-        for(int i = 0; i < decoder->nlayer; i++){
+        /* decoder parameters */
+        for (int i = 0; i < decoder->nlayer; i++) {
+            list.Add(&decoder->selfAtt[i].wq);
+            list.Add(&decoder->selfAtt[i].wk);
+            list.Add(&decoder->selfAtt[i].wv);
+            list.Add(&decoder->selfAtt[i].bq);
+            list.Add(&decoder->selfAtt[i].bk);
+            list.Add(&decoder->selfAtt[i].bv);
+            if (decoder->selfAtt[i].useRPR)
+                list.Add(&decoder->selfAtt[i].RPEmbK);
+            list.Add(&decoder->selfAtt[i].wo);
+            list.Add(&decoder->selfAtt[i].bo);
+            list.Add(&decoder->selfAttLayerNorms[i].w);
+            list.Add(&decoder->selfAttLayerNorms[i].b);
+            list.Add(&decoder->enDeAtt[i].wq);
+            list.Add(&decoder->enDeAtt[i].wk);
+            list.Add(&decoder->enDeAtt[i].wv);
+            list.Add(&decoder->enDeAtt[i].bq);
+            list.Add(&decoder->enDeAtt[i].bk);
+            list.Add(&decoder->enDeAtt[i].bv);
+            list.Add(&decoder->enDeAtt[i].wo);
+            list.Add(&decoder->enDeAtt[i].bo);
+            list.Add(&decoder->enDeAttLayerNorms[i].w);
+            list.Add(&decoder->enDeAttLayerNorms[i].b);
            list.Add(&decoder->fnns[i].w1);
            list.Add(&decoder->fnns[i].b1);
            list.Add(&decoder->fnns[i].w2);
            list.Add(&decoder->fnns[i].b2);
-            list.Add(&decoder->attentionsEnde[i].wk);
-            list.Add(&decoder->attentionsEnde[i].wq);
-            list.Add(&decoder->attentionsEnde[i].wv);
-            list.Add(&decoder->attentionsEnde[i].wa);
-            list.Add(&decoder->attEndeLayerNorms[i].w);
-            list.Add(&decoder->attEndeLayerNorms[i].b);
-            //list.Add(&decoder->attentions[i].wk);
-            //list.Add(&decoder->attentions[i].wq);
-            //list.Add(&decoder->attentions[i].wv);
-            list.Add(&decoder->attentions[i].wbig);
-            list.Add(&decoder->attentions[i].wa);
            list.Add(&decoder->fnnLayerNorms[i].w);
            list.Add(&decoder->fnnLayerNorms[i].b);
-            list.Add(&decoder->attLayerNorms[i].w);
-            list.Add(&decoder->attLayerNorms[i].b);
        }
+        if (decoder->preNorm) {
+            list.Add(&decoder->decoderLayerNorm->w);
+            list.Add(&decoder->decoderLayerNorm->b);
+        }
+    }
+    list.Add(&encoder->embedder.w);
+    if (isMT && (shareAllEmbeddings == 0)) {
        list.Add(&decoder->embedder.w);
    }
+    if (shareDecInputOutputWeight == 0)
+        list.Add(&outputLayer->w);
 }
 /*
-dump the parameters 
+dump the model to a file
->> fn - where to keep the model
+>> fn - where to save the model
 >> model - the model
 */
-void T2TModel::Dump(const char * fn)
+void T2TModel::Dump(const char* fn)
 {
    double startT = GetClockSec();
-    FILE * file = fopen(fn, "wb");
+    FILE* file = fopen(fn, "wb");
    CheckNTErrors(file, "Cannot open the model file");
    TensorList params(100);
    GetParams(params);
-    for(int i = 0; i < params.count; i++){
+    int metaInfo[]{
-        XTensor * p = (XTensor*)params.Get(i);
+        encoder->nlayer, decoder->nlayer,
-        p->Dump(file, "param:");
+        encoder->fnns->hSize, encoder->selfAtt->d,
+        encoder->embedder.eSize, encoder->embedder.vSize,
+        decoder->embedder.vSize, encoder->selfAtt->nhead,
+        encoder->selfAtt->maxRP, shareAllEmbeddings,
+        shareDecInputOutputWeight, encoder->embedder.maxLength - 1 - 1,
+    };
+    /* part 1: hyper-parameters */
+    fwrite(metaInfo, sizeof(int), sizeof(metaInfo) / sizeof(int), file);
+    /* part 2: model parameters */
+    for (int i = 0; i < params.Size(); i++) {
+        params[i]->BinaryDump(file);
    }
    fclose(file);
@@ -391,27 +463,43 @@ void T2TModel::Dump(const char * fn)
 }
 /* read the parameters */
-void T2TModel::Read(const char * fn)
+void T2TModel::Read(FILE* file)
 {
    double startT = GetClockSec();
-    FILE * file = fopen(fn, "rb");
-    CheckNTErrors(file, "Cannot open the model file");
    TensorList params(100);
    GetParams(params);
-    for(int i = 0; i < params.count; i++){
+    /* convert parameters to FP16 */
-        XTensor * p = (XTensor*)params.Get(i);
+    if (useFP16) {
-        p->Read(file, "param:");
+        for (int i = 0; i < params.Size(); i++) {
+            XTensor* p = params[i];
+            InitTensorV2(p, p->order, p->dimSize, X_FLOAT16, 1, p->devID);
+        }
+        auto& encEmb = encoder->embedder.posEmbeddingBase;
+        auto& decEmb = decoder->embedder.posEmbeddingBase;
+        encEmb = ConvertDataType(encEmb, X_FLOAT16);
+        decEmb = ConvertDataType(decEmb, X_FLOAT16);
    }
-    fclose(file);
+    for (int i = 0; i < params.Size(); i++)
+        params[i]->BinaryRead(file);
-    double elapsed = GetClockSec() - startT;
+    /* share all embeddings */
+    if (shareAllEmbeddings == 1) {
+        decoder->embedder.w = CopyValues(encoder->embedder.w);
+        XPRINT(0, stderr, "[INFO] sharing encoder decoder embeddings\n");
+    }
+    /* share embeddings with output weights */
+    if (shareDecInputOutputWeight == 1) {
+        outputLayer->w = CopyValues(decoder->embedder.w);
+        XPRINT(0, stderr, "[INFO] sharing decoder embeddings with output weights\n");
+    }
+    double elapsed = GetClockSec() - startT;
    XPRINT1(0, stderr, "[INFO] model loaded (took %.1fs)\n", elapsed);
 }
 }
\ No newline at end of file
--- a/source/sample/transformer/T2TModel.h
+++ b/source/sample/transformer/T2TModel.h
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,16 +17,18 @@
 /*
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */
 #ifndef __T2TMODEL_H__
 #define __T2TMODEL_H__
-#include "T2TFNN.h"
-#include "T2TAttention.h"
 #include "T2TEncoder.h"
 #include "T2TDecoder.h"
-#include "T2TOutput.h"
+#include "module/T2TFNN.h"
+#include "module/T2TOutput.h"
+#include "module/T2TUtility.h"
+#include "module/T2TAttention.h"
 namespace transformer
 {
@@ -41,13 +43,13 @@ public:
    int devID;
    /* the encoder */
-    AttEncoder * encoder;
+    AttEncoder* encoder;
    /* the decoder */
-    AttDecoder * decoder;
+    AttDecoder* decoder;
    /* output layer */
-    T2TOutput * outputLayer;
+    T2TOutput* outputLayer;
    /* indicates whether the model is running for language modeling */
    bool isLM;
@@ -55,9 +57,18 @@ public:
    /* indicates whether the model is running for machine translation */
    bool isMT;
+    /* indicates whether the model is running with FP16 data type */
+    bool useFP16;
    /* number of heads in the attention model */
    int nhead;
+    /* indicates whether share encoders embeddings with decoders */
+    int shareAllEmbeddings;
+    /* indicates whether share decoder embeddings with output weights */
+    int shareDecInputOutputWeight;
 public:
    /* constructor */
    T2TModel();
@@ -66,41 +77,42 @@ public:
    ~T2TModel();
    /* initialize the model */
-    void InitModel(int argc, char ** argv);
+    void InitModel(T2TConfig& config);
    /* make the encoding network */
-    XTensor MakeEncoder(XTensor &input, XTensor &mask, bool isTraining);
+    XTensor MakeEncoder(XTensor& input, XTensor* mask, bool isTraining);
    /* make the encoding network */
-    XTensor MakeDecoder(XTensor &inputEnc, XTensor &inputDec, XTensor &mask, XTensor &MaskEncDec, bool isTraining);
+    XTensor MakeDecoder(XTensor& inputEnc, XTensor& inputDec, XTensor* mask,
+        XTensor& MaskEncDec, bool isTraining);
-    /* make the network for langauge modeling (with the output softmax layer) */
+    /* make the network for language modeling (with the output softmax layer) */
-    void MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool isTraining);
+    void MakeLM(XTensor& input, XTensor& output, XTensor& padding, bool isTraining);
    /* make the network for machine translation (with the output softmax layer) */
-    void MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, 
+    void MakeMT(XTensor& inputEnc, XTensor& inputDec, XTensor& output,
-                XTensor &paddingEnc, XTensor &paddingDec, bool isTraining);
+        XTensor& paddingEnc, XTensor& paddingDec, bool isTraining);
    /* make the mask for training MT models */
-    void MakeMTMask(XTensor &inputEnc, XTensor &inputDec, 
+    void MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
-                    XTensor &paddingEnc, XTensor &paddingDec, 
+        XTensor& paddingEnc, XTensor& paddingDec,
-                    XTensor &maskEnc, XTensor &maskDec, XTensor &maskEncDec);
+        XTensor& maskEnc, XTensor& maskDec, XTensor& maskEncDec);
    /* make the mask of the encoder */
-    void MakeMTMaskEnc(XTensor &paddingEnc, XTensor &maskEnc);
+    void MakeMTMaskEnc(XTensor& paddingEnc, XTensor& maskEnc);
    /* make the mask of the decoder */
-    void MakeMTMaskDec(XTensor &paddingEnc, XTensor &paddingDec,
+    void MakeMTMaskDec(XTensor& paddingEnc, XTensor& paddingDec,
-                       XTensor &maskDec, XTensor &maskEncDec);
+        XTensor& maskDec, XTensor& maskEncDec);
-    /* get parameter matrics */
+    /* get parameter matrices */
-    void GetParams(TensorList &list);
+    void GetParams(TensorList& list);
-    /* dump the parameters */
+    /* dump the model to a file */
-    void Dump(const char * fn);
+    void Dump(const char* fn);
    /* read the parameters */
-    void Read(const char * fn);
+    void Read(FILE* file);
 };
 }

--- a/source/sample/transformer/T2TSearch.cpp
+++ b/source/sample/transformer/T2TSearch.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2019, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
- */
-#include "T2TSearch.h"
-#include "T2TUtility.h"
-#include "../../tensor/core/CHeader.h"
-using namespace nts;
-namespace transformer
-{
-/* constructor */
-T2TSearch::T2TSearch()
-{
-    alpha = 0;
-    maxLength = 0;
-    beamSize = 0;
-    batchSize = 0;
-    endSymbolNum = 0;
-    fullHypos = NULL;
-    endSymbols = new int[32];
-    startSymbol = -1;
-}
-/* de-constructor */
-T2TSearch::~T2TSearch()
-{
-    if(fullHypos != NULL)
-        delete[] fullHypos;
-    if(endSymbols != NULL)
-        delete[] endSymbols;
-}
-/*
-initialize the model
->> argc - number of arguments
->> argv - list of pointers to the arguments
-*/
-void T2TSearch::Init(int argc, char ** argv)
-{
-    LoadParamInt(argc, argv, "beamsize", &beamSize, 1);
-    LoadParamInt(argc, argv, "batchsize", &batchSize, 1);
-    LoadParamFloat(argc, argv, "lenalpha", &alpha, 0.2F);
-    LoadParamInt(argc, argv, "endid", endSymbols, -1);
-    LoadParamInt(argc, argv, "startid", &startSymbol, -1);
-    if(endSymbols[0] >= 0)
-        endSymbolNum = 1;
-}
-/* 
-search for the most promising states 
->> model - the transformer model
->> input - input of the model
->> padding - padding of the input
->> output - output that represents the sequences as rows
-*/
-void T2TSearch::Search(T2TModel * model, XTensor * input, XTensor * padding, XTensor * output)
-{
-    T2TPredictor predictor;
-    XTensor maskEnc;
-    XTensor encoding;
-    XTensor encodingBeam;
-    XTensor inputBeam;
-    XTensor paddingBeam;
-    CheckNTErrors(endSymbolNum > 0, "The search class is not initialized!");
-    CheckNTErrors(startSymbol >= 0, "The search class is not initialized!");
-    Prepare(input->unitNum/input->GetDim(-1), beamSize);
-    /* encoder mask */
-    model->MakeMTMaskEnc(*padding, maskEnc);
-    //input->Dump(stderr, "input:");
-    //maskEnc.Dump(stderr, "maskenc:");
-    /* make the encoding network */
-    encoding = model->MakeEncoder(*input, maskEnc, false);
-    encoding.SetName(ENCODING_NAME);
-    encodingBeam = Unsqueeze(encoding, encoding.order - 2, beamSize);
-    inputBeam = Unsqueeze(*input, input->order - 1, beamSize);
-    paddingBeam = Unsqueeze(*padding, padding->order - 1, beamSize);
-    encodingBeam.ReshapeMerged(encodingBeam.order - 4);
-    inputBeam.ReshapeMerged(inputBeam.order - 3);
-    paddingBeam.ReshapeMerged(paddingBeam.order - 3);
-    /* max output-length = 2 * source-length */
-    maxLength = input->GetDim(-1) * 2;
-    CheckNTErrors(maxLength > 0, "no max length specified!");
-    T2TStateBundle * states = new T2TStateBundle[maxLength + 1];
-    T2TStateBundle * first = states;
-    /* create the first state */
-    predictor.Create(model, &encodingBeam, input, beamSize, first);
-    predictor.SetStartSymbol(startSymbol);
-    first->isStart = true;
-    /* generate the sequence from left to right */
-    for(int i = 0 ; i < maxLength; i++){
-        T2TStateBundle * cur = states + i;
-        T2TStateBundle * next = states + i + 1;
-        /* read the current state */
-        predictor.Read(model, cur);
-        /* predict the next state */
-        predictor.Predict(next, &encodingBeam, &inputBeam, &paddingBeam);
-        /* compute the model score (given the prediction probability) */
-        Score(cur, next);
-        /* beam pruning */
-        Generate(next);
-        /* expand the search graph */
-        Expand(cur, next);
-        /* push complete hypotheses into the heap */
-        Collect(next);
-    }
-    /* fill the heap with imcomplete hypotheses if neccesary */
-    FillHeap(&states[maxLength]);
-    Dump(output);
-    delete[] states;
-}
-/* 
-prepare for search
->> batchSize - size of the batch
->> beamSize - size of the beam
-*/
-void T2TSearch::Prepare(int myBatchSize, int myBeamSize)
-{
-    batchSize = myBatchSize;
-    beamSize = myBeamSize;
-    if (fullHypos != NULL)
-        delete[] fullHypos;
-    fullHypos = new XHeap<MIN_HEAP, float>[batchSize];
-    for (int i = 0; i < batchSize; i++)
-        fullHypos[i].Init(beamSize);
-}
-/* 
-compute the model score for each hypothesis 
->> prev - the beam of the previous state
->> beam - the beam that keeps a number of states
-*/
-void T2TSearch::Score(T2TStateBundle * prev, T2TStateBundle * beam)
-{
-    XTensor &score = beam->modelScore;
-    XTensor &prob = beam->prob;
-    XTensor &probPath = beam->probPath;
-    XTensor &probPathPrev = prev->probPath;
-    XTensor &lenPrev = prev->nstep;
-    XTensor &len = beam->nstep;
-    XTensor lp;
-    XTensor mask;
-    int order = prob.order;
-    int outputSize = prob.GetDim(-1);
-    int dims[MAX_TENSOR_DIM_NUM];
-    for(int i = 0; i < order; i++)
-        dims[i] = prob.GetDim(i);
-    InitTensor(&score, &prob);
-    InitTensor(&probPath, &prob);
-    prob.Reshape(prob.unitNum/outputSize, outputSize);
-    score.Reshape(score.unitNum/outputSize, outputSize);
-    probPath.Reshape(score.unitNum / outputSize, outputSize);
-    probPathPrev.Reshape(probPathPrev.unitNum);
-    /* the log-scale probability of the entire sequence */
-    _SumDim(&prob, &probPathPrev, &probPath, 0);
-    InitTensor(&len, &lenPrev);
-    InitTensor(&lp, &lenPrev);
-    _ScaleAndShift(&lenPrev, &len, 1.0F, 1.0F);
-    /* the GNMT-like length penalty */
-    lp = T2TLengthPenalizer::GNMT(len, alpha);
-    lp.Reshape(lp.unitNum);
-    /* score = log-prob/lp */
-    _DivDim(&probPath, &lp, &score, 0);
-    if (prev->isStart) {
-        XTensor firstMask = MakeFirstMask(beam);
-        firstMask.Reshape(firstMask.unitNum);
-        /* mask the hypotheses in the beam expect the first one */
-        _SumDim(&score, &firstMask, &score, 0);
-    }
-    InitTensor(&mask, 
-               prev->endMark.order, prev->endMark.dimSize, X_FLOAT, 
-               prev->endMark.devID);
-    _SetDataFixedCond(&mask, &prev->endMark, -1e9F);
-    mask.Reshape(mask.unitNum);
-    /* mask the completed hypotheses so that they cannot 
-       be involved in further sorting and beam search. */
-    _SumDim(&score, &mask, &score, 0);
-    prob.Reshape(order, dims);
-    score.Reshape(order, dims);
-    probPath.Reshape(order, dims);
-    probPathPrev.Reshape(order - 1, dims);
-    lp.Reshape(order - 1, dims);
-    mask.Reshape(order -1 , dims);
-}
-/* 
-generate tokens for the next state via beam pruning
->> beam - the beam that keeps a number of states
-*/
-void T2TSearch::Generate(T2TStateBundle * beam)
-{
-    int dims[MAX_TENSOR_DIM_NUM];
-    int dimsBeam[MAX_TENSOR_DIM_NUM];
-    int dimsTopK[MAX_TENSOR_DIM_NUM];
-    XTensor scoreTopK;
-    XTensor &score = beam->modelScore;
-    XTensor &index = beam->prediction;
-    XTensor &preID = beam->preID;
-    XTensor &probPath = beam->probPath;
-    XTensor &prob = beam->prob;
-    int order = score.order;
-    CheckNTErrors(order >= 3, "The tensor must be of order 2 or larger.");
-    CheckNTErrors(dimsBeam[order - 3] % beamSize == 0, "Wrong dimension size!");
-    for (int i = 0; i < order; i++) {
-        dims[i] = score.GetDim(i);
-        dimsBeam[i] = score.GetDim(i);
-        dimsTopK[i] = score.GetDim(i);
-    }
-    int sizeVocab = score.GetDim(-1);
-    int stride = score.GetDim(-1);
-    dimsBeam[order - 3] /= beamSize;
-    dimsBeam[order - 1] *= beamSize;
-    dimsTopK[order - 3] = dimsBeam[order - 3];
-    dimsTopK[order - 1] = beamSize;
-    InitTensor(&scoreTopK, order, dimsTopK, score.dataType,
-                 score.devID);
-    InitTensor(&index, order, dimsTopK, X_INT,
-                 score.devID);
-    InitTensor(&preID, order, dimsTopK, X_INT, -1);
-    score.Reshape(order, dimsBeam);
-    /* keep the most promissing candidates in the beam */
-    TopK(score, scoreTopK, index, -1, beamSize);
-    CopyValues(index, preID);
-    /* "preID" represents the id (or the offset) of the previous state used to make the current
-       hypothesis. Note that we reshape the "score" tensor into a matrix where each
-       row means a previous state. The column number is size-of-beam \times vocab-size. We,
-       therefore, divide entries of the top-k index by vocab-size to compute the id of the
-       previous state for each hypothesis in the top-k list. */
-    DescaleMe(preID, sizeVocab);
-    /* Then, we do something similar to "preID". For the top-k predictions, we need 
-       to know their indices in the vocabulary. We compute the offset of each prediction
-       in the vocabulary by dividing it with vocab-size and computing the remainder. */
-    ModMe(index, sizeVocab);
-    score.Reshape(order, dims);
-    /* we keep the top-k scores */
-    InitTensor(&score, &scoreTopK);
-    CopyValues(scoreTopK, score);
-    /*  CPU data (TODO: remove GPU->CPU data copy!!!) */
-    XTensor indexGPU;
-    indexGPU = CopyValues(index);
-    //InitTensorV2(&indexCPU, index.order, index.dimSize, index.dataType, index.denseRatio, -1);
-    //CopyValues(index, indexCPU);
-    for (int i = 0; i < indexGPU.unitNum; i++)
-        indexGPU.SetInt(i * stride + indexGPU.GetInt(i), i);
-    CheckNTErrors(IsSameShaped(prob, probPath), "Wrong tensor shape!");
-    /* sequence probability of top-k candidates */
-    XTensor probPathTopK;
-    InitTensor(&probPathTopK, &scoreTopK);
-    XTensor probTopK;
-    InitTensor(&probTopK, &scoreTopK);
-    for (int i = 0; i < probPath.order; i++) {
-        dims[i] = probPath.GetDim(i);
-        dimsTopK[i] = probPathTopK.GetDim(i);
-    }
-    order = probPath.order;
-    probPath.Reshape(1, probPath.unitNum);
-    probPathTopK.Reshape(1, probPathTopK.unitNum);
-    prob.Reshape(1, prob.unitNum);
-    probTopK.Reshape(1, probTopK.unitNum);
-    _CopyIndexed(&probPath, &probPathTopK, probPathTopK.order - 1, &indexGPU);
-    _CopyIndexed(&prob, &probTopK, probTopK.order - 1, &indexGPU);
-    probPath.Reshape(order, dims);
-    probPathTopK.Reshape(order, dimsTopK);
-    prob.Reshape(order, dims);
-    probTopK.Reshape(order, dimsTopK);
-    probPath = probPathTopK;
-    prob = probTopK;
-}
-/* 
-expand the search graph 
->> beam - the beam that keeps a number of states
-*/
-void T2TSearch::Expand(T2TStateBundle * prev, T2TStateBundle * beam)
-{
-    CheckNTErrors(beam->prediction.unitNum == beam->preID.unitNum, "A problem occurs in the beam!");
-    beam->MakeStates(beam->prediction.unitNum);
-    T2TState * states = beam->states;
-    XTensor & idRef = beam->preID;
-    XTensor & modelScoreRef = beam->modelScore;
-    XTensor & probRef = beam->prob;
-    XTensor & probPathRef = beam->probPath;
-    XTensor & predictionRef = beam->prediction;
-    XTensor & endMark = beam->endMark;
-    XTensor   id;
-    XTensor   modelScore;
-    XTensor   prob;
-    XTensor   probPath;
-    XTensor   prediction;
-    XTensor   endMarkCPU;
-    InitTensorOnCPU(&id, &idRef);
-    InitTensorOnCPU(&modelScore, &modelScoreRef);
-    InitTensorOnCPU(&prob, &probRef);
-    InitTensorOnCPU(&probPath, &probPathRef);
-    InitTensorOnCPU(&prediction, &predictionRef);
-    InitTensorOnCPU(&endMarkCPU, &predictionRef);
-    InitTensor(&endMark, &predictionRef);
-    /* we copy the data to CPU because the frequent access to GPU is slow
-       and we can speed-up the process by doing the job on CPU. */
-    CopyValues(idRef, id);
-    CopyValues(modelScoreRef, modelScore);
-    CopyValues(probRef, prob);
-    CopyValues(probPathRef, probPath);
-    CopyValues(predictionRef, prediction);
-    CheckNTErrors(beam->stateNum == id.unitNum, "Errors occur in counting!");
-    /* Related variables are kept on the states of the graph. All these are 
-       maintained on CPUs to ease the implementation of frequent access and 
-       modification of the states. An alternative is to do this on GPUs but 
-       it needs much more coding work and the speed-up is not obvious. */
-    for(int i = 0; i < beam->stateNum; i += beamSize){
-        for (int j = 0; j < beamSize; j++) {
-            int k = i + j;
-            T2TState & state = states[k];
-            int offset = id.GetInt(k);
-            int pid = i / beamSize;
-            T2TState * last = prev->states + pid * beamSize + offset;
-            CheckNTErrors(offset >= 0, "Wrong state index!");
-            /* pointer to the previous state */
-            if (prev->isStart) {
-                state.last = NULL;
-                state.pid = pid;
-                state.nstep = 0;
-                state.isCompleted = false;
-            }
-            else {
-                state.last = last;
-                state.pid = state.last->pid;
-                state.nstep = last->nstep + 1;
-                state.isCompleted = last->isCompleted;
-                CheckNTErrors(offset < prev->stateNum, "Wrong state index!");
-            }
-            /* scores */
-            state.modelScore = modelScore.Get(k);
-            state.prob = prob.Get(k);
-            state.probPath = probPath.Get(k);
-            /* prediction */
-            state.prediction = prediction.GetInt(k);
-            CheckNTErrors(state.prediction >= 0, "Illegal prediction!");
-            /* check if it is the end of the sequence */
-            state.isEnd = IsEnd(state.prediction);
-            state.isCompleted = (state.isCompleted || state.isEnd);
-            /* set the ending mark */
-            endMarkCPU.SetInt(state.isEnd, k);
-        }
-    }
-    /* copy the ending mark from CPU to the target device */
-    CopyValues(endMarkCPU, endMark);
-}
-/* 
-collect hypotheses with ending symbols. Given a beam of hypotheses,
-we remove the finished hypotheses and keep them in a heap.
->> beam  - the beam that keeps a number of states
-*/
-void T2TSearch::Collect(T2TStateBundle * beam)
-{
-    T2TState * states = beam->states;
-    for (int i = 0; i < beam->stateNum; i++) {
-        T2TState & state = states[i];
-        CheckNTErrors(state.pid >= 0 && state.pid < batchSize, 
-                      "Invalid sample id!");
-        /* we push the hypothesis into the heap when it is completed */
-        if(state.isEnd != 0)
-            fullHypos[state.pid].Push(HeapNode<float>(&state, state.modelScore));
-    }
-}
-/* 
-fill the hypotheis heap with incomplete hypotheses 
->> beam  - the beam that keeps a number of states (final)
-*/
-void T2TSearch::FillHeap(T2TStateBundle * beam)
-{
-    bool * emptyFlags = new bool[batchSize];
-    for (int i = 0; i < batchSize; i++)
-        emptyFlags[i] = (fullHypos[i].Count() == 0);
-    T2TState * states = beam->states;
-    for (int i = 0; i < beam->stateNum; i++) {
-        T2TState & state = states[i];
-        CheckNTErrors(state.pid >= 0 && state.pid < batchSize,
-                      "Invalid sample id!");
-        /* we push the imcomplete hypothesis into the heap */
-        if (emptyFlags[state.pid] && state.isEnd == 0)
-            fullHypos[state.pid].Push(HeapNode<float>(&state, state.modelScore));
-    }
-    delete[] emptyFlags;
-}
-/* 
-save the output sequences in a tensor 
->> output - output sequences (for return)
-*/
-void T2TSearch::Dump(XTensor * output)
-{
-    int dims[3] = {batchSize, beamSize, maxLength};
-    int * words = new int[maxLength];
-    InitTensor(output, 3, dims, X_INT);
-    output->SetDataFixed(-1);
-    /* heap for an input sentence in the batch */
-    for(int h = 0; h < batchSize; h++){
-        XHeap<MIN_HEAP, float> &heap = fullHypos[h];
-        /* for each output in the beam */
-        for(int i = 0; i < beamSize && heap.Count() > 0; i++){
-            T2TState * state = (T2TState *)heap.Pop().index;
-            int count = 0;
-            bool isCompleted = true;
-            /* we track the state from the end to the beginning */
-            while(state != NULL){
-                if (!state->isCompleted)
-                    isCompleted = false;
-                if (isCompleted)
-                    words[count++] = -1;
-                else
-                    words[count++] = state->prediction;
-                state = state->last;
-            }
-            /* dump the sentence to the output tensor */
-            for(int w = 0; w < count; w++)
-                output->Set3DInt(words[count - w - 1], h, beamSize - i - 1, w);
-        }
-    }
-    delete[] words;
-}
-/* 
-check if the token is an end symbol 
->> token - token to be checked
-*/
-bool T2TSearch::IsEnd(int token)
-{
-    CheckNTErrors(endSymbolNum > 0, "No end symbol?");
-    for(int i = 0; i < endSymbolNum; i++){
-        if(endSymbols[i] == token)
-            return true;
-    }
-    return false;
-}
-/* 
-set end symbols for search
->> tokens - end symbols
->> tokenNum - number of the end symbols
-*/
-void T2TSearch::SetEnd(const int * tokens, const int tokenNum)
-{
-    if(endSymbols != NULL)
-        delete[] endSymbols;
-    if(tokenNum <= 0)
-        return;
-    /* we may have multiple end symbols */
-    tokens = new int[tokenNum];
-    for(int i = 0; i < tokenNum; i++)
-        endSymbols[i] = tokens[i];
-    endSymbolNum = tokenNum;
-}
-/*
-make a mask to prevent duplicated entries in beam expansion for the first position
->> beam - the beam that keeps the searching states
-*/
-XTensor T2TSearch::MakeFirstMask(T2TStateBundle * beam)
-{
-    XTensor &prob = beam->prob;
-    XTensor mask;
-    int order = prob.order;
-    int dims[MAX_TENSOR_DIM_NUM];
-    for (int i = 0; i < order - 1; i++)
-        dims[i] = prob.GetDim(i);
-    InitTensor(&mask, order - 1, dims, X_FLOAT);
-    mask.SetZeroAll();
-    for (int i = 0; i < mask.unitNum; i++) {
-        if (i % beamSize != 0)
-            mask.Set(-1e9, i);
-    }
-    mask.SetDevice(prob.devID);
-    return mask;
-}
-}
--- a/source/sample/transformer/T2TTester.cpp
+++ b/source/sample/transformer/T2TTester.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2019, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
- */
-#include <math.h>
-#include "T2TUtility.h"
-#include "T2TTester.h"
-#include "T2TSearch.h"
-#include "../../tensor/XUtility.h"
-#include "../../tensor/core/CHeader.h"
-#include "../../network/XNoder.h"
-using namespace nts;
-namespace transformer
-{
-/* constructor */
-T2TTester::T2TTester()
-{
-}
-/* de-constructor */
-T2TTester::~T2TTester()
-{
-}
-/* initialize the model */
-void T2TTester::Init(int argc, char ** argv)
-{
-    LoadParamInt(argc, argv, "vsize", &vSize, 1);
-    LoadParamInt(argc, argv, "vsizetgt", &vSizeTgt, vSize);
-    batchLoader.Init(argc, argv);
-    seacher.Init(argc, argv);
-}
-/* 
-test the model
->> fn - test data file
->> ofn - output data file
->> model - model that is trained
-*/
-void T2TTester::Test(const char * fn, const char * ofn, T2TModel * model)
-{
-    int wc = 0;
-    int ws = 0;
-    int wordCount = 0;
-    int wordCountTotal = 0;
-    int sentCount = 0;
-    int batchCount = 0;
-    float loss = 0;
-    /* data files */
-    FILE * file = fopen(fn, "rb");
-    CheckNTErrors(file, "Cannot read the test file");
-    FILE * ofile = fopen(ofn, "wb");
-    CheckNTErrors(ofile, "Cannot open the output file");
-    int devID = model->devID;
-    XNet net;
-    double startT = GetClockSec();
-    wordCount = 0;
-    /* batch of input sequences */
-    XTensor batchEnc;
-    XTensor batchDec;
-    /* label */
-    XTensor label;
-    /* padding */
-    XTensor paddingEnc;
-    XTensor paddingDec;
-    /* gold standard */
-    XTensor gold;
-    /* an array that keeps the sequences */
-    int * seqs = new int[MILLION];
-    batchLoader.SetRandomBatch(false);
-    batchLoader.ClearBuf();
-    while(batchLoader.LoadBatch(file, model->isLM, 
-                                &batchEnc, &paddingEnc, &paddingDec, &paddingDec, &gold, &label,
-                                seqs, vSize, vSizeTgt,
-                                1, 1, false, ws, wc, devID, false))
-    {
-        CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch!");
-        CheckNTErrors(!model->isLM, "Only MT model is supported!");
-        XTensor output;
-        seacher.Search(model, &batchEnc, &paddingEnc, &output);
-        Dump(ofile, &output);
-        float prob = 0;
-        loss += -prob;
-        wc = batchEnc.GetDim(-1);
-        wordCount += wc;
-        wordCountTotal += wc;
-        sentCount += batchEnc.GetDim(-2);
-        batchCount += 1;
-        if (batchCount % 1 == 0) {
-            double elapsed = GetClockSec() - startT;
-            XPRINT3(0, stderr, 
-                   "[INFO] elapsed=%.1fs, sentence=%d, sword=%d\n",
-                    elapsed, sentCount, wordCount);
-        }
-    }
-    fclose(file);
-    fclose(ofile);
-    delete[] seqs;
-    double elapsed = GetClockSec() - startT;
-    XPRINT3(0, stderr, "[INFO] test finished (took %.1fs, word=%d, and ppl=%.3f)\n",
-            elapsed,wordCountTotal, exp(loss/wordCount));
-}
-/*
-dump the result into the file
->> file - data file
->> output - output tensor
-*/
-void T2TTester::Dump(FILE * file, XTensor * output)
-{
-    int seqLength = output->GetDim(-1);
-    for (int i = 0; i < output->unitNum; i += seqLength) {
-        for (int j = 0; j < seqLength; j++) {
-            int w = output->GetInt(i + j);
-            fprintf(file, "%d ", w);
-            if (w < 0)
-                break;
-        }
-        fprintf(file, "\n");
-    }
-}
-}
--- a/source/sample/transformer/T2TUtility.cpp
+++ b/source/sample/transformer/T2TUtility.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
- */
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-namespace transformer
-{
-FILE * tmpFILE;
-int llnum = 0;
-FILE * tf = NULL;
-void LoadParamString(int argc, char ** argv, const char * name, char * p, const char * defaultP)
-{
-    char vname[128];
-    vname[0] = '-';
-    strcpy(vname + 1, name);
-    bool hit = false;
-    for(int i = 0; i < argc; i++){
-        if(!strcmp(argv[i], vname) && i + 1 < argc){
-            strcpy(p, argv[i + 1]);
-            //fprintf(stderr, " %s=%s\n", name, argv[i + 1]);
-            hit = true;
-        }
-    }
-    if(!hit)
-        strcpy(p, defaultP);
-}
-void LoadParamInt(int argc, char ** argv, const char * name, int * p, int defaultP)
-{
-    char vname[128];
-    vname[0] = '-';
-    strcpy(vname + 1, name);
-    bool hit = false;
-    for(int i = 0; i < argc; i++){
-        if(!strcmp(argv[i], vname) && i + 1 < argc){
-            *(int*)p = atoi(argv[i + 1]);
-            //fprintf(stderr, " %s=%s\n", name, argv[i + 1]);
-            hit = true;
-        }
-    }
-    if(!hit)
-        *p = defaultP;
-}
-void LoadParamBool(int argc, char ** argv, const char * name, bool * p, bool defaultP)
-{
-    char vname[128];
-    vname[0] = '-';
-    strcpy(vname + 1, name);
-    bool hit = false;
-    for(int i = 0; i < argc; i++){
-        if(!strcmp(argv[i], vname)){
-            *(bool*)p = true;
-            //fprintf(stderr, " %s=%s\n", name, "true");
-            hit = true;
-        }
-    }
-    if(!hit)
-        *p = defaultP;
-}
-void LoadParamFloat(int argc, char ** argv, const char * name, float * p, float defaultP)
-{
-    char vname[128];
-    vname[0] = '-';
-    strcpy(vname + 1, name);
-    bool hit = false;
-    for(int i = 0; i < argc; i++){
-        if(!strcmp(argv[i], vname) && i + 1 < argc){
-            *p = (float)atof(argv[i + 1]);
-            //fprintf(stderr, " %s=%s\n", name, argv[i + 1]);
-            hit = true;
-        }
-    }
-    if(!hit)
-        *p = defaultP;
-}
-void ShowParams(int argc, char ** argv)
-{
-    fprintf(stderr, "args:\n");
-    for(int i = 0; i < argc; i++){
-        if(argv[i][1] == 0)
-            continue;
-        if(argv[i][0] == '-' && (argv[i][1] < '1' || argv[i][1] > '9')){
-            if(i + 1 < argc && argv[i + 1][0] != '-')
-                fprintf(stderr, " %s=%s\n", argv[i], argv[i + 1]);
-            else
-                fprintf(stderr, " %s=yes\n", argv[i]);
-        }
-    }
-    fprintf(stderr, "\n");
-}
-}
--- a/source/sample/transformer/Transformer.cpp
+++ b/source/sample/transformer/Transformer.cpp
@@ -17,99 +17,55 @@
 /*
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
 */
-#include <math.h>
+#include <cmath>
-#include <time.h>
+#include <ctime>
 #include "Transformer.h"
-#include "T2TModel.h"
+#include "train/T2TTrainer.h"
-#include "T2TUtility.h"
+#include "module/T2TUtility.h"
-#include "T2TTrainer.h"
+#include "translate/T2TTranslator.h"
-#include "T2TPredictor.h"
-#include "T2TTester.h"
 #include "../../tensor/XDevice.h"
-#include "../../tensor/XUtility.h"
 #include "../../tensor/XGlobal.h"
+#include "../../tensor/XUtility.h"
 namespace transformer
 {
-int TransformerMain(int argc, const char ** argv)
+int TransformerMain(int argc, const char** argv)
 {
-    if(argc == 0)
+    if (argc == 0)
        return 1;
-    char ** args = new char*[argc];
+    /* load configurations */
-    for(int i = 0; i < argc; i++){
+    T2TConfig config(argc, argv);
-        args[i] = new char[strlen(argv[i]) + 1];
-        strcpy(args[i], argv[i]);
-    }
-    tmpFILE = fopen("tmp.txt", "wb");
-    ShowParams(argc, args);
-    bool isBeamSearch = false;
-    char * trainFN = new char[MAX_LINE_LENGTH];
-    char * modelFN = new char[MAX_LINE_LENGTH];
-    char * testFN = new char[MAX_LINE_LENGTH];
-    char * outputFN = new char[MAX_LINE_LENGTH];
-    LoadParamString(argc, args, "train", trainFN, "");
-    LoadParamString(argc, args, "model", modelFN, "");
-    LoadParamString(argc, args, "test", testFN, "");
-    LoadParamString(argc, args, "output", outputFN, "");
-    LoadParamBool(argc, args, "beamsearch", &isBeamSearch, false);
    srand((unsigned int)time(NULL));
-    T2TTrainer trainer;
+    /* train the model */
-    trainer.Init(argc, args);
+    if (strcmp(config.trainFN, "") != 0) {
+        ENABLE_GRAD;
-    T2TModel model;
+        T2TModel model;
-    model.InitModel(argc, args);
+        model.InitModel(config);
+        T2TTrainer trainer;
-    /* learn model parameters */
+        trainer.Init(config);
-    if(strcmp(trainFN, ""))
+        trainer.Train(config.trainFN, config.validFN, config.modelFN, &model);
-        trainer.Train(trainFN, testFN, strcmp(modelFN, "") ? modelFN : "checkpoint.model", &model);
-    /* save the final model */
-    if(strcmp(modelFN, "") && strcmp(trainFN, ""))
-        model.Dump(modelFN);
-    /* load the model if neccessary */
-    if(strcmp(modelFN, ""))
-        model.Read(modelFN);
-    /* test the model on the new data */
-    if(strcmp(testFN, "") && strcmp(outputFN, "")){
-        /* beam search */
-        if(isBeamSearch){
-            T2TTester searcher;
-            searcher.Init(argc, args);
-            searcher.Test(testFN, outputFN, &model);
-        }
-        /* forced decoding */
-        else{
-            T2TTrainer tester;
-            tester.Init(argc, args);
-            tester.Validate(testFN, outputFN, &model);
-        }
    }
-    delete[] trainFN;
+    /* translate the test file */
-    delete[] modelFN;
+    if (strcmp(config.testFN, "") != 0 && strcmp(config.outputFN, "") != 0) {
-    delete[] testFN;
+        DISABLE_GRAD;
-    delete[] outputFN;
+        T2TModel model;
+        model.InitModel(config);
-    for(int i = 0; i < argc; i++)
+        T2TTranslator translator;
-        delete[] args[i];
+        translator.Init(config);
-    delete[] args;
+        translator.Translate(config.testFN, config.srcVocabFN, 
+                             config.tgtVocabFN, config.outputFN, &model);
-    fclose(tmpFILE);
+    }
    return 0;
 }
 }
\ No newline at end of file
--- a/source/sample/transformer/Transformer.h
+++ b/source/sample/transformer/Transformer.h
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,13 +17,13 @@
 /*
 *
- * An impelementation of the transformer system. See more details 
+ * An implementation of the transformer system. See more details
- * about FNNLM in 
+ * about FNNLM in
 * "Attention Is All You Need" by Vaswani et al.
 * https://arxiv.org/pdf/1706.03762.pdf
 *
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
- * I start writing the code related to NMT - a long time since my last coding 
+ * I start writing the code related to NMT - a long time since my last coding
 * work on MT
 */
@@ -38,7 +38,7 @@ namespace transformer
 {
 /* entrance of the program */
-int TransformerMain(int argc, const char ** argv);
+int TransformerMain(int argc, const char** argv);
 }

--- a/source/sample/transformer/module/T2TAttention.cpp
+++ b/source/sample/transformer/module/T2TAttention.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
+ */
+#include <cmath>
+#include "T2TUtility.h"
+#include "T2TAttention.h"
+#include "T2TEmbedding.h"
+#include "../../../tensor/core/CHeader.h"
+namespace transformer
+{
+/* constructor */
+T2TAttention::T2TAttention()
+{
+    nhead = -1;
+    dk = -1;
+    dv = -1;
+    d = -1;
+}
+/* de-constructor */
+T2TAttention::~T2TAttention()
+{
+}
+/*
+initialize the model
+>> config - the configurations of the network
+*/
+void T2TAttention::InitModel(T2TConfig& config)
+{
+    devID = config.devID;
+    useRPR = config.useRPR;
+    nhead = config.nhead;
+    d = config.modelSize;
+    dk = config.modelSize;
+    dv = config.modelSize;
+    maxRP = config.maxRP;
+    dropoutP = config.attDropout;
+    InitTensor2D(&wq, d, d, X_FLOAT, devID);
+    InitTensor1D(&bq, d, X_FLOAT, devID);
+    InitTensor2D(&wk, d, d, X_FLOAT, devID);
+    InitTensor1D(&bk, d, X_FLOAT, devID);
+    InitTensor2D(&wv, d, d, X_FLOAT, devID);
+    InitTensor1D(&bv, d, X_FLOAT, devID);
+    if (useRPR)
+        InitTensor2D(&RPEmbK, maxRP * 2 + 1, d / nhead, X_FLOAT, devID);
+    InitTensor2D(&wo, d, d, X_FLOAT, devID);
+    InitTensor1D(&bo, d, X_FLOAT, devID);
+    float scale = 1.0F;
+    _SetDataFanInOut(&wk, scale);
+    _SetDataFanInOut(&wq, scale);
+    _SetDataFanInOut(&wv, scale);
+    _SetDataFanInOut(&wo, scale);
+    if (useRPR)
+        _SetDataFanInOut(&RPEmbK, scale);
+    bk.SetZeroAll();
+    bq.SetZeroAll();
+    bv.SetZeroAll();
+    bo.SetZeroAll();
+}
+/*
+make the network
+>> k - keys, B * L * H for encoders, B * 1 * H for decoders
+       where B = batch size, L = sequence length,
+       and H = vector size of each position
+>> q - queries, B * L * H
+>> v - values, B * L * H for encoders, B * 1 * H for decoders
+>> mask - as it is
+>> isTraining - indicates whether the model is used for training
+>> cache - decoder cache
+>> cacheType - type of cache, e.g., self-attention
+<< return - multi-attention result
+*/
+XTensor T2TAttention::Make(XTensor& k, XTensor& q, XTensor& v, XTensor* mask,
+                           bool isTraining, Cache* cache, int cacheType)
+{
+    const bool isEnc = (!cache) ? true : false;
+    /* linear transformation before self-attention */
+    XTensor q2, k2, v2;
+    q2 = MulAndShift(q, wq, bq);
+    if (!cache || isTraining) {
+        /* self attention for encoder layers */
+        k2 = MulAndShift(k, wk, bk);
+        v2 = MulAndShift(v, wv, bv);
+        if (useRPR)
+            return MakeRPRAttention(k2, q2, v2, mask, isTraining, isEnc);
+        return MakeAttention(k2, q2, v2, mask, isTraining);
+    }
+    else {
+        if (cacheType == SELF_ATT) {
+            k2 = MulAndShift(k, wk, bk);
+            v2 = MulAndShift(v, wv, bv);
+            /* if hit, we only concat the cache with the new token */
+            if (!cache->miss) {
+                k2 = Concatenate(cache->key, k2, 1);
+                v2 = Concatenate(cache->value, v2, 1);
+            }
+            cache->key = k2;
+            cache->value = v2;
+            cache->miss = false;
+            if (useRPR)
+                return MakeRPRAttention(cache->key, q2, cache->value, mask, isTraining, isEnc);
+            return MakeAttention(cache->key, q2, cache->value, mask, isTraining);
+        }
+        else if (cacheType == EN_DE_ATT) {
+            if (cache->miss) {
+                cache->key = MulAndShift(k, wk, bk);
+                cache->value = MulAndShift(v, wv, bv);
+                cache->miss = false;
+            }
+            return MakeAttention(cache->key, q2, cache->value, mask, isTraining);
+        }
+        CheckNTErrors(0, "invalid cache type");
+    }
+}
+/*
+make the attention network given keys, queries and values (after linear transformation)
+>> k - keys, B * L * H
+>> q - queries, B * L * H
+>> v - values, B * L * H
+>> mask - as it is
+>> isTraining - indicates whether the model is used for training
+*/
+XTensor T2TAttention::MakeAttention(XTensor& k, XTensor& q, XTensor& v,
+                                    XTensor* mask, bool isTraining)
+{
+    XTensor kheads;
+    XTensor qheads;
+    XTensor vheads;
+    const auto dataType = k.dataType;
+    /* multi head */
+    kheads = Split(k, k.order - 1, nhead);
+    qheads = Split(q, q.order - 1, nhead);
+    vheads = Split(v, v.order - 1, nhead);
+    XTensor att;
+    XTensor dot;
+    XTensor scalar;
+    /* Some operations may cause numerical overflow under FP16 including
+       BMMul, Mask, Div and Softmax. So we need to cast the input to FP32 */
+    if (qheads.dataType == X_FLOAT16) {
+        qheads = ConvertDataType(qheads, X_FLOAT);
+        kheads = ConvertDataType(kheads, X_FLOAT);
+    }
+    /* scalar = softmax(Q * K^T / sqrt(dk)) * V */
+    dot = BMMul(qheads, X_NOTRANS, kheads, X_TRANS);
+    if (mask)
+        dot = dot + (*mask);
+    dot = Linear(dot, 1.0F / (float)sqrt((float)dk / nhead));
+    scalar = Softmax(dot, -1);
+    if (isTraining && dropoutP > 0)
+        scalar = Dropout(scalar, dropoutP);
+    if (vheads.dataType != scalar.dataType)
+        vheads = ConvertDataType(vheads, scalar.dataType);
+    att = BMMul(scalar, vheads);
+    if (dataType != att.dataType)
+        att = ConvertDataType(att, dataType);
+    /* concatenate the heads */
+    return MulAndShift(Merge(att, att.order - 1), wo, bo);
+}
+/*
+make the attention network by incorporating the relative position representation
+with the given keys, queries and values (after linear transformation)
+>> k - keys, B * L * H
+>> q - queries, B * L * H
+>> v - values, B * L * H
+>> mask - as it is
+>> isTraining - indicates whether the model is used for training
+>> isEnc - indicates whether it is encoder
+*/
+XTensor T2TAttention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
+                                       XTensor* mask, bool isTraining, bool isEnc)
+{
+    XTensor kheads;
+    XTensor qheads;
+    XTensor vheads;
+    const int batchSize = q.dimSize[0];
+    const int lenQ = q.dimSize[1];
+    const int lenKV = k.dimSize[1];
+    const auto dataType = k.dataType;
+    /* multi head */
+    kheads = Split(k, k.order - 1, nhead);
+    qheads = Split(q, q.order - 1, nhead);
+    vheads = Split(v, v.order - 1, nhead);
+    XTensor att;
+    XTensor dot;
+    XTensor scalar;
+    XTensor embMatrix, relativeKey;
+    /* generate the relative emb index (L_q, L_kv) */
+    embMatrix = GetRPEmbedding(lenQ, lenKV, maxRP, isEnc);
+    /* generate the relative key from the RPEmbK (L_q, L_kv, H/K) */
+    relativeKey = Gather(RPEmbK, embMatrix);
+    if (qheads.dataType == X_FLOAT16) {
+        qheads = ConvertDataType(qheads, X_FLOAT);
+        kheads = ConvertDataType(kheads, X_FLOAT);
+        relativeKey = ConvertDataType(relativeKey, X_FLOAT);
+    }
+    ScaleAndShiftMe(qheads, 1.0F / float(nhead));
+    dot = RPDotProduct(qheads, kheads, relativeKey, true);
+    if (mask)
+        dot = dot + (*mask);
+    /* softmax */
+    scalar = Softmax(dot, -1);
+    if (isTraining && dropoutP > 0)
+        scalar = Dropout(scalar, dropoutP);
+    if (vheads.dataType != scalar.dataType)
+        vheads = ConvertDataType(vheads, scalar.dataType);
+    /* generate the relative attention output (K, B, L_q, H/K) */
+    att = BMMul(scalar, vheads);
+    if (dataType != att.dataType)
+        att = ConvertDataType(att, dataType);
+    /* concatenate the heads */
+    return MulAndShift(Merge(att, att.order - 1), wo, bo);
+}
+/*
+generate relative position embeddings
+>> lenQ - the length of query
+>> lenKV - the length of key and value
+>> maxRelativeLen - the maximum length of relative position
+*/
+XTensor T2TAttention::GetRPEmbedding(const int lenQ, const int lenKV,
+                                     const int maxRelativeLen, const bool isEnc)
+{
+    XTensor range;
+    XTensor embMatrix;
+    InitTensor1D(&range, lenKV, X_INT, devID);
+    int* index = new int[lenKV];
+    if (isEnc) {
+        for (int i = 0; i < lenKV; i++)
+            index[i] = i;
+        range.SetData(index, lenKV);
+        XTensor range2D;
+        XTensor range2DTrans;
+        range2D = Unsqueeze(range, 0, lenQ);
+        range2DTrans = Transpose(range2D, 0, 1);
+        embMatrix = Sum(range2D, range2DTrans, -1);
+    }
+    else {
+        for (int i = 0; i < lenKV; i++)
+            index[i] = -lenKV + i + 1;
+        range.SetData(index, lenKV);
+        embMatrix = Unsqueeze(range, 0, lenQ);
+    }
+    ClipMe(embMatrix, -float(maxRelativeLen), float(maxRelativeLen));
+    ScaleAndShiftMe(embMatrix, 1.0F, float(maxRelativeLen));
+    delete[] index;
+    return embMatrix;
+}
+/*
+Relative position-aware dot-product attention inner calculation.
+>> x - Tensor with shape [batch_size*heads, length, length or depth].
+>> y - Tensor with shape [batch_size*heads, length, depth].
+>> z - Tensor with shape [length, length, depth].
+>> isKey - Whether y is key.
+<< return - A Tensor with shape [batch_size*heads, length, length or depth].
+*/
+XTensor T2TAttention::RPDotProduct(XTensor& x, XTensor& y, XTensor& z, const bool isKey)
+{
+    const int headNum = nhead;
+    const int batchSize = x.dimSize[1];
+    const int lenQ = x.dimSize[2];
+    const int lenKV = y.dimSize[2];
+    const int depth = y.dimSize[3];
+    const int lastDim = isKey ? lenKV : depth;
+    MATRIX_TRANS_TYPE transposeFlag = isKey ? X_TRANS : X_NOTRANS;
+    XTensor context;
+    context = MatrixMulBatched(x, X_NOTRANS, y, transposeFlag);
+    int mergeDims[] = { headNum * batchSize, lenQ, x.dimSize[3] };
+    x.Reshape(3, mergeDims);
+    XTensor xTrans;
+    xTrans = Transpose(x, 0, 1);
+    XTensor relative;
+    relative = MatrixMulBatched(xTrans, X_NOTRANS, z, transposeFlag);
+    XTensor relativeTrans;
+    relativeTrans = Transpose(relative, 0, 1);
+    int splitDims[] = { headNum, batchSize, lenQ, lastDim };
+    relativeTrans.Reshape(4, splitDims);
+    return Sum(context, relativeTrans);
+}
+/* constructor */
+Cache::Cache()
+{
+    miss = true;
+}
+/* update the states cache */
+void Cache::Update(XTensor&& k, XTensor&& v)
+{
+    key = k;
+    value = v;
+    miss = false;
+}
+/* keep alive states */
+void Cache::KeepAlive(XTensor& aliveIdx)
+{
+    if (!miss) {
+        key = AutoGather(key, aliveIdx);
+        value = AutoGather(value, aliveIdx);
+    }
+}
+/* reorder alive states */
+void Cache::Reorder(XTensor& reorder)
+{
+    if (!miss) {
+        key = AutoGather(key, reorder);
+        value = AutoGather(value, reorder);
+    }
+}
+}
\ No newline at end of file
--- a/source/sample/transformer/T2TAttention.h
+++ b/source/sample/transformer/T2TAttention.h
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,48 +17,93 @@
 /*
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
 */
 #ifndef __T2TATTENTION_H__
 #define __T2TATTENTION_H__
-#include "../../network/XNet.h"
+#include "T2TNNUtil.h"
+#include "T2TUtility.h"
+#include "../../../network/XNet.h"
+#include "../../../tensor/core/CHeader.h"
 using namespace nts;
 namespace transformer
 {
+/* attention type */
+enum { NONE, SELF_ATT, EN_DE_ATT };
-/* 
+/* layer cache for keys and values */
-multi-head attention 
+class Cache
-y(Q, K, V) = cat(head_1, head_2, ..., head_n)
+{
-where head_i = Attention(Q * w_i^Q, K * w_i^K, V * w_i^V)
+public:
-      attention(Q, K, V) = softmax(Q * K^T/d_k^0.5) V
+    /* cache for keys, (B, L, H) */
-      d_k = dimension size of K
+    XTensor key;
-*/
+    /* cache for values, (B, L, H) */
+    XTensor value;
+public:
+    /* indicates cache miss if 'true' */
+    bool miss;
+    /* constructor */
+    Cache();
+    /* update the states cache */
+    void Update(XTensor&& k, XTensor&& v);
+    /* keep alive states */
+    void KeepAlive(XTensor& aliveIdx);
+    /* reorder alive states */
+    void Reorder(XTensor& reorder);
+};
+/* multi-head attention */
 class T2TAttention
 {
 public:
    /* device id */
    int devID;
    /* head number */
    int nhead;
+    /* transformation matrix for Q */
+    XTensor wq;
+    /* bias for Q */
+    XTensor bq;
    /* transformation matrix for K */
    XTensor wk;
-    /* transformation matrix for Q */
+    /* bias for K */
-    XTensor wq;
+    XTensor bk;
    /* transformation matrix for V */
    XTensor wv;
+    /* bias for V */
+    XTensor bv;
+    XTensor wBig;
+    XTensor bBig;
+    /* RPR emb */
+    XTensor RPEmbK;
    /* transformation after dot-product attention */
-    XTensor wa;
+    XTensor wo;
-    XTensor wbig;
+    /* bias after dot-product attention */
+    XTensor bo;
    /* size of transformed Q and K */
    int dk;
@@ -68,19 +113,15 @@ public:
    /* size of input Q, K and V */
    int d;
-    /* indicates whether the attention is masked */
+    /* indicates whether we use the RPR attention */
-    bool isMasked;
+    bool useRPR;
-    /* some positions can be ignored in attention. this is useful in lm where the first position needs
-       special design for the attention model. */
-    int ignored;
-    /* indicates whether the model is used for training */
-    bool isTraining;
    /* dropout probability */
    DTYPE dropoutP;
+    /* the maximum relative window size */
+    int maxRP;
 public:
    /* constructor */
    T2TAttention();
@@ -89,20 +130,25 @@ public:
    ~T2TAttention();
    /* initialize the model */
-    void InitModel(int argc, char ** argv, 
+    void InitModel(T2TConfig& config);
-                   bool myIsMasked, int myIgnored, 
-                   int myDevID = -1);
    /* make the network */
-    XTensor Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining);
+    XTensor Make(XTensor& k, XTensor& q, XTensor& v,
+                 XTensor* mask, bool isTraining,
-    /* make the network given a big tensor that keeps keys, queries and values */
+                 Cache* cache, int cacheType);
-    XTensor MakeBig(XTensor &kqv, XTensor &mask, bool isTraining);
    /* make the attention network given keys, queries and values (after linear transformation) */
-    XTensor MakeAttention(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining);
+    XTensor MakeAttention(XTensor& k, XTensor& q, XTensor& v,
-};
+                          XTensor* mask, bool isTraining);
+    /* make the attention network given keys, queries and values (after linear transformation) */
+    XTensor MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
+                             XTensor* mask, bool isTraining, bool isEnc);
+    XTensor GetRPEmbedding(const int lenQ, const int lenKV, const int maxRelativeLen, const bool isEnc);
+    XTensor RPDotProduct(XTensor& x, XTensor& y, XTensor& z, const bool is_key);
+};
 }
 #endif
--- a/source/sample/transformer/module/T2TCommonModules.cpp
+++ b/source/sample/transformer/module/T2TCommonModules.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2020, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: Bei Li (libei_neu@outlook.com) 2020-02-05
+ * This file includes some common modules of the Transformer model
+ */
+#include <cmath>
+#include "T2TCommonModules.h"
+#include "../../../tensor/core/CHeader.h"
+#include "../../../tensor/function/FHeader.h"
+namespace transformer
+{
+/* 
+flexible layer normalization for the Transformer 
+>> input - input tensor
+>> ln - the layernorm network
+>> prenorm - whether we use prenorm or not
+>> before - whether we use layernorm before attention/fnn
+>> after - whether we use layernorm after attention/fnn
+*/
+XTensor LayerNorm(XTensor& input, T2TLN& ln, bool prenorm, bool before, bool after)
+{
+    if (after ^ prenorm)
+        return ln.Make(input);
+    else
+        return input;
+}
+}
\ No newline at end of file
--- a/source/sample/transformer/module/T2TCommonModules.h
+++ b/source/sample/transformer/module/T2TCommonModules.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2020, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ /*
+  * $Created by: Bei Li (libei_neu@outlook.com) 2020-02-03
+  */
+#ifndef __COMMONMODULE_H__
+#define __COMMONMODULE_H__
+#include "T2TLayerNormal.h"
+#include "T2TCommonModules.h"
+using namespace nts;
+namespace transformer
+{
+/* the layer normalization module to control pre-norm or post-norm*/
+XTensor LayerNorm(XTensor& input, T2TLN& ln, bool prenorm, bool before, bool after);
+}
+#endif
\ No newline at end of file
--- a/source/sample/transformer/module/T2TEmbedding.cpp
+++ b/source/sample/transformer/module/T2TEmbedding.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-01
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-07
+ */
+#include <cmath>
+#include "T2TUtility.h"
+#include "T2TEmbedding.h"
+#include "../../../tensor/core/CHeader.h"
+namespace transformer
+{
+/* constructor */
+T2TEmbedder::T2TEmbedder()
+{
+    devID = -1;
+    vSize = -1;
+    maxLength = -1;
+}
+/* de-constructor */
+T2TEmbedder::~T2TEmbedder()
+{
+}
+/*
+initialize the model
+>> config - configurations of the model
+>> isEnc - indicates if it is used for the encoder
+*/
+void T2TEmbedder::InitModel(T2TConfig& config, bool isEnc)
+{
+    devID = config.devID;
+    d = config.modelSize;
+    padIdx = config.padID;
+    eSize = config.embSize;
+    maxLength = config.maxPosLen;
+    vSize = (isEnc) ? config.srcVocabSize : config.tgtVocabSize;
+    InitTensor2D(&w, vSize, eSize, X_FLOAT, devID);
+    maxLength = maxLength + 1 + 1;
+    DTYPE v = 1.0F / (float)sqrt((float)eSize);
+    w.SetDataRandn(0, v);
+    /* create the positional embedding matrix */
+    MakePosEmbedding(maxLength);
+}
+/*
+make positional embeddings (of size eSize * length)
+>> length - length of the sequence
+*/
+void T2TEmbedder::MakePosEmbedding(int length)
+{
+    InitTensor2D(&posEmbeddingBase, length, eSize, X_FLOAT, devID);
+    float* data = new float[posEmbeddingBase.unitNum];
+    for (int pos = 0; pos < length; pos++) {
+        float* dp = data + pos * eSize;
+        int channelSize = eSize / 2;
+        int offset = 0;
+        for (int i = 0; i < channelSize; i++) {
+            dp[offset++] = (float)sin(pos * exp(-i * log(10000.0F) / (channelSize - 1)));
+        }
+        for (int i = 0; i < channelSize; i++) {
+            dp[offset++] = (float)cos(pos * exp(-i * log(10000.0F) / (channelSize - 1)));
+        }
+    }
+    /* padding zeros */
+    int padStart = padIdx * eSize;
+    for (int i = padStart; i < padStart + eSize; i++)
+        data[i] = 0.F;
+    posEmbeddingBase.SetData(data, posEmbeddingBase.unitNum);
+    if (w.dataType != posEmbeddingBase.dataType)
+        posEmbeddingBase = ConvertDataType(posEmbeddingBase, w.dataType);
+    delete[] data;
+}
+/*
+make the network
+>> input - the word indices
+>> nstep - the length of current sequence
+>> isDec - indicates whether it is decoder
+>> isTraining - indicates whether it is training
+<< return - word & position embeddings of the input
+*/
+XTensor T2TEmbedder::Make(XTensor& input, bool isDec, bool isTraining, int nstep)
+{
+    /* make sure the padding index is 1 */
+    CheckNTErrors(input.order > 1, "Wrong input tensor size!");
+    CheckNTErrors(input.dimSize[input.order - 1] < maxLength, "The sequence is too long!");
+    CheckNTErrors(vSize > 0, "set vocabulary size by \"-vsize\"");
+    CheckNTErrors(eSize > 0, "set embedding size by \"-esize\"");
+    XTensor wordEmbedding, position, posEmbedding;
+    InitTensor(&position, &input);
+    int* posData = new int[input.unitNum];
+    XTensor inputCPU;
+    InitTensorOnCPU(&inputCPU, &input);
+    _CopyValues(&input, &inputCPU);
+    if (!isDec)
+    {
+        /* encoder embeddings */
+        for (int i = 0; i < inputCPU.dimSize[0]; i++) {
+            int startNoPad = 1 + 1;
+            int* p = ((int*)inputCPU.data) + i * inputCPU.dimSize[1];
+            for (int j = 0; j < inputCPU.dimSize[1]; j++) {
+                if (p[j] == 1) {
+                    posData[i * inputCPU.dimSize[1] + j] = 1;
+                }
+                else {
+                    posData[i * inputCPU.dimSize[1] + j] = startNoPad++;
+                }
+            }
+        }
+        position.SetData(posData, position.unitNum);
+    }
+    else
+    {
+        /* decoder embeddings */
+        position.SetDataFixed(nstep + 2);
+    }
+    delete[] posData;
+    /* we make positional embeddings first */
+    posEmbedding = Gather(posEmbeddingBase, position);
+    /* then we make word embeddings */
+    wordEmbedding = Gather(w, input);
+    wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize));
+    /* we sum over the two embeddings */
+    return wordEmbedding + posEmbedding;
+}
+}
\ No newline at end of file
--- a/source/sample/transformer/T2TEmbedding.h
+++ b/source/sample/transformer/T2TEmbedding.h
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,12 +17,14 @@
 /*
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-01
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-07
 */
 #ifndef __T2TEMBEDDING_H__
 #define __T2TEMBEDDING_H__
-#include "../../network/XNet.h"
+#include "T2TUtility.h"
+#include "../../../network/XNet.h"
 using namespace nts;
@@ -31,7 +33,7 @@ namespace transformer
 #define DEFAULT_EMBEDDING_SIZE 512
-/* 
+/*
 embedding (of word at position i):
 word embedding + positional embedding
 */
@@ -40,7 +42,7 @@ class T2TEmbedder
 public:
    /* device id */
    int devID;
    /* vocabulary size */
    int vSize;
@@ -53,10 +55,13 @@ public:
    /* dimension size of the hidden layers in the t2t model */
    int d;
+    /* padding index */
+    int padIdx;
    /* word embedding matrix */
    XTensor w;
-    /* predefined positional embeddings. It can speeds up 
+    /* predefined positional embeddings. It can speeds up
       the embedding processing by re-loading. */
    XTensor posEmbeddingBase;
@@ -68,13 +73,13 @@ public:
    ~T2TEmbedder();
    /* initialize the model */
-    void InitModel(int argc, char ** argv, int myDevID = -1, bool isEnc = true);
+    void InitModel(T2TConfig& config, bool isEnc = true);
    /* make positional embeddings */
-    void MakePosEmbedding(int eSize, int d, int length);
+    void MakePosEmbedding(int length);
    /* make the network */
-    XTensor Make(XTensor &input);
+    XTensor Make(XTensor& input, bool isDec, bool isTraining, int nstep = 0);
 };
 }

--- a/source/sample/transformer/T2TFNN.cpp
+++ b/source/sample/transformer/T2TFNN.cpp
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,14 +17,16 @@
 /*
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */
-#include <math.h>
+#include <cmath>
 #include "T2TFNN.h"
 #include "T2TUtility.h"
 #include "T2TEmbedding.h"
-#include "../../tensor/core/CHeader.h"
+#include "../../../tensor/core/CHeader.h"
-#include "../../tensor/function/FHeader.h"
+#include "../../../tensor/function/FHeader.h"
 namespace transformer
 {
@@ -32,33 +34,30 @@ namespace transformer
 /* constructor */
 T2TFNN::T2TFNN()
 {
-    inSize  = -1;
+    inSize = -1;
    outSize = -1;
-    hSize   = -1;
+    hSize = -1;
 }
-/* deconstructor */
+/* de-constructor */
 T2TFNN::~T2TFNN()
 {
 }
-/* 
+/*
-initialize the model 
+initialize the model
 >> argc - number of arguments
 >> argv - list of pointers to the arguments
->> myDevID - device id
+>> config - configurations of the model
 */
-void T2TFNN::InitModel(int argc, char ** argv, int myDevID)
+void T2TFNN::InitModel(T2TConfig& config)
 {
-    devID = myDevID;
+    devID = config.devID;
-    float minmax = 0;
-    LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE);
+    inSize = config.modelSize;
-    LoadParamInt(argc, argv, "d", &outSize, DEFAULT_EMBEDDING_SIZE);
+    outSize = config.modelSize;
-    LoadParamInt(argc, argv, "fnnh", &hSize, outSize * 4);
+    hSize = config.fnnHiddenSize;
-    LoadParamFloat(argc, argv, "fnnminmax", &minmax, 0.1F);
+    dropoutP = config.fnnDropout;
-    LoadParamFloat(argc, argv, "dropoutfnn", &dropoutP, 0);
    InitTensor2D(&w1, inSize, hSize, X_FLOAT, devID);
    InitTensor1D(&b1, hSize, X_FLOAT, devID);
@@ -74,27 +73,24 @@ void T2TFNN::InitModel(int argc, char ** argv, int myDevID)
    b2.SetZeroAll();
 }
-/* 
+/*
-make the network 
+make the network
 y = max(0, x * w1 + b1) * w2 + b2
 >> input - the input tensor
->> return - the output tensor 
+>> return - the output tensor
 */
-XTensor T2TFNN::Make(XTensor &input, bool isTraining)
+XTensor T2TFNN::Make(XTensor& input, bool isTraining)
 {
    XTensor t1;
    /* t1 = max(0, x * w1 + b1) */
-    //t1 = Rectify(MMul(input, w1) + b1);
    t1 = Rectify(MulAndShift(input, w1, b1));
-    if(isTraining && dropoutP > 0)
+    if (isTraining && dropoutP > 0)
        t1 = Dropout(t1, dropoutP);
    /* result = t1 * w2 + b2 */
-    //return MMul(t1, w2) + b2;
    return MulAndShift(t1, w2, b2);
 }
+}
-}
\ No newline at end of file
--- a/source/sample/transformer/T2TFNN.h
+++ b/source/sample/transformer/T2TFNN.h
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,12 +17,15 @@
 /*
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */
 #ifndef __T2TFNN_H__
 #define __T2TFNN_H__
-#include "../../tensor/XTensor.h"
+#include "T2TUtility.h"
+#include "T2TLayerNormal.h"
+#include "../../../tensor/XTensor.h"
 using namespace nts;
@@ -56,7 +59,7 @@ public:
    /* bias of transformation 2 */
    XTensor b2;
    /* dropout probability */
    DTYPE dropoutP;
@@ -65,15 +68,14 @@ public:
    /* constructor */
    T2TFNN();
-    /* deconstructor */
+    /* de-constructor */
    ~T2TFNN();
    /* initialize the model */
-    void InitModel(int argc, char ** argv, int myDevID = -1);
+    void InitModel(T2TConfig& config);
    /* make the network */
-    XTensor Make(XTensor &input, bool isTraining);
+    XTensor Make(XTensor& input, bool isTraining);
 };
 }

--- a/source/sample/transformer/module/T2TGatedLinearUnit.cpp
+++ b/source/sample/transformer/module/T2TGatedLinearUnit.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: Bei Li (libei_neu@outlook.com) 2020-02-03
+ */
+#include <cmath>
+#include "T2TUtility.h"
+#include "T2TEmbedding.h"
+#include "T2TGatedLinearUnit.h"
+#include "../../../tensor/core/CHeader.h"
+#include "../../../tensor/function/FHeader.h"
+namespace transformer
+{
+/* constructor */
+GLU::GLU()
+{
+    inSize = -1;
+    outSize = -1;
+    hSize = -1;
+}
+/* de-constructor */
+GLU::~GLU()
+{
+}
+/*
+initialize the model
+>> config - configurations of the model
+*/
+void GLU::InitModel(T2TConfig& config)
+{
+    devID = config.devID;
+    float minmax = 0;
+    inSize = config.modelSize;
+    outSize = config.modelSize;
+    InitTensor2D(&w1, hSize, outSize, X_FLOAT, devID);
+    InitTensor1D(&b1, outSize, X_FLOAT, devID);
+    InitTensor2D(&w2, hSize, outSize, X_FLOAT, devID);
+    InitTensor1D(&b2, outSize, X_FLOAT, devID);
+}
+/*
+make the network
+y = W1 * x + b1 * sigmod(W2 * x + b2)
+>> input - the input tensor, size = 2 * hSize
+>> return - the output tensor, size = hSize
+*/
+XTensor GLU::Make(XTensor& input)
+{
+    XTensor t1;
+    XTensor t2;
+    TensorList input_list;
+    /* split the input into two vectors with the dim hSize */
+    Split(input, input_list, -1, 2);
+    /* t1 = W1 * x + b1 */
+    t1 = MulAndShift(input_list.GetItem(0), w1, b1);
+    /* t2 = W2 * x + b2 */
+    t2 = MulAndShift(input_list.GetItem(1), w2, b2);
+    return t1 * Sigmoid(t2);
+}
+}
\ No newline at end of file
--- a/source/sample/transformer/module/T2TGatedLinearUnit.h
+++ b/source/sample/transformer/module/T2TGatedLinearUnit.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: Bei Li (libei_neu@outlook.com) 2020-02-03
+ */
+#ifndef __GLU_H__
+#define __GLU_H__
+#include "T2TLayerNormal.h"
+#include "T2TGatedLinearUnit.h"
+using namespace nts;
+namespace transformer
+{
+/* a fnn: y = max(0, x * w1 + b1) * w2 + b2 */
+class GLU
+{
+public:
+    /* device id */
+    int devID;
+    /* size of input vector */
+    int inSize;
+    /* size of output vector */
+    int outSize;
+    /* size of hidden layers */
+    int hSize;
+    /* matrix of transformation 1 */
+    XTensor w1;
+    /* bias of transformation 1 */
+    XTensor b1;
+    /* matrix of transformation 2 */
+    XTensor w2;
+    /* bias of transformation 2 */
+    XTensor b2;
+public:
+    /* constructor */
+    GLU();
+    /* de-constructor */
+    ~GLU();
+    /* initialize the model */
+    void InitModel(T2TConfig& config);
+    /* make the network */
+    XTensor Make(XTensor& input);
+};
+}
+#endif
\ No newline at end of file
--- a/source/sample/transformer/module/T2TLayerHistory.cpp
+++ b/source/sample/transformer/module/T2TLayerHistory.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: Bei Li (libei_neu@outlook.com) 2020-02-03
+ */
+#include <cmath>
+#include "T2TUtility.h"
+#include "T2TEmbedding.h"
+#include "T2TLayerNormal.h"
+#include "T2TLayerHistory.h"
+#include "../../../tensor/core/CHeader.h"
+#define SAFE_DELETE(x) do{ if((x) != NULL){delete (x); (x) = NULL;} } while(false)
+#define SAFE_DELETE_ARRAY(x) do{ if((x) != NULL) {delete [] (x); (x)=NULL;} } while(false)
+namespace transformer
+{
+/* constructor */
+LayerHistory::LayerHistory()
+{
+    d = -1;
+    count = -1;
+    weight = NULL;
+    layerNorms = NULL;
+}
+/* de-constructor */
+LayerHistory::~LayerHistory()
+{
+    history.Clear();
+    delete[] layerNorms;
+}
+/*
+initialize the model
+>> config - configurations of the model
+*/
+void LayerHistory::InitModel(T2TConfig& config)
+{
+    devID = config.devID;
+    d = config.modelSize;
+    nlayer = config.nEncLayer;
+    InitTensor2D(&weight, nlayer + 1, nlayer + 1, X_FLOAT, devID);
+    layerNorms = new T2TLN[nlayer];
+    /* initialize the layer normalization of each layer */
+    for (int i = 0; i < nlayer; i++) {
+        layerNorms[i].InitModel(config);
+    }
+}
+/*
+the Add operation
+>> tensor - the previous layer output. It might be of size B * L * H
+            where B = batch size, L = sequence length,
+            and H = vector size of each position
+*/
+void LayerHistory::Add(XTensor& tensor)
+{
+    /* the embedding is not normed */
+    count += 1;
+    if (history.Size() == 0) {
+        //sample_ = tensor;
+        history.Add(&tensor);
+        return;
+    }
+    XTensor ln = layerNorms[count - 2].Make(tensor);
+    history.Add(&ln);
+}
+/*
+generate the weight sum vector of all previous layer output in the history as the layer input
+*/
+XTensor LayerHistory::Pop()
+{
+    /* the number of layer output in the history */
+    size_t size = history.Size();
+    TensorList historyList;
+    for (size_t i = 0; i < size; i++)
+        historyList.Add(history[i]);
+    /* we need stack the tensor along the first dim*/
+    XTensor stackTensor = Stack(historyList, 0);
+    XTensor interWeight;
+    InitTensor2D(&interWeight, 1, weight.dimSize[1], DEFAULT_DTYPE, devID);
+    XTensor layerWeight;
+    InitTensor1D(&layerWeight, size, DEFAULT_DTYPE, devID);
+    _SelectRange(&weight, &interWeight, 0, size - 1, size);
+    interWeight.Reshape(interWeight.unitNum);
+    _SelectRange(&interWeight, &layerWeight, 0, 0, size);
+    MultiplyDimMe(stackTensor, layerWeight, 0);
+    XTensor result;
+    ReduceSum(stackTensor, result, 0);
+    return result;
+}
+void LayerHistory::ClearHistory()
+{
+    history.Clear();
+}
+}
\ No newline at end of file
--- a/source/sample/transformer/module/T2TLayerHistory.h
+++ b/source/sample/transformer/module/T2TLayerHistory.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: Bei Li (libei_neu@outlook.com) 2020-02-03
+ */
+#ifndef __LAYERHISTORY_H__
+#define __LAYERHISTORY_H__
+#include "T2TLayerNormal.h"
+#include "T2TLayerHistory.h"
+#include "../../../tensor/function/FHeader.h"
+using namespace nts;
+namespace transformer
+{
+/*
+multi-head attention
+y(Q, K, V) = cat(head_1, head_2, ..., head_n)
+where head_i = Attention(Q * w_i^Q, K * w_i^K, V * w_i^V)
+      attention(Q, K, V) = softmax(Q * K^T/d_k^0.5) V
+      d_k = dimension size of K
+*/
+class LayerHistory
+{
+public:
+    /* device id */
+    int devID;
+    /* the triangle weight matrix for dlcl */
+    XTensor weight;
+    /* hidden size */
+    int d;
+    /* layer number */
+    int nlayer;
+    /* current layer number */
+    int count;
+    /* a history to store the value of intimidate layers */
+    TensorList history;
+    /* layer normalization for each intimidate layer */
+    T2TLN* layerNorms;
+public:
+    /* constructor */
+    LayerHistory();
+    /* de-constructor */
+    ~LayerHistory();
+    /* initialize the model */
+    void InitModel(T2TConfig& config);
+    /* add the layer output to the history */
+    void Add(XTensor& tensor);
+    /* compute the layer input for the current layer, the weight sum of all previous layer output after normed in the history */
+    XTensor Pop();
+    /* clean the history*/
+    void ClearHistory();
+};
+}
+#endif
--- a/source/sample/transformer/T2TLayerNormal.cpp
+++ b/source/sample/transformer/T2TLayerNormal.cpp
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,13 +17,14 @@
 /*
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */
-#include <math.h>
+#include <cmath>
-#include "T2TLayerNormal.h"
 #include "T2TUtility.h"
 #include "T2TEmbedding.h"
-#include "../../tensor/core/CHeader.h"
+#include "T2TLayerNormal.h"
+#include "../../../tensor/core/CHeader.h"
 namespace transformer
 {
@@ -44,32 +45,28 @@ T2TLN::~T2TLN()
 initialize the model
 >> argc - number of arguments
 >> argv - list of pointers to the arguments
->> myDevID - device id
+>> config - configurations of the model
 */
-void T2TLN::InitModel(int argc, char ** argv, int myDevID)
+void T2TLN::InitModel(T2TConfig& config)
 {
-    devID = myDevID;
+    devID = config.devID;
-    d = 0;
+    d = config.modelSize;
-    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
    InitTensor1D(&w, d, X_FLOAT, devID);
    InitTensor1D(&b, d, X_FLOAT, devID);
    w.SetDataRand(1.0F, 1.0F);
    b.SetZeroAll();
 }
 /*
 make the network
-for each layer representation x, we have
-y =
 >> input - the input tensor
 >> return - layer normalization output
 */
-XTensor T2TLN::Make(XTensor &input)
+XTensor T2TLN::Make(XTensor& input)
 {
-    XTensor &x = input;
+    XTensor& x = input;
    XTensor xn;
    XTensor mean;
    XTensor variance;
@@ -77,6 +74,13 @@ XTensor T2TLN::Make(XTensor &input)
    XTensor meanFilled;
    XTensor standardFilled;
+    TENSOR_DATA_TYPE dataType = input.dataType;
+    if (dataType == X_FLOAT16) {
+        /* reduce functions can only run with FP32 */
+        x = ConvertDataType(input, X_FLOAT);
+    }
    /* \mu = (sum_i x_i)/m */
    mean = ReduceMean(x, x.order - 1);
@@ -94,8 +98,13 @@ XTensor T2TLN::Make(XTensor &input)
    /* x' = (x - \mu)/standard */
    xn = (x - meanFilled) / standardFilled;
+    if (dataType != mean.dataType) {
+        x = ConvertDataType(x, dataType);
+        xn = ConvertDataType(xn, dataType);
+    }
    /* result = x' * w + b   */
    return xn * w + b;
 }
 }
\ No newline at end of file
--- a/source/sample/transformer/T2TLayerNormal.h
+++ b/source/sample/transformer/T2TLayerNormal.h
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,19 +17,21 @@
 /*
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */
 #ifndef __T2TLAYERNORMAL_H__
 #define __T2TLAYERNORMAL_H__
-#include "../../network/XNet.h"
+#include "T2TUtility.h"
+#include "../../../network/XNet.h"
 using namespace nts;
 namespace transformer
 {
-/* layer normalization: y = norm(x) * w + b 
+/* layer normalization: y = norm(x) * w + b
   where norm(x) = (x - mean)/standardDeviation */
 class T2TLN
 {
@@ -45,19 +47,19 @@ public:
    /* dimension size of the model */
    int d;
 public:
    /* constructor */
    T2TLN();
    /* de-constructor */
    ~T2TLN();
    /* initialize the model */
-    void InitModel(int argc, char ** argv, int myDevID = -1);
+    void InitModel(T2TConfig& config);
    /* make the network */
-    XTensor Make(XTensor &input);
+    XTensor Make(XTensor& input);
 };
 }

--- a/source/sample/transformer/module/T2TNNUtil.cpp
+++ b/source/sample/transformer/module/T2TNNUtil.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: Chi (huchinlp@foxmail.com) 2020-03-21
+ */
+#include "T2TNNUtil.h"
+namespace transformer
+{
+/* 
+a wrapper for the gather function 
+>> src - the input tensor
+>> index - the index tensor
+<< res - the output tensor
+*/
+XTensor AutoGather(XTensor& src, XTensor& index)
+{
+    if (src.order == 2)
+        return Gather(src, index);
+    else {
+        CheckNTErrors(src.order == 3, "the source must be 3d");
+        int order = src.order;
+        int dimSize[MAX_TENSOR_DIM_NUM];
+        for (int i = 0; i < src.order; i++) {
+            dimSize[i] = src.dimSize[i];
+        }
+        src.Reshape(src.dimSize[0], src.dimSize[1] * src.dimSize[2]);
+        XTensor res = Gather(src, index);
+        src.Reshape(order, dimSize);
+        dimSize[0] = index.dimSize[0];
+        dimSize[1] = res.unitNum / (dimSize[0] * dimSize[2]);
+        res.Reshape(order, dimSize);
+        return res;
+    }
+}
+}
\ No newline at end of file
--- a/source/sample/transformer/T2TUtility.h
+++ b/source/sample/transformer/T2TUtility.h
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -16,31 +16,24 @@
 */
 /*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Created by: Chi (huchinlp@foxmail.com) 2020-03-21
 */
-#ifndef __T2TUTILITY_H__
+#ifndef __T2TNNUTIL_H__
-#define __T2TUTILITY_H__
+#define __T2TNNUTIL_H__
-#include <stdio.h>
+#include "../../../tensor/XGlobal.h"
+#include "../../../tensor/core/CHeader.h"
+#include "../../../tensor/function/FHeader.h"
+using namespace nts;
 namespace transformer
 {
-extern FILE * tmpFILE;
+/* the gather function for tensor with any dimension */
+XTensor AutoGather(XTensor& src, XTensor& index);
-/* load arguments */
-void LoadParamString(int argc, char ** argv, const char * name, char * p, const char * defaultP);
-void LoadParamInt(int argc, char ** argv, const char * name, int * p, int defaultP);
-void LoadParamBool(int argc, char ** argv, const char * name, bool * p, bool defaultP);
-void LoadParamFloat(int argc, char ** argv, const char * name, float * p, float defaultP);
-/* show arguments */
-void ShowParams(int argc, char ** argv);
-extern int llnum;
-extern FILE * tf;
 }
 #endif
\ No newline at end of file
--- a/source/sample/transformer/T2TOutput.cpp
+++ b/source/sample/transformer/T2TOutput.cpp
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,22 +17,24 @@
 /*
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */
-#include <math.h>
+#include <cmath>
 #include "T2TOutput.h"
 #include "T2TUtility.h"
 #include "T2TEmbedding.h"
-#include "../../tensor/core/CHeader.h"
+#include "../../../tensor/core/CHeader.h"
 namespace transformer
 {
 /* constructor */
 T2TOutput::T2TOutput()
 {
    devID = -1;
    vSize = -1;
-    inSize = -1;
    hSize = -1;
 }
@@ -42,57 +44,51 @@ T2TOutput::~T2TOutput()
 }
 /*
-initialize the model 
+initialize the model
->> argc - number of arguments
+>> config - configurations of the model
->> argv - list of pointers to the arguments
->> myDevID - device id
 */
-void T2TOutput::InitModel(int argc, char ** argv, int myDevID)
+void T2TOutput::InitModel(T2TConfig& config)
 {
-    devID = myDevID;
+    devID = config.devID;
+    hSize = config.modelSize;
-    float minmax = 0;
+    vSize = config.tgtVocabSize;
-    LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
+    InitTensor2D(&w, vSize, hSize, X_FLOAT, devID);
-    LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE);
-    LoadParamInt(argc, argv, "d", &hSize, DEFAULT_EMBEDDING_SIZE);
-    LoadParamFloat(argc, argv, "outputminmax", &minmax, 0.08F);
-    InitTensor2D(&w, hSize, vSize, X_FLOAT, devID);
+    DTYPE v = 1.0F / (float)sqrt((float)hSize);
-    float scale = 1.0F;
-    float finfout = (float)sqrt(6.0F * scale/(hSize + vSize));
-    w.SetDataRand(-finfout, finfout);
-    DTYPE v = 1.0F/(float)sqrt((float)hSize);
    w.SetDataRandn(0, v);
 }
-/* 
+/*
-make the network 
+make the network (redefined output tensor)
-y = softmax(x * w)
 >> input - input tensor
-<< return - output tensor 
+>> output - output tensor
+>> isTraining - whether it is used for training
+>> normalized - whether ignore the log-softmax
 */
-XTensor T2TOutput::Make(XTensor &input)
+void T2TOutput::Make(XTensor& input, XTensor& output, bool isTraining, bool normalized)
 {
-    XTensor &x = input;
+    XTensor& x = input;
-    return LogSoftmax(MMul(x, w), -1);
+    output = MMul(x, X_NOTRANS, w, X_TRANS);
-}
-/* 
+    /* use softmax for training */
-make the network (redefined output tensor) 
+    if (isTraining) {
->> input - input tensor
+        output = Softmax(output, -1);
->> output - output tensor 
+        return;
-*/
+    }
-void T2TOutput::Make(XTensor &input, XTensor &output)
-{
-    XTensor &x = input;
-    //output = LogSoftmax(MMul(x, w), -1);
+    /* normalize the output for beam search */
-    output = Softmax(MMul(x, w), -1);
+    if (normalized) {
-    output.SetName(OUTPUT_NAME);
+        auto dataType = output.dataType;
-}
+        if (dataType == X_FLOAT16)
+            output = ConvertDataType(output, X_FLOAT);
+        output = LogSoftmax(output, -1);
+        if (output.dataType != dataType)
+            output = ConvertDataType(output, dataType);
+    }
 }
+}
\ No newline at end of file
--- a/source/sample/transformer/T2TOutput.h
+++ b/source/sample/transformer/T2TOutput.h
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,19 +17,19 @@
 /*
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */
 #ifndef __T2TOUTPUT_H__
 #define __T2TOUTPUT_H__
-#include "../../tensor/function/FHeader.h"
+#include "T2TUtility.h"
+#include "../../../tensor/function/FHeader.h"
 using namespace nts;
 namespace transformer
 {
-#define OUTPUT_NAME "output"
 /* output layer */
 class T2TOutput
@@ -41,9 +41,6 @@ public:
    /* vocabulary size */
    int vSize;
-    /* input vector size */
-    int inSize;
    /* vector size of the linear transformation */
    int hSize;
@@ -58,16 +55,12 @@ public:
    ~T2TOutput();
    /* initialize the model */
-    void InitModel(int argc, char ** argv, int myDevID = -1);
+    void InitModel(T2TConfig& config);
-    /* make the network */
-    XTensor Make(XTensor &input);
    /* make the network (redefined output tensor) */
-    void Make(XTensor &input, XTensor &output);
+    void Make(XTensor& input, XTensor& output, bool isTraining, bool normalized);
 };
 }
 #endif
\ No newline at end of file
--- a/source/sample/transformer/module/T2TUtility.cpp
+++ b/source/sample/transformer/module/T2TUtility.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
+ */
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <string>
+#include <fstream>
+#include <sstream>
+#include "T2TUtility.h"
+#include "../../../tensor/XGlobal.h"
+using namespace nts;
+using namespace std;
+namespace transformer
+{
+/*
+load configurations from the command
+>> argc - number of arguments
+>> argv - the list of arguments
+*/
+T2TConfig::T2TConfig(int argc, const char** argv)
+{
+    char** args = new char* [MAX_PARAM_NUM];
+    for (int i = 0; i < argc; i++) {
+        args[i] = new char[strlen(argv[i]) + 1];
+        strcpy(args[i], argv[i]);
+    }
+    char* configFN = new char[1024];
+    LoadParamString(argc, args, "config", configFN, "");
+    int argsNum = argc;
+    /* load configurations from a file */
+    if (strcmp(configFN, "") != 0)
+        argsNum = LoadFromFile(configFN, args);
+    ShowParams(argsNum, args);
+    /* options for the model */
+    LoadParamInt(argsNum, args, "nhead", &nhead, 8);
+    LoadParamInt(argsNum, args, "enclayer", &nEncLayer, 1);
+    LoadParamInt(argsNum, args, "declayer", &nDecLayer, 1);
+    LoadParamInt(argsNum, args, "maxrp", &maxRP, 8);
+    LoadParamInt(argsNum, args, "embsize", &embSize, 256);
+    LoadParamInt(argsNum, args, "modelsize", &modelSize, 256);
+    LoadParamInt(argsNum, args, "maxpos", &maxPosLen, 1024);
+    LoadParamInt(argsNum, args, "fnnhidden", &fnnHiddenSize, modelSize * 4);
+    LoadParamInt(argsNum, args, "vsize", &srcVocabSize, 10000);
+    LoadParamInt(argsNum, args, "vsizetgt", &tgtVocabSize, 10000);
+    LoadParamInt(argsNum, args, "padid", &padID, 1);
+    LoadParamInt(argsNum, args, "startid", &startID, 2);
+    LoadParamInt(argsNum, args, "endid", &endID, 2);
+    LoadParamBool(argsNum, args, "rpr", &useRPR, false);
+    LoadParamBool(argsNum, args, "prenorm", &preNorm, false);
+    LoadParamString(argsNum, args, "model", modelFN, "model.bin");
+    LoadParamString(argsNum, args, "srcvocab", srcVocabFN, "vocab.src");
+    LoadParamString(argsNum, args, "tgtvocab", tgtVocabFN, "vocab.tgt");
+    /* options for training */
+    LoadParamString(argsNum, args, "train", trainFN, "");
+    LoadParamString(argsNum, args, "valid", validFN, "");
+    LoadParamInt(argsNum, args, "dev", &devID, 0);
+    LoadParamInt(argsNum, args, "wbatch", &wBatchSize, 2048);
+    LoadParamInt(argsNum, args, "sbatch", &sBatchSize, 1);
+    isTraining = (strcmp(trainFN, "") == 0) ? false : true;
+    LoadParamBool(argsNum, args, "mt", &isMT, true);
+    LoadParamFloat(argsNum, args, "dropout", &dropout, 0.1);
+    LoadParamFloat(argsNum, args, "fnndrop", &fnnDropout, 0.0);
+    LoadParamFloat(argsNum, args, "attdrop", &attDropout, 0.0);
+    LoadParamFloat(argc, args, "lrate", &lrate, 1.0F);
+    LoadParamFloat(argc, args, "lrbias", &lrbias, 0);
+    LoadParamInt(argc, args, "nepoch", &nepoch, 20);
+    LoadParamInt(argc, args, "nstep", &nstep, 100000);
+    LoadParamInt(argc, args, "nwarmup", &nwarmup, 3000);
+    LoadParamBool(argc, args, "adam", &useAdam, true);
+    LoadParamFloat(argc, args, "adambeta1", &adamBeta1, 0.9F);
+    LoadParamFloat(argc, args, "adambeta2", &adamBeta2, 0.98F);
+    LoadParamFloat(argc, args, "adamdelta", &adamDelta, 1e-9F);
+    LoadParamBool(argc, args, "shuffled", &isShuffled, true);
+    LoadParamFloat(argc, args, "labelsmoothing", &labelSmoothingP, 0.1);
+    LoadParamInt(argc, args, "nstepcheckpoint", &nStepCheckpoint, -1);
+    LoadParamBool(argc, args, "epochcheckpoint", &useEpochCheckpoint, false);
+    LoadParamInt(argc, args, "updatestep", &updateStep, 1);
+    LoadParamBool(argc, args, "debug", &isDebugged, false);
+    LoadParamBool(argc, args, "sorted", &isLenSorted, false);
+    LoadParamInt(argc, args, "bufsize", &bufSize, 50000);
+    LoadParamBool(argc, args, "doubledend", &isDoubledEnd, false);
+    LoadParamBool(argc, args, "smallbatch", &isSmallBatch, true);
+    LoadParamBool(argc, args, "bigbatch", &isBigBatch, false);
+    LoadParamBool(argc, args, "randbatch", &isRandomBatch, false);
+    LoadParamInt(argc, args, "bucketsize", &bucketSize, 0);
+    /* options for translating */
+    LoadParamString(argsNum, args, "test", testFN, "");
+    LoadParamString(argsNum, args, "output", outputFN, "");
+    LoadParamInt(argsNum, args, "beamsize", &beamSize, 1);
+    LoadParamBool(argsNum, args, "fp16", &useFP16, false);
+    LoadParamFloat(argsNum, args, "lenalpha", &lenAlpha, 0.6);
+    LoadParamFloat(argsNum, args, "maxlenalpha", &maxLenAlpha, 2.0);
+    for (int i = 0; i < argc; i++)
+        delete[] args[i];
+    delete[] args;
+    delete[] configFN;
+}
+/*
+load configurations from a file
+>> configFN - path to the configuration file
+>> args - the list to store the configurations
+format: one option per line, separated by a blank or a tab
+*/
+int T2TConfig::LoadFromFile(const char* configFN, char** args) {
+    ifstream f(configFN, ios::in);
+    CheckNTErrors(f.is_open(), "unable to open the config file");
+    int argsNum = 0;
+    /* parse arguments */
+    string key, value;
+    while (f >> key >> value) {
+        key += '-';
+        strcpy(args[argsNum++], key.c_str());
+        strcpy(args[argsNum++], value.c_str());
+    }
+    /* record the number of arguments */
+    return argsNum;
+}
+void LoadParamString(int argc, char** argv, const char* name, char* p, const char* defaultP)
+{
+    char vname[128];
+    vname[0] = '-';
+    strcpy(vname + 1, name);
+    bool hit = false;
+    for (int i = 0; i < argc; i++) {
+        if (!strcmp(argv[i], vname) && i + 1 < argc) {
+            strcpy(p, argv[i + 1]);
+            hit = true;
+            break;
+        }
+    }
+    if (!hit)
+        strcpy(p, defaultP);
+}
+void LoadParamInt(int argc, char** argv, const char* name, int* p, int defaultP)
+{
+    char vname[128];
+    vname[0] = '-';
+    strcpy(vname + 1, name);
+    bool hit = false;
+    for (int i = 0; i < argc; i++) {
+        if (!strcmp(argv[i], vname) && i + 1 < argc) {
+            *(int*)p = atoi(argv[i + 1]);
+            hit = true;
+            break;
+        }
+    }
+    if (!hit)
+        *p = defaultP;
+}
+void LoadParamBool(int argc, char** argv, const char* name, bool* p, bool defaultP)
+{
+    char vname[128];
+    vname[0] = '-';
+    strcpy(vname + 1, name);
+    bool hit = false;
+    for (int i = 0; i < argc; i++) {
+        if (!strcmp(argv[i], vname)) {
+            *(bool*)p = true;
+            hit = true;
+            break;
+        }
+    }
+    if (!hit)
+        *p = defaultP;
+}
+void LoadParamFloat(int argc, char** argv, const char* name, float* p, float defaultP)
+{
+    char vname[128];
+    vname[0] = '-';
+    strcpy(vname + 1, name);
+    bool hit = false;
+    for (int i = 0; i < argc; i++) {
+        if (!strcmp(argv[i], vname) && i + 1 < argc) {
+            *p = (float)atof(argv[i + 1]);
+            hit = true;
+            break;
+        }
+    }
+    if (!hit)
+        *p = defaultP;
+}
+void ShowParams(int argc, char** argv)
+{
+    fprintf(stderr, "args:\n");
+    for (int i = 0; i < argc; i++) {
+        if (argv[i][1] == 0)
+            continue;
+        if (argv[i][0] == '-' && (argv[i][1] < '1' || argv[i][1] > '9')) {
+            if (i + 1 < argc && argv[i + 1][0] != '-')
+                fprintf(stderr, " %s=%s\n", argv[i], argv[i + 1]);
+            else
+                fprintf(stderr, " %s=yes\n", argv[i]);
+        }
+    }
+    fprintf(stderr, "\n");
+}
+#define MAX_WORD_NUM 120
+/*
+split string by delimiter, this will return indices of all sub-strings
+>> s - the original string
+>> delimiter - as it is
+<< indices - indices of all sub-strings
+*/
+UInt64List SplitToPos(const string& s, const string& delimiter)
+{
+    UInt64List indices;
+    if (delimiter.length() == 0) {
+        indices.Add(0);
+    }
+    size_t pos = 0;
+    uint64_t start = 0;
+    while ((pos = s.find(delimiter, start)) != string::npos) {
+        if (pos != start) {
+            indices.Add(start);
+        }
+        start = pos + delimiter.length();
+    }
+    if (start != s.length()) {
+        indices.Add(start);
+    }
+    return indices;
+}
+/* split a string to a int64_t list */
+IntList SplitInt(const string& s, const string& delimiter)
+{
+    IntList values;
+    auto indices = SplitToPos(s, delimiter);
+    for (int i = 0; i < indices.Size(); i++) {
+        values.Add(strtol(s.data() + indices[i], nullptr, 10));
+    }
+    return values;
+}
+/* split a string to a float list */
+FloatList SplitFloat(const string& s, const string& delimiter)
+{
+    FloatList values;
+    auto indices = SplitToPos(s, delimiter);
+    for (int i = 0; i < indices.Size(); i++) {
+        values.Add(strtof(s.data() + indices[i], nullptr));
+    }
+    return values;
+}
+}
\ No newline at end of file
--- a/source/sample/transformer/module/T2TUtility.h
+++ b/source/sample/transformer/module/T2TUtility.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
+ */
+#ifndef __T2TUTILITY_H__
+#define __T2TUTILITY_H__
+#include <string>
+#include <cstdio>
+#include "../../../tensor/XList.h"
+using namespace std;
+using namespace nts;
+namespace transformer
+{
+#define MAX_PARAM_NUM 100
+/* load arguments */
+void LoadParamInt(int argc, char** argv, const char* name, int* p, int defaultP);
+void LoadParamBool(int argc, char** argv, const char* name, bool* p, bool defaultP);
+void LoadParamFloat(int argc, char** argv, const char* name, float* p, float defaultP);
+void LoadParamString(int argc, char** argv, const char* name, char* p, const char* defaultP);
+/* show arguments */
+void ShowParams(int argc, char** argv);
+/* split string */
+IntList SplitInt(const string& s, const string& delimiter);
+FloatList SplitFloat(const string& s, const string& delimiter);
+UInt64List SplitToPos(const string& s, const string& delimiter);
+/* configurations for t2t */
+class T2TConfig {
+public:
+    /* path to the model */
+    char modelFN[1024];
+    /* path to the source vocab */
+    char srcVocabFN[1024];
+    /* path to the target vocab */
+    char tgtVocabFN[1024];
+    /* path to the input file (for inference) */
+    char testFN[1024];
+    /* path to the output file (for inference) */
+    char outputFN[1024];
+    /* path to the training file */
+    char trainFN[1024];
+    /* path to the validation file */
+    char validFN[1024];
+    /* device id */
+    int devID;
+    /* beam size */
+    int beamSize;
+    /* word batch size */
+    int wBatchSize;
+    /* sentence batch size */
+    int sBatchSize;
+    /* number of heads in attention */
+    int nhead;
+    /* number of encoder layers */
+    int nEncLayer;
+    /* number of decoder layers */
+    int nDecLayer;
+    /* the maximum relative position in RPR attentions */
+    int maxRP;
+    /* the dimension of embeddings */
+    int embSize;
+    /* the dimension of hidden layer */
+    int modelSize;
+    /* the maximum length in positional embedding */
+    int maxPosLen;
+    /* the dimension of fnn hidden layer */
+    int fnnHiddenSize;
+    /* the vocab size of source sequence */
+    int srcVocabSize;
+    /* the vocab size of target sequence */
+    int tgtVocabSize;
+    /* the padding id */
+    int padID;
+    /* start symbol */
+    int startID;
+    /* end symbol */
+    int endID;
+    /* indicates whether the model uses pre-norm */
+    bool preNorm;
+    /* indicates whether the model is running for machine translation */
+    bool isMT;
+    /* indicates whether the model is running with FP16 data type */
+    bool useFP16;
+    /* indicates whether we use the RPR attention */
+    bool useRPR;
+    /* indicates whether we train the model */
+    bool isTraining;
+    /* dropout rate for the model */
+    float dropout;
+    /* dropout rate for fnn layers */
+    float fnnDropout;
+    /* dropout rate for attention layers */
+    float attDropout;
+    /* the alpha parameter controls the length preference */
+    float lenAlpha;
+    /* scalar of the input sequence (for max number of search steps) */
+    float maxLenAlpha;
+    /* learning rate */
+    float lrate;
+    /* the parameter that controls the maximum learning rate in training */
+    float lrbias;
+    /* training epoch number */
+    int nepoch;
+    /* traing step number */
+    int nstep;
+    /* indicates whether we use Adam */
+    bool useAdam;
+    /* hyper parameters of Adam */
+    float adamBeta1;
+    float adamBeta2;
+    float adamDelta;
+    /* step number of warm-up for training */
+    int nwarmup;
+    /* indicates whether the data file is shuffled for training */
+    bool isShuffled;
+    /* the factor of label smoothing */
+    float labelSmoothingP;
+    /* number of steps after which we make a checkpoint */
+    int nStepCheckpoint;
+    /* indicates whether we make a checkpoint after each training epoch */
+    bool useEpochCheckpoint;
+    /* number of batches on which we do model update */
+    int updateStep;
+    /* indicates whether we intend to debug the net */
+    bool isDebugged;
+    /* indicates whether the sequence is sorted by length */
+    bool isLenSorted;
+    /* buffer size */
+    int bufSize;
+    /* indicates whether we double the </s> symbol for the output of LM */
+    bool isDoubledEnd;
+    /* indicates whether we use batchsize = max * sc
+       rather rather than batchsize = word-number, where max is the maximum
+       length and sc is the sentence number */
+    bool isSmallBatch;
+    /* counterpart of "isSmallBatch" */
+    bool isBigBatch;
+    /* randomize batches */
+    bool isRandomBatch;
+    /* bucket size */
+    int bucketSize;
+public:
+    /* load configurations from the command */
+    T2TConfig(int argc, const char** argv);
+    /* load configurations from a file */
+    int LoadFromFile(const char* configFN, char** args);
+};
+}
+#endif
--- a/source/sample/transformer/T2TBatchLoader.cpp
+++ b/source/sample/transformer/T2TBatchLoader.cpp
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2019, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -20,10 +20,10 @@
 */
 #include "T2TBatchLoader.h"
-#include "T2TUtility.h"
+#include "../module/T2TUtility.h"
-#include "../../tensor/XUtility.h"
+#include "../../../tensor/XUtility.h"
-#include "../../tensor/core/CHeader.h"
+#include "../../../tensor/core/CHeader.h"
-#include "../../network/XNoder.h"
+#include "../../../network/XNoder.h"
 namespace transformer
 {
@@ -55,24 +55,24 @@ T2TBatchLoader::~T2TBatchLoader()
    delete[] seqOffset;
 }
-/* 
+/*
-initialization 
+initialization
 >> argc - number of arguments
 >> argv - list of pointers to the arguments
 */
-void T2TBatchLoader::Init(int argc, char ** argv)
+void T2TBatchLoader::Init(T2TConfig& config)
 {
-    LoadParamInt(argc, argv, "bufsize", &bufSize, 50000);
+    bufSize = config.bufSize;
-    LoadParamBool(argc, argv, "doubledend", &isDoubledEnd, false);
+    isDoubledEnd = config.isDoubledEnd;
-    LoadParamBool(argc, argv, "smallbatch", &isSmallBatch, true);
+    isSmallBatch = config.isSmallBatch;
-    LoadParamBool(argc, argv, "bigbatch", &isBigBatch, false);
+    isBigBatch = config.isBigBatch;
-    LoadParamBool(argc, argv, "randbatch", &isRandomBatch, false);
+    isRandomBatch = config.isRandomBatch;
-    LoadParamInt(argc, argv, "bucketsize", &bucketSize, 0);
+    bucketSize = config.bucketSize;
-    buf  = new int[bufSize];
+    buf = new int[bufSize];
    buf2 = new int[bufSize];
    bufBatch = new BatchNode[bufSize];
-    seqLen  = new int[bufSize];
+    seqLen = new int[bufSize];
    seqLen2 = new int[bufSize];
    seqOffset = new int[bufSize];
 }
@@ -83,65 +83,65 @@ struct SampleNode
 {
    int id;
    int offset;
-    int * p;
+    int* p;
    int size;
    int value;
    int key;
 };
-int CompareSampleNode(const void * a, const void * b)
+int CompareSampleNode(const void* a, const void* b)
 {
-   return ((SampleNode*)b)->value - ((SampleNode*)a)->value;
+    return ((SampleNode*)b)->value - ((SampleNode*)a)->value;
 }
-int CompareSampleNodeV2(const void * a, const void * b)
+int CompareSampleNodeV2(const void* a, const void* b)
 {
    return ((SampleNode*)b)->key - ((SampleNode*)a)->key;
 }
-/* 
+/*
-load data to buffer 
+load data to buffer
 >> file - where to load data
 >> isSorted - indicates whether the samples are sorted by length
 >> step - the number of sequences we go over when move to the next sample
 */
-int T2TBatchLoader::LoadBuf(FILE * file, bool isSorted, int step)
+int T2TBatchLoader::LoadBuf(FILE* file, bool isSorted, int step)
 {
    int lineCount = 0;
    int seqCount = 0;
    int wordCount = 0;
-    while(fgets(line, MAX_SEQUENCE_LENGTH - 1, file)){
+    while (fgets(line, MAX_SEQUENCE_LENGTH - 1, file)) {
        int len = (int)strlen(line);
-        while(line[len - 1] == '\r' || line[len - 1] == '\n'){
+        while (line[len - 1] == '\r' || line[len - 1] == '\n') {
            line[len - 1] = 0;
            len--;
        }
        len = (int)strlen(line);
-        if(len == 0)
+        if (len == 0)
            continue;
        /* how many characters are in a word */
        int wSize = 0;
        /* how many words are in the sentence */
        int wNum = 0;
        int wNumLocal = 0;
        int i = 0;
-        for(i = 0; i < len; i++){
+        for (i = 0; i < len; i++) {
            /* load word (id) seperated by space or tab */
-            if((line[i] == ' ' || line[i] == '\t') && wSize > 0){
+            if ((line[i] == ' ' || line[i] == '\t') && wSize > 0) {
                line[i] = 0;
-                if(wSize == 3 && line[i - 1] == '|' && line[i - 2] == '|' && line[i - 3] == '|'){
+                if (wSize == 3 && line[i - 1] == '|' && line[i - 2] == '|' && line[i - 3] == '|') {
                    seqLen[seqCount] = wNumLocal;
                    seqOffset[seqCount] = wordCount + wNum - wNumLocal;
                    seqCount++;
                    wNumLocal = 0;
                }
-                else{
+                else {
                    buf[wordCount + wNum++] = atoi(line + i - wSize);
                    wNumLocal++;
                }
@@ -152,7 +152,7 @@ int T2TBatchLoader::LoadBuf(FILE * file, bool isSorted, int step)
                wSize++;
        }
-        if(wSize > 0){
+        if (wSize > 0) {
            buf[wordCount + wNum++] = atoi(line + i - wSize);
            wNumLocal++;
        }
@@ -164,7 +164,7 @@ int T2TBatchLoader::LoadBuf(FILE * file, bool isSorted, int step)
        wordCount += wNum;
        lineCount++;
-        if(wordCount >= bufSize - MAX_SEQUENCE_LENGTH)
+        if (wordCount >= bufSize - MAX_SEQUENCE_LENGTH)
            break;
        CheckNTErrors(seqCount % step == 0, "Wrong number of sequences!");
@@ -176,11 +176,11 @@ int T2TBatchLoader::LoadBuf(FILE * file, bool isSorted, int step)
    /* sort the sequences by length */
    if (isSorted) {
        CheckNTErrors(seqCount % step == 0, "Wrong number of sequences!");
-        SampleNode * nodes = new SampleNode[seqCount];
+        SampleNode* nodes = new SampleNode[seqCount];
        int count = 0;
        int offset = 0;
        for (int i = 0; i < seqCount; i += step) {
-            SampleNode &node = nodes[count];
+            SampleNode& node = nodes[count];
            node.id = count;
            node.offset = i;
            node.p = buf + offset;
@@ -222,10 +222,10 @@ int T2TBatchLoader::LoadBuf(FILE * file, bool isSorted, int step)
        count = 0;
        offset = 0;
-        for(int i = 0; i < seqCount; i += step){
+        for (int i = 0; i < seqCount; i += step) {
-            SampleNode &node = nodes[count];
+            SampleNode& node = nodes[count];
            memcpy(buf2 + offset, node.p, sizeof(int) * node.size);
-            for(int j = 0; j < step; j++){
+            for (int j = 0; j < step; j++) {
                seqLen2[i + j] = seqLen[node.offset + j];
                seqOffset[i + j] = offset + (j > 0 ? seqLen[node.offset + j - 1] : 0);
            }
@@ -233,7 +233,7 @@ int T2TBatchLoader::LoadBuf(FILE * file, bool isSorted, int step)
            offset += node.size;
        }
-        int * tmp = buf;
+        int* tmp = buf;
        buf = buf2;
        buf2 = tmp;
        tmp = seqLen;
@@ -264,7 +264,7 @@ void T2TBatchLoader::SetRandomBatch(bool flag)
 }
 /*
-load a batch of sequences 
+load a batch of sequences
 >> file - the handle to the data file
 >> isLM - indicates whether the data is used for training lms
 >> batchEnc - the batch of the input sequences
@@ -282,28 +282,28 @@ load a batch of sequences
 >> devID - device id
 >> isTraining - indicates whether we are training the model
 */
-int T2TBatchLoader::LoadBatch(FILE * file, bool isLM, 
+int T2TBatchLoader::LoadBatch(FILE* file, bool isLM,
-                          XTensor * batchEnc, XTensor * paddingEnc, 
+    XTensor* batchEnc, XTensor* paddingEnc,
-                          XTensor * batchDec, XTensor * paddingDec,
+    XTensor* batchDec, XTensor* paddingDec,
-                          XTensor * gold, XTensor * label,
+    XTensor* gold, XTensor* label,
-                          int * seqs,
+    int* seqs,
-                          int vsEnc, int vsDec, int sBatch, int wBatch, 
+    int vsEnc, int vsDec, int sBatch, int wBatch,
-                          bool isSorted, int &ws, int &wCount,
+    bool isSorted, int& ws, int& wCount,
-                          int devID, bool isTraining)
+    int devID, bool isTraining)
 {
-    if(isLM){
+    if (isLM) {
        return LoadBatchLM(file, batchEnc, paddingEnc, batchDec, paddingDec, gold, label,
-                           seqs, vsEnc, sBatch, wBatch, 
+            seqs, vsEnc, sBatch, wBatch,
-                           isSorted, wCount, devID, isTraining);
+            isSorted, wCount, devID, isTraining);
    }
-    else{
+    else {
        return LoadBatchMT(file, batchEnc, paddingEnc, batchDec, paddingDec, gold, label,
-                           seqs, vsEnc, vsDec, sBatch, wBatch, 
+            seqs, vsEnc, vsDec, sBatch, wBatch,
-                           isSorted, ws, wCount, devID, isTraining);
+            isSorted, ws, wCount, devID, isTraining);
    }
 }
-/* 
+/*
 load a batch of sequences (for LM)
 >> file - the handle to the data file
 >> isLM - indicates whether the data is used for training lms
@@ -322,16 +322,16 @@ load a batch of sequences (for LM)
 >> devID - device id
 >> isTraining - indicates whether we are training the model
 */
-int T2TBatchLoader::LoadBatchLM(FILE * file, 
+int T2TBatchLoader::LoadBatchLM(FILE* file,
-                            XTensor * batchEnc, XTensor * paddingEnc,
+    XTensor* batchEnc, XTensor* paddingEnc,
-                            XTensor * batchDec, XTensor * paddingDec,
+    XTensor* batchDec, XTensor* paddingDec,
-                            XTensor * gold, XTensor * label,
+    XTensor* gold, XTensor* label,
-                            int * seqs,
+    int* seqs,
-                            int vSize, int sBatch, int wBatch, 
+    int vSize, int sBatch, int wBatch,
-                            bool isSorted, int &wCount,
+    bool isSorted, int& wCount,
-                            int devID, bool isTraining)
+    int devID, bool isTraining)
 {
-    if(nextSeq < 0 || nextSeq >= nseqBuf)
+    if (nextSeq < 0 || nextSeq >= nseqBuf)
        LoadBuf(file, isSorted, 1);
    int seq = MAX(nextSeq, 0);
@@ -339,25 +339,25 @@ int T2TBatchLoader::LoadBatchLM(FILE * file,
    int wn = 0;
    int sc = 0;
    int max = 0;
-    while(seq + sc < nseqBuf){
+    while (seq + sc < nseqBuf) {
        int len = isDoubledEnd ? seqLen[seq + sc] : seqLen[seq + sc] - 1;
        CheckNTErrors(len > 0, "Empty sequence!");
        wn = len;
        wc += wn;
        sc += 1;
-        if(max < wn)
+        if (max < wn)
            max = wn;
        int tc = isBigBatch ? wc : max * sc;
-        if(sc >= sBatch && tc >= wBatch)
+        if (sc >= sBatch && tc >= wBatch)
            break;
    }
    wCount = 0;
    nextSeq = seq + sc;
-    if(sc <= 0)
+    if (sc <= 0)
        return 0;
    int dims[MAX_TENSOR_DIM_NUM];
@@ -378,22 +378,22 @@ int T2TBatchLoader::LoadBatchLM(FILE * file,
    paddingDec->SetZeroAll();
    int seqSize = 0;
-    int * batchEncValues = new int[batchEnc->unitNum];
+    int* batchEncValues = new int[batchEnc->unitNum];
-    int * labelValues = new int[label->unitNum];
+    int* labelValues = new int[label->unitNum];
-    MTYPE * goldOffsets = new MTYPE[gold->unitNum];
+    MTYPE* goldOffsets = new MTYPE[gold->unitNum];
-    MTYPE * paddingEncOffsets = new MTYPE[paddingEnc->unitNum];
+    MTYPE* paddingEncOffsets = new MTYPE[paddingEnc->unitNum];
-    MTYPE * paddingDecOffsets = new MTYPE[paddingDec->unitNum];
+    MTYPE* paddingDecOffsets = new MTYPE[paddingDec->unitNum];
    int wGold = 0;
    memset(batchEncValues, 0, sizeof(int) * batchEnc->unitNum);
    memset(labelValues, 0, sizeof(int) * label->unitNum);
-    for(int s = seq; s < seq + sc; s++){
+    for (int s = seq; s < seq + sc; s++) {
        int len = isDoubledEnd ? seqLen[s] : seqLen[s] - 1;
        CheckNTErrors(len <= max, "Something is wrong!");
-        for(int w = 0; w < len; w++){
+        for (int w = 0; w < len; w++) {
            int num = buf[seqOffset[s] + w];
            batchEncValues[(int)batchEnc->GetOffset2D(s - seq, w)] = num;
            paddingEncOffsets[wCount] = paddingEnc->GetOffset2D(s - seq, w);
@@ -402,27 +402,26 @@ int T2TBatchLoader::LoadBatchLM(FILE * file,
                goldOffsets[wGold++] = gold->GetOffset3D(s - seq, w - 1, num);
                labelValues[(int)label->GetOffset2D(s - seq, w - 1)] = buf[seqOffset[s] + w];
            }
            if (w == len - 1) {
                if (isDoubledEnd) {
                    goldOffsets[wGold++] = gold->GetOffset3D(s - seq, w, num);
                    labelValues[(int)label->GetOffset2D(s - seq, w)] = buf[seqOffset[s] + w];
-                }   
+                }
                else {
                    goldOffsets[wGold++] = gold->GetOffset3D(s - seq, w, buf[seqOffset[s] + w + 1]);
                    labelValues[(int)label->GetOffset2D(s - seq, w)] = buf[seqOffset[s] + w + 1];
                }
            }
            wCount++;
-            if(seqs != NULL)
+            if (seqs != NULL)
                seqs[seqSize++] = buf[seqOffset[s] + w];
        }
-        if(seqs != NULL){
+        if (seqs != NULL) {
-            for(int w = len; w < max; w++)
+            for (int w = len; w < max; w++)
                seqs[seqSize++] = -1;
        }
    }
@@ -437,7 +436,7 @@ int T2TBatchLoader::LoadBatchLM(FILE * file,
    _ConvertDataType(batchEnc, tmp);
    _NotEqual(tmp, paddingEnc, 0);
    DelTensorBuf(tmp);
    XTensor * tmp2 = NewTensorBuf(paddingDec, devID);
    _ConvertDataType(batchEnc, tmp2);
    _NotEqual(tmp2, paddingDec, 0);
@@ -449,17 +448,14 @@ int T2TBatchLoader::LoadBatchLM(FILE * file,
    delete[] paddingEncOffsets;
    delete[] paddingDecOffsets;
-    fflush(tf);
    return sc;
 }
-int CompareBatchNode(const void * a, const void * b)
+int CompareBatchNode(const void* a, const void* b)
 {
    return ((BatchNode*)b)->key - ((BatchNode*)a)->key;
 }
 /*
 load a batch of sequences (for MT)
 >> file - the handle to the data file
@@ -479,14 +475,14 @@ load a batch of sequences (for MT)
 >> devID - device id
 >> isTraining - indicates whether we are training the model
 */
-int T2TBatchLoader::LoadBatchMT(FILE * file, 
+int T2TBatchLoader::LoadBatchMT(FILE* file,
-                            XTensor * batchEnc, XTensor * paddingEnc, 
+    XTensor* batchEnc, XTensor* paddingEnc,
-                            XTensor * batchDec, XTensor * paddingDec,
+    XTensor* batchDec, XTensor* paddingDec,
-                            XTensor * gold, XTensor * label,
+    XTensor* gold, XTensor* label,
-                            int * seqs,
+    int* seqs,
-                            int vSizeEnc, int vSizeDec, int sBatch, int wBatch, 
+    int vSizeEnc, int vSizeDec, int sBatch, int wBatch,
-                            bool isSorted, int &ws, int &wCount,
+    bool isSorted, int& ws, int& wCount,
-                            int devID, bool isTraining)
+    int devID, bool isTraining)
 {
    if (nextBatch < 0 || nextBatch >= bufBatchSize) {
        LoadBuf(file, isSorted, 2);
@@ -498,7 +494,6 @@ int T2TBatchLoader::LoadBatchMT(FILE * file,
        /* we segment the buffer into batches */
        while (seq < nseqBuf) {
            int wcEnc = 0;
            int wcDec = 0;
            int wnEnc = 0;
@@ -508,7 +503,6 @@ int T2TBatchLoader::LoadBatchMT(FILE * file,
            int sc = 0;
            while (seq + sc < nseqBuf) {
                /* source-side sequence */
                wnEnc = seqLen[seq + sc];
@@ -534,7 +528,7 @@ int T2TBatchLoader::LoadBatchMT(FILE * file,
                    maxDec = wnDec;
            }
-            BatchNode & batch = bufBatch[bufBatchSize];
+            BatchNode& batch = bufBatch[bufBatchSize];
            batch.beg = seq;
            batch.end = seq + sc;
            batch.maxEnc = maxEnc;
@@ -545,14 +539,14 @@ int T2TBatchLoader::LoadBatchMT(FILE * file,
            seq = seq + sc;
        }
-        if(isRandomBatch)
+        if (isRandomBatch)
            qsort(bufBatch, bufBatchSize, sizeof(BatchNode), CompareBatchNode);
    }
-    if(bufBatchSize <= 0)
+    if (bufBatchSize <= 0)
        return 0;
-    BatchNode & batch = bufBatch[nextBatch++];
+    BatchNode& batch = bufBatch[nextBatch++];
    int seq = batch.beg;
    int sc = batch.end - batch.beg;
    int maxEnc = batch.maxEnc;
@@ -560,7 +554,7 @@ int T2TBatchLoader::LoadBatchMT(FILE * file,
    CheckNTErrors(sc % 2 == 0, "The input samples must be paired");
-    int sCount = sc/2;
+    int sCount = sc / 2;
    int seqSize = 0;
    InitTensor2D(batchEnc, sCount, maxEnc, X_INT, devID);
@@ -568,6 +562,7 @@ int T2TBatchLoader::LoadBatchMT(FILE * file,
    InitTensor2D(batchDec, sCount, maxDec, X_INT, devID);
    InitTensor2D(paddingDec, sCount, maxDec, X_FLOAT, devID);
    InitTensor2D(label, sCount, maxDec, X_INT, devID);
    //InitTensor(gold, 3, dimsDec, X_FLOAT, devID);
    batchEnc->SetZeroAll();
@@ -575,6 +570,7 @@ int T2TBatchLoader::LoadBatchMT(FILE * file,
    batchDec->SetZeroAll();
    paddingDec->SetZeroAll();
    label->SetZeroAll();
    //gold->SetZeroAll();
    int wCountEnc = 0;
@@ -582,11 +578,12 @@ int T2TBatchLoader::LoadBatchMT(FILE * file,
    int wCountPad = 0;
    wCount = 0;
-    int * batchEncValues = new int[batchEnc->unitNum];
+    int* batchEncValues = new int[batchEnc->unitNum];
-    int * batchDecValues = new int[batchDec->unitNum];
+    int* batchDecValues = new int[batchDec->unitNum];
-    int * labelValues = new int[label->unitNum];
+    int* labelValues = new int[label->unitNum];
-    MTYPE * paddingEncOffsets = new MTYPE[sc * maxEnc / 2];
+    MTYPE* paddingEncOffsets = new MTYPE[sc * maxEnc / 2];
-    MTYPE * paddingDecOffsets = new MTYPE[sc * maxDec / 2];
+    MTYPE* paddingDecOffsets = new MTYPE[sc * maxDec / 2];
    //MTYPE * goldOffsets = new MTYPE[sc * maxDec / 2];
    memset(batchEncValues, 0, sizeof(int) * batchEnc->unitNum);
@@ -594,10 +591,10 @@ int T2TBatchLoader::LoadBatchMT(FILE * file,
    memset(labelValues, 0, sizeof(int) * batchDec->unitNum);
    /* batch of the source-side sequences */
-    for(int s = seq; s < seq + sc; s += 2){
+    for (int s = seq; s < seq + sc; s += 2) {
        int len = seqLen[s];
-        int sent = (s - seq)/2;
+        int sent = (s - seq) / 2;
-        for(int w = 0; w < len; w++){
+        for (int w = 0; w < len; w++) {
            int num = buf[seqOffset[s] + w];
            batchEncValues[batchEnc->GetOffset2D(sent, w)] = num;
            paddingEncOffsets[wCountEnc] = paddingEnc->GetOffset2D(sent, w);
@@ -607,6 +604,7 @@ int T2TBatchLoader::LoadBatchMT(FILE * file,
    ws = wCountEnc;
    batchEnc->SetData(batchEncValues, batchEnc->unitNum);
    paddingEnc->SetDataBatched(paddingEncOffsets, 1.0F, wCountEnc);
    //XTensor * tmp = NewTensorBuf(paddingEnc, devID);
    //_ConvertDataType(batchEnc, tmp);
    //tmp->Dump(stderr, "tmp:");
@@ -614,40 +612,45 @@ int T2TBatchLoader::LoadBatchMT(FILE * file,
    //DelTensorBuf(tmp);
    /* batch of the target-side sequences */
-    for(int s = seq + 1; s < seq + sc; s += 2){
+    for (int s = seq + 1; s < seq + sc; s += 2) {
        int len = isDoubledEnd ? seqLen[s] : seqLen[s] - 1;
        CheckNTErrors(len <= maxDec, "Something is wrong!");
-        int sent = (s - seq - 1)/2;
+        int sent = (s - seq - 1) / 2;
-        for(int w = 0; w < len; w++){
+        for (int w = 0; w < len; w++) {
            int num = buf[seqOffset[s] + w];
            batchDecValues[batchDec->GetOffset2D(sent, w)] = num;
            //paddingDecOffsets[wCountDec] = paddingDec->GetOffset2D(sent, w);
-            if (w < len-1){
+            if (w < len - 1) {
                paddingDecOffsets[wCountPad++] = paddingDec->GetOffset2D(sent, w);
                wCount++;
            }
            if (w > 0) {
                //goldOffsets[wGold++] = gold->GetOffset3D(sent, w - 1, buf[seqOffset[s] + w]);
                labelValues[label->GetOffset2D(sent, w - 1)] = buf[seqOffset[s] + w];
            }
            if (w == len - 1) {
                if (isDoubledEnd) {
                    //goldOffsets[wGold++] = gold->GetOffset3D(sent, w, buf[seqOffset[s] + w]);
                    labelValues[label->GetOffset2D(sent, w)] = buf[seqOffset[s] + w];
                }
                else {
                    //goldOffsets[wGold++] = gold->GetOffset3D(sent, w, buf[seqOffset[s] + w + 1]);
                    labelValues[label->GetOffset2D(sent, w)] = buf[seqOffset[s] + w + 1];
                }
            }
            //wCount++;
            wCountDec++;
-            if(seqs != NULL)
+            if (seqs != NULL)
                seqs[seqSize++] = buf[seqOffset[s] + w];
        }
-        if(seqs != NULL){
+        if (seqs != NULL) {
-            for(int w = len; w < maxDec; w++)
+            for (int w = len; w < maxDec; w++)
                seqs[seqSize++] = -1;
        }
    }
@@ -668,19 +671,20 @@ int T2TBatchLoader::LoadBatchMT(FILE * file,
    delete[] labelValues;
    delete[] paddingEncOffsets;
    delete[] paddingDecOffsets;
    //delete[] goldOffsets;
    return sc;
 }
-/* 
+/*
-shuffle lines of the file 
+shuffle lines of the file
 >> srcFile - the source file to shuffle
 >> tgtFile - the resulting file
 */
-void T2TBatchLoader::Shuffle(const char * srcFile, const char * tgtFile)
+void T2TBatchLoader::Shuffle(const char* srcFile, const char* tgtFile)
 {
-    char * line = new char[MAX_LINE_LENGTH];
+    char* line = new char[MAX_LINE_LENGTH];
 #ifndef WIN32
    sprintf(line, "shuf %s > %s", srcFile, tgtFile);
    system(line);
@@ -690,5 +694,4 @@ void T2TBatchLoader::Shuffle(const char * srcFile, const char * tgtFile)
    delete[] line;
 }
 }
\ No newline at end of file
\ No newline at end of file
--- a/source/sample/transformer/T2TBatchLoader.h
+++ b/source/sample/transformer/T2TBatchLoader.h
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,13 +17,14 @@
 /*
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-04-25
- * it is cold today but i'll move to a warm place tomorrow :)
+ * it is cold today but I'll move to a warm place tomorrow :)
 */
 #ifndef __T2TBATCHLOADER_H__
 #define __T2TBATCHLOADER_H__
-#include "../../network/XNet.h"
+#include "../module/T2TUtility.h"
+#include "../../../network/XNet.h"
 using namespace nts;
@@ -35,7 +36,7 @@ namespace transformer
 /* node to keep batch information */
 struct BatchNode
 {
-    /* begining position */
+    /* beginning position */
    int beg;
    /* end position */
@@ -55,13 +56,13 @@ class T2TBatchLoader
 {
 public:
    /* buffer for loading words */
-    int * buf;
+    int* buf;
    /* another buffer */
-    int * buf2;
+    int* buf2;
    /* batch buf */
-    BatchNode * bufBatch;
+    BatchNode* bufBatch;
    /* buffer size */
    int bufSize;
@@ -70,13 +71,13 @@ public:
    int bufBatchSize;
    /* length of each sequence */
-    int * seqLen;
+    int* seqLen;
    /* another array */
-    int * seqLen2;
+    int* seqLen2;
    /* offset of the first word for each sequence */
-    int * seqOffset;
+    int* seqOffset;
    /* number of sequences in the buffer */
    int nseqBuf;
@@ -87,9 +88,9 @@ public:
    /* offset for next batch */
    int nextBatch;
-    /* indicates whether we double the </s> symbol for the output of lms */
+    /* indicates whether we double the </s> symbol for the output of LM */
    bool isDoubledEnd;
    /* indicates whether we use batchsize = max * sc
       rather rather than batchsize = word-number, where max is the maximum
       length and sc is the sentence number */
@@ -112,10 +113,10 @@ public:
    ~T2TBatchLoader();
    /* initialization */
-    void Init(int argc, char ** argv);
+    void Init(T2TConfig& config);
    /* load data to buffer */
-    int LoadBuf(FILE * file, bool isSorted, int step);
+    int LoadBuf(FILE* file, bool isSorted, int step);
    /* clear data buffer */
    void ClearBuf();
@@ -124,36 +125,37 @@ public:
    void SetRandomBatch(bool flag = true);
    /* load a batch of sequences */
-    int LoadBatch(FILE * file, bool isLM,
+    int LoadBatch(FILE* file, bool isLM,
-                  XTensor * batchEnc, XTensor * paddingEnc, 
+        XTensor* batchEnc, XTensor* paddingEnc,
-                  XTensor * batchDec, XTensor * paddingDec,
+        XTensor* batchDec, XTensor* paddingDec,
-                  XTensor * gold, XTensor * label,
+        XTensor* gold, XTensor* label,
-                  int * seqs,
+        int* seqs,
-                  int vsEnc, int vsDec, int sBatch, int wBatch, 
+        int vsEnc, int vsDec, int sBatch, int wBatch,
-                  bool isSorted, int &ws, int &wCount,
+        bool isSorted, int& ws, int& wCount,
-                  int devID, bool isTraining);
+        int devID, bool isTraining);
    /* load a batch of sequences (for language modeling) */
-    int LoadBatchLM(FILE * file, 
+    int LoadBatchLM(FILE* file,
-                    XTensor * batchEnc, XTensor * paddingEnc,
+        XTensor* batchEnc, XTensor* paddingEnc,
-                    XTensor * batchDec, XTensor * paddingDec,
+        XTensor* batchDec, XTensor* paddingDec,
-                    XTensor * gold, XTensor * label,
+        XTensor* gold, XTensor* label,
-                    int * seqs, int vs, int sBatch, int wBatch, 
+        int* seqs, int vs, int sBatch, int wBatch,
-                    bool isSorted, int &wCount,
+        bool isSorted, int& wCount,
-                    int devID, bool isTraining);
+        int devID, bool isTraining);
    /* load a batch of sequences (for machine translation) */
-    int LoadBatchMT(FILE * file, 
+    int LoadBatchMT(FILE* file,
-                    XTensor * batchEnc, XTensor * paddingEnc, 
+        XTensor* batchEnc, XTensor* paddingEnc,
-                    XTensor * batchDec, XTensor * paddingDec,
+        XTensor* batchDec, XTensor* paddingDec,
-                    XTensor * gold, XTensor * label,
+        XTensor* gold, XTensor* label,
-                    int * seqs, int vsEnc, int vsDec, int sBatch, int wBatch, 
+        int* seqs, int vsEnc, int vsDec, int sBatch, int wBatch,
-                    bool isSorted, int &ws, int &wCount,
+        bool isSorted, int& ws, int& wCount,
-                    int devID, bool isTraining);
+        int devID, bool isTraining);
    /* shuffle the data file */
-    void Shuffle(const char * srcFile, const char * tgtFile);
+    void Shuffle(const char* srcFile, const char* tgtFile);
 };
 }
 #endif
\ No newline at end of file
--- a/source/sample/transformer/T2TTrainer.cpp
+++ b/source/sample/transformer/T2TTrainer.cpp
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -19,13 +19,13 @@
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-02
 */
-#include <math.h>
+#include <cmath>
 #include "T2TTrainer.h"
-#include "T2TUtility.h"
+#include "../module/T2TUtility.h"
-#include "../../tensor/XUtility.h"
+#include "../../../tensor/XUtility.h"
-#include "../../tensor/core/CHeader.h"
+#include "../../../tensor/core/CHeader.h"
-#include "../../tensor/loss/LHeader.h"
+#include "../../../tensor/loss/LHeader.h"
-#include "../../network/XNoder.h"
+#include "../../../network/XNoder.h"
 #ifndef WIN32
 #include <sys/time.h>
@@ -38,85 +38,72 @@ namespace transformer
 /* constructor */
 T2TTrainer::T2TTrainer()
 {
-    argNum = 0;
+    cfg = NULL;
-    argArray = NULL;
 }
 /* de-constructor */
 T2TTrainer::~T2TTrainer()
 {
-    for(int i = 0; i < moments.count; i++){
+    for (int i = 0; i < moments.count; i++) {
-        XTensor * m = (XTensor*)moments.Get(i);
+        XTensor* m = (XTensor*)moments.Get(i);
        delete m;
    }
-    for(int i = 0; i < moments2nd.count; i++){
+    for (int i = 0; i < moments2nd.count; i++) {
-        XTensor * m = (XTensor*)moments2nd.Get(i);
+        XTensor* m = (XTensor*)moments2nd.Get(i);
        delete m;
    }
-    for(int i = 0; i < argNum; i++)
-        delete[] argArray[i];
-    delete[] argArray;
 }
-/* 
+/*
-initialization 
+initialization
->> argc - number of arguments
+>> config - configurations of the training process
->> argv - list of pointers to the arguments
 */
-void T2TTrainer::Init(int argc, char ** argv)
+void T2TTrainer::Init(T2TConfig& config)
 {
-    argNum = argc;
+    cfg = &config;
-    argArray = new char*[argc];
+    lrate = config.lrate;
-    for(int i = 0; i < argNum; i++){
+    lrbias = config.lrbias;
-        argArray[i] = new char[strlen(argv[i]) + 1];
+    sBatchSize = config.sBatchSize;
-        strcpy(argArray[i], argv[i]);
+    wBatchSize = config.wBatchSize;
-    }
+    nepoch = config.nepoch;
+    nstep = config.nstep;
-    LoadParamFloat(argc, argv, "lrate", &lrate, 1.0F);
+    d = config.modelSize;
-    LoadParamFloat(argc, argv, "lrbias", &lrbias, 0);
+    nwarmup = config.nwarmup;
-    LoadParamInt(argc, argv, "sbatch", &sBatchSize, 1);
+    vSize = config.srcVocabSize;
-    LoadParamInt(argc, argv, "wbatch", &wBatchSize, 1);
+    vSizeTgt = config.tgtVocabSize;
-    LoadParamInt(argc, argv, "nepoch", &nepoch, 1);
+    useAdam = config.useAdam;
-    LoadParamInt(argc, argv, "nstep", &nstep, 1);
+    adamBeta1 = config.adamBeta1;
-    LoadParamInt(argc, argv, "d", &d, 512);
+    adamBeta2 = config.adamBeta2;
-    LoadParamInt(argc, argv, "nwarmup", &nwarmup, 4000);
+    adamDelta = config.adamDelta;
-    LoadParamInt(argc, argv, "vsize", &vSize, 1);
+    isShuffled = config.isShuffled;
-    LoadParamInt(argc, argv, "vsizetgt", &vSizeTgt, vSize);
+    labelSmoothingP = config.labelSmoothingP;
-    LoadParamBool(argc, argv, "adam", &useAdam, false);
+    nStepCheckpoint = config.nStepCheckpoint;
-    LoadParamFloat(argc, argv, "adambeta1", &adamBeta1, 0.9F);
+    useEpochCheckpoint = config.useEpochCheckpoint;
-    LoadParamFloat(argc, argv, "adambeta2", &adamBeta2, 0.98F);
+    updateStep = config.updateStep;
-    LoadParamFloat(argc, argv, "adamdelta", &adamDelta, 1e-9F);
+    isDebugged = config.isDebugged;
-    LoadParamBool(argc, argv, "shuffled", &isShuffled, false);
+    isLenSorted = config.isLenSorted;
-    LoadParamFloat(argc, argv, "labelsmoothing", &labelSmoothingP, 0);
-    LoadParamInt(argc, argv, "nstepcheckpoint", &nStepCheckpoint, -1);
-    LoadParamBool(argc, argv, "epochcheckpoint", &useEpochCheckpoint, false);
-    LoadParamInt(argc, argv, "updatestep", &updateStep, 1);
-    LoadParamBool(argc, argv, "debug", &isDebugged, false);
-    LoadParamBool(argc, argv, "sorted", &isLenSorted, false);
    adamBeta1T = 1.0F;
    adamBeta2T = 1.0F;
-    batchLoader.Init(argc, argv);
+    batchLoader.Init(config);
 }
 int tc = 0;
-/* 
+/*
 train the model
 >> fn - training data file
 >> validFN - validation data file
 >> modelFN - where we keep the model
 >> model - model to train
 */
-void T2TTrainer::Train(const char * fn, const char * validFN, const char * modelFN, T2TModel * model)
+void T2TTrainer::Train(const char* fn, const char* validFN, const char* modelFN, T2TModel* model)
 {
    int step = 0;
    int wc = 0;
-    int ws =0;
+    int ws = 0;
    int wordCount = 0;
    int wordCountTotal = 0;
    int batchCountTotal = 0;
@@ -130,36 +117,35 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
    int validStep = 0;
    int epoch = 0;
-    char * trainFN = new char[(int)strlen(fn) + 10];
+    char* trainFN = new char[(int)strlen(fn) + 10];
    strcpy(trainFN, fn);
 #ifndef WIN32
-    if(isShuffled)
+    if (isShuffled)
        sprintf(trainFN, "%s.random", fn);
 #endif
    int devID = model->devID;
    XNet net;
-    if(isDebugged)
-        net.SetGradEfficientFlag(false);
    PrepareModel(model);
    double startT = GetClockSec();
-    for(epoch = 1; epoch <= nepoch; epoch++){
+    for (epoch = 1; epoch <= nepoch; epoch++) {
 #ifndef WIN32
-        if(isShuffled)
+        if (isShuffled) {
+            fprintf(stderr, "shuffle the file\n");
            batchLoader.Shuffle(fn, trainFN);
+        }
 #endif
-        FILE * file = fopen(trainFN, "rb");
+        FILE* file = fopen(trainFN, "r");
        CheckNTErrors(file, "cannot open training file!");
        wordCount = 0;
        loss = 0;
        /* batch of sequences (on the encoder and decoder sides) */
        XTensor batchEnc;
        XTensor batchDec;
@@ -173,24 +159,23 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
        /* gold standard */
        XTensor gold;
-        while (batchLoader.LoadBatch(file, model->isLM, 
-                                     &batchEnc, &paddingEnc, &batchDec, &paddingDec, &gold, &label,
-                                     NULL, vSize, vSizeTgt,
-                                     sBatchSize, wBatchSize, isLenSorted, ws, wc, devID, true)) 
-        {
+        while (batchLoader.LoadBatch(file, model->isLM,
+            &batchEnc, &paddingEnc, &batchDec, &paddingDec, &gold, &label,
+            NULL, vSize, vSizeTgt,
+            sBatchSize, wBatchSize, isLenSorted, ws, wc, devID, true))
+        {
            CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch");
            /* output probabilities */
            XTensor output;
            /* make the network */
-            if(model->isLM)
+            if (model->isLM)
                model->MakeLM(batchEnc, output, paddingEnc, true);
-            else if(model->isMT)
+            else if (model->isMT)
                model->MakeMT(batchEnc, batchDec, output, paddingEnc, paddingDec, true);
-            else{
+            else {
                ShowNTErrors("Illegal model type!");
            }
@@ -199,16 +184,18 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
            XTensor lossTensor;
            labelOnehot = IndexToOnehot(label, vSizeTgt, labelSmoothingP);
            lossTensor = CrossEntropy(output, labelOnehot, paddingDec);
            float lossBatch = ReduceSumAllValue(lossTensor);
            DTYPE lossLocal = lossBatch / wc;
            bool doUpdate = (!IsNAN(lossLocal) && !IsINF(lossLocal) && lossLocal < 1e3F);
            if (doUpdate) {
                /* back-propagation */
                net.Backward(lossTensor);
                gradStep += 1;
                loss += lossBatch;
                wordCount += wc;
@@ -216,74 +203,73 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
                batchCountTotal += ws;
                /* update the parameters */
-                if(gradStep == updateStep){
+                if (gradStep == updateStep) {
                    /* learning rate */
-                    lr = lrate * (1.0F / (float)sqrt((float)d)) * 
+                    lr = lrate * (1.0F / (float)sqrt((float)d)) *
-                         (float)MIN(pow((float)validStep + 1, -0.5F - lrbias), 
+                        (float)MIN(pow((float)validStep + 1, -0.5F - lrbias),
-                         ((float)validStep + 1) * pow((float)nwarmup, -1.5F - lrbias));
+                        ((float)validStep + 1) * pow((float)nwarmup, -1.5F - lrbias));
                    /* model update */
                    Update(model, lr);
                    gradStep = 0;
                    validStep++;
                }
            }
            else
                nSkipped++;
-            if(++step >= nstep){
+            if (++step >= nstep) {
                isEnd = true;
                break;
            }
            if (step % 100 == 0) {
                double elapsed = GetClockSec() - startT;
                XPRINT8(0, stderr, "[INFO] elapsed=%.1fs, step=%d, epoch=%d, total word=%d, total batch=%d, loss=%.3f, ppl=%.3f, sppl=%.3f",
-                        elapsed, step, epoch, 
+                    elapsed, step, epoch,
-                        wordCountTotal, batchCountTotal,
+                    wordCountTotal, batchCountTotal,
-                        loss/wordCount, exp(loss/wordCount), exp(lossBatch /wc));
+                    loss / wordCount, exp(loss / wordCount), exp(lossBatch / wc));
                if (!doUpdate)
                    XPRINT(0, stderr, " (no update)");
                XPRINT(0, stderr, "\n");
            }
-            if(nStepCheckpoint > 0 && ++nStepCheck >= nStepCheckpoint){
+            if (nStepCheckpoint > 0 && ++nStepCheck >= nStepCheckpoint) {
                MakeCheckpoint(model, validFN, modelFN, "step", step);
                nStepCheck = 0;
                nCheckpoint++;
            }
        }
        fclose(file);
        if (isEnd)
            break;
-        if(useEpochCheckpoint)
+        if (useEpochCheckpoint)
            MakeCheckpoint(model, validFN, modelFN, "epoch", epoch);
    }
    double elapsed = GetClockSec() - startT;
    epoch = MIN(epoch, nepoch);
    XPRINT7(0, stderr, "[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, loss=%.3f, ppl=%.3f\n",
-            lr, elapsed, step, epoch, wordCountTotal, loss/wordCount, exp(loss/wordCount));
+        lr, elapsed, step, epoch, wordCountTotal, loss / wordCount, exp(loss / wordCount));
    XPRINT4(0, stderr, "[INFO] training finished (took %.1fs, step=%d, skipped=%d and epoch=%d)\n",
-            elapsed, step, nSkipped, epoch);
+        elapsed, step, nSkipped, epoch);
    delete[] trainFN;
 }
-/* 
+/*
 test the model
 >> fn - test data file
 >> ofn - output data file
 >> model - model that is trained
 */
-void T2TTrainer::Validate(const char * fn, const char * ofn, T2TModel * model)
+void T2TTrainer::Validate(const char* fn, const char* ofn, T2TModel* model)
 {
    int wc = 0;
    int ws = 0;
@@ -292,13 +278,13 @@ void T2TTrainer::Validate(const char * fn, const char * ofn, T2TModel * model)
    float loss = 0;
    /* data files */
-    FILE * file = fopen(fn, "rb");
+    FILE* file = fopen(fn, "rb");
    CheckNTErrors(file, "Cannot read the test file");
-    FILE * ofile = fopen(ofn, "wb");
+    FILE* ofile = fopen(ofn, "wb");
    CheckNTErrors(ofile, "Cannot open the output file");
    double startT = GetClockSec();
    /* batch of input sequences */
    XTensor batchEnc;
    XTensor batchDec;
@@ -314,26 +300,26 @@ void T2TTrainer::Validate(const char * fn, const char * ofn, T2TModel * model)
    XTensor gold;
    /* an array that keeps the sequences */
-    int * seqs = new int[MILLION];
+    int* seqs = new int[MILLION];
    batchLoader.ClearBuf();
-    while(batchLoader.LoadBatch(file, model->isLM, 
+    while (batchLoader.LoadBatch(file, model->isLM,
-                                &batchEnc, &paddingEnc, &batchDec, &paddingDec, &gold, &label,
+        &batchEnc, &paddingEnc, &batchDec, &paddingDec, &gold, &label,
-                                seqs, vSize, vSizeTgt,
+        seqs, vSize, vSizeTgt,
-                                1, 1, false, ws, wc, model->devID, false))
+        1, 1, false, ws, wc, model->devID, false))
    {
        CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch");
        /* output probabilities */
        XTensor output;
        /* make the network */
-        if(model->isLM)
+        if (model->isLM)
            model->MakeLM(batchEnc, output, paddingEnc, false);
-        else if(model->isMT)
+        else if (model->isMT)
            model->MakeMT(batchEnc, batchDec, output, paddingEnc, paddingDec, false);
-        else{
+        else {
            ShowNTErrors("Illegal model type!");
        }
@@ -348,19 +334,19 @@ void T2TTrainer::Validate(const char * fn, const char * ofn, T2TModel * model)
        float lossBatch = ReduceSumAllValue(lossTensor);
        /* dump the test result */
-        for(int s = 0; s < bSize; s++){
+        for (int s = 0; s < bSize; s++) {
            DTYPE sum = 0;
-            int * seq = seqs + s * length;
+            int* seq = seqs + s * length;
-            for(int i = 0; i < length; i++){
+            for (int i = 0; i < length; i++) {
-                if(seq[i] >= 0){
+                if (seq[i] >= 0) {
                    fprintf(ofile, "%d ", seq[i]);
                }
                else
                    break;
            }
            fprintf(ofile, "||| ");
-            for(int i = 0; i < length; i++){
+            for (int i = 0; i < length; i++) {
-                if(seq[i] >= 0){
+                if (seq[i] >= 0) {
                    DTYPE p = lossTensor.Get2D(s, i);
                    fprintf(ofile, "%.3e ", p);
                    sum += p;
@@ -370,7 +356,7 @@ void T2TTrainer::Validate(const char * fn, const char * ofn, T2TModel * model)
            }
            fprintf(ofile, "||| %e\n", sum);
        }
        loss += lossBatch;
        wordCount += wc;
@@ -381,39 +367,40 @@ void T2TTrainer::Validate(const char * fn, const char * ofn, T2TModel * model)
    fclose(ofile);
    delete[] seqs;
    double elapsed = GetClockSec() - startT;
    XPRINT5(0, stderr, "[INFO] test finished (took %.1fs, sentence=%d, word=%d, loss=%.3f and ppl=%.3f)\n",
-            elapsed, sentCount, wordCount, loss / wordCount, exp(loss / wordCount));
+        elapsed, sentCount, wordCount, loss / wordCount, exp(loss / wordCount));
 }
-/* 
+/*
-make a checkpoint 
+make a checkpoint
 >> model - the model
 >> validFN - validation data file
 >> modelFN - model data file
 >> label - label of the model
 >> id - id of the checkpoint
 */
-void T2TTrainer::MakeCheckpoint(T2TModel * model, const char * validFN, const char * modelFN, const char * label, int id)
+void T2TTrainer::MakeCheckpoint(T2TModel* model, const char* validFN, const char* modelFN, const char* label, int id)
 {
-    char * fn = new char[MAX_LINE_LENGTH];
+    fprintf(stderr, "make a checkpoint\n");
+    char* fn = new char[MAX_LINE_LENGTH];
    sprintf(fn, "%s.%s.%03d", modelFN, label, id);
    model->Dump(fn);
    delete[] fn;
    char* fn2 = new char[MAX_LINE_LENGTH];
    sprintf(fn2, "%s.%s.%03d.output", modelFN, label, id);
-    if(validFN != NULL){
+    if (validFN != NULL) {
        T2TTrainer trainer;
-        trainer.Init(argNum, argArray);
+        trainer.Init(*cfg);
        trainer.Validate(validFN, fn2, model);
    }
    delete[] fn2;
 }
-/* 
+/*
 update the model by delta rule
 \theta_{new} = \theta - \lrate * grad
 where
@@ -421,15 +408,15 @@ where
 >> model - the t2t model
 >> lr - learning rate
 */
-void T2TTrainer::Update(T2TModel * model, const float lr)
+void T2TTrainer::Update(T2TModel* model, const float lr)
 {
    TensorList ws(100);
    model->GetParams(ws);
-    for(int i = 0; i < ws.count; i++){
+    for (int i = 0; i < ws.Size(); i++) {
-        XTensor * para = (XTensor*)ws.Get(i);
+        XTensor* para = ws[i];
-        XTensor * paraGrad = para->grad;
+        XTensor* paraGrad = para->grad;
        if (paraGrad == NULL)
            continue;
@@ -437,24 +424,24 @@ void T2TTrainer::Update(T2TModel * model, const float lr)
        CheckNTErrors(para != NULL, "NULL parameter tensor!");
        CheckNTErrors(paraGrad != NULL, "NULL gradient tensor!");
-        if(useAdam){
+        if (useAdam) {
            adamBeta1T *= adamBeta1;
            adamBeta2T *= adamBeta2;
            DTYPE e = lr * (DTYPE)sqrt(1 - adamBeta2T) / (1 - adamBeta1T);
            DTYPE d = adamDelta * (DTYPE)sqrt(1 - adamBeta2T);
            /* m = beta_1 * m + (1-beta_1) * grad */
-            XTensor * m = (XTensor*)moments.Get(i);
+            XTensor* m = (XTensor*)moments.Get(i);
            _ScaleAndShiftMe(m, adamBeta1, 0);
            _Sum(m, paraGrad, m, (1.0F - adamBeta1));
            /* v = beta_2 * v + (1-beta_2) * grad * grad*/
-            XTensor * v = (XTensor*)moments2nd.Get(i);
+            XTensor* v = (XTensor*)moments2nd.Get(i);
-            _Multiply(paraGrad, paraGrad, v, adamBeta2/(1.0F - adamBeta2));
+            _Multiply(paraGrad, paraGrad, v, adamBeta2 / (1.0F - adamBeta2));
            _ScaleAndShiftMe(v, (1.0F - adamBeta2), 0);
            /* v2 = m / (sqrt(v) + delta) */
-            XTensor * v2 = NewTensorBuf(v, v->devID);
+            XTensor* v2 = NewTensorBuf(v, v->devID);
            _Power(v, v2, 0.5F);
            _ScaleAndShiftMe(v2, 1.0F, d);
            _Div(m, v2, v2);
@@ -464,7 +451,7 @@ void T2TTrainer::Update(T2TModel * model, const float lr)
            DelTensorBuf(v2);
        }
-        else{
+        else {
            /* the delta rule */
            _Sum(para, paraGrad, para, -lr);
        }
@@ -474,11 +461,11 @@ void T2TTrainer::Update(T2TModel * model, const float lr)
    }
 }
-/* 
+/*
-prepare model for training 
+prepare model for training
 >> model - the model for training
 */
-void T2TTrainer::PrepareModel(T2TModel * model)
+void T2TTrainer::PrepareModel(T2TModel* model)
 {
    moments.Clear();
    moments2nd.Clear();
@@ -487,13 +474,13 @@ void T2TTrainer::PrepareModel(T2TModel * model)
    model->GetParams(ws);
-    for(int i = 0; i < ws.count; i++){
+    for (int i = 0; i < ws.Size(); i++) {
-        XTensor * para = (XTensor*)ws.Get(i);
+        XTensor* para = ws[i];
        XNoder::MakeGrad(para);
-        if(useAdam){
+        if (useAdam) {
-            XTensor * m = new XTensor(para);
+            XTensor* m = new XTensor(para);
-            XTensor * m2 = new XTensor(para);
+            XTensor* m2 = new XTensor(para);
            m->SetZeroAll();
            m2->SetZeroAll();
            moments.Add(m);
@@ -505,4 +492,4 @@ void T2TTrainer::PrepareModel(T2TModel * model)
    adamBeta2T = 1.0F;
 }
 }
\ No newline at end of file
--- a/source/sample/transformer/T2TTrainer.h
+++ b/source/sample/transformer/T2TTrainer.h
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -22,9 +22,9 @@
 #ifndef __T2TTRAINER_H__
 #define __T2TTRAINER_H__
-#include "T2TModel.h"
+#include "../T2TModel.h"
 #include "T2TBatchLoader.h"
-#include "../../tensor/function/FHeader.h"
+#include "../../../tensor/function/FHeader.h"
 using namespace nts;
@@ -35,15 +35,13 @@ namespace transformer
 class T2TTrainer
 {
 public:
-    /* paramter number */
-    int argNum;
-    /* parameter array */
+    /* configurations */
-    char ** argArray;
+    T2TConfig* cfg;
    /* dimension size of each inner layer */
    int d;
    /* step number of warm-up for training */
    int nwarmup;
@@ -55,7 +53,7 @@ public:
    /* learning rate */
    float lrate;
    /* the parameter that controls the maximum learning rate in training */
    float lrbias;
@@ -81,24 +79,24 @@ public:
    float adamBeta1T;
    float adamBeta2T;
-    /* list of the moment of the parameter matrics */
+    /* list of the moment of the parameter matrices */
    TensorList moments;
-    /* list of the 2nd order moment of the parameter matrics */
+    /* list of the 2nd order moment of the parameter matrices */
    TensorList moments2nd;
    /* indicates whether the data file is shuffled for training */
    bool isShuffled;
    /* the factor of label smoothing */
    DTYPE labelSmoothingP;
    /* number of steps after which we make a checkpoint */
    int nStepCheckpoint;
-    /* indicates whether we make a checkpoint after each traing epoch */
+    /* indicates whether we make a checkpoint after each training epoch */
    bool useEpochCheckpoint;
    /* number of batches on which we do model update */
    int updateStep;
@@ -119,25 +117,24 @@ public:
    ~T2TTrainer();
    /* initialize the trainer */
-    void Init(int argc, char ** argv);
+    void Init(T2TConfig& config);
    /* train the model */
-    void Train(const char * fn, const char * validFN, const char * modelFN, T2TModel * model);
+    void Train(const char* fn, const char* validFN, const char* modelFN, T2TModel* model);
    /* test the model */
-    void Validate(const char * fn, const char * ofn, T2TModel * model);
+    void Validate(const char* fn, const char* ofn, T2TModel* model);
    /* make a checkpoint */
-    void MakeCheckpoint(T2TModel * model, const char * validFN, const char * modelFN, const char * label, int id);
+    void MakeCheckpoint(T2TModel* model, const char* validFN, const char* modelFN, const char* label, int id);
    /* update the model by delta rule */
-    void Update(T2TModel * model, const float lr);
+    void Update(T2TModel* model, const float lr);
    /* prepare model for training */
-    void PrepareModel(T2TModel * model);
+    void PrepareModel(T2TModel* model);
 };
 }
 #endif
--- a/source/sample/transformer/translate/T2TDataSet.cpp
+++ b/source/sample/transformer/translate/T2TDataSet.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: HU Chi (huchinlp@foxmail.com) 2019-04-03
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
+ */
+#include <string>
+#include <vector>
+#include <cstdlib>
+#include <fstream>
+#include <algorithm>
+#include "T2TDataSet.h"
+#include "../module/T2TUtility.h"
+using namespace transformer;
+namespace nts {
+/* sort the output by id (in ascending order) */
+void DataSet::SortInput() {
+    sort(inputBuffer.items, inputBuffer.items + inputBuffer.count, [](Example* a, Example* b) {
+        return a->values.count > b->values.count;
+        });
+}
+/* sort the input by length (in descending order) */
+void DataSet::SortOutput() {
+    sort(outputBuffer.items, outputBuffer.items + outputBuffer.count, [](Result* a, Result* b) {
+        return a->id < b->id;
+        });
+}
+/*
+load data from the file to the buffer
+*/
+void DataSet::LoadDataToBuffer()
+{
+    string line;
+    inputBuffer.Clear();
+    bufferUsed = 0;
+    int id = 0;
+    const string tokenDelimiter = " ";
+    while (getline(*fp, line)) {
+        IntList values;
+        /* load words and transform them to ids */
+        auto indices = SplitToPos(line, tokenDelimiter);
+        /* reserve the first 120 words if the input is too long */
+        size_t maxLen = indices.Size() > MAX_WORD_NUM ? MAX_WORD_NUM : indices.Size();
+        for (size_t i = 0; i < maxLen; i++) {
+            auto offset = (i != (indices.Size() - 1)) ?
+                indices[i + 1] - indices[i] - tokenDelimiter.size()
+                : line.size() - indices[i];
+            string word = line.substr(indices[i], offset);
+            if (srcVocab.word2id.find(word) == srcVocab.word2id.end())
+                values.Add(3);
+            else
+                values.Add(srcVocab.word2id.at(word));
+        }
+        /* make sure that the sequence ends with EOS */
+        if (values.Size() != 0 && values[-1] != EOS)
+            values.Add(EOS);
+        Example* example = new Example;
+        example->id = id;
+        example->values = values;
+        if (values.Size() != 0)
+            inputBuffer.Add(example);
+        else
+            emptyLines.Add(id);
+        id++;
+    }
+    fp->close();
+    SortInput();
+    XPRINT1(0, stderr, "[INFO] loaded %d sentences\n", id);
+}
+/*
+load a mini-batch to the device
+>> batchEnc - a tensor to store the batch of input
+>> paddingEnc - a tensor to store the batch of paddings
+>> minSentBatch - the minimum number of sentence batch
+>> batchSize - the maxium number of words in a batch
+>> devID - the device id, -1 for the CPU
+<< indices of the sentences
+*/
+UInt64List DataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
+                              size_t minSentBatch, size_t batchSize, int devID)
+{
+    size_t realBatchSize = minSentBatch;
+    /* get the maximum sentence length in a mini-batch */
+    size_t maxLen = inputBuffer[bufferUsed]->values.Size();
+    /* dynamic batching for sentences */
+    while ((realBatchSize < (inputBuffer.Size() - bufferUsed))
+        && (realBatchSize * maxLen < batchSize)) {
+        realBatchSize++;
+    }
+    /* real batch size */
+    if ((inputBuffer.Size() - bufferUsed) < realBatchSize) {
+        realBatchSize = inputBuffer.Size() - bufferUsed;
+    }
+    CheckNTErrors(maxLen != 0, "invalid length");
+    int* batchValues = new int[realBatchSize * maxLen];
+    float* paddingValues = new float[realBatchSize * maxLen];
+    for (int i = 0; i < realBatchSize * maxLen; i++) {
+        batchValues[i] = 1;
+        paddingValues[i] = 0.0F;
+    }
+    size_t cur = 0;
+    /* left padding */
+    UInt64List infos;
+    size_t totalLength = 0;
+    for (int i = 0; i < realBatchSize; ++i) {
+        infos.Add(inputBuffer[bufferUsed + i]->id);
+        totalLength += inputBuffer[bufferUsed + i]->values.Size();
+        cur = maxLen * (i + 1) - inputBuffer[bufferUsed + i]->values.Size();
+        for (int j = 0; j < inputBuffer[bufferUsed + i]->values.Size(); j++) {
+            batchValues[cur] = inputBuffer[bufferUsed + i]->values[j];
+            paddingValues[cur++] = 1.0F;
+        }
+    }
+    infos.Add(totalLength);
+    InitTensor2D(batchEnc, realBatchSize, maxLen, X_INT, devID);
+    InitTensor2D(paddingEnc, realBatchSize, maxLen, X_FLOAT, devID);
+    bufferUsed += realBatchSize;
+    batchEnc->SetData(batchValues, batchEnc->unitNum);
+    paddingEnc->SetData(paddingValues, paddingEnc->unitNum);
+    delete[] batchValues;
+    delete[] paddingValues;
+    return infos;
+}
+/*
+the constructor of DataSet
+>> dataFile - path of the data file
+>> srcVocabFN - path of the source vocab file
+>> tgtVocabFN - path of the target vocab file
+*/
+void DataSet::Init(const char* dataFile, const char* srcVocabFN, const char* tgtVocabFN)
+{
+    fp = new ifstream(dataFile);
+    CheckNTErrors(fp->is_open(), "can not open the file");
+    bufferUsed = 0;
+    CheckNTErrors(strcmp(srcVocabFN, "") != 0, "missing source vocab file");
+    CheckNTErrors(strcmp(tgtVocabFN, "") != 0, "missing target vocab file");
+    srcVocab.Load(srcVocabFN);
+    /* share source and target vocabs */
+    if (strcmp(srcVocabFN, tgtVocabFN) == 0) {
+        XPRINT(0, stderr, "[INFO] share source and target vocabs \n");
+        tgtVocab.CopyFrom(srcVocab);
+    }
+    else {
+        tgtVocab.Load(tgtVocabFN);
+    }
+    LoadDataToBuffer();
+}
+/* check if the buffer is empty */
+bool DataSet::IsEmpty() {
+    if (bufferUsed < inputBuffer.Size())
+        return false;
+    return true;
+}
+/* dump the translation to a file */
+void DataSet::DumpRes(const char* ofn)
+{
+    ofstream ofile(ofn, ios::out);
+    for (int t = 0; t < outputBuffer.Size(); t++) {
+        auto res = outputBuffer[t];
+        for (int i = 0; i < res->res.Size(); i++) {
+            if (res->res[i] < 4)
+                break;
+            ofile << tgtVocab.id2word[res->res[i]] << " ";
+        }
+        ofile << "\n";
+    }
+    ofile.close();
+}
+/* de-constructor */
+DataSet::~DataSet()
+{
+    /* release the file */
+    delete fp;
+    /* release the input buffer */
+    for (int i = 0; i < inputBuffer.Size(); i++)
+        delete inputBuffer[i];
+    /* release the output buffer */
+    for (int i = 0; i < outputBuffer.Size(); i++)
+        delete outputBuffer[i];
+}
+}
\ No newline at end of file
--- a/source/sample/transformer/translate/T2TDataSet.h
+++ b/source/sample/transformer/translate/T2TDataSet.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: HU Chi (huchinlp@foxmail.com) 2019-04-03
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
+ */
+#ifndef __DATASET_H__
+#define __DATASET_H__
+#include <cstdio>
+#include <vector>
+#include <fstream>
+#include "T2TVocab.h"
+#include "../../../tensor/XList.h"
+#include "../../../tensor/XTensor.h"
+#include "../../../tensor/XGlobal.h"
+#define MAX_WORD_NUM 120
+using namespace std;
+namespace nts {
+/* the struct of tokenized input */
+struct Example {
+    int id;
+    IntList values;
+};
+/* the struct of tokenized output */
+struct Result {
+    int id;
+    IntList res;
+};
+/* A `DataSet` is associated with a file which contains variable length data.*/
+struct DataSet {
+public:
+    /* the data buffer */
+    InputBufferType inputBuffer;
+    /* a list of empty line number */
+    IntList emptyLines;
+    /* the result buffer */
+    OutputBufferType outputBuffer;
+    /* the pointer to file stream */
+    ifstream* fp;
+    /* size of used data in buffer */
+    size_t bufferUsed;
+    /* the source vocabulary */
+    Vocab srcVocab;
+    /* the target vocabulary */
+    Vocab tgtVocab;
+public:
+    /* sort the input by length */
+    void SortInput();
+    /* reorder the output by ids */
+    void SortOutput();
+    /* load data from a file to the buffer */
+    void LoadDataToBuffer();
+    /* generate a mini-batch */
+    UInt64List LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
+        size_t sBatch, size_t wBatch, int devID);
+    /* initialization function */
+    void Init(const char* dataFile, const char* srcVocabFN, const char* tgtVocabFN);
+    /* check if the buffer is empty */
+    bool IsEmpty();
+    /* dump the translations to a file */
+    void DumpRes(const char* ofn);
+    /* de-constructor */
+    ~DataSet();
+};
+}
+#endif // __DATASET_H__
\ No newline at end of file
--- a/source/sample/transformer/T2TLengthPenalty.cpp
+++ b/source/sample/transformer/T2TLengthPenalty.cpp
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2019, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,7 +15,13 @@
 * limitations under the License.
 */
-#include "../../tensor/core/CHeader.h"
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-04-08
+ * Start of a new week - I just finished several documents.
+ * Writing document is harder than writing code :)
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
+ */
 #include "T2TLengthPenalty.h"
 using namespace nts;
@@ -23,24 +29,23 @@ using namespace nts;
 namespace transformer
 {
-/* 
+/*
-GNMT-like length penalty: pl = ((5 + n)/(5 + 1))^\alpha 
+GNMT-like length penalty: pl = ((5 + n)/(5 + 1))^\alpha
-where n = length of the sequence 
+where n = length of the sequence
->> length - length of the sequence (for each entry)
+>> length - length of the sequence
 >> alpha - the parameter controls the length preference
-<< return - length penaltyof the sequence (for each entry)
+<< return - length penalty of the sequence
 */
-XTensor T2TLengthPenalizer::GNMT(const XTensor & length, float alpha)
+float T2TLengthPenalizer::GNMT(float length, float alpha)
 {
-    XTensor base;
+    float base;
-    XTensor lp;
+    float lp;
-    //base = ScaleAndShift(ScaleAndShift(length, 0, 5.0F), 1.0F/(5 + 1));
+    base = (length + 5.0F) / (1.0F + 5.0F);
-    base = (length + 5)/(1 + 5);
+    lp = pow(base, alpha);
-    lp = Power(base, alpha);
    return lp;
 }
 }
\ No newline at end of file
--- a/source/sample/transformer/T2TLengthPenalty.h
+++ b/source/sample/transformer/T2TLengthPenalty.h
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2019, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -19,12 +19,14 @@
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-04-08
 * Start of a new week - I just finished several documents.
 * Writing document is harder than writing code :)
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */
 #ifndef __T2TLENGTHPENALTY_H__
 #define __T2TLENGTHPENALTY_H__
-#include "../../tensor/XTensor.h"
+#include "../module/T2TUtility.h"
+#include "../../../tensor/XTensor.h"
 using namespace nts;
@@ -37,10 +39,9 @@ namespace transformer
 class T2TLengthPenalizer
 {
 public:
-    /* GNMT-like length penalty: pl = ((5 + n)/(5 + 1))^\alpha 
+    /* GNMT-like length penalty: pl = ((5 + n)/(5 + 1))^\alpha
       where n = length of the sequence */
-    static
+    static float GNMT(float length, float alpha);
-    XTensor GNMT(const XTensor & length, float alpha);
 };
 }

--- a/source/sample/transformer/T2TPredictor.cpp
+++ b/source/sample/transformer/T2TPredictor.cpp
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2019, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,10 +17,13 @@
 /*
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-13
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */
+#include <iostream>
 #include "T2TPredictor.h"
-#include "../../tensor/core/CHeader.h"
+#include "../module/T2TNNUtil.h"
 using namespace nts;
@@ -37,24 +40,24 @@ T2TStateBundle::T2TStateBundle()
 /* de-constructor */
 T2TStateBundle::~T2TStateBundle()
 {
-    if(states != NULL)
+    if (states != NULL)
        delete[] states;
 }
-/* 
+/*
-create states 
+create states
 >> num - number of states
 */
 void T2TStateBundle::MakeStates(int num)
 {
    CheckNTErrors(num > 0, "invalid number");
-    if(states != NULL)
+    if (states != NULL)
        delete[] states;
    states = new T2TState[num];
-    for(int i = 0; i < num; i++){
+    for (int i = 0; i < num; i++) {
        states[i].prediction = -1;
        states[i].pid = T2T_PID_EMPTY;
        states[i].isEnd = false;
@@ -73,7 +76,7 @@ void T2TStateBundle::MakeStates(int num)
 /* constructor */
 T2TPredictor::T2TPredictor()
 {
-    startSymbol = -1;
+    startSymbol = 2;
 }
 /* de-constructor */
@@ -81,36 +84,27 @@ T2TPredictor::~T2TPredictor()
 {
 }
-/* 
+/*
-create an initial state 
+create an initial state
 >> model - the t2t model
 >> top - the top-most layer of the network
 >> input - input of the network
 >> beamSize - beam size
 >> state - the state to be initialized
 */
-void T2TPredictor::Create(T2TModel * model, XTensor * top, const XTensor * input, int beamSize, T2TStateBundle * state)
+void T2TPredictor::Create(T2TModel* model, XTensor* top, const XTensor* input,
+    int beamSize, T2TStateBundle* state)
 {
-    state->layersEnc.Clear();
-    state->layersDec.Clear();
-    XTensor * encoding = XLink::SearchNode(top, ENCODING_NAME);
-    CheckNTErrors(encoding != NULL, "No encoding layers found!");
-    state->layersEnc.Add(encoding);
-    state->layersDec.Add(NULL);
    int dims[MAX_TENSOR_DIM_NUM];
    for (int i = 0; i < input->order - 1; i++)
-        dims[i] = input->GetDim(i);
+        dims[i] = input->dimSize[i];
    dims[input->order - 1] = beamSize;
    InitTensor(&state->probPath, input->order, dims, X_FLOAT, input->devID);
-    InitTensor(&state->nstep, input->order, dims, X_FLOAT, input->devID);
    InitTensor(&state->endMark, input->order, dims, X_INT, input->devID);
    state->probPath.SetZeroAll();
-    state->nstep.SetZeroAll();
+    state->nstep = 0.0F;
    state->endMark.SetZeroAll();
    state->stateNum = 0;
@@ -125,15 +119,15 @@ void T2TPredictor::SetStartSymbol(int symbol)
    startSymbol = symbol;
 }
-/* 
+/*
-read a state 
+read a state
 >> model - the t2t model that keeps the network created so far
 >> state - a set of states. It keeps
-             1) hypotheses (states)
+1) hypotheses (states)
-             2) probablities of hypotheses
+2) probabilities of hypotheses
-             3) parts of the network for expanding toward the next state
+3) parts of the network for expanding toward the next state
 */
-void T2TPredictor::Read(T2TModel * model, T2TStateBundle * state)
+void T2TPredictor::Read(T2TModel* model, T2TStateBundle* state)
 {
    m = model;
    s = state;
@@ -141,118 +135,108 @@ void T2TPredictor::Read(T2TModel * model, T2TStateBundle * state)
 /*
 predict the next state
->> next - next states (assuming that the current state has been read)
+>> next - next states
->> encoding - encoder output
+>> aliveIndices - indices of alive states, (B)
->> inputEnc - input of the encoder
+>> absoluteIdx - the absolute indices of alive states, (B)
->> paddingEnc - padding of the encoder
+>> encoding - encoder output, (B, L, E)
+>> inputEnc - input of the encoder, (B, L)
+>> paddingEnc - padding of the encoder, (B, L)
+>> rawBatchSize - the raw batch size (in case of some states are pruned)
+>> isStart - whether it is the start state or not
+>> reorderState - the new order of states
+>> needReorder - whether we need reordering the states
+>> nstep - current time step of the target sequence
 */
-void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding,
+void T2TPredictor::Predict(T2TStateBundle* next, XTensor& aliveState, XTensor& encoding,
-                           XTensor * inputEnc, XTensor * paddingEnc)
+                           XTensor& inputEnc, XTensor& paddingEnc, int batchSize, bool isStart,
+                           XTensor& reorderState, bool needReorder, int nstep)
 {
    int dims[MAX_TENSOR_DIM_NUM];
-    next->layersEnc.Clear();
-    next->layersDec.Clear();
-    AttDecoder &decoder = *m->decoder;
-    /* word indices of previous positions */
-    XTensor * inputLast = (XTensor*)s->layersDec.GetItem(0);
    /* word indices of positions up to next state */
    XTensor inputDec;
    /* the first token */
    XTensor first;
-    CheckNTErrors(inputEnc->order >= 2, "Wrong order of the tensor!");
-    for(int i = 0; i < inputEnc->order - 1; i++)
-        dims[i] = inputEnc->GetDim(i);
-    dims[inputEnc->order - 1] = 1;
-    InitTensor(&first, inputEnc->order, dims, X_INT, inputEnc->devID);
+    InitTensor2D(&first, batchSize, 1, X_INT, inputEnc.devID);
    first.SetDataFixed(startSymbol);
    /* add a new word into the input sequence of the decoder side */
-    if (inputLast == NULL) {
+    if (isStart) {
        inputDec = Identity(first);
    }
-    else{
+    else {
-        inputDec = GeneratePaths(s);
+        /* only pass one step to the decoder */
-        inputDec.SetDevice(inputEnc->devID);
+        inputDec = GetLastPrediction(s, inputEnc.devID);
+    }
-        inputDec = Concatenate(first, inputDec, inputDec.order - 1);
+    /* keep alive states for the decoder */
+    if (aliveState.dimSize[0] < batchSize) {
+        /* alive inputs */
+        inputDec = AutoGather(inputDec, aliveState);
+        /* alive cache */
+        for (int i = 0; i < m->decoder->nlayer; i++) {
+            m->decoder->selfAttCache[i].KeepAlive(aliveState);
+            m->decoder->enDeAttCache[i].KeepAlive(aliveState);
+        }
+    }
+    if (needReorder) {
+        for (int i = 0; i < m->decoder->nlayer; i++) {
+            m->decoder->selfAttCache[i].Reorder(reorderState);
+            m->decoder->enDeAttCache[i].Reorder(reorderState);
+        }
    }
    /* prediction probabilities */
-    XTensor &output = next->prob;
+    XTensor& output = next->prob;
    XTensor decoding;
-    XTensor decodingStep;
+    for (int i = 0; i < inputDec.order - 1; i++)
-    for(int i = 0; i < inputDec.order - 1; i++)
+        dims[i] = inputDec.dimSize[i];
-        dims[i] = inputDec.GetDim(i);
+    dims[inputDec.order - 1] = inputDec.dimSize[inputDec.order - 1];
-    dims[inputDec.order - 1] = inputDec.GetDim(-1);
    XTensor paddingDec;
-    InitTensor(&paddingDec, inputDec.order, dims, X_INT, paddingEnc->devID);
+    InitTensor(&paddingDec, inputDec.order, dims, X_INT, paddingEnc.devID);
    paddingDec.SetDataFixed(1);
    XTensor maskDec;
    XTensor maskEncDec;
    /* decoder mask */
-    m->MakeMTMaskDec(*paddingEnc, paddingDec, maskDec, maskEncDec);
+    m->MakeMTMaskDec(paddingEnc, paddingDec, maskDec, maskEncDec);
    /* make the decoding network */
-    decoding = decoder.Make(inputDec, *encoding, maskDec, maskEncDec, false);
+    decoding = m->decoder->Make(inputDec, encoding, NULL, &maskEncDec, nstep, false);
-    XTensor selectSrc;
-    XTensor selectTgt;
    CheckNTErrors(decoding.order >= 2, "The tensor must be of order 2 or larger!");
-    int stride = decoding.GetDim(decoding.order - 2);
-    InitTensor1D(&selectSrc, 1, X_INT);
-    InitTensor1D(&selectTgt, 1, X_INT);
-    selectSrc.SetInt(stride - 1, 0);
-    selectTgt.SetInt(0, 0);
-    selectSrc.SetDevice(decoding.devID);
-    selectTgt.SetDevice(decoding.devID);
-    /* the decoder output of the last position */
-    decodingStep = CopyIndexed(decoding, decoding.order - 2, selectSrc, selectTgt);
    /* generate the output probabilities */
-    m->outputLayer->Make(decodingStep, output);
+    m->outputLayer->Make(decoding, output, false, true);
-    next->layersEnc.AddList(&s->layersEnc);
-    next->layersDec.Add(&inputDec);
-    next->layersDec.Add(&output);
 }
-/* 
+/*
-generate paths up to the states of the current step 
+generate paths up to the states of the current step
 >> state - state bundle of the current step
 */
-XTensor T2TPredictor::GeneratePaths(T2TStateBundle * state)
+XTensor T2TPredictor::GeneratePaths(T2TStateBundle* state)
 {
    CheckNTErrors(state->stateNum >= 0, "Illegal state!");
    int distance = -1;
-    for(int i = 0; i < state->stateNum; i++){
+    for (int i = 0; i < state->stateNum; i++) {
-        T2TState * cur = state->states + i;
+        T2TState* cur = state->states + i;
        int nsteps = 0;
-        while(cur != NULL){
+        while (cur != NULL) {
            nsteps++;
            cur = cur->last;
        }
-        if(nsteps > distance)
+        if (nsteps > distance)
            distance = nsteps;
    }
@@ -260,11 +244,11 @@ XTensor T2TPredictor::GeneratePaths(T2TStateBundle * state)
    InitTensor2D(&path, state->stateNum, distance, X_INT);
    path.SetZeroAll();
-    for(int i = 0; i < state->stateNum; i++){
+    for (int i = 0; i < state->stateNum; i++) {
-        T2TState * cur = state->states + i;
+        T2TState* cur = state->states + i;
        int nsteps = 0;
-        while(cur != NULL){
+        while (cur != NULL) {
            nsteps++;
            path.Set2DInt(cur->prediction, i, distance - nsteps);
            cur = cur->last;
@@ -274,5 +258,28 @@ XTensor T2TPredictor::GeneratePaths(T2TStateBundle * state)
    return path;
 }
+/*
+get the predictions of the previous step
+>> state - state bundle of the current step
+>> devID - the device id for the predictions
+*/
+XTensor T2TPredictor::GetLastPrediction(T2TStateBundle* state, int devID)
+{
+    CheckNTErrors(state->stateNum >= 0, "Illegal state!");
+    IntList last;
+    for (int i = 0; i < state->stateNum; i++) {
+        T2TState* cur = state->states + i;
+        last.Add(cur->prediction);
+    }
+    XTensor lastPred;
+    InitTensor2D(&lastPred, last.Size(), 1, X_INT, devID);
+    lastPred.SetData(last.items, last.Size());
+    return lastPred;
 }
+}
\ No newline at end of file
--- a/source/sample/transformer/T2TPredictor.h
+++ b/source/sample/transformer/T2TPredictor.h
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2019, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -18,29 +18,32 @@
 /*
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-13
 * This is the first source file I create in 2019 - new start!
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */
 #ifndef __T2TPREDICTOR_H__
 #define __T2TPREDICTOR_H__
-#include "T2TModel.h"
+#include "../T2TModel.h"
 #include "T2TLengthPenalty.h"
+using namespace std;
 namespace transformer
 {
 #define T2T_PID_EMPTY -1
 /* state for search. It keeps the path (back-pointer), prediction distribution,
-   and etc. It can be regarded as a hypothsis in translation. */
+   and etc. It can be regarded as a hypotheses in translation. */
 class T2TState
 {
 public:
    /* we assume that the prediction is an integer */
    int prediction;
-    /* id of the problem. One can regard it as the sentence id when we 
+    /* id of the problem. One can regard it as the sentence id when we
-       translate a number of sentences in the batched manner. The hypothesis 
+       translate a number of sentences in the batched manner. The hypotheses
       is empty if id = -1 */
    int pid;
@@ -62,11 +65,11 @@ public:
    /* model score of every path. A model score = path probability + some other stuff */
    float modelScore;
-    /* nubmer of steps we go over so far */
+    /* number of steps we go over so far */
    int nstep;
    /* pointer to the previous state */
-    T2TState * last;
+    T2TState* last;
 };
 /* a bundle of states */
@@ -75,11 +78,11 @@ class T2TStateBundle
 public:
    /* predictions */
    XTensor prediction;
    /* id of the previous state that generates the current one  */
    XTensor preID;
-    /* mark that indicates whether each hypothesis is completed */
+    /* mark that indicates whether each hypotheses is completed */
    XTensor endMark;
    /* probability of every prediction (last state of the path) */
@@ -91,18 +94,11 @@ public:
    /* model score of every path */
    XTensor modelScore;
-    /* step number of each hypothesis */
+    /* step number of each hypotheses */
-    XTensor nstep;
+    float nstep;
-    /* layers on the encoder side. We actually use the encoder output instead
-       of all hidden layers. */
-    TensorList layersEnc;
-    /* layers on the decoder side */
-    TensorList layersDec;
    /* list of states */
-    T2TState * states;
+    T2TState* states;
    /* number of states */
    int stateNum;
@@ -121,23 +117,26 @@ public:
    void MakeStates(int num);
 };
-/* The predictor reads the current state and then predicts the next. 
+/* The predictor reads the current state and then predicts the next.
   It is exactly the same procedure of MT inference -
   we get the state of previous words and then generate the next word.
-   Here, a state can be regared as the representation of words (word 
+   Here, a state can be regarded as the representation of words (word
   indices, hidden states, embeddings and etc.).  */
 class T2TPredictor
 {
 private:
    /* pointer to the transformer model */
-    T2TModel * m;
+    T2TModel* m;
    /* current state */
-    T2TStateBundle * s;
+    T2TStateBundle* s;
    /* start symbol */
    int startSymbol;
+    /* end symbol */
+    int endSymbol;
 public:
    /* constructor */
    T2TPredictor();
@@ -146,19 +145,24 @@ public:
    ~T2TPredictor();
    /* create an initial state */
-    void Create(T2TModel * model, XTensor * top, const XTensor * input, int beamSize, T2TStateBundle * state);
+    void Create(T2TModel* model, XTensor* top, const XTensor* input, int beamSize, T2TStateBundle* state);
    /* set the start symbol */
    void SetStartSymbol(int symbol);
    /* read a state */
-    void Read(T2TModel * model, T2TStateBundle * state);
+    void Read(T2TModel* model, T2TStateBundle* state);
    /* predict the next state */
-    void Predict(T2TStateBundle * next, XTensor * encoding, XTensor * inputEnc, XTensor * paddingEnc);
+    void Predict(T2TStateBundle* next, XTensor& aliveIndices, XTensor& encoding,
+        XTensor& inputEnc, XTensor& paddingEnc, int rawBatchSize,
+        bool isStart, XTensor& reorderState, bool needReorder, int nstep);
    /* generate paths up to the states of the current step */
-    XTensor GeneratePaths(T2TStateBundle * state);
+    XTensor GeneratePaths(T2TStateBundle* state);
+    /* get the predictions of the previous step */
+    XTensor GetLastPrediction(T2TStateBundle* state, int devID);
 };
 }

--- a/source/sample/transformer/translate/T2TSearch.cpp
+++ b/source/sample/transformer/translate/T2TSearch.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
+ */
+#include "T2TSearch.h"
+#include "../module/T2TUtility.h"
+#include "../../../tensor/core/CHeader.h"
+using namespace nts;
+namespace transformer
+{
+/* constructor */
+BeamSearch::BeamSearch()
+{
+    alpha = 0;
+    maxLength = 0;
+    beamSize = 0;
+    batchSize = 0;
+    endSymbolNum = 0;
+    fullHypos = NULL;
+    endSymbols = new int[32];
+    startSymbol = -1;
+}
+/* de-constructor */
+BeamSearch::~BeamSearch()
+{
+    if (fullHypos != NULL)
+        delete[] fullHypos;
+    if (endSymbols != NULL)
+        delete[] endSymbols;
+}
+/*
+initialize the model
+>> argc - number of arguments
+>> argv - list of pointers to the arguments
+*/
+void BeamSearch::Init(T2TConfig& config)
+{
+    beamSize = config.beamSize;
+    batchSize = config.sBatchSize;
+    alpha = config.lenAlpha;
+    endSymbols[0] = config.endID;
+    startSymbol = config.startID;
+    scalarMaxLength = config.maxLenAlpha;
+    if (endSymbols[0] >= 0)
+        endSymbolNum = 1;
+}
+/*
+prepare for search
+>> batchSize - size of the batch
+>> beamSize - size of the beam
+*/
+void BeamSearch::Prepare(int myBatchSize, int myBeamSize)
+{
+    batchSize = myBatchSize;
+    beamSize = myBeamSize;
+    needReorder = false;
+    /* prepare for the heap of hypotheses */
+    if (fullHypos != NULL)
+        delete[] fullHypos;
+    fullHypos = new XHeap<MIN_HEAP, float>[batchSize];
+    for (int i = 0; i < batchSize; i++)
+        fullHypos[i].Init(beamSize);
+    /* prepare for the indices of alive states */
+    aliveStatePids.Clear();
+    aliveSentList.Clear();
+    for (int i = 0; i < batchSize; i++) {
+        aliveStatePids.Add(i);
+        aliveSentList.Add(i);
+    }
+}
+/*
+search for the most promising states
+>> model - the transformer model
+>> input - input of the model
+>> padding - padding of the input
+>> output - output that represents the sequences as rows
+>> score - score of the sequences
+*/
+void BeamSearch::Search(T2TModel* model, XTensor& input, XTensor& padding, 
+                        IntList* output, XTensor& score)
+{
+    T2TPredictor predictor;
+    XTensor maskEnc;
+    XTensor encoding;
+    XTensor encodingBeam;
+    XTensor inputBeam;
+    XTensor paddingBeam;
+    CheckNTErrors(endSymbolNum > 0, "The search class is not initialized!");
+    CheckNTErrors(startSymbol >= 0, "The search class is not initialized!");
+    Prepare(input.unitNum / input.dimSize[input.order - 1], beamSize);
+    /* encoder mask */
+    model->MakeMTMaskEnc(padding, maskEnc);
+    /* make the encoding network */
+    encoding = model->MakeEncoder(input, &maskEnc, false);
+    encodingBeam = Unsqueeze(encoding, encoding.order - 2, beamSize);
+    inputBeam = Unsqueeze(input, input.order - 1, beamSize);
+    paddingBeam = Unsqueeze(padding, padding.order - 1, beamSize);
+    encodingBeam.ReshapeMerged(encodingBeam.order - 4);
+    inputBeam.ReshapeMerged(inputBeam.order - 3);
+    paddingBeam.ReshapeMerged(paddingBeam.order - 3);
+    /* max output-length = scalar * source-length */
+    int lengthLimit = (int)(input.dimSize[input.order - 1] * scalarMaxLength);
+    CheckNTErrors(lengthLimit > 0, "no max length specified!");
+    maxLength = lengthLimit;
+    T2TStateBundle* states = new T2TStateBundle[lengthLimit + 1];
+    T2TStateBundle* first = states;
+    T2TStateBundle* cur = NULL;
+    T2TStateBundle* next = NULL;
+    /* create the first state */
+    predictor.Create(model, &encodingBeam, &input, beamSize, first);
+    predictor.SetStartSymbol(startSymbol);
+    first->isStart = true;
+    XTensor aliveState;
+    InitTensor1D(&aliveState, batchSize * beamSize, X_INT, input.devID);
+    SetAscendingOrder(aliveState, 0);
+    XTensor reorderState;
+    InitTensor1D(&reorderState, batchSize * beamSize, X_INT, input.devID);
+    SetAscendingOrder(reorderState, 0);
+    /* generate the sequence from left to right */
+    for (int l = 0; l < lengthLimit; l++) {
+        if (beamSize > 1) {
+            inputBeam = AutoGather(inputBeam, reorderState);
+            paddingBeam = AutoGather(paddingBeam, reorderState);
+            encodingBeam = AutoGather(encodingBeam, reorderState);
+        }
+        cur = states + l;
+        next = states + l + 1;
+        /* read the current state */
+        predictor.Read(model, cur);
+        /* predict the next state */
+        predictor.Predict(next, aliveState, encodingBeam, inputBeam,
+            paddingBeam, batchSize * beamSize, l == 0, reorderState, needReorder, l);
+        /* compute the model score (given the prediction probability) */
+        Score(cur, next);
+        /* beam pruning */
+        Generate(cur, next);
+        /* expand the search graph */
+        Expand(cur, next, reorderState);
+        /* push complete hypotheses into the heap */
+        Collect(next);
+        /* stop searching when all hypotheses are completed */
+        if (IsAllCompleted(next)) {
+            maxLength = l + 1;
+            break;
+        }
+        /* remove finished sentences */
+        //RemoveFinishedStates(next, encodingBeam, inputBeam, paddingBeam, aliveState);
+    }
+    /* fill the heap with incomplete hypotheses if necessary */
+    FillHeap(next);
+    Dump(output, &score);
+    delete[] states;
+}
+/*
+compute the model score for each hypotheses
+>> prev - the beam of the previous state
+>> beam - the beam that keeps a number of states
+*/
+void BeamSearch::Score(T2TStateBundle* prev, T2TStateBundle* beam)
+{
+    XTensor& score = beam->modelScore;
+    XTensor& prob = beam->prob;
+    XTensor& probPath = beam->probPath;
+    XTensor& probPathPrev = prev->probPath;
+    XTensor mask;
+    int order = prob.order;
+    int outputSize = prob.dimSize[prob.order - 1];
+    int dims[MAX_TENSOR_DIM_NUM];
+    for (int i = 0; i < order; i++)
+        dims[i] = prob.dimSize[i];
+    if (prob.dataType == X_FLOAT16)
+        prob = ConvertDataType(prob, X_FLOAT);
+    InitTensor(&score, &prob);
+    InitTensor(&probPath, &prob);
+    prob.Reshape(prob.unitNum / outputSize, outputSize);
+    score.Reshape(score.unitNum / outputSize, outputSize);
+    probPath.Reshape(score.unitNum / outputSize, outputSize);
+    probPathPrev.Reshape(probPathPrev.unitNum);
+    /* the log-scale probability of the entire sequence */
+    SumDim(prob, probPathPrev, probPath, 0);
+    beam->nstep = prev->nstep + 1.0F;
+    /* the GNMT-like length penalty */
+    float lp = T2TLengthPenalizer::GNMT(beam->nstep, alpha);
+    /* score = log-prob/lp */
+    score = probPath / lp;
+    if (prev->isStart) {
+        XTensor firstMask = MakeFirstMask(beam);
+        firstMask.Reshape(firstMask.unitNum);
+        /* mask the hypotheses in the beam except the first one */
+        SumDim(score, firstMask, score, 0);
+    }
+    InitTensor(&mask,
+        prev->endMark.order, prev->endMark.dimSize, X_FLOAT,
+        prev->endMark.devID);
+    mask.SetZeroAll();
+    _SetDataFixedCond(&mask, &prev->endMark, -1e9F);
+    mask.Reshape(mask.unitNum);
+    /* mask the completed hypotheses so that they cannot
+       be involved in further sorting and beam search. */
+    SumDim(score, mask, score, 0);
+    prob.Reshape(order, dims);
+    score.Reshape(order, dims);
+    probPath.Reshape(order, dims);
+}
+/*
+generate tokens for the next state via beam pruning
+>> prev - the last beam
+>> beam - the beam that keeps a number of states
+*/
+void BeamSearch::Generate(T2TStateBundle* prev, T2TStateBundle* beam)
+{
+    int dims[MAX_TENSOR_DIM_NUM];
+    int dimsBeam[MAX_TENSOR_DIM_NUM];
+    int dimsTopK[MAX_TENSOR_DIM_NUM];
+    XTensor scoreTopK;
+    XTensor indexCPU;
+    XTensor& score = beam->modelScore;
+    XTensor& index = beam->prediction;
+    XTensor& preID = beam->preID;
+    XTensor& probPath = beam->probPath;
+    XTensor& prob = beam->prob;
+    int order = score.order;
+    for (int i = 0; i < order; i++) {
+        dims[i] = score.dimSize[i];
+        dimsBeam[i] = score.dimSize[i];
+        dimsTopK[i] = score.dimSize[i];
+    }
+    CheckNTErrors(order >= 3, "The tensor must be of order 2 or larger.");
+    CheckNTErrors(dimsBeam[order - 3] % beamSize == 0, "Wrong dimension size!");
+    int sizeVocab = score.dimSize[score.order - 1];
+    int stride = score.dimSize[score.order - 1];
+    dimsBeam[order - 3] /= beamSize;
+    dimsBeam[order - 1] *= beamSize;
+    dimsTopK[order - 3] = dimsBeam[order - 3];
+    dimsTopK[order - 1] = beamSize;
+    InitTensor(&scoreTopK, order, dimsTopK, score.dataType, score.devID);
+    InitTensor(&index, order, dimsTopK, X_INT, score.devID);
+    InitTensor(&preID, order, dimsTopK, X_INT, -1);
+    InitTensor(&indexCPU, order, dimsTopK, X_INT, -1);
+    score.Reshape(order, dimsBeam);
+    prob.Reshape(order, dimsBeam);
+    /* keep the most promising candidates in the beam */
+    TopK(score, scoreTopK, index, -1, beamSize, true);
+    float lp = T2TLengthPenalizer::GNMT(beam->nstep, alpha);
+    CopyValues(index, indexCPU);
+    CopyValues(index, preID);
+    /* "preID" represents the id (or the offset) of the previous state used to make the current
+       hypotheses. Note that we reshape the "score" tensor into a matrix where each
+       row means a previous state. The column number is size-of-beam \times vocab-size. We,
+       therefore, divide entries of the top-k index by vocab-size to compute the id of the
+       previous state for each hypotheses in the top-k list. */
+    DescaleMe(preID, sizeVocab);
+    /* Then, we do something similar to "preID". For the top-k predictions, we need
+       to know their indices in the vocabulary. We compute the offset of each prediction
+       in the vocabulary by dividing it with vocab-size and computing the remainder. */
+    ModMe(index, sizeVocab);
+    /* we keep the top-k scores */
+    score = CopyValues(scoreTopK);
+    for (int i = 0; i < indexCPU.unitNum; i += beamSize) {
+        for (int j = 0; j < beamSize; j++) {
+            indexCPU.SetInt(i * stride + indexCPU.GetInt(i + j), i + j);
+        }
+    }
+    /* sequence probability of top-k candidates */
+    for (int i = 0; i < probPath.order; i++) {
+        dims[i] = probPath.dimSize[i];
+        dimsTopK[i] = scoreTopK.dimSize[i];
+    }
+    order = probPath.order;
+    prob.Reshape(prob.unitNum, 1);
+    probPath.Reshape(probPath.unitNum, 1);
+    indexCPU.Reshape(indexCPU.dimSize[0], indexCPU.dimSize[indexCPU.order - 1]);
+    indexCPU.SetDevice(prob.devID);
+    prob = Gather(prob, indexCPU);
+    probPath = Gather(probPath, indexCPU);
+    prob.Reshape(order, dimsTopK);
+    probPath.Reshape(order, dimsTopK);
+}
+/*
+expand the search graph
+>> prev - the last beam
+>> beam - the beam that keeps a number of states
+>> reorderState - the new order of states
+*/
+void BeamSearch::Expand(T2TStateBundle* prev, T2TStateBundle* beam, XTensor& reorderState)
+{
+    CheckNTErrors(beam->prediction.unitNum == beam->preID.unitNum, 
+                  "A problem occurs in the beam!");
+    beam->MakeStates(beam->prediction.unitNum);
+    T2TState* states = beam->states;
+    XTensor& idRef = beam->preID;
+    XTensor& modelScoreRef = beam->modelScore;
+    XTensor& probRef = beam->prob;
+    XTensor& probPathRef = beam->probPath;
+    XTensor& predictionRef = beam->prediction;
+    XTensor& endMark = beam->endMark;
+    XTensor   id;
+    XTensor   modelScore;
+    XTensor   prob;
+    XTensor   probPath;
+    XTensor   prediction;
+    XTensor   endMarkCPU;
+    XTensor reorderStateCPU;
+    InitTensorOnCPU(&id, &idRef);
+    InitTensorOnCPU(&modelScore, &modelScoreRef);
+    InitTensorOnCPU(&prob, &probRef);
+    InitTensorOnCPU(&probPath, &probPathRef);
+    InitTensorOnCPU(&prediction, &predictionRef);
+    InitTensorOnCPU(&endMarkCPU, &predictionRef);
+    InitTensor(&endMark, &predictionRef);
+    InitTensorOnCPU(&reorderStateCPU, &reorderState);
+    /* we copy the data to CPU because the frequent access to GPU is slow
+       and we can speed-up the process by doing the job on CPU. */
+    CopyValues(idRef, id);
+    CopyValues(modelScoreRef, modelScore);
+    CopyValues(probRef, prob);
+    CopyValues(probPathRef, probPath);
+    CopyValues(predictionRef, prediction);
+    CheckNTErrors(beam->stateNum == id.unitNum, "Errors occur in counting!");
+    /* Related variables are kept on the states of the graph. All these are
+       maintained on CPUs to ease the implementation of frequent access and
+       modification of the states. An alternative is to do this on GPUs but
+       it needs much more coding work and the speed-up is not obvious. */
+    for (int i = 0; i < beam->stateNum; i += beamSize) {
+        for (int j = 0; j < beamSize; j++) {
+            int k = i + j;
+            T2TState& state = states[k];
+            int offset = id.GetInt(k);
+            int pid = i / beamSize;
+            reorderStateCPU.SetInt(i + offset, i + j);
+            if (offset != j)
+                needReorder = true;
+            T2TState* last = prev->states + pid * beamSize + offset;
+            CheckNTErrors(offset >= 0, "Wrong state index!");
+            /* pointer to the previous state */
+            if (prev->isStart) {
+                state.last = NULL;
+                state.pid = pid;
+                state.nstep = 0;
+                state.isCompleted = false;
+            }
+            else {
+                state.last = last;
+                state.pid = state.last->pid;
+                state.nstep = last->nstep + 1;
+                state.isCompleted = last->isCompleted;
+                CheckNTErrors(offset < prev->stateNum, "Wrong state index!");
+            }
+            /*if(aliveStatePids.size() < batchSize)
+                state.pid = aliveStatePids[i/beamSize];*/
+                /* scores */
+            state.modelScore = modelScore.Get(k);
+            state.prob = prob.Get(k);
+            state.probPath = probPath.Get(k);
+            /* prediction */
+            state.prediction = prediction.GetInt(k);
+            CheckNTErrors(state.prediction >= 0, "Illegal prediction!");
+            /* check if it is the end of the sequence */
+            state.isEnd = IsEnd(state.prediction);
+            state.isCompleted = (state.isCompleted || state.isEnd);
+            /* set the ending mark */
+            endMarkCPU.SetInt(state.isEnd, k);
+        }
+    }
+    /* copy the ending mark from CPU to the target device */
+    CopyValues(endMarkCPU, endMark);
+    CopyValues(reorderStateCPU, reorderState);
+}
+/*
+collect hypotheses with ending symbols. Given a beam of hypotheses,
+we remove the finished hypotheses and keep them in a heap.
+>> beam  - the beam that keeps a number of states
+*/
+void BeamSearch::Collect(T2TStateBundle* beam)
+{
+    T2TState* states = beam->states;
+    for (int i = 0; i < beam->stateNum; i++) {
+        T2TState& state = states[i];
+        CheckNTErrors(state.pid >= 0 && state.pid < batchSize,
+            "Invalid sample id!");
+        /* check if this is the first end symbol. It is false
+           if there have been end symbols in previously generated words. */
+        bool isCompleted = state.isCompleted && 
+             (state.last == NULL || !state.last->isCompleted);
+        /* we push the hypothesis into the heap when it is completed */
+        if ((state.isEnd || state.isCompleted)) {
+            fullHypos[state.pid].Push(HeapNode<float>(&state, state.modelScore));
+        }
+    }
+}
+/*
+fill the hypothesis heap with incomplete hypotheses
+>> beam  - the beam that keeps a number of states (final)
+*/
+void BeamSearch::FillHeap(T2TStateBundle* beam)
+{
+    T2TState* states = beam->states;
+    for (int i = 0; i < beam->stateNum / beamSize; i++) {
+        for (int j = 0; j < beamSize; j++) {
+            T2TState& state = states[i * beamSize + j];
+            /* we push the incomplete hypothesis into the heap */
+            if (fullHypos[state.pid].Count() == 0 && state.isEnd && state.isCompleted) {
+                fullHypos[state.pid].Push(HeapNode<float>(&state, state.modelScore));
+            }
+            else {
+                auto node = fullHypos[state.pid].Top();
+                float score = node.value;
+                if (score < state.modelScore)
+                    fullHypos[state.pid].Push(HeapNode<float>(&state, state.modelScore));
+            }
+        }
+    }
+}
+/*
+save the output sequences in a tensor
+>> output - output sequences (for return)
+>> score - score of thes sequences
+*/
+void BeamSearch::Dump(IntList* output, XTensor* score)
+{
+    int dims[3] = { batchSize, 1, maxLength };
+    InitTensor(score, 2, dims, X_FLOAT);
+    score->SetZeroAll();
+    /* heap for an input sentence in the batch */
+    for (int h = 0; h < batchSize; h++) {
+        XHeap<MIN_HEAP, float>& heap = fullHypos[h];
+        int c = heap.Count();
+        float bestScore = -1e9F;
+        T2TState* state = NULL;
+        for (int i = 0; i < c; i++) {
+            auto node = heap.Pop();
+            T2TState* s = (T2TState*)node.index;
+            if (i == 0 || bestScore < node.value) {
+                state = s;
+                bestScore = node.value;
+            }
+        }
+        int count = 0;
+        bool isCompleted = true;
+        /* we track the state from the end to the beginning */
+        while (state != NULL) {
+            if (!state->isCompleted)
+                isCompleted = false;
+            if (isCompleted) {
+                output[h].Add(2);
+            }
+            else {
+                output[h].Add(state->prediction);
+            }
+            state = state->last;
+        }
+        output[h].Reverse();
+        score->Set2D(bestScore, h, 0);
+    }
+}
+/*
+check if the token is an end symbol
+>> token - token to be checked
+*/
+bool BeamSearch::IsEnd(int token)
+{
+    CheckNTErrors(endSymbolNum > 0, "No end symbol?");
+    for (int i = 0; i < endSymbolNum; i++) {
+        if (endSymbols[i] == token)
+            return true;
+    }
+    return false;
+}
+/*
+set end symbols for search
+>> tokens - end symbols
+>> tokenNum - number of the end symbols
+*/
+void BeamSearch::SetEnd(const int* tokens, const int tokenNum)
+{
+    if (endSymbols != NULL)
+        delete[] endSymbols;
+    if (tokenNum <= 0)
+        return;
+    /* we may have multiple end symbols */
+    tokens = new int[tokenNum];
+    for (int i = 0; i < tokenNum; i++)
+        endSymbols[i] = tokens[i];
+    endSymbolNum = tokenNum;
+}
+/*
+check whether all hypotheses are completed
+>> beam - the beam that keeps the searching states
+*/
+bool BeamSearch::IsAllCompleted(T2TStateBundle* beam)
+{
+    T2TState* states = beam->states;
+    for (int i = 0; i < beam->stateNum; i++) {
+        T2TState& state = states[i];
+        if (!state.isCompleted)
+            return false;
+    }
+    return true;
+}
+/*
+update the beam by removing finished hypotheses
+>> beam - the beam that keeps the searching states
+>> aliveEncoding - new input embeddings for the encoder, (B, L, E)
+>> aliveInput - new input tokens of the encoder, (B, L)
+>> alivePadding - new paddings for the inputs, (B, L)
+<< aliveIdx - the indices of alive states
+*/
+void BeamSearch::RemoveFinishedStates(T2TStateBundle* beam, XTensor& aliveEncoding,
+                                      XTensor& aliveInput, XTensor& alivePadding, 
+                                      XTensor& aliveState)
+{
+    T2TState* states = beam->states;
+    /* get the indices of uncompleted sentences and states */
+    aliveSentList.Clear();
+    IntList aliveStateList;
+    int count = 0;
+    /* the number of completed sentences */
+    for (int i = 0; i < beam->stateNum; i += beamSize) {
+        int endState = 0;
+        for (int j = 0; j < beamSize; j++) {
+            if (states[i + j].isEnd) {
+                endState++;
+            }
+        }
+        bool isSentCompleted = (endState == beamSize);
+        int sent = i / beamSize;
+        if (!isSentCompleted) {
+            aliveSentList.Add(sent);
+            for (int j = 0; j < beamSize; j++) {
+                aliveStateList.Add(i + j);
+            }
+        }
+        else {
+            aliveStatePids.Remove(sent - count);
+            count++;
+        }
+    }
+    InitTensor1D(&aliveState, aliveStateList.Size(), X_INT, aliveEncoding.devID);
+    aliveState.SetData(aliveStateList.items, aliveStateList.Size());
+    XTensor aliveSent;
+    InitTensor1D(&aliveSent, aliveSentList.Size(), X_INT, aliveEncoding.devID);
+    aliveSent.SetData(aliveSentList.items, aliveSentList.Size());
+    if (aliveStateList.Size() < aliveEncoding.dimSize[0] && aliveStateList.Size() > 0) {
+        aliveInput = AutoGather(aliveInput, aliveState);
+        alivePadding = AutoGather(alivePadding, aliveState);
+        aliveEncoding = AutoGather(aliveEncoding, aliveState);
+        beam->prob = AutoGather(beam->prob, aliveSent);
+        beam->endMark = AutoGather(beam->endMark, aliveSent);
+        beam->probPath = AutoGather(beam->probPath, aliveSent);
+        beam->modelScore = AutoGather(beam->modelScore, aliveSent);
+        beam->prediction = AutoGather(beam->prediction, aliveSent);
+    }
+}
+/*
+make a mask to prevent duplicated entries in beam expansion for the first position
+>> beam - the beam that keeps the searching states
+*/
+XTensor BeamSearch::MakeFirstMask(T2TStateBundle* beam)
+{
+    XTensor& prob = beam->prob;
+    XTensor mask;
+    int order = prob.order;
+    int dims[MAX_TENSOR_DIM_NUM];
+    for (int i = 0; i < order - 1; i++)
+        dims[i] = prob.dimSize[i];
+    InitTensor(&mask, order - 1, dims, X_FLOAT);
+    mask.SetZeroAll();
+    for (int i = 0; i < mask.unitNum; i++) {
+        if (i % beamSize != 0)
+            mask.Set(-1e9, i);
+    }
+    mask.SetDevice(prob.devID);
+    return mask;
+}
+/* constructor */
+GreedySearch::GreedySearch()
+{
+    maxLength = 0;
+    batchSize = 0;
+    endSymbolNum = 0;
+    endSymbols = new int[32];
+    startSymbol = -1;
+}
+/* de-constructor */
+GreedySearch::~GreedySearch()
+{
+    if (endSymbols != NULL)
+        delete[] endSymbols;
+}
+/*
+initialize the model
+>> argc - number of arguments
+>> argv - list of pointers to the arguments
+*/
+void GreedySearch::Init(T2TConfig& config)
+{
+    batchSize = config.wBatchSize;
+    endSymbols[0] = config.endID;
+    startSymbol = config.startID;
+    scalarMaxLength = config.maxLenAlpha;
+    if (endSymbols[0] >= 0)
+        endSymbolNum = 1;
+}
+/*
+prepare for search
+>> batchSize - size of the batch
+*/
+void GreedySearch::Prepare(int myBatchSize)
+{
+    batchSize = myBatchSize;
+}
+/* check if the token is an end symbol */
+bool GreedySearch::IsEnd(int token)
+{
+    CheckNTErrors(endSymbolNum > 0, "No end symbol?");
+    for (int i = 0; i < endSymbolNum; i++) {
+        if (endSymbols[i] == token)
+            return true;
+    }
+    return false;
+}
+/* set end symbols for search */
+void GreedySearch::SetEnd(const int* tokens, const int tokenNum)
+{
+    if (endSymbols != NULL)
+        delete[] endSymbols;
+    if (tokenNum <= 0)
+        return;
+    /* we may have multiple end symbols */
+    tokens = new int[tokenNum];
+    for (int i = 0; i < tokenNum; i++)
+        endSymbols[i] = tokens[i];
+    endSymbolNum = tokenNum;
+}
+/*
+search for the most promising states
+>> model - the transformer model
+>> input - input of the model
+>> padding - padding of the input
+>> output - output that represents the sequences as rows
+*/
+void GreedySearch::Search(T2TModel* model, XTensor& input, 
+                          XTensor& padding, IntList* output)
+{
+    XTensor maskEnc;
+    XTensor encoding;
+    /* dynamic batch size */
+    Prepare(input.unitNum / input.dimSize[input.order - 1]);
+    /* encoder mask */
+    model->MakeMTMaskEnc(padding, maskEnc);
+    /* make the encoding network */
+    encoding = model->encoder->Make(input, &maskEnc, false);
+    /* max output-length = scalar * source-length */
+    maxLength = (int)(input.dimSize[input.order - 1] * scalarMaxLength);
+    /* the first token */
+    XTensor inputDec;
+    InitTensor2D(&inputDec, batchSize, 1, X_INT, input.devID);
+    inputDec.SetDataFixed(startSymbol);
+    /* initialize the finished flags */
+    int* finishedFlags = new int[batchSize];
+    for (int i = 0; i < batchSize; i++)
+        finishedFlags[i] = 0;
+    /* generate the sequence from left to right */
+    int l = 0;
+    for (; l < maxLength; l++) {
+        XTensor prob;
+        XTensor maskDec;
+        XTensor maskEncDec;
+        XTensor paddingDec;
+        XTensor decoding;
+        XTensor indexCPU;
+        XTensor bestScore;
+        InitTensor(&paddingDec, inputDec.order, inputDec.dimSize, X_INT, padding.devID);
+        paddingDec.SetDataFixed(1);
+        /* decoder mask */
+        model->MakeMTMaskDec(padding, paddingDec, maskDec, maskEncDec);
+        /* make the decoding network */
+        decoding = model->decoder->Make(inputDec, encoding, NULL, &maskEncDec, l, false);
+        /* generate the output probabilities */
+        model->outputLayer->Make(decoding, prob, false, false);
+        /* get the most promising prediction */
+        prob.Reshape(prob.dimSize[0], prob.dimSize[prob.order - 1]);
+        InitTensor2D(&bestScore, prob.dimSize[0], 1, prob.dataType, prob.devID);
+        TopK(prob, bestScore, inputDec, -1, 1);
+        /* save the prediction */
+        InitTensorOnCPU(&indexCPU, &inputDec);
+        CopyValues(inputDec, indexCPU);
+        for (int i = 0; i < batchSize; i++) {
+            output[i].Add(indexCPU.GetInt(i));
+            if (IsEnd(indexCPU.GetInt(i)))
+                finishedFlags[i] = 1;
+        }
+        int finished = 0;
+        for (int i = 0; i < batchSize; i++)
+            finished += finishedFlags[i];
+        if (finished == batchSize)
+            break;
+    }
+    delete[] finishedFlags;
+}
+}
\ No newline at end of file
--- a/source/sample/transformer/T2TSearch.h
+++ b/source/sample/transformer/T2TSearch.h
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2019, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,22 +17,25 @@
 /*
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
 */
 #ifndef __T2TSEARCH_H__
 #define __T2TSEARCH_H__
-#include "T2TModel.h"
+#include "../T2TModel.h"
 #include "T2TPredictor.h"
+using namespace std;
 namespace transformer
 {
-/* The class orgnizes the search process. It calls "predictors" to generate
+/* The class organizes the search process. It calls "predictors" to generate
   distributions of the predictions and prunes the search space by beam pruning.
-   This makes a graph where each path respresents a translation hypothsis.
+   This makes a graph where each path represents a translation hypotheses.
   The output can be the path with the highest model score. */
-class T2TSearch
+class BeamSearch
 {
 private:
    /* the alpha parameter controls the length preference */
@@ -40,10 +43,10 @@ private:
    /* predictor */
    T2TPredictor predictor;
    /* max length of the generated sequence */
    int maxLength;
    /* beam size */
    int beamSize;
@@ -51,10 +54,10 @@ private:
    int batchSize;
    /* we keep the final hypotheses in a heap for each sentence in the batch. */
-    XHeap<MIN_HEAP, float> * fullHypos;
+    XHeap<MIN_HEAP, float>* fullHypos;
    /* array of the end symbols */
-    int * endSymbols;
+    int* endSymbols;
    /* number of the end symbols */
    int endSymbolNum;
@@ -62,48 +65,118 @@ private:
    /* start symbol */
    int startSymbol;
+    /* scalar of the input sequence (for max number of search steps) */
+    float scalarMaxLength;
+    /* indicate whether the early stop strategy is used */
+    bool isEarlyStop;
+    /* pids for alive states */
+    IntList aliveStatePids;
+    /* alive sentences */
+    IntList aliveSentList;
+    /* whether we need to reorder the states */
+    bool needReorder;
 public:
    /* constructor */
-    T2TSearch();
+    BeamSearch();
    /* de-constructor */
-    ~T2TSearch();
+    ~BeamSearch();
    /* initialize the model */
-    void Init(int argc, char ** argv);
+    void Init(T2TConfig& config);
    /* search for the most promising states */
-    void Search(T2TModel * model, XTensor * input, XTensor * padding, XTensor * output);
+    void Search(T2TModel* model, XTensor& input, XTensor& padding, IntList* output, XTensor& score);
    /* preparation */
-    void Prepare(int myBatchSize,int myBeamSize);
+    void Prepare(int myBatchSize, int myBeamSize);
-    /* compute the model score for each hypothesis */
+    /* compute the model score for each hypotheses */
-    void Score(T2TStateBundle * prev, T2TStateBundle * beam);
+    void Score(T2TStateBundle* prev, T2TStateBundle* beam);
    /* generate token indices via beam pruning */
-    void Generate(T2TStateBundle * beam);
+    void Generate(T2TStateBundle* prev, T2TStateBundle* beam);
    /* expand the search graph */
-    void Expand(T2TStateBundle * prev, T2TStateBundle * beam);
+    void Expand(T2TStateBundle* prev, T2TStateBundle* beam, XTensor& reorderState);
    /* collect hypotheses with ending symbol */
-    void Collect(T2TStateBundle * beam);
+    void Collect(T2TStateBundle* beam);
-    /* fill the hypotheis heap with incomplete hypothses */
+    /* fill the hypotheses heap with incomplete hypotheses */
-    void FillHeap(T2TStateBundle * beam);
+    void FillHeap(T2TStateBundle* beam);
-    /* save the output sequences in a tensor */
+    /* save the output sequences and score */
-    void Dump(XTensor * output);
+    void Dump(IntList* output, XTensor* score);
    /* check if the token is an end symbol */
    bool IsEnd(int token);
+    /* check whether all hypotheses are completed */
+    bool IsAllCompleted(T2TStateBundle* beam);
+    /* update the beam by pruning finished states */
+    void RemoveFinishedStates(T2TStateBundle* beam, XTensor& aliveEncoding,
+        XTensor& aliveInput, XTensor& alivePadding, XTensor& aliveIdx);
    /* set end symbols for search */
-    void SetEnd(const int * tokens, const int tokenNum);
+    void SetEnd(const int* tokens, const int tokenNum);
    /* make a mask to prevent duplicated entries in beam expansion for the first position */
-    XTensor MakeFirstMask(T2TStateBundle * beam);
+    XTensor MakeFirstMask(T2TStateBundle* beam);
+};
+class GreedySearch
+{
+private:
+    /* predictor */
+    T2TPredictor predictor;
+    /* max length of the generated sequence */
+    int maxLength;
+    /* batch size */
+    int batchSize;
+    /* array of the end symbols */
+    int* endSymbols;
+    /* number of the end symbols */
+    int endSymbolNum;
+    /* start symbol */
+    int startSymbol;
+    /* scalar of the input sequence (for max number of search steps) */
+    float scalarMaxLength;
+public:
+    /* constructor */
+    GreedySearch();
+    /* de-constructor */
+    ~GreedySearch();
+    /* initialize the model */
+    void Init(T2TConfig& config);
+    /* search for the most promising states */
+    void Search(T2TModel* model, XTensor& input, XTensor& padding, IntList* output);
+    /* preparation */
+    void Prepare(int myBatchSize);
+    /* check if the token is an end symbol */
+    bool IsEnd(int token);
+    /* set end symbols for search */
+    void SetEnd(const int* tokens, const int tokenNum);
 };
 }

--- a/source/sample/transformer/translate/T2TTranslator.cpp
+++ b/source/sample/transformer/translate/T2TTranslator.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
+ */
+#include <cmath>
+#include "T2TTranslator.h"
+#include "T2TSearch.h"
+#include "../module/T2TUtility.h"
+#include "../../../tensor/XTensor.h"
+#include "../../../tensor/XUtility.h"
+#include "../../../tensor/core/CHeader.h"
+using namespace nts;
+namespace transformer
+{
+/* constructor */
+T2TTranslator::T2TTranslator()
+{
+}
+/* de-constructor */
+T2TTranslator::~T2TTranslator()
+{
+    if (beamSize > 1)
+        delete (BeamSearch*)seacher;
+    else
+        delete (GreedySearch*)seacher;
+}
+/* initialize the model */
+void T2TTranslator::Init(T2TConfig& config)
+{
+    beamSize = config.beamSize;
+    vSize = config.srcVocabSize;
+    vSizeTgt = config.tgtVocabSize;
+    sentBatch = config.sBatchSize;
+    wordBatch = config.wBatchSize;
+    if (beamSize > 1) {
+        XPRINT1(0, stderr, "Translating with beam search (%d)\n", beamSize);
+        seacher = new BeamSearch();
+        ((BeamSearch*)seacher)->Init(config);
+    }
+    else if (beamSize == 1) {
+        XPRINT1(0, stderr, "Translating with greedy search\n", beamSize);
+        seacher = new GreedySearch();
+        ((GreedySearch*)seacher)->Init(config);
+    }
+    else {
+        CheckNTErrors(false, "invalid beam size\n");
+    }
+}
+/*
+test the model
+>> ifn - input data file
+>> sfn - source vocab file
+>> tfn - target vocab file
+>> ofn - output data file
+>> model - pretrained model
+*/
+void T2TTranslator::Translate(const char* ifn, const char* sfn, const char* tfn, 
+                              const char* ofn, T2TModel* model)
+{
+    int wc = 0;
+    int wordCountTotal = 0;
+    int sentCount = 0;
+    int batchCount = 0;
+    int devID = model->devID;
+    double startT = GetClockSec();
+    /* batch of input sequences */
+    XTensor batchEnc;
+    /* padding */
+    XTensor paddingEnc;
+    batchLoader.Init(ifn, sfn, tfn);
+    XPRINT1(0, stderr, "[INFO] loaded the input file, elapsed=%.1fs \n", 
+            GetClockSec() - startT);
+    int count = 0;
+    double batchStart = GetClockSec();
+    while (!batchLoader.IsEmpty())
+    {
+        count++;
+        for (int i = 0; i < model->decoder->nlayer; ++i) {
+            model->decoder->selfAttCache[i].miss = true;
+            model->decoder->enDeAttCache[i].miss = true;
+        }
+        auto indices = batchLoader.LoadBatch(&batchEnc, &paddingEnc, 
+                                             sentBatch, wordBatch, devID);
+        IntList* output = new IntList[indices.Size() - 1];
+        /* greedy search */
+        if (beamSize == 1) {
+            ((GreedySearch*)seacher)->Search(model, batchEnc, paddingEnc, output);
+        }
+        /* beam search */
+        else {
+            XTensor score;
+            ((BeamSearch*)seacher)->Search(model, batchEnc, paddingEnc, output, score);
+        }
+        for (int i = 0; i < indices.Size() - 1; ++i) {
+            Result* res = new Result;
+            res->id = indices[i];
+            res->res = output[i];
+            batchLoader.outputBuffer.Add(res);
+        }
+        delete[] output;
+        wc += indices[-1];
+        wordCountTotal += indices[-1];
+        sentCount += (indices.Size() - 1);
+        batchCount += 1;
+        if (count % 1 == 0) {
+            double elapsed = GetClockSec() - batchStart;
+            batchStart = GetClockSec();
+            XPRINT3(0, stderr, "[INFO] elapsed=%.1fs, sentence=%f, sword=%.1fw/s\n",
+                    elapsed, float(sentCount) / float(batchLoader.inputBuffer.Size()), 
+                    double(wc) / elapsed);
+            wc = 0;
+        }
+    }
+    /* append empty lines to the result */
+    for (int i = 0; i < batchLoader.emptyLines.Size(); i++) {
+        Result* emptyRes = new Result;
+        emptyRes->id = batchLoader.emptyLines[i];
+        batchLoader.outputBuffer.Add(emptyRes);
+    }
+    double startDump = GetClockSec();
+    /* reorder the result */
+    batchLoader.SortOutput();
+    /* print the result to a file */
+    batchLoader.DumpRes(ofn);
+    double elapsed = GetClockSec() - startDump;
+    XPRINT2(0, stderr, "[INFO] translation completed (word=%d, sent=%llu)\n", 
+            wordCountTotal, batchLoader.inputBuffer.Size() + batchLoader.emptyLines.Size());
+}
+/*
+dump the result into the file
+>> file - data file
+>> output - output tensor
+*/
+void T2TTranslator::Dump(FILE* file, XTensor* output)
+{
+    if (output != NULL && output->unitNum != 0) {
+        int seqLength = output->dimSize[output->order - 1];
+        for (int i = 0; i < output->unitNum; i += seqLength) {
+            for (int j = 0; j < seqLength; j++) {
+                int w = output->GetInt(i + j);
+                if (w < 0 || w == 1 || w == 2)
+                    break;
+                fprintf(file, "%d ", w);
+            }
+            fprintf(file, "\n");
+        }
+    }
+    else
+    {
+        fprintf(file, "\n");
+    }
+}
+}
\ No newline at end of file
--- a/source/sample/transformer/T2TTester.h
+++ b/source/sample/transformer/T2TTester.h
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2019, Natural Language Processing Lab, Northeastern University.
+ * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -18,19 +18,20 @@
 /*
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
 * A week with no trips :)
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
 */
 #ifndef __T2TTESTER_H__
 #define __T2TTESTER_H__
 #include "T2TSearch.h"
-#include "T2TBatchLoader.h"
+#include "T2TDataSet.h"
 namespace transformer
 {
 /* This class translates test sentences with a trained model. */
-class T2TTester
+class T2TTranslator
 {
 public:
    /* vocabulary size of the source side */
@@ -38,28 +39,38 @@ public:
    /* vocabulary size of the target side */
    int vSizeTgt;
+    /* batch size for sentences */
+    int sentBatch;
+    /* batch size for words */
+    int wordBatch;
+    /* beam size */
+    int beamSize;
    /* for batching */
-    T2TBatchLoader batchLoader;
+    DataSet batchLoader;
    /* decoder for inference */
-    T2TSearch seacher;
+    void* seacher;
 public:
    /* constructor */
-    T2TTester();
+    T2TTranslator();
    /* de-constructor */
-    ~T2TTester();
+    ~T2TTranslator();
    /* initialize the model */
-    void Init(int argc, char ** argv);
+    void Init(T2TConfig& config);
    /* test the model */
-    void Test(const char * fn, const char * ofn, T2TModel * model);
+    void Translate(const char* ifn, const char* vfn, const char* ofn, 
+                   const char* tfn, T2TModel* model);
    /* dump the result into the file */
-    void Dump(FILE * file, XTensor * output);
+    void Dump(FILE* file, XTensor* output);
 };
 }

--- a/source/sample/transformer/translate/T2TVocab.cpp
+++ b/source/sample/transformer/translate/T2TVocab.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: HU Chi (huchinlp@foxmail.com) 2020-01-03
+ */
+#include <fstream>
+#include "T2TVocab.h"
+#include "../module/T2TUtility.h"
+namespace nts {
+/* load a vocabulary from a file */
+void Vocab::Load(const string& src)
+{
+    string vsz, sid;
+    ifstream f(src, ios::in);
+    CheckNTErrors(f.is_open(), "Unable to open the vocabulary file");
+    /* get the vocab size and the start id */
+    f >> vsz >> sid;
+    startID = stol(sid);
+    vocabSize = stol(vsz);
+    string word, id;
+    for (int i = 0; i < vocabSize - startID; i++) {
+        f >> word >> id;
+        word2id[word] = stol(id);
+        id2word[stol(id)] = word;
+    }
+    f.close();
+}
+/* save a vocabulary to a file */
+void Vocab::Save(const string& src)
+{
+    ofstream f(src, ios::out);
+    /* the first line: size of the vocab and the start id */
+    f << vocabSize << "\t" << startID;
+    /* other lines: words and indices */
+    for (const auto& p : word2id)
+        f << p.first << "\t" << p.second;
+    f.close();
+}
+/*
+copy data from another vocabulary
+>> v - the target vocabulary
+*/
+void Vocab::CopyFrom(const Vocab& v)
+{
+    for (const auto& w2i : v.word2id)
+        word2id.insert(w2i);
+    for (const auto& i2w : v.id2word)
+        id2word.insert(i2w);
+}
+}
\ No newline at end of file
--- a/source/sample/transformer/translate/T2TVocab.h
+++ b/source/sample/transformer/translate/T2TVocab.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: HU Chi (huchinlp@foxmail.com) 2020-01-03
+ */
+#ifndef __T2TVOCAB_H__
+#define __T2TVOCAB_H__
+#include <cstdio>
+#include <unordered_map>
+using namespace std;
+namespace nts {
+/* user-defined symbols */
+#define UNK 0
+#define PAD 1
+#define SOS 2
+#define EOS 2
+/* the vocabulary class */
+struct Vocab
+{
+    /* the start id for words */
+    int startID;
+    /* size of the vocabulary */
+    int vocabSize;
+    /* a dict that maps words to ids */
+    unordered_map<string, int> word2id;
+    /* a dict that maps ids to words */
+    unordered_map<int, string> id2word;
+    /* load a vocabulary from a file */
+    void Load(const string& src);
+    /* save a vocabulary to a file */
+    void Save(const string& src);
+    /* copy data from another vocab */
+    void CopyFrom(const Vocab& v);
+};
+}
+#endif
\ No newline at end of file
--- a/source/tensor/XCall.h
+++ b/source/tensor/XCall.h
@@ -26,13 +26,15 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* default settings */
-#define X_ENABLE_GRAD true
 /*
 * we define the "new and delete" functions below
 */
+/* global flag for enabling gradient flows or not */
+static bool X_ENABLE_GRAD = true;
+#define DISABLE_GRAD X_ENABLE_GRAD=false
+#define ENABLE_GRAD X_ENABLE_GRAD=true
 /* initialize a XTensor V2 */
 void InitTensorV2(XTensor * tensor,
                  const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT,

--- a/source/tensor/XGlobal.h
+++ b/source/tensor/XGlobal.h
@@ -110,7 +110,7 @@ namespace nts {
 #define FIELD_SEP " ||| "
 #define FLOAT_MIN float(-1.0E38)
 #define FLOAT16_MIN float(-65504)
-#define MILLION 1024 * 1024
+#define MILLION 1000000
 #define LOG_E_10 2.302585
 #define LEADING_DIM 1

--- a/source/tensor/XList.cpp
+++ b/source/tensor/XList.cpp
@@ -24,7 +24,6 @@
  */
 #include "time.h"
-#include "XMem.h"
 #include "XList.h"
 #include "XGlobal.h"
@@ -35,47 +34,72 @@ namespace nts {
 template <typename T>
 TensorListBase<T>::TensorListBase()
 {
-    mem = NULL;
+    maxNum = 1;
-    maxNum = 0;
    count = 0;
-    items = NULL;
+    items = (T*)malloc(sizeof(T) * 1);
 }
 /* 
 constructor 
 >> myMaxNum - maximum number of items to keep
->> isIntListOrNot - specify if the list keeps int items
 */
 template <typename T>
 TensorListBase<T>::TensorListBase(int myMaxNum)
 {
-    mem = NULL;
+    CheckNTErrors(myMaxNum > 0, "check if the input number > 0");
    maxNum = myMaxNum;
    count = 0;
-    items = new T[myMaxNum];
+    items = (T*)malloc(sizeof(T) * myMaxNum);
 }
-/* 
+/* copy-constructor */
-constructor 
+template<typename T>
->> myMaxNum - maximum number of items to keep
+TensorListBase<T>::TensorListBase(const TensorListBase<T>& l)
->> myMem - the memory pool used for data allocation
->> isIntListOrNot - specify if the list keeps int items
-*/
-template <typename T>
-TensorListBase<T>::TensorListBase(int myMaxNum, XMem* myMem)
 {
-    mem = myMem;
+    maxNum = l.maxNum;
-    maxNum = myMaxNum;
+    count = l.count;
-    count = 0;
+    items = (T*)malloc(sizeof(T) * maxNum);
-    items = (T*)mem->Alloc(mem->devID, sizeof(T) * maxNum);
+    memcpy(items, l.items, l.count * sizeof(T));
+}
+/* move-constructor */
+template<typename T>
+TensorListBase<T>::TensorListBase(TensorListBase<T>&& l)
+{
+    maxNum = l.maxNum;
+    count = l.count;
+    items = (T*)malloc(sizeof(T) * maxNum);
+    memcpy(items, l.items, l.count * sizeof(T));
+}
+/* assignment operator for a constant reference */
+template<typename T>
+TensorListBase<T> TensorListBase<T>::operator=(const TensorListBase<T>& l)
+{
+    maxNum = l.maxNum;
+    count = l.count;
+    items = (T*)malloc(sizeof(T) * maxNum);
+    memcpy(items, l.items, l.count * sizeof(T));
+    return *this;
+}
+/* assignment operator for a rvalue */
+template<typename T>
+TensorListBase<T> TensorListBase<T>::operator=(TensorListBase<T>&& l)
+{
+    maxNum = l.maxNum;
+    count = l.count;
+    items = (T*)malloc(sizeof(T) * maxNum);
+    memcpy(items, l.items, l.count * sizeof(T));
+    return *this;
 }
 /* de-constructor */
 template <typename T>
 TensorListBase<T>::~TensorListBase()
 {
-    if(items)
+    if(items != NULL)
-        delete[] items;
+        free(items);
    items = NULL;
 }
@@ -90,13 +114,19 @@ void TensorListBase<T>::Add(T&& item)
    if (count == maxNum) {
        T* newItems;
-        if (mem == NULL)
-            newItems = new T[maxNum * 2 + 1];
+        newItems = (T*)realloc(items, sizeof(T) * (count * 2 + 1));
-        else
+        if (newItems != NULL)
-            newItems = (T*)mem->Alloc(mem->devID, sizeof(T) * (maxNum * 2 + 1));
+            items = newItems;
-        memcpy(newItems, items, sizeof(T) * maxNum);
+        else {
-        items = newItems;
+            newItems = (T*)malloc(sizeof(T) * (count * 2 + 1));
-        maxNum = maxNum * 2 + 1;
+            memcpy(newItems, items, count * sizeof(T));
+            free(items);
+            items = newItems;
+        }
+        maxNum = count * 2 + 1;
    }
    items[count++] = item;
 }
@@ -117,13 +147,18 @@ void TensorListBase<T>::Add(const T& item)
 {
    if (count == maxNum) {
        T* newItems;
-        if (mem == NULL)
-            newItems = new T[maxNum * 2 + 1];
+        newItems = (T*)realloc(items, sizeof(T) * (count * 2 + 1));
-        else
+        if (newItems != NULL)
-            newItems = (T*)mem->Alloc(mem->devID, sizeof(T) * (maxNum * 2 + 1));
+            items = newItems;
-        memcpy(newItems, items, sizeof(T) * maxNum);
+        else {
-        items = newItems;
+            newItems = (T*)malloc(sizeof(T) * (count * 2 + 1));
-        maxNum = maxNum * 2 + 1;
+            memcpy(newItems, items, count * sizeof(T));
+            free(items);
+            items = newItems;
+        }
+        maxNum = count * 2 + 1;
    }
    items[count++] = item;
@@ -138,15 +173,19 @@ template <typename T>
 void TensorListBase<T>::Add(const T* inputItems, int inputItemCount)
 {
    if (count + inputItemCount >= maxNum) {
-        int newMaxNum = (count + inputItemCount) * 2 + 1;
        T* newItems;
-        if (mem == NULL)
-            newItems = new T[newMaxNum];
+        newItems = (T*)realloc(items, sizeof(T) * (count + inputItemCount + 1));
-        else
+        if (newItems != NULL)
-            newItems = (T*)mem->Alloc(mem->devID, sizeof(T) * newMaxNum);
+            items = newItems;
-        memcpy(newItems, items, sizeof(T) * maxNum);
+        else {
-        items = newItems;
+            newItems = (T*)malloc(sizeof(T) * (maxNum + count + inputItemCount + 1));
-        maxNum = newMaxNum;
+            memcpy(newItems, items, count * sizeof(T));
+            free(items);
+            items = newItems;
+        }
+        maxNum += (count + inputItemCount + 1);
    }
    memcpy(items + count, inputItems, sizeof(T) * inputItemCount);
    count += inputItemCount;
@@ -172,13 +211,18 @@ void TensorListBase<T>::Insert(int pos, const T& item)
 {
    if (count == maxNum) {
        T* newItems;
-        if (mem == NULL)
-            newItems = new T[maxNum * 2 + 1];
+        newItems = (T*)realloc(items, sizeof(T) * (count * 2 + 1));
-        else
+        if (newItems != NULL)
-            newItems = (T*)mem->Alloc(mem->devID, sizeof(T) * (maxNum * 2 + 1));
+            items = newItems;
-        memcpy(newItems, items, sizeof(T) * maxNum);
+        else {
-        items = newItems;
+            newItems = (T*)malloc(sizeof(T) * (count * 2 + 1));
-        maxNum = maxNum * 2 + 1;
+            memcpy(newItems, items, count * sizeof(T));
+            free(items);
+            items = newItems;
+        }
+        maxNum = count * 2 + 1;
    }
    for (int i = count - 1; i >= pos; i--)
@@ -192,13 +236,18 @@ void TensorListBase<T>::Insert(int pos, T&& item)
 {
    if (count == maxNum) {
        T* newItems;
-        if (mem == NULL)
-            newItems = new T[maxNum * 2 + 1];
+        newItems = (T*)realloc(items, sizeof(T) * (count * 2 + 1));
-        else
+        if (newItems != NULL)
-            newItems = (T*)mem->Alloc(mem->devID, sizeof(T) * (maxNum * 2 + 1));
+            items = newItems;
-        memcpy(newItems, items, sizeof(T) * maxNum);
+        else {
-        items = newItems;
+            newItems = (T*)malloc(sizeof(T) * (count * 2 + 1));
-        maxNum = maxNum * 2 + 1;
+            memcpy(newItems, items, count * sizeof(T));
+            free(items);
+            items = newItems;
+        }
+        maxNum = count * 2 + 1;
    }
    for (int i = count - 1; i >= pos; i--)
@@ -255,8 +304,9 @@ template <typename T>
 void TensorListBase<T>::Clear()
 {
    count = 0;
-    delete[] items;
    maxNum = 0;
+    if(items != NULL)
+        free(items);
    items = NULL;
 }
@@ -311,22 +361,7 @@ void TensorListBase<T>::Reserve(int n)
        return;
    }
-    items = new T[n];
+    items = (T*)malloc(sizeof(T) * n);
-}
-/* 
-copy the list 
->> myMem - memory pool used for allocating the data in the new list
-<< hard copy of the list
-*/
-template <typename T>
-TensorListBase<T>* TensorListBase<T>::Copy(XMem* myMem)
-{
-    TensorListBase<T>* newList = new TensorListBase<T>(maxNum, myMem);
-    for (int i = 0; i < count; i++) {
-        newList->Add(GetItem(i));
-    }
-    return newList;
 }
 /* 
@@ -360,20 +395,6 @@ void TensorListBase<T>::Shuffle(int nround, int beg, int len)
    }
 }
-/* sum a range of values */
-template<>
-int TensorListBase<int>::SumRange(int begin, int end)
-{
-    CheckNTErrors(begin < count && end <= count && end > begin, "invalid index of begin or end");
-    int value = items[begin];
-    int i = begin + 1;
-    while (i < end) {
-        value += items[i];
-        i++;
-    }
-    return value;
-}
 /* specializations and typedef of list */
 template struct TensorListBase<int>;
 template struct TensorListBase<char>;
@@ -384,5 +405,7 @@ template struct TensorListBase<short>;
 template struct TensorListBase<XTensor*>;
 template struct TensorListBase<uint64_t>;
 template struct TensorListBase<void*>;
+template struct TensorListBase<Example*>;
+template struct TensorListBase<Result*>;
 } /* end of the nts (NiuTrans.Tensor) namespace */
\ No newline at end of file
--- a/source/tensor/XList.h
+++ b/source/tensor/XList.h
@@ -27,6 +27,7 @@
 #include "XGlobal.h"
 #include <cstdint>
+#include <string>
 #ifndef __TensorList_H__
 #define __TensorList_H__
@@ -49,9 +50,6 @@ public:
    /* maximum number of items can be kept */
    int maxNum;
-    /* the memory pool for data array allocation */
-    XMem* mem;
 public:
    /* constructor */
    TensorListBase();
@@ -59,8 +57,17 @@ public:
    /* constructor */
    TensorListBase(int myMaxNum);
-    /* constructor */
+    /* copy-constructor */
-    TensorListBase(int myMaxNum, XMem* myMem);
+    TensorListBase(const TensorListBase<T>& l);
+    /* move-constructor */
+    TensorListBase(TensorListBase<T>&& l);
+    /* assignment operator for a constant reference */
+    TensorListBase<T> operator=(const TensorListBase<T>& l);
+    /* assignment operator for a rvalue */
+    TensorListBase<T> operator=(TensorListBase<T>&& l);
    /* de-constructor */
    ~TensorListBase();
@@ -113,29 +120,18 @@ public:
    /* reserve space for data entry */
    void Reserve(int n);
-    /* copy the list */
-    TensorListBase* Copy(XMem* myMem);
    /* shuffle the list */
    void Shuffle(int nround = 10, int beg = -1, int len = 0);
    /* short */
-    T& operator[] (int i) { 
+    T& operator[] (int i) const { return GetItem(i); };
-        CheckNTErrors(i >= -count && i < count, "Index of a list item is out of scope!");
+    T& Get(int i) const { return GetItem(i); };
-        CheckNTErrors(count > 0, "Cannt index the item in an empty list!");
-        if (i < 0)
-            return items[count + i];
-        else
-            return items[i];
-    };
-    T& Get(int i) { return GetItem(i); };
    void Set(int i, T item) { SetItem(i, item); };
-    /* sum a range of values */
-    T SumRange(int begin, int end);
 };
 struct XTensor;
+struct Example;
+struct Result;
 typedef TensorListBase<void*> XList;
 typedef TensorListBase<int> IntList;
@@ -146,6 +142,8 @@ typedef TensorListBase<float> FloatList;
 typedef TensorListBase<short> ShortList;
 typedef TensorListBase<uint64_t> UInt64List;
 typedef TensorListBase<XTensor*> TensorList;
+typedef TensorListBase<Example*> InputBufferType;
+typedef TensorListBase<Result*> OutputBufferType;
 } /* end of the nts (NiuTrans.Tensor) namespace */

--- a/source/tensor/XMem.h
+++ b/source/tensor/XMem.h
@@ -293,7 +293,7 @@ public:
    void SetComputationMode(bool myIsForComputation);
    /* initialize the index */
-    void SetIndex(INT_64 size, MTYPE minSizeFirst = 256, int minSizeNum = 20);
+    void SetIndex(INT_64 indexSize, MTYPE minSizeFirst = 256, int minSizeNum = 20);
    /* get device id */
    int GetDevID();

--- a/source/tensor/core/getandset/SetData.cpp
+++ b/source/tensor/core/getandset/SetData.cpp
@@ -536,7 +536,6 @@ void _SetDataRange(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE step)
    else {
        ShowNTErrors("TODO! Unsupported datatype!")
    }
 }
 /*