Update the Transformer sample based on the NiuTrans.NMT.

00d7b386 · liyinqiao · 3b93be69 · 00d7b386 · 00d7b386 · 3b93be69
Commit 00d7b386 authored Sep 19, 2020 by liyinqiao
--- a/data/transformer/test/bpevocab
+++ b/data/transformer/test/bpevocab
--- a/data/transformer/test/code
+++ b/data/transformer/test/code
--- a/data/transformer/test/fbis.test
+++ b/data/transformer/test/fbis.test
-Munich 18@@ 56 : Four maps that will change your view of the city
-A mental asylum , where today young people are said to meet .
-A cryp@@ t chap@@ el , where they are now dig@@ ging t@@ unn@@ els for the S @@@ -@@ @ Bahn .
-Al@@ lo@@ t@@ ment holders cul@@ tiv@@ ate the soil of former farmers .
-The oldest official map of Munich brings cap@@ tiv@@ ating stories to light .
--- a/data/transformer/test/test.de
+++ b/data/transformer/test/test.de
--- a/data/transformer/test/test.en
+++ b/data/transformer/test/test.en
--- a/data/transformer/train/bpevocab
+++ b/data/transformer/train/bpevocab
--- a/data/transformer/train/code
+++ b/data/transformer/train/code
--- a/data/transformer/train/fbis.train
+++ b/data/transformer/train/fbis.train
--- a/data/transformer/train/train.data.tgz
+++ b/data/transformer/train/train.data.tgz
--- a/data/transformer/train/valid.data.tgz
+++ b/data/transformer/train/valid.data.tgz
--- a/source/Main.cpp
+++ b/source/Main.cpp
@@ -26,7 +26,7 @@
 #include "./tensor/core/CHeader.h"
 #include "./tensor/test/Test.h"
 #include "./sample/fnnlm/FNNLM.h"
-#include "./sample/transformer/Transformer.h"
+#include "./sample/transformer/NMT.h"

 //#define CRTDBG_MAP_ALLOC
 //#include <stdlib.h>
@@ -34,7 +34,7 @@

 using namespace nts;
 using namespace fnnlm;
-using namespace transformer;
+using namespace nmt;

 int main( int argc, const char ** argv )
 {
@@ -43,7 +43,7 @@ int main( int argc, const char ** argv )
    else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
        FNNLMMain(argc - 1, argv + 1);
    else if(argc > 1 && !strcmp(argv[1], "-t2t"))
-        TransformerMain(argc - 1, argv + 1);
+        NMTMain(argc - 1, argv + 1);
    else{
        fprintf(stderr, "Thanks for using NiuTensor! This is a library for building\n");
        fprintf(stderr, "neural networks in an easy way. \n\n");

--- a/source/sample/transformer/T2TDecoder.cpp
+++ b/source/sample/transformer/T2TDecoder.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,15 +19,13 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

-#include <cmath>
-
-#include "T2TDecoder.h"
-#include "module/T2TUtility.h"
-#include "module/T2TLayerNormal.h"
-#include "module/T2TCommonModules.h"
+#include "Decoder.h"
+#include "Utility.h"
+#include "module/LayerNorm.h"
+#include "module/CommonModules.h"
 #include "../../tensor/core/CHeader.h"

-namespace transformer
+namespace nmt
 {

 /* constructor */
@@ -64,7 +61,7 @@ AttDecoder::~AttDecoder()
 initialize the model
 >> config - configurations of the model
 */
-void AttDecoder::InitModel(T2TConfig& config)
+void AttDecoder::InitModel(Config& config)
 {
    devID = config.devID;
    nlayer = config.nDecLayer;
@@ -80,16 +77,17 @@ void AttDecoder::InitModel(T2TConfig& config)
    /* embedding model */
    embedder.InitModel(config, false);

-    selfAtt = new T2TAttention[nlayer];
-    fnns = new T2TFNN[nlayer];
-    selfAttLayerNorms = new T2TLN[nlayer];
-    enDeAtt = new T2TAttention[nlayer];
-    enDeAttLayerNorms = new T2TLN[nlayer];
-    fnnLayerNorms = new T2TLN[nlayer];
+    selfAtt = new Attention[nlayer];
+    fnns = new FNN[nlayer];
+    selfAttLayerNorms = new LN[nlayer];
+    enDeAtt = new Attention[nlayer];
+    enDeAttLayerNorms = new LN[nlayer];
+    fnnLayerNorms = new LN[nlayer];
+
    selfAttCache = new Cache[nlayer];
    enDeAttCache = new Cache[nlayer];
    if (preNorm)
-        decoderLayerNorm = new T2TLN;
+        decoderLayerNorm = new LN;

    /* initialize the stacked layers */
    for (int i = 0; i < nlayer; i++) {
@@ -99,6 +97,8 @@ void AttDecoder::InitModel(T2TConfig& config)
        fnnLayerNorms[i].InitModel(config);
        enDeAtt[i].InitModel(config);
        enDeAttLayerNorms[i].InitModel(config);
+        selfAttCache[i].enable = true;
+        enDeAttCache[i].enable = true;
    }
    if (preNorm)
        decoderLayerNorm->InitModel(config);
@@ -115,9 +115,10 @@ make the decoding network
 << return - the output tensor of the decoder
 */
 XTensor AttDecoder::Make(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
-    XTensor* maskEncDec, int nstep, bool isTraining)
+                         XTensor* maskEncDec, int nstep, bool isTraining)
 {
    XTensor x;
+
    x = embedder.Make(inputDec, true, isTraining, nstep);

    /* dropout */
@@ -188,8 +189,86 @@ XTensor AttDecoder::Make(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
    }

    if (preNorm)
-        x = decoderLayerNorm->Make(x);
+        return decoderLayerNorm->Make(x);
+
+    return x;
+}
+
+/*
+make the decoding network
+>> inputDec - the input tensor of the decoder
+>> outputEnc - the output tensor of the encoder
+>> mask - mask that indicates which position is valid
+>> maskEncDec - mask for the encoder-decoder attention
+>> nstep - the current length of the decoder input
+>> isTraining - indicates whether the model is used for training
+<< return - the output tensor of the decoder
+*/
+XTensor AttDecoder::MakeFast(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
+                             XTensor* maskEncDec, int nstep, bool isTraining)
+{
+    XTensor x;
+
+    x = embedder.Make(inputDec, true, isTraining, nstep);
+
+    /* dropout */
+    if (isTraining && dropoutP > 0)
+        x = Dropout(x, dropoutP);
+
+    for (int i = 0; i < nlayer; i++) {
+        XTensor res;
+
+        res = x;
+
+        /* layer normalization with pre-norm for self-attn */
+        x = selfAttLayerNorms[i].Make(x);
+
+        /******************/
+        /* self attention */
+        x = selfAtt[i].Make(x, x, x, mask, isTraining, &selfAttCache[i], SELF_ATT);
+
+        /* dropout */
+        if (isTraining && dropoutP > 0)
+            x = Dropout(x, dropoutP);
+
+        /* residual connection */
+        x = Sum(res, x);
+
+        res = x;
+
+        /* layer normalization with pre-norm for encoder-decoder attention */
+        x = enDeAttLayerNorms[i].Make(x);
+
+        /* encoder-decoder attention */
+        x = enDeAtt[i].Make(outputEnc, x, outputEnc, maskEncDec,
+                            isTraining, &enDeAttCache[i], EN_DE_ATT);
+
+        /* dropout */
+        if (isTraining && dropoutP > 0)
+            x = Dropout(x, dropoutP);
+
+        /* residual connection */
+        x = Sum(res, x);
+
+        res = x;
+
+        /* layer normalization with pre-norm for fnn */
+        x = fnnLayerNorms[i].Make(x);
+
+        /* fnn */
+        x = fnns[i].Make(x, isTraining);
+
+        /* dropout */
+        if (isTraining && dropoutP > 0)
+            x = Dropout(x, dropoutP);
+
+        /* residual connection */
+        x = Sum(res, x);
+    }
+
+    x = decoderLayerNorm->Make(x);

    return x;
 }
+
 }
\ No newline at end of file
--- a/source/sample/transformer/T2TDecoder.h
+++ b/source/sample/transformer/T2TDecoder.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,13 +19,13 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

-#ifndef __T2TDECODER_H__
-#define __T2TDECODER_H__
+#ifndef __DECODER_H__
+#define __DECODER_H__

-#include "T2TEncoder.h"
-#include "module/T2TUtility.h"
+#include "Encoder.h"
+#include "Utility.h"

-namespace transformer
+namespace nmt
 {

 class AttDecoder
@@ -52,28 +51,28 @@ public:
    DTYPE dropoutP;

    /* embedding of word at each position */
-    T2TEmbedder embedder;
+    Embedder embedder;

    /* FNN model of each layer */
-    T2TFNN* fnns;
+    FNN* fnns;

    /* attention model of each layer */
-    T2TAttention* selfAtt;
+    Attention* selfAtt;

    /* layer normalization for attention */
-    T2TLN* selfAttLayerNorms;
+    LN* selfAttLayerNorms;

    /* layer normalization for fnn */
-    T2TLN* fnnLayerNorms;
+    LN* fnnLayerNorms;

    /* layer normalization for decoder */
-    T2TLN* decoderLayerNorm;
+    LN* decoderLayerNorm;

    /* encoder-decoder attention model of each layer */
-    T2TAttention* enDeAtt;
+    Attention* enDeAtt;

    /* layer normalization for encoder-decoder attention */
-    T2TLN* enDeAttLayerNorms;
+    LN* enDeAttLayerNorms;

    /* layer cache list */
    Cache* selfAttCache;
@@ -92,11 +91,15 @@ public:
    ~AttDecoder();

    /* initialize the model */
-    void InitModel(T2TConfig& config);
+    void InitModel(Config& config);

    /* make the decoding network */
    XTensor Make(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
                 XTensor* maskEncDec, int nstep, bool isTraining);
+
+    /* make the decoding network (pre norm) */
+    XTensor MakeFast(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
+                     XTensor* maskEncDec, int nstep, bool isTraining);
 };

 }

--- a/source/sample/transformer/T2TEncoder.cpp
+++ b/source/sample/transformer/T2TEncoder.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,15 +19,13 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

-#include <cmath>
-
-#include "T2TEncoder.h"
-#include "module/T2TUtility.h"
-#include "module/T2TLayerNormal.h"
-#include "module/T2TCommonModules.h"
+#include "Encoder.h"
+#include "Utility.h"
+#include "module/LayerNorm.h"
+#include "module/CommonModules.h"
 #include "../../tensor/core/CHeader.h"

-namespace transformer
+namespace nmt
 {

 /* constructor */
@@ -56,7 +53,7 @@ AttEncoder::~AttEncoder()
 initialize the model
 >> config - configurations for the model
 */
-void AttEncoder::InitModel(T2TConfig& config)
+void AttEncoder::InitModel(Config& config)
 {

    devID = config.devID;
@@ -68,18 +65,18 @@ void AttEncoder::InitModel(T2TConfig& config)
    dropoutP = config.dropout;

    CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
-    CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsize\"");
+    CheckNTErrors(vSize > 1, "Set vocabulary size by \"-vsize\"");

    /* embedding model */
    embedder.InitModel(config);

-    selfAtt = new T2TAttention[nlayer];
-    fnns = new T2TFNN[nlayer];
-    attLayerNorms = new T2TLN[nlayer];
-    fnnLayerNorms = new T2TLN[nlayer];
+    selfAtt = new Attention[nlayer];
+    fnns = new FNN[nlayer];
+    attLayerNorms = new LN[nlayer];
+    fnnLayerNorms = new LN[nlayer];

    if (preNorm)
-        encoderLayerNorm = new T2TLN;
+        encoderLayerNorm = new LN;

    /* initialize the stacked layers */
    for (int i = 0; i < nlayer; i++) {
@@ -122,7 +119,7 @@ XTensor AttEncoder::Make(XTensor& input, XTensor* mask, XTensor& maskEncDec, boo
        attnBefore = LayerNorm(x, attLayerNorms[i], preNorm, true, false);

        /* self attention */
-        att = selfAtt[i].Make(attnBefore, attnBefore, attnBefore, mask, isTraining, NULL, 0);
+        att = selfAtt[i].Make(attnBefore, attnBefore, attnBefore, mask, isTraining, NULL, SELF_ATT);

        /* dropout */
        if (isTraining && dropoutP > 0)
@@ -151,7 +148,63 @@ XTensor AttEncoder::Make(XTensor& input, XTensor* mask, XTensor& maskEncDec, boo
        x = LayerNorm(res, fnnLayerNorms[i], preNorm, false, true);
    }
    if (preNorm)
-        x = encoderLayerNorm->Make(x);
+        return encoderLayerNorm->Make(x);
+
+    return x;
+}
+
+/*
+make the encoding network
+>> input - the input tensor of the encoder
+>> mask - the mask that indicate each position is valid
+>> maskEncDec - no use
+>> isTraining - indicates whether the model is used for training
+<< return - the output tensor of the encoder
+*/
+XTensor AttEncoder::MakeFast(XTensor& input, XTensor* mask, XTensor& maskEncDec, bool isTraining)
+{
+    XTensor x;
+
+    x = embedder.Make(input, false, isTraining);
+
+    /* dropout */
+    if (isTraining && dropoutP > 0)
+        x = Dropout(x, dropoutP);
+
+    for (int i = 0; i < nlayer; i++) {
+        XTensor res;
+
+        res = x;
+
+        /* layer normalization with pre-norm for self-attn */
+        x = attLayerNorms[i].Make(x);
+
+        /* self attention */
+        x = selfAtt[i].Make(x, x, x, mask, isTraining, NULL, SELF_ATT);
+
+        /* dropout */
+        if (isTraining && dropoutP > 0)
+            x = Dropout(x, dropoutP);
+
+        /* residual connection */
+        x = Sum(res, x);
+
+        res = x;
+
+        /* layer normalization with pre-norm for fnn */
+        x = fnnLayerNorms[i].Make(x);
+
+        /* fnn */
+        x = fnns[i].Make(x, isTraining);
+
+        /* dropout */
+        if (isTraining && dropoutP > 0)
+            x = Dropout(x, dropoutP);
+
+        /* residual connection */
+        x = Sum(res, x);
+    }
+    x = encoderLayerNorm->Make(x);

    return x;
 }

--- a/source/sample/transformer/T2TEncoder.h
+++ b/source/sample/transformer/T2TEncoder.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,25 +19,25 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

-#ifndef __T2TENCODER_H__
-#define __T2TENCODER_H__
+#ifndef __ENCODER_H__
+#define __ENCODER_H__

-#include "module/T2TFNN.h"
-#include "module/T2TUtility.h"
-#include "module/T2TAttention.h"
-#include "module/T2TEmbedding.h"
-#include "module/T2TLayerNormal.h"
+#include "Utility.h"
+#include "module/FNN.h"
+#include "module/Attention.h"
+#include "module/Embedding.h"
+#include "module/LayerNorm.h"
 #include "../../network/XNet.h"

 using namespace nts;

-namespace transformer
+namespace nmt
 {

 /*
 base class of the encoder
 */
-class T2TEncoder
+class Encoder
 {
 public:
    virtual XTensor Make(XTensor& input, XTensor* mask, XTensor& mask2, bool isTraining) = 0;
@@ -47,7 +46,7 @@ public:
 /*
 the encoder based on self-attention
 */
-class AttEncoder : T2TEncoder
+class AttEncoder : Encoder
 {
 public:
    /* device id */
@@ -73,22 +72,22 @@ public:
    int ignored;

    /* embedding of word at each position */
-    T2TEmbedder embedder;
+    Embedder embedder;

    /* FNN model of each layer */
-    T2TFNN* fnns;
+    FNN* fnns;

    /* attention model of each layer */
-    T2TAttention* selfAtt;
+    Attention* selfAtt;

    /* layer normalizations for attention */
-    T2TLN* attLayerNorms;
+    LN* attLayerNorms;

    /* layer normalization for fnn */
-    T2TLN* fnnLayerNorms;
+    LN* fnnLayerNorms;

    /* layer normalization for encoder */
-    T2TLN* encoderLayerNorm;
+    LN* encoderLayerNorm;

    /* the location of layer normalization */
    bool preNorm;
@@ -101,11 +100,14 @@ public:
    ~AttEncoder();

    /* initialize the model */
-    void InitModel(T2TConfig& config);
+    void InitModel(Config& config);

    /* make the encoding network */
    XTensor Make(XTensor& input, XTensor* mask, XTensor& maskEncDec, bool isTraining);

+    /* make the encoding network */
+    XTensor MakeFast(XTensor& input, XTensor* mask, XTensor& maskEncDec, bool isTraining);
+
    /* make the encoding network (wrapper) */
    XTensor Make(XTensor& input, XTensor* mask, bool isTraining);
 };

--- a/source/sample/transformer/T2TModel.cpp
+++ b/source/sample/transformer/T2TModel.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,32 +21,32 @@

 #include <cstdint>

-#include "T2TModel.h"
-#include "module/T2TUtility.h"
+#include "Model.h"
+#include "Utility.h"
 #include "../../tensor/XUtility.h"
 #include "../../tensor/core/CHeader.h"

-namespace transformer
+namespace nmt
 {

 /* constructor */
-T2TModel::T2TModel()
+Model::Model()
 {
    devID = -1;
    isLM = false;
    isMT = false;
    useFP16 = false;
-    shareAllEmbeddings = false;
-    shareDecInputOutputWeight = false;
+    shareAllEmbeddings = 0;
+    shareDecInputOutputWeight = 0;
    nhead = 1;

    encoder = new AttEncoder();
    decoder = new AttDecoder();
-    outputLayer = new T2TOutput();
+    outputLayer = new Output();
 }

 /* de-constructor */
-T2TModel::~T2TModel()
+Model::~Model()
 {
    delete encoder;
    delete decoder;
@@ -58,7 +57,7 @@ T2TModel::~T2TModel()
 initialize the model
 >> config - configurations of the model
 */
-void T2TModel::InitModel(T2TConfig& config)
+void Model::InitModel(Config& config)
 {
    devID = config.devID;
    isMT = config.isMT;
@@ -71,8 +70,8 @@ void T2TModel::InitModel(T2TConfig& config)
        &config.fnnHiddenSize, &config.modelSize,
        &config.embSize, &config.srcVocabSize,
        &config.tgtVocabSize, &config.nhead,
-        &config.maxRP, &shareAllEmbeddings,
-        &shareDecInputOutputWeight,
+        &config.maxRP, &config.shareAllEmbeddings,
+        &config.shareDecInputOutputWeight,
        &config.maxPosLen
    };

@@ -81,10 +80,28 @@ void T2TModel::InitModel(T2TConfig& config)
    /* read model configurations */
    if (!config.isTraining) {
        modelFile = fopen(config.modelFN, "rb");
-        for (auto& meta : metaInfo)
+        CheckNTErrors(modelFile, "Failed to open the model file");
+        for (auto& meta : metaInfo) {
            fread(meta, sizeof(int), 1, modelFile);
+        }
+    }
+    else {
+        /* read the source and target vocab size */
+        FILE* trainF = fopen(config.trainFN, "rb");
+        CheckNTErrors(trainF, "Failed to open the training file");
+
+        fread(&config.srcVocabSize, sizeof(config.srcVocabSize), 1, trainF);
+        fread(&config.tgtVocabSize, sizeof(config.tgtVocabSize), 1, trainF);
+        CheckNTErrors(config.srcVocabSize > 0, "Invalid source vocabulary size");
+        CheckNTErrors(config.tgtVocabSize > 0, "Invalid target vocabulary size");
+        fclose(trainF);
    }
+
    nhead = config.nhead;
+    shareAllEmbeddings = config.shareAllEmbeddings;
+    shareDecInputOutputWeight = config.shareDecInputOutputWeight;
+
+    ShowModelConfig(config);

    encoder->InitModel(config);
    outputLayer->InitModel(config);
@@ -92,13 +109,12 @@ void T2TModel::InitModel(T2TConfig& config)
    if (isMT)
        decoder->InitModel(config);

-    TensorList params(10);
-    GetParams(params);
-
    /* load parameters */
    if (!config.isTraining)
        Read(modelFile);
    else {
+        TensorList params;
+        GetParams(params);
        for (int i = 0; i < params.Size(); i++)
            params[i]->SetVarFlag();
    }
@@ -108,13 +124,28 @@ void T2TModel::InitModel(T2TConfig& config)
 }

 /*
+print model configurations
+>> config - model configurations
+*/
+void Model::ShowModelConfig(Config& config)
+{
+    /* TODO: output more info */
+    XPRINT1(0, stderr, "encoder layer: %d\n", config.nEncLayer);
+    XPRINT1(0, stderr, "decoder layer: %d\n", config.nDecLayer);
+    XPRINT1(0, stderr, "attention heads: %d\n", config.nhead);
+    XPRINT1(0, stderr, "model size: %d\n", config.modelSize);
+    XPRINT1(0, stderr, "source vocab size: %d\n", config.srcVocabSize);
+    XPRINT1(0, stderr, "target vocab size: %d\n", config.tgtVocabSize);
+}
+
+/*
 make the encoding network
->> input - input tensor
->> mask - the mask for positions that are/not involved in computation
+>> input - input tensor, (batchSize, srcLen)
+>> mask - the mask for encoder self-attention, (headNum, batchSize, srcLen, srcLen)
 >> isTraining - indicates whether we are training the model
-<< return - encoding result
+<< return - encoding result, (batchSize, srcLen, hiddenDim)
 */
-XTensor T2TModel::MakeEncoder(XTensor& input, XTensor* mask, bool isTraining)
+XTensor Model::MakeEncoder(XTensor& input, XTensor* mask, bool isTraining)
 {
    XTensor nothing;

@@ -123,18 +154,17 @@ XTensor T2TModel::MakeEncoder(XTensor& input, XTensor* mask, bool isTraining)

 /*
 make the decoding network
->> inputDec - input tensor of the decoder
->> outputEnc - output tensor of the encoder
->> output - output tensor (distribution)
->> mask - mask for positions that are/not involved in computation
->> maskEncDec - mask for the encoder-decoder attention
+>> inputDec - input tensor of the decoder, (batchSize, tgtLen)
+>> outputEnc - output tensor of the encoder, (batchSize, srcLen, hiddenDim)
+>> mask - mask for decoder self-attention, (headNum, batchSize, tgtLen, tgtLen)
+>> maskEncDec - mask for the encoder-decoder attention, (headNum, batchSize, tgtLen, srcLen)
 >> isTraining - indicates whether we are training the model
-<< return - encoding result
+<< return - decoding result, (batchSize, tgtLen, hiddenDim)
 */
-XTensor T2TModel::MakeDecoder(XTensor& inputDec, XTensor& outputEnc, 
-                              XTensor* mask, XTensor& maskEncDec, bool isTraining)
+XTensor Model::MakeDecoder(XTensor& inputDec, XTensor& outputEnc,
+    XTensor* mask, XTensor& maskEncDec, bool isTraining)
 {
-    return decoder->Make(inputDec, outputEnc, mask, &maskEncDec, 
+    return decoder->Make(inputDec, outputEnc, mask, &maskEncDec,
                         inputDec.GetDim(1), isTraining);
 }

@@ -145,7 +175,7 @@ make the network for language modeling (with the output softmax layer)
 >> padding - padding of the sequences
 >> isTraining - indicates whether the model is for training
 */
-void T2TModel::MakeLM(XTensor& input, XTensor& output, XTensor& padding, bool isTraining)
+void Model::MakeLM(XTensor& input, XTensor& output, XTensor& padding, bool isTraining)
 {
    int len = padding.GetDim(padding.order - 1);
    int* dims = new int[padding.order + 2];
@@ -173,19 +203,19 @@ void T2TModel::MakeLM(XTensor& input, XTensor& output, XTensor& padding, bool is

 /*
 make the network for machine translation (with the output softmax layer)
->> inputEnc - input tensor of the encoder
->> inputDec - input tensor of the decoder
->> output - output tensor (distribution)
->> paddingEnc - padding of the sequences (on the encoder side)
->> paddingDec - padding of the sequences (on the decoder side)
+>> inputEnc - input tensor of the encoder, (batchSize, srcLen)
+>> inputDec - input tensor of the decoder, (batchSize, tgtLen)
+>> output - output tensor (distribution), (batchSize, tgtLen, hiddenDim)
+>> paddingEnc - padding of the sequences (on the encoder side), (batchSize, srcLen)
+>> paddingDec - padding of the sequences (on the decoder side), (batchSize, tgtLen)
 >> isTraining - indicates whether the model is for training
 */
-void T2TModel::MakeMT(XTensor& inputEnc, XTensor& inputDec, XTensor& output,
-    XTensor& paddingEnc, XTensor& paddingDec,
-    bool isTraining)
+void Model::MakeMT(XTensor& inputEnc, XTensor& inputDec, XTensor& output,
+                   XTensor& paddingEnc, XTensor& paddingDec, bool isTraining)
 {
    XTensor encoding;
    XTensor decoding;
+
    XTensor maskEnc;
    XTensor maskDec;
    XTensor maskEncDec;
@@ -213,9 +243,9 @@ make the mask for training MT models
 >> maksDec - mask of the decoder self-attention
 >> maksEncDec - mask of the decoder enc-dec attention
 */
-void T2TModel::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
-    XTensor& paddingEnc, XTensor& paddingDec,
-    XTensor& maskEnc, XTensor& maskDec, XTensor& maskEncDec)
+void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
+                       XTensor& paddingEnc, XTensor& paddingDec,
+                       XTensor& maskEnc, XTensor& maskDec, XTensor& maskEncDec)
 {
    int len = inputDec.GetDim(inputDec.order - 1);
    int* dims = new int[inputDec.order + 2];
@@ -235,8 +265,8 @@ void T2TModel::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
    dims[inputDec.order + 1] = inputEnc.GetDim(inputEnc.order - 1);
    InitTensor(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, paddingEnc.devID);

-    XTensor* maskEncDecTMPEnc = NewTensorBuf(paddingEnc.order + 1, dims + 1, 
-                                paddingEnc.dataType, paddingEnc.devID);
+    XTensor* maskEncDecTMPEnc = NewTensorBuf(paddingEnc.order + 1, dims + 1,
+        paddingEnc.dataType, paddingEnc.devID);
    XTensor* maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID);

    _Unsqueeze(&paddingEnc, maskEncDecTMPEnc, paddingEnc.order - 1, paddingDec.GetDim(-1));
@@ -260,8 +290,7 @@ void T2TModel::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
        dimsPadding[i + 1] = padding2->GetDim(i);
    dimsPadding[0] = nhead;

-    XTensor* padding3 = NewTensorBuf(paddingEnc.order + 2, dimsPadding, paddingEnc.dataType,
-        paddingEnc.devID);
+    XTensor* padding3 = NewTensorBuf(paddingEnc.order + 2, dimsPadding, paddingEnc.dataType, paddingEnc.devID);

    /* mask of the padding */
    _Unsqueeze(&paddingEnc, padding2, paddingEnc.order - 1, paddingEnc.GetDim(-1));
@@ -284,38 +313,28 @@ void T2TModel::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,

 /*
 make the mask of the encoder
->> inputEnc - input of the encoder
->> paddingEnc - padding of the encoder input
->> maskEnc - mask of the encoder self-attention
+>> paddingEnc - padding of the encoder input, (batchSize, srcLen)
+>> maskEnc - mask of the encoder self-attention, (headNum, batchSize, srcLen, srcLen)
 */
-void T2TModel::MakeMTMaskEnc(XTensor& paddingEnc, XTensor& maskEnc)
+void Model::MakeMTMaskEnc(XTensor& paddingEnc, XTensor& maskEnc)
 {
    XTensor padding2;
-    XTensor padding3;

    /* mask of the padding */
    Unsqueeze(paddingEnc, padding2, paddingEnc.order - 1, paddingEnc.GetDim(-1));
-    Unsqueeze(padding2, padding3, 0, nhead);
-    ScaleAndShiftMe(padding3, 1e9F, -1e9F);
-
-    InitTensor(&maskEnc, &padding3);
-    maskEnc.SetZeroAll();
-
-    /* generate the mask on the source language side (for padding) */
-    SumMe(maskEnc, padding3);
+    Unsqueeze(padding2, maskEnc, 0, nhead);
+    ScaleAndShiftMe(maskEnc, 1e9F, -1e9F);
 }

 /*
 make the mask of the decoder
->> inputEnc - input of the encoder
->> inputDec - input of the decoder
->> paddingEnc - padding of the encoder input
->> paddingDec - padding of the decoder input
->> maksDec - mask of the decoder self-attention
->> maksEncDec - mask of the decoder enc-dec attention
+>> paddingEnc - padding of the encoder input, (batchSize, srcLen)
+>> paddingDec - padding of the decoder input, (batchSize, tgtLen)
+>> maksDec - mask of the decoder self-attention, (headNum, batchSize, tgtLen, tgtLen)
+>> maksEncDec - mask of the decoder enc-dec attention, (headNum, batchSize, tgtLen, srcLen)
 */
-void T2TModel::MakeMTMaskDec(XTensor& paddingEnc, XTensor& paddingDec,
-                             XTensor& maskDec, XTensor& maskEncDec)
+void Model::MakeMTMaskDec(XTensor& paddingEnc, XTensor& paddingDec,
+                          XTensor& maskDec, XTensor& maskEncDec)
 {
    int len = paddingDec.GetDim(paddingDec.order - 1);
    int* dims = new int[paddingDec.order + 2];
@@ -340,26 +359,27 @@ void T2TModel::MakeMTMaskDec(XTensor& paddingEnc, XTensor& paddingDec,

    delete[] dims;
 }
+
 /*
 get parameter matrices
 >> list - the list that keeps the parameter matrics
 */
-void T2TModel::GetParams(TensorList& list)
+void Model::GetParams(TensorList& list)
 {
    list.Clear();

    /* encoder parameters */
    for (int i = 0; i < encoder->nlayer; i++) {
-        list.Add(&encoder->selfAtt[i].wq);
-        list.Add(&encoder->selfAtt[i].wk);
-        list.Add(&encoder->selfAtt[i].wv);
-        list.Add(&encoder->selfAtt[i].bq);
-        list.Add(&encoder->selfAtt[i].bk);
-        list.Add(&encoder->selfAtt[i].bv);
+        list.Add(&encoder->selfAtt[i].weightQ);
+        list.Add(&encoder->selfAtt[i].weightK);
+        list.Add(&encoder->selfAtt[i].weightV);
+        list.Add(&encoder->selfAtt[i].biasQ);
+        list.Add(&encoder->selfAtt[i].biasK);
+        list.Add(&encoder->selfAtt[i].biasV);
        if (encoder->selfAtt[i].useRPR)
            list.Add(&encoder->selfAtt[i].RPEmbK);
-        list.Add(&encoder->selfAtt[i].wo);
-        list.Add(&encoder->selfAtt[i].bo);
+        list.Add(&encoder->selfAtt[i].weightO);
+        list.Add(&encoder->selfAtt[i].biasO);
        list.Add(&encoder->fnns[i].w1);
        list.Add(&encoder->fnns[i].b1);
        list.Add(&encoder->fnns[i].w2);
@@ -377,26 +397,26 @@ void T2TModel::GetParams(TensorList& list)
    if (isMT) {
        /* decoder parameters */
        for (int i = 0; i < decoder->nlayer; i++) {
-            list.Add(&decoder->selfAtt[i].wq);
-            list.Add(&decoder->selfAtt[i].wk);
-            list.Add(&decoder->selfAtt[i].wv);
-            list.Add(&decoder->selfAtt[i].bq);
-            list.Add(&decoder->selfAtt[i].bk);
-            list.Add(&decoder->selfAtt[i].bv);
+            list.Add(&decoder->selfAtt[i].weightQ);
+            list.Add(&decoder->selfAtt[i].weightK);
+            list.Add(&decoder->selfAtt[i].weightV);
+            list.Add(&decoder->selfAtt[i].biasQ);
+            list.Add(&decoder->selfAtt[i].biasK);
+            list.Add(&decoder->selfAtt[i].biasV);
            if (decoder->selfAtt[i].useRPR)
                list.Add(&decoder->selfAtt[i].RPEmbK);
-            list.Add(&decoder->selfAtt[i].wo);
-            list.Add(&decoder->selfAtt[i].bo);
+            list.Add(&decoder->selfAtt[i].weightO);
+            list.Add(&decoder->selfAtt[i].biasO);
            list.Add(&decoder->selfAttLayerNorms[i].w);
            list.Add(&decoder->selfAttLayerNorms[i].b);
-            list.Add(&decoder->enDeAtt[i].wq);
-            list.Add(&decoder->enDeAtt[i].wk);
-            list.Add(&decoder->enDeAtt[i].wv);
-            list.Add(&decoder->enDeAtt[i].bq);
-            list.Add(&decoder->enDeAtt[i].bk);
-            list.Add(&decoder->enDeAtt[i].bv);
-            list.Add(&decoder->enDeAtt[i].wo);
-            list.Add(&decoder->enDeAtt[i].bo);
+            list.Add(&decoder->enDeAtt[i].weightQ);
+            list.Add(&decoder->enDeAtt[i].weightK);
+            list.Add(&decoder->enDeAtt[i].weightV);
+            list.Add(&decoder->enDeAtt[i].biasQ);
+            list.Add(&decoder->enDeAtt[i].biasK);
+            list.Add(&decoder->enDeAtt[i].biasV);
+            list.Add(&decoder->enDeAtt[i].weightO);
+            list.Add(&decoder->enDeAtt[i].biasO);
            list.Add(&decoder->enDeAttLayerNorms[i].w);
            list.Add(&decoder->enDeAttLayerNorms[i].b);
            list.Add(&decoder->fnns[i].w1);
@@ -418,8 +438,9 @@ void T2TModel::GetParams(TensorList& list)
        list.Add(&decoder->embedder.w);
    }

-    if (shareDecInputOutputWeight == 0)
+    if (shareDecInputOutputWeight == 0) {
        list.Add(&outputLayer->w);
+    }
 }

 /*
@@ -427,14 +448,14 @@ dump the model to a file
 >> fn - where to save the model
 >> model - the model
 */
-void T2TModel::Dump(const char* fn)
+void Model::Dump(const char* fn)
 {
    double startT = GetClockSec();

    FILE* file = fopen(fn, "wb");
    CheckNTErrors(file, "Cannot open the model file");

-    TensorList params(100);
+    TensorList params;

    GetParams(params);

@@ -459,22 +480,29 @@ void T2TModel::Dump(const char* fn)

    double elapsed = GetClockSec() - startT;

-    XPRINT1(0, stderr, "[INFO] model saved (took %.1fs)\n", elapsed);
+    LOG("model saved (took %.1fs)", elapsed);
 }

 /* read the parameters */
-void T2TModel::Read(FILE* file)
+void Model::Read(FILE* file)
 {
    double startT = GetClockSec();

-    TensorList params(100);
+    TensorList params;
    GetParams(params);
+    LOG("params count: %lu", params.Size());
+    int size = 0;
+    for (int i = 0; i < params.Size(); i++) {
+        size += params[i]->unitNum;
+    }
+    LOG("params size: %d", size);

-    /* convert parameters to FP16 */
+    /* convert parameters to FP16 before reading files */
    if (useFP16) {
+        LOG("Convert parameters to FP16");
        for (int i = 0; i < params.Size(); i++) {
            XTensor* p = params[i];
-            InitTensorV2(p, p->order, p->dimSize, X_FLOAT16, 1, p->devID);
+            InitTensor(p, p->order, p->dimSize, X_FLOAT16, p->devID, p->enableGrad && X_ENABLE_GRAD);
        }

        auto& encEmb = encoder->embedder.posEmbeddingBase;
@@ -488,18 +516,18 @@ void T2TModel::Read(FILE* file)

    /* share all embeddings */
    if (shareAllEmbeddings == 1) {
-        decoder->embedder.w = CopyValues(encoder->embedder.w);
-        XPRINT(0, stderr, "[INFO] sharing encoder decoder embeddings\n");
+        _CopyValues(&encoder->embedder.w, &decoder->embedder.w);
+        LOG("sharing encoder decoder embeddings");
    }

    /* share embeddings with output weights */
    if (shareDecInputOutputWeight == 1) {
-        outputLayer->w = CopyValues(decoder->embedder.w);
-        XPRINT(0, stderr, "[INFO] sharing decoder embeddings with output weights\n");
+        _CopyValues(&decoder->embedder.w, &outputLayer->w);
+        LOG("sharing decoder embeddings with output weights");
    }

    double elapsed = GetClockSec() - startT;
-    XPRINT1(0, stderr, "[INFO] model loaded (took %.1fs)\n", elapsed);
+    LOG("model loaded (took %.1fs)", elapsed);
 }

 }
\ No newline at end of file
--- a/source/sample/transformer/T2TModel.h
+++ b/source/sample/transformer/T2TModel.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,23 +19,22 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

-#ifndef __T2TMODEL_H__
-#define __T2TMODEL_H__
+#ifndef __MODEL_H__
+#define __MODEL_H__

-#include "T2TEncoder.h"
-#include "T2TDecoder.h"
-#include "module/T2TFNN.h"
-#include "module/T2TOutput.h"
-#include "module/T2TUtility.h"
-#include "module/T2TAttention.h"
+#include "Encoder.h"
+#include "Decoder.h"
+#include "module/FNN.h"
+#include "module/Output.h"
+#include "Utility.h"
+#include "module/Attention.h"

-namespace transformer
+namespace nmt
 {

-/* a transformer model that keeps parameters of the encoder,
-   the decoder and the output layer (softmax). Also, it creates
-   the network used in transformer. */
-class T2TModel
+/* a nmt model that keeps parameters of the encoder,
+   the decoder and the output layer (softmax). */
+class Model
 {
 public:
    /* device id */
@@ -49,7 +47,7 @@ public:
    AttDecoder* decoder;

    /* output layer */
-    T2TOutput* outputLayer;
+    Output* outputLayer;

    /* indicates whether the model is running for language modeling */
    bool isLM;
@@ -71,13 +69,16 @@ public:

 public:
    /* constructor */
-    T2TModel();
+    Model();

    /* de-constructor */
-    ~T2TModel();
+    ~Model();

    /* initialize the model */
-    void InitModel(T2TConfig& config);
+    void InitModel(Config& config);
+
+    /* print model configurations */
+    void ShowModelConfig(Config& config);

    /* make the encoding network */
    XTensor MakeEncoder(XTensor& input, XTensor* mask, bool isTraining);

--- a/source/sample/transformer/Transformer.cpp
+++ b/source/sample/transformer/Transformer.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,49 +16,47 @@

 /*
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
- * $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-06, 2020-07
 */

-#include <cmath>
 #include <ctime>

-#include "Transformer.h"
-#include "train/T2TTrainer.h"
-#include "module/T2TUtility.h"
-#include "translate/T2TTranslator.h"
-#include "../../tensor/XDevice.h"
-#include "../../tensor/XGlobal.h"
-#include "../../tensor/XUtility.h"
+#include "NMT.h"
+#include "train/Trainer.h"
+#include "translate/Translator.h"

-namespace transformer
+namespace nmt
 {

-int TransformerMain(int argc, const char** argv)
+int NMTMain(int argc, const char** argv)
 {
    if (argc == 0)
        return 1;

    /* load configurations */
-    T2TConfig config(argc, argv);
+    Config config(argc, argv);

-    srand((unsigned int)time(NULL));
+    srand(1);

-    /* train the model */
+    /* training */
    if (strcmp(config.trainFN, "") != 0) {
-        ENABLE_GRAD;
-        T2TModel model;
+        
+        Model model;
        model.InitModel(config);
-        T2TTrainer trainer;
+        Trainer trainer;
        trainer.Init(config);
        trainer.Train(config.trainFN, config.validFN, config.modelFN, &model);
    }

-    /* translate the test file */
+    /* translating */
    if (strcmp(config.testFN, "") != 0 && strcmp(config.outputFN, "") != 0) {
+        
+        /* disable grad flow */
        DISABLE_GRAD;
-        T2TModel model;
+
+        Model model;
        model.InitModel(config);
-        T2TTranslator translator;
+        Translator translator;
        translator.Init(config);
        translator.Translate(config.testFN, config.srcVocabFN, 
                             config.tgtVocabFN, config.outputFN, &model);

--- a/source/sample/transformer/Transformer.h
+++ b/source/sample/transformer/Transformer.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,29 +15,17 @@
 */

 /*
- *
- * An implementation of the transformer system. See more details
- * about FNNLM in
- * "Attention Is All You Need" by Vaswani et al.
- * https://arxiv.org/pdf/1706.03762.pdf
- *
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
- * I start writing the code related to NMT - a long time since my last coding
- * work on MT
+ * An implementation of the NMT system. 
 */

-#ifndef __TRANSFORMER_H__
-#define __TRANSFORMER_H__
-
-#include "../../tensor/XGlobal.h"
-#include "../../tensor/XTensor.h"
-#include "../../tensor/core/CHeader.h"
+#ifndef __NMT_H__
+#define __NMT_H__

-namespace transformer
+namespace nmt
 {

 /* entrance of the program */
-int TransformerMain(int argc, const char** argv);
+int NMTMain(int argc, const char** argv);

 }


--- a/source/sample/transformer/module/T2TUtility.cpp
+++ b/source/sample/transformer/module/T2TUtility.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -27,13 +26,13 @@
 #include <fstream>
 #include <sstream>

-#include "T2TUtility.h"
-#include "../../../tensor/XGlobal.h"
+#include "Utility.h"
+#include "../../tensor/XGlobal.h"

 using namespace nts;
 using namespace std;

-namespace transformer
+namespace nmt
 {

 /*
@@ -41,7 +40,7 @@ load configurations from the command
 >> argc - number of arguments
 >> argv - the list of arguments
 */
-T2TConfig::T2TConfig(int argc, const char** argv)
+Config::Config(int argc, const char** argv)
 {
    char** args = new char* [MAX_PARAM_NUM];
    for (int i = 0; i < argc; i++) {
@@ -61,22 +60,26 @@ T2TConfig::T2TConfig(int argc, const char** argv)
    ShowParams(argsNum, args);

    /* options for the model */
-    LoadParamInt(argsNum, args, "nhead", &nhead, 8);
-    LoadParamInt(argsNum, args, "enclayer", &nEncLayer, 1);
-    LoadParamInt(argsNum, args, "declayer", &nDecLayer, 1);
+    LoadParamInt(argsNum, args, "nhead", &nhead, 4);
+    LoadParamInt(argsNum, args, "enclayer", &nEncLayer, 6);
+    LoadParamInt(argsNum, args, "declayer", &nDecLayer, 6);
    LoadParamInt(argsNum, args, "maxrp", &maxRP, 8);
-    LoadParamInt(argsNum, args, "embsize", &embSize, 256);
-    LoadParamInt(argsNum, args, "modelsize", &modelSize, 256);
+    LoadParamInt(argsNum, args, "embsize", &embSize, 512);
+    LoadParamInt(argsNum, args, "modelsize", &modelSize, 512);
    LoadParamInt(argsNum, args, "maxpos", &maxPosLen, 1024);
-    LoadParamInt(argsNum, args, "fnnhidden", &fnnHiddenSize, modelSize * 4);
-    LoadParamInt(argsNum, args, "vsize", &srcVocabSize, 10000);
-    LoadParamInt(argsNum, args, "vsizetgt", &tgtVocabSize, 10000);
+    LoadParamInt(argsNum, args, "fnnhidden", &fnnHiddenSize, modelSize * 2);
+    LoadParamInt(argsNum, args, "vsize", &srcVocabSize, 10152);
+    LoadParamInt(argsNum, args, "vsizetgt", &tgtVocabSize, 10152);
    LoadParamInt(argsNum, args, "padid", &padID, 1);
    LoadParamInt(argsNum, args, "startid", &startID, 2);
    LoadParamInt(argsNum, args, "endid", &endID, 2);
    LoadParamBool(argsNum, args, "rpr", &useRPR, false);
-    LoadParamBool(argsNum, args, "prenorm", &preNorm, false);
-    LoadParamString(argsNum, args, "model", modelFN, "model.bin");
+    LoadParamBool(argsNum, args, "prenorm", &preNorm, true);
+
+    // TODO: refactor the parameters type to support weight sharing during training
+    LoadParamInt(argsNum, args, "shareemb", &shareAllEmbeddings, 0);
+    LoadParamInt(argsNum, args, "sharedec", &shareDecInputOutputWeight, 0);
+    LoadParamString(argsNum, args, "model", modelFN, "");
    LoadParamString(argsNum, args, "srcvocab", srcVocabFN, "vocab.src");
    LoadParamString(argsNum, args, "tgtvocab", tgtVocabFN, "vocab.tgt");

@@ -84,19 +87,20 @@ T2TConfig::T2TConfig(int argc, const char** argv)
    LoadParamString(argsNum, args, "train", trainFN, "");
    LoadParamString(argsNum, args, "valid", validFN, "");
    LoadParamInt(argsNum, args, "dev", &devID, 0);
-    LoadParamInt(argsNum, args, "wbatch", &wBatchSize, 2048);
-    LoadParamInt(argsNum, args, "sbatch", &sBatchSize, 1);
+    LoadParamInt(argsNum, args, "wbatch", &wBatchSize, 4096);
+    LoadParamInt(argsNum, args, "sbatch", &sBatchSize, 8);
    isTraining = (strcmp(trainFN, "") == 0) ? false : true;
    LoadParamBool(argsNum, args, "mt", &isMT, true);
-    LoadParamFloat(argsNum, args, "dropout", &dropout, 0.1);
-    LoadParamFloat(argsNum, args, "fnndrop", &fnnDropout, 0.0);
-    LoadParamFloat(argsNum, args, "attdrop", &attDropout, 0.0);
+    LoadParamFloat(argsNum, args, "dropout", &dropout, 0.3);
+    LoadParamFloat(argsNum, args, "fnndrop", &fnnDropout, 0.1);
+    LoadParamFloat(argsNum, args, "attdrop", &attDropout, 0.1);

-    LoadParamFloat(argc, args, "lrate", &lrate, 1.0F);
+    LoadParamFloat(argc, args, "lrate", &lrate, 0.0015F);
    LoadParamFloat(argc, args, "lrbias", &lrbias, 0);
-    LoadParamInt(argc, args, "nepoch", &nepoch, 20);
+    LoadParamInt(argc, args, "nepoch", &nepoch, 50);
+    LoadParamInt(argc, args, "maxcheckpoint", &maxCheckpoint, 10);
    LoadParamInt(argc, args, "nstep", &nstep, 100000);
-    LoadParamInt(argc, args, "nwarmup", &nwarmup, 3000);
+    LoadParamInt(argc, args, "nwarmup", &nwarmup, 8000);
    LoadParamBool(argc, args, "adam", &useAdam, true);
    LoadParamFloat(argc, args, "adambeta1", &adamBeta1, 0.9F);
    LoadParamFloat(argc, args, "adambeta2", &adamBeta2, 0.98F);
@@ -104,9 +108,8 @@ T2TConfig::T2TConfig(int argc, const char** argv)
    LoadParamBool(argc, args, "shuffled", &isShuffled, true);
    LoadParamFloat(argc, args, "labelsmoothing", &labelSmoothingP, 0.1);
    LoadParamInt(argc, args, "nstepcheckpoint", &nStepCheckpoint, -1);
-    LoadParamBool(argc, args, "epochcheckpoint", &useEpochCheckpoint, false);
+    LoadParamBool(argc, args, "epochcheckpoint", &useEpochCheckpoint, true);
    LoadParamInt(argc, args, "updatestep", &updateStep, 1);
-    LoadParamBool(argc, args, "debug", &isDebugged, false);
    LoadParamBool(argc, args, "sorted", &isLenSorted, false);

    LoadParamInt(argc, args, "bufsize", &bufSize, 50000);
@@ -114,7 +117,7 @@ T2TConfig::T2TConfig(int argc, const char** argv)
    LoadParamBool(argc, args, "smallbatch", &isSmallBatch, true);
    LoadParamBool(argc, args, "bigbatch", &isBigBatch, false);
    LoadParamBool(argc, args, "randbatch", &isRandomBatch, false);
-    LoadParamInt(argc, args, "bucketsize", &bucketSize, 0);
+    LoadParamInt(argc, args, "bucketsize", &bucketSize, wBatchSize * 10);

    /* options for translating */
    LoadParamString(argsNum, args, "test", testFN, "");
@@ -122,7 +125,7 @@ T2TConfig::T2TConfig(int argc, const char** argv)
    LoadParamInt(argsNum, args, "beamsize", &beamSize, 1);
    LoadParamBool(argsNum, args, "fp16", &useFP16, false);
    LoadParamFloat(argsNum, args, "lenalpha", &lenAlpha, 0.6);
-    LoadParamFloat(argsNum, args, "maxlenalpha", &maxLenAlpha, 2.0);
+    LoadParamFloat(argsNum, args, "maxlenalpha", &maxLenAlpha, 1.2);

    for (int i = 0; i < argc; i++)
        delete[] args[i];
@@ -136,7 +139,7 @@ load configurations from a file
 >> args - the list to store the configurations
 format: one option per line, separated by a blank or a tab
 */
-int T2TConfig::LoadFromFile(const char* configFN, char** args) {
+int Config::LoadFromFile(const char* configFN, char** args) {
    ifstream f(configFN, ios::in);
    CheckNTErrors(f.is_open(), "unable to open the config file");


--- a/source/sample/transformer/module/T2TUtility.h
+++ b/source/sample/transformer/module/T2TUtility.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,18 +19,18 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
 */

-#ifndef __T2TUTILITY_H__
-#define __T2TUTILITY_H__
+#ifndef __UTILITY_H__
+#define __UTILITY_H__

 #include <string>
 #include <cstdio>

-#include "../../../tensor/XList.h"
+#include "../../tensor/XList.h"

 using namespace std;
 using namespace nts;

-namespace transformer
+namespace nmt
 {

 #define MAX_PARAM_NUM 100
@@ -50,8 +49,8 @@ IntList SplitInt(const string& s, const string& delimiter);
 FloatList SplitFloat(const string& s, const string& delimiter);
 UInt64List SplitToPos(const string& s, const string& delimiter);

-/* configurations for t2t */
-class T2TConfig {
+/* configurations for  */
+class Config {
 public:
    /* path to the model */
    char modelFN[1024];
@@ -131,6 +130,12 @@ public:
    /* indicates whether the model is running for machine translation */
    bool isMT;

+    /* indicates whether share encoder decoder embeddings */
+    int shareAllEmbeddings;
+
+    /* indicates whether share decoder embeddings and output weights */
+    int shareDecInputOutputWeight;
+
    /* indicates whether the model is running with FP16 data type */
    bool useFP16;

@@ -164,9 +169,12 @@ public:
    /* training epoch number */
    int nepoch;

-    /* traing step number */
+    /* training step number */
    int nstep;

+    /* the maximum number of saved checkpoints */
+    int maxCheckpoint;
+
    /* indicates whether we use Adam */
    bool useAdam;

@@ -193,9 +201,6 @@ public:
    /* number of batches on which we do model update */
    int updateStep;

-    /* indicates whether we intend to debug the net */
-    bool isDebugged;
-
    /* indicates whether the sequence is sorted by length */
    bool isLenSorted;

@@ -222,7 +227,7 @@ public:
 public:

    /* load configurations from the command */
-    T2TConfig(int argc, const char** argv);
+    Config(int argc, const char** argv);

    /* load configurations from a file */
    int LoadFromFile(const char* configFN, char** args);

--- a/source/sample/transformer/module/T2TAttention.cpp
+++ b/source/sample/transformer/module/T2TAttention.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -15,22 +14,20 @@
 * limitations under the License.
 */

-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
- * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
- */
+ /*
+  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+  * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
+  */

-#include <cmath>
-
-#include "T2TUtility.h"
-#include "T2TAttention.h"
-#include "T2TEmbedding.h"
+#include "Attention.h"
+#include "Embedding.h"
+#include "../Utility.h"
 #include "../../../tensor/core/CHeader.h"

-namespace transformer
+namespace nmt
 {
 /* constructor */
-T2TAttention::T2TAttention()
+Attention::Attention()
 {
    nhead = -1;
    dk = -1;
@@ -39,7 +36,7 @@ T2TAttention::T2TAttention()
 }

 /* de-constructor */
-T2TAttention::~T2TAttention()
+Attention::~Attention()
 {
 }

@@ -47,7 +44,7 @@ T2TAttention::~T2TAttention()
 initialize the model
 >> config - the configurations of the network
 */
-void T2TAttention::InitModel(T2TConfig& config)
+void Attention::InitModel(Config& config)
 {
    devID = config.devID;
    useRPR = config.useRPR;
@@ -59,28 +56,34 @@ void T2TAttention::InitModel(T2TConfig& config)
    maxRP = config.maxRP;
    dropoutP = config.attDropout;

-    InitTensor2D(&wq, d, d, X_FLOAT, devID);
-    InitTensor1D(&bq, d, X_FLOAT, devID);
-    InitTensor2D(&wk, d, d, X_FLOAT, devID);
-    InitTensor1D(&bk, d, X_FLOAT, devID);
-    InitTensor2D(&wv, d, d, X_FLOAT, devID);
-    InitTensor1D(&bv, d, X_FLOAT, devID);
+    /* initialize the parameters */
+    InitTensor2D(&weightQ, d, d, X_FLOAT, devID);
+    InitTensor1D(&biasQ, d, X_FLOAT, devID);
+    InitTensor2D(&weightK, d, d, X_FLOAT, devID);
+    InitTensor1D(&biasK, d, X_FLOAT, devID);
+    InitTensor2D(&weightV, d, d, X_FLOAT, devID);
+    InitTensor1D(&biasV, d, X_FLOAT, devID);
+
    if (useRPR)
        InitTensor2D(&RPEmbK, maxRP * 2 + 1, d / nhead, X_FLOAT, devID);
-    InitTensor2D(&wo, d, d, X_FLOAT, devID);
-    InitTensor1D(&bo, d, X_FLOAT, devID);
+
+    InitTensor2D(&weightO, d, d, X_FLOAT, devID);
+    InitTensor1D(&biasO, d, X_FLOAT, devID);

    float scale = 1.0F;
-    _SetDataFanInOut(&wk, scale);
-    _SetDataFanInOut(&wq, scale);
-    _SetDataFanInOut(&wv, scale);
-    _SetDataFanInOut(&wo, scale);
+    _SetDataFanInOut(&weightK, scale);
+    _SetDataFanInOut(&weightQ, scale);
+    _SetDataFanInOut(&weightV, scale);
+    _SetDataFanInOut(&weightO, scale);
+
    if (useRPR)
        _SetDataFanInOut(&RPEmbK, scale);
-    bk.SetZeroAll();
-    bq.SetZeroAll();
-    bv.SetZeroAll();
-    bo.SetZeroAll();
+
+    biasQ.SetZeroAll();
+    biasO.SetZeroAll();
+
+    biasK.SetDataRand(-(DTYPE)sqrt(6.0F / d), (DTYPE)sqrt(6.0F / d));
+    biasV.SetDataRand(-(DTYPE)sqrt(6.0F / d), (DTYPE)sqrt(6.0F / d));
 }

 /*
@@ -96,30 +99,30 @@ make the network
 >> cacheType - type of cache, e.g., self-attention
 << return - multi-attention result
 */
-XTensor T2TAttention::Make(XTensor& k, XTensor& q, XTensor& v, XTensor* mask,
-                           bool isTraining, Cache* cache, int cacheType)
+XTensor Attention::Make(XTensor& k, XTensor& q, XTensor& v, XTensor* mask,
+    bool isTraining, Cache* cache, int attType)
 {
    const bool isEnc = (!cache) ? true : false;

    /* linear transformation before self-attention */
    XTensor q2, k2, v2;

-    q2 = MulAndShift(q, wq, bq);
+    q2 = MulAndShift(q, weightQ, biasQ);

-    if (!cache || isTraining) {
+    if (!cache || isTraining || !(cache->enable)) {
        /* self attention for encoder layers */
-        k2 = MulAndShift(k, wk, bk);
-        v2 = MulAndShift(v, wv, bv);
+        k2 = MulAndShift(k, weightK, biasK);
+        v2 = MulAndShift(v, weightV, biasV);

-        if (useRPR)
+        if (useRPR && attType == SELF_ATT)
            return MakeRPRAttention(k2, q2, v2, mask, isTraining, isEnc);
        return MakeAttention(k2, q2, v2, mask, isTraining);
    }

    else {
-        if (cacheType == SELF_ATT) {
-            k2 = MulAndShift(k, wk, bk);
-            v2 = MulAndShift(v, wv, bv);
+        if (attType == SELF_ATT) {
+            k2 = MulAndShift(k, weightK, biasK);
+            v2 = MulAndShift(v, weightV, biasV);

            /* if hit, we only concat the cache with the new token */
            if (!cache->miss) {
@@ -134,10 +137,10 @@ XTensor T2TAttention::Make(XTensor& k, XTensor& q, XTensor& v, XTensor* mask,
                return MakeRPRAttention(cache->key, q2, cache->value, mask, isTraining, isEnc);
            return MakeAttention(cache->key, q2, cache->value, mask, isTraining);
        }
-        else if (cacheType == EN_DE_ATT) {
+        else if (attType == EN_DE_ATT) {
            if (cache->miss) {
-                cache->key = MulAndShift(k, wk, bk);
-                cache->value = MulAndShift(v, wv, bv);
+                cache->key = MulAndShift(k, weightK, biasK);
+                cache->value = MulAndShift(v, weightV, biasV);
                cache->miss = false;
            }

@@ -155,8 +158,8 @@ make the attention network given keys, queries and values (after linear transfor
 >> mask - as it is
 >> isTraining - indicates whether the model is used for training
 */
-XTensor T2TAttention::MakeAttention(XTensor& k, XTensor& q, XTensor& v,
-                                    XTensor* mask, bool isTraining)
+XTensor Attention::MakeAttention(XTensor& k, XTensor& q, XTensor& v,
+    XTensor* mask, bool isTraining)
 {
    XTensor kheads;
    XTensor qheads;
@@ -185,7 +188,7 @@ XTensor T2TAttention::MakeAttention(XTensor& k, XTensor& q, XTensor& v,
    dot = BMMul(qheads, X_NOTRANS, kheads, X_TRANS);

    if (mask)
-        dot = dot + (*mask);
+        dot = dot + *mask;

    dot = Linear(dot, 1.0F / (float)sqrt((float)dk / nhead));

@@ -203,7 +206,7 @@ XTensor T2TAttention::MakeAttention(XTensor& k, XTensor& q, XTensor& v,
        att = ConvertDataType(att, dataType);

    /* concatenate the heads */
-    return MulAndShift(Merge(att, att.order - 1), wo, bo);
+    return MulAndShift(Merge(att, att.order - 1), weightO, biasO);
 }

 /*
@@ -216,16 +219,16 @@ with the given keys, queries and values (after linear transformation)
 >> isTraining - indicates whether the model is used for training
 >> isEnc - indicates whether it is encoder
 */
-XTensor T2TAttention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
-                                       XTensor* mask, bool isTraining, bool isEnc)
+XTensor Attention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
+                                    XTensor* mask, bool isTraining, bool isEnc)
 {
    XTensor kheads;
    XTensor qheads;
    XTensor vheads;

-    const int batchSize = q.dimSize[0];
-    const int lenQ = q.dimSize[1];
-    const int lenKV = k.dimSize[1];
+    const int batchSize = q.GetDim(0);
+    const int lenQ = q.GetDim(1);
+    const int lenKV = k.GetDim(1);

    const auto dataType = k.dataType;

@@ -241,7 +244,7 @@ XTensor T2TAttention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
    XTensor embMatrix, relativeKey;

    /* generate the relative emb index (L_q, L_kv) */
-    embMatrix = GetRPEmbedding(lenQ, lenKV, maxRP, isEnc);
+    embMatrix = GetRPEmbedding(lenQ, lenKV, maxRP, isEnc || isTraining);

    /* generate the relative key from the RPEmbK (L_q, L_kv, H/K) */
    relativeKey = Gather(RPEmbK, embMatrix);
@@ -252,12 +255,13 @@ XTensor T2TAttention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
        relativeKey = ConvertDataType(relativeKey, X_FLOAT);
    }

-    ScaleAndShiftMe(qheads, 1.0F / float(nhead));
+    float scaling = sqrt(d / nhead);
+    qheads = ScaleAndShift(qheads, 1.0F / scaling);

    dot = RPDotProduct(qheads, kheads, relativeKey, true);

    if (mask)
-        dot = dot + (*mask);
+        dot = dot + *mask;

    /* softmax */
    scalar = Softmax(dot, -1);
@@ -275,7 +279,7 @@ XTensor T2TAttention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
        att = ConvertDataType(att, dataType);

    /* concatenate the heads */
-    return MulAndShift(Merge(att, att.order - 1), wo, bo);
+    return MulAndShift(Merge(att, att.order - 1), weightO, biasO);
 }

 /*
@@ -284,8 +288,8 @@ generate relative position embeddings
 >> lenKV - the length of key and value
 >> maxRelativeLen - the maximum length of relative position
 */
-XTensor T2TAttention::GetRPEmbedding(const int lenQ, const int lenKV,
-                                     const int maxRelativeLen, const bool isEnc)
+XTensor Attention::GetRPEmbedding(const int lenQ, const int lenKV,
+    const int maxRelativeLen, const bool isEnc)
 {
    XTensor range;
    XTensor embMatrix;
@@ -309,37 +313,46 @@ XTensor T2TAttention::GetRPEmbedding(const int lenQ, const int lenKV,
        embMatrix = Unsqueeze(range, 0, lenQ);
    }

-    ClipMe(embMatrix, -float(maxRelativeLen), float(maxRelativeLen));
-    ScaleAndShiftMe(embMatrix, 1.0F, float(maxRelativeLen));
+    //ClipMe(embMatrix, -float(maxRelativeLen), float(maxRelativeLen));
+    embMatrix = Clip(embMatrix, -float(maxRelativeLen), float(maxRelativeLen));
+    embMatrix = ScaleAndShift(embMatrix, 1.0F, float(maxRelativeLen));

    delete[] index;
    return embMatrix;
 }

 /*
-Relative position-aware dot-product attention inner calculation.
+relative position-aware dot-product attention inner calculation.
 >> x - Tensor with shape [batch_size*heads, length, length or depth].
 >> y - Tensor with shape [batch_size*heads, length, depth].
 >> z - Tensor with shape [length, length, depth].
 >> isKey - Whether y is key.
 << return - A Tensor with shape [batch_size*heads, length, length or depth].
 */
-XTensor T2TAttention::RPDotProduct(XTensor& x, XTensor& y, XTensor& z, const bool isKey)
+XTensor Attention::RPDotProduct(XTensor& x, XTensor& y, XTensor& z, const bool isKey)
 {
    const int headNum = nhead;
-    const int batchSize = x.dimSize[1];
-    const int lenQ = x.dimSize[2];
-    const int lenKV = y.dimSize[2];
-    const int depth = y.dimSize[3];
+    const int batchSize = x.GetDim(1);
+    const int lenQ = x.GetDim(2);
+    const int lenKV = y.GetDim(2);
+    const int depth = y.GetDim(3);

    const int lastDim = isKey ? lenKV : depth;
-    MATRIX_TRANS_TYPE transposeFlag = isKey ? X_TRANS : X_NOTRANS;
+    auto transposeFlag = isKey ? X_TRANS : X_NOTRANS;

-    XTensor context;
-    context = MatrixMulBatched(x, X_NOTRANS, y, transposeFlag);
+    int mergeDimsX[] = { headNum * batchSize, lenQ, x.GetDim(3) };
+    int mergeDimsY[] = { headNum * batchSize, lenKV, y.GetDim(3) };
+    x = Reshape(x, 3, mergeDimsX);
+    y = Reshape(y, 3, mergeDimsY);
+
+    if (isKey) {
+        y = Transpose(y, 1, 2);
+    }

-    int mergeDims[] = { headNum * batchSize, lenQ, x.dimSize[3] };
-    x.Reshape(3, mergeDims);
+    XTensor context;
+    context = BMMul(x, y);
+    int newDims[]{ headNum, batchSize, context.GetDim(1), context.GetDim(2) };
+    context = Reshape(context, 4, newDims);

    XTensor xTrans;
    xTrans = Transpose(x, 0, 1);
@@ -351,15 +364,17 @@ XTensor T2TAttention::RPDotProduct(XTensor& x, XTensor& y, XTensor& z, const boo
    relativeTrans = Transpose(relative, 0, 1);

    int splitDims[] = { headNum, batchSize, lenQ, lastDim };
-    relativeTrans.Reshape(4, splitDims);

-    return Sum(context, relativeTrans);
+    relativeTrans = Reshape(relativeTrans, 4, splitDims);
+
+    return context + relativeTrans;
 }

 /* constructor */
 Cache::Cache()
 {
    miss = true;
+    enable = true;
 }

 /* update the states cache */

--- a/source/sample/transformer/module/T2TAttention.h
+++ b/source/sample/transformer/module/T2TAttention.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,17 +19,17 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
 */

-#ifndef __T2TATTENTION_H__
-#define __T2TATTENTION_H__
+#ifndef __ATTENTION_H__
+#define __ATTENTION_H__

-#include "T2TNNUtil.h"
-#include "T2TUtility.h"
+#include "NNUtil.h"
+#include "../Utility.h"
 #include "../../../network/XNet.h"
 #include "../../../tensor/core/CHeader.h"

 using namespace nts;

-namespace transformer
+namespace nmt
 {
 /* attention type */
 enum { NONE, SELF_ATT, EN_DE_ATT };
@@ -50,6 +49,9 @@ public:
    /* indicates cache miss if 'true' */
    bool miss;

+    /* indicates whether we use cache */
+    bool enable;
+
    /* constructor */
    Cache();

@@ -64,7 +66,7 @@ public:
 };

 /* multi-head attention */
-class T2TAttention
+class Attention
 {
 public:
    /* device id */
@@ -74,22 +76,22 @@ public:
    int nhead;

    /* transformation matrix for Q */
-    XTensor wq;
+    XTensor weightQ;

    /* bias for Q */
-    XTensor bq;
+    XTensor biasQ;

    /* transformation matrix for K */
-    XTensor wk;
+    XTensor weightK;

    /* bias for K */
-    XTensor bk;
+    XTensor biasK;

    /* transformation matrix for V */
-    XTensor wv;
+    XTensor weightV;

    /* bias for V */
-    XTensor bv;
+    XTensor biasV;

    XTensor wBig;

@@ -99,10 +101,10 @@ public:
    XTensor RPEmbK;

    /* transformation after dot-product attention */
-    XTensor wo;
+    XTensor weightO;

    /* bias after dot-product attention */
-    XTensor bo;
+    XTensor biasO;

    /* size of transformed Q and K */
    int dk;
@@ -124,13 +126,13 @@ public:

 public:
    /* constructor */
-    T2TAttention();
+    Attention();

    /* de-constructor */
-    ~T2TAttention();
+    ~Attention();

    /* initialize the model */
-    void InitModel(T2TConfig& config);
+    void InitModel(Config& config);

    /* make the network */
    XTensor Make(XTensor& k, XTensor& q, XTensor& v,
@@ -145,8 +147,10 @@ public:
    XTensor MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
                             XTensor* mask, bool isTraining, bool isEnc);

+    /* generate relative position embeddings */
    XTensor GetRPEmbedding(const int lenQ, const int lenKV, const int maxRelativeLen, const bool isEnc);

+    /* relative position-aware dot-product attention inner calculation */
    XTensor RPDotProduct(XTensor& x, XTensor& y, XTensor& z, const bool is_key);
 };
 }

--- a/source/sample/transformer/module/T2TCommonModules.cpp
+++ b/source/sample/transformer/module/T2TCommonModules.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,13 +19,11 @@
 * This file includes some common modules of the Transformer model
 */

-#include <cmath>
-
-#include "T2TCommonModules.h"
+#include "CommonModules.h"
 #include "../../../tensor/core/CHeader.h"
 #include "../../../tensor/function/FHeader.h"

-namespace transformer
+namespace nmt
 {

 /* 
@@ -37,7 +34,7 @@ flexible layer normalization for the Transformer
 >> before - whether we use layernorm before attention/fnn
 >> after - whether we use layernorm after attention/fnn
 */
-XTensor LayerNorm(XTensor& input, T2TLN& ln, bool prenorm, bool before, bool after)
+XTensor LayerNorm(XTensor& input, LN& ln, bool prenorm, bool before, bool after)
 {
    if (after ^ prenorm)
        return ln.Make(input);

--- a/source/sample/transformer/module/T2TCommonModules.h
+++ b/source/sample/transformer/module/T2TCommonModules.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,16 +21,16 @@
 #ifndef __COMMONMODULE_H__
 #define __COMMONMODULE_H__

-#include "T2TLayerNormal.h"
-#include "T2TCommonModules.h"
+#include "LayerNorm.h"
+#include "CommonModules.h"

 using namespace nts;

-namespace transformer
+namespace nmt
 {

 /* the layer normalization module to control pre-norm or post-norm*/
-XTensor LayerNorm(XTensor& input, T2TLN& ln, bool prenorm, bool before, bool after);
+XTensor LayerNorm(XTensor& input, LN& ln, bool prenorm, bool before, bool after);

 }


--- a/source/sample/transformer/module/T2TEmbedding.cpp
+++ b/source/sample/transformer/module/T2TEmbedding.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,17 +19,15 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-07
 */

-#include <cmath>
-
-#include "T2TUtility.h"
-#include "T2TEmbedding.h"
+#include "Embedding.h"
+#include "../Utility.h"
 #include "../../../tensor/core/CHeader.h"

-namespace transformer
+namespace nmt
 {

 /* constructor */
-T2TEmbedder::T2TEmbedder()
+Embedder::Embedder()
 {
    devID = -1;
    vSize = -1;
@@ -38,7 +35,7 @@ T2TEmbedder::T2TEmbedder()
 }

 /* de-constructor */
-T2TEmbedder::~T2TEmbedder()
+Embedder::~Embedder()
 {
 }

@@ -47,7 +44,7 @@ initialize the model
 >> config - configurations of the model
 >> isEnc - indicates if it is used for the encoder
 */
-void T2TEmbedder::InitModel(T2TConfig& config, bool isEnc)
+void Embedder::InitModel(Config& config, bool isEnc)
 {
    devID = config.devID;
    d = config.modelSize;
@@ -70,7 +67,7 @@ void T2TEmbedder::InitModel(T2TConfig& config, bool isEnc)
 make positional embeddings (of size eSize * length)
 >> length - length of the sequence
 */
-void T2TEmbedder::MakePosEmbedding(int length)
+void Embedder::MakePosEmbedding(int length)
 {
    InitTensor2D(&posEmbeddingBase, length, eSize, X_FLOAT, devID);

@@ -110,58 +107,45 @@ make the network
 >> isTraining - indicates whether it is training
 << return - word & position embeddings of the input
 */
-XTensor T2TEmbedder::Make(XTensor& input, bool isDec, bool isTraining, int nstep)
+XTensor Embedder::Make(XTensor& input, bool isDec, bool isTraining, int nstep)
 {
    /* make sure the padding index is 1 */
    CheckNTErrors(input.order > 1, "Wrong input tensor size!");
    CheckNTErrors(input.dimSize[input.order - 1] < maxLength, "The sequence is too long!");
-    CheckNTErrors(vSize > 0, "set vocabulary size by \"-vsize\"");
-    CheckNTErrors(eSize > 0, "set embedding size by \"-esize\"");
+    CheckNTErrors(vSize > 0, "Set vocabulary size by \"-vsize\"");
+    CheckNTErrors(eSize > 0, "Set embedding size by \"-esize\"");

    XTensor wordEmbedding, position, posEmbedding;
-    InitTensor(&position, &input);
-
-    int* posData = new int[input.unitNum];

-    XTensor inputCPU;
-    InitTensorOnCPU(&inputCPU, &input);
-    _CopyValues(&input, &inputCPU);
+    InitTensor1D(&position, input.GetDim(-1), X_INT, devID);

-    if (!isDec)
+    if (!isDec || isTraining || input.GetDim(-1) > 1)
    {
-        /* encoder embeddings */
-        for (int i = 0; i < inputCPU.dimSize[0]; i++) {
-            int startNoPad = 1 + 1;
-            int* p = ((int*)inputCPU.data) + i * inputCPU.dimSize[1];
-            for (int j = 0; j < inputCPU.dimSize[1]; j++) {
-                if (p[j] == 1) {
-                    posData[i * inputCPU.dimSize[1] + j] = 1;
-                }
-                else {
-                    posData[i * inputCPU.dimSize[1] + j] = startNoPad++;
-                }
-            }
-        }
-        position.SetData(posData, position.unitNum);
+        position.Range(0, position.unitNum, 1);
+
+        // disable grad
+        ScaleAndShiftMe(position, 1.0F, float(padIdx + 1));
    }
    else
    {
-        /* decoder embeddings */
-        position.SetDataFixed(nstep + 2);
+        /* decoder embeddings during decoding */
+        position.SetDataFixed(nstep + padIdx + 1);
    }

-    delete[] posData;
-
    /* we make positional embeddings first */
-    posEmbedding = Gather(posEmbeddingBase, position);
+    XTensor embTMP;
+    embTMP = Gather(posEmbeddingBase, position);
+    posEmbedding = Unsqueeze(embTMP, 0, input.GetDim(0));

    /* then we make word embeddings */
+    //w.enableGrad = false;
    wordEmbedding = Gather(w, input);

    wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize));

    /* we sum over the two embeddings */
-    return wordEmbedding + posEmbedding;
+    SumMe(wordEmbedding, posEmbedding);
+    return wordEmbedding;
 }

 }
\ No newline at end of file
--- a/source/sample/transformer/module/T2TEmbedding.h
+++ b/source/sample/transformer/module/T2TEmbedding.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,15 +19,15 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-07
 */

-#ifndef __T2TEMBEDDING_H__
-#define __T2TEMBEDDING_H__
+#ifndef __EMBEDDING_H__
+#define __EMBEDDING_H__

-#include "T2TUtility.h"
+#include "../Utility.h"
 #include "../../../network/XNet.h"

 using namespace nts;

-namespace transformer
+namespace nmt
 {

 #define DEFAULT_EMBEDDING_SIZE 512
@@ -37,7 +36,7 @@ namespace transformer
 embedding (of word at position i):
 word embedding + positional embedding
 */
-class T2TEmbedder
+class Embedder
 {
 public:
    /* device id */
@@ -52,7 +51,7 @@ public:
    /* maximum length of the sequence */
    int maxLength;

-    /* dimension size of the hidden layers in the t2t model */
+    /* dimension size of the hidden layers in the  model */
    int d;

    /* padding index */
@@ -67,13 +66,13 @@ public:

 public:
    /* constructor */
-    T2TEmbedder();
+    Embedder();

    /* de-constructor */
-    ~T2TEmbedder();
+    ~Embedder();

    /* initialize the model */
-    void InitModel(T2TConfig& config, bool isEnc = true);
+    void InitModel(Config& config, bool isEnc = true);

    /* make positional embeddings */
    void MakePosEmbedding(int length);

--- a/source/sample/transformer/module/T2TFNN.cpp
+++ b/source/sample/transformer/module/T2TFNN.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,19 +19,17 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

-#include <cmath>
-
-#include "T2TFNN.h"
-#include "T2TUtility.h"
-#include "T2TEmbedding.h"
+#include "FNN.h"
+#include "Embedding.h"
+#include "../Utility.h"
 #include "../../../tensor/core/CHeader.h"
 #include "../../../tensor/function/FHeader.h"

-namespace transformer
+namespace nmt
 {

 /* constructor */
-T2TFNN::T2TFNN()
+FNN::FNN()
 {
    inSize = -1;
    outSize = -1;
@@ -40,7 +37,7 @@ T2TFNN::T2TFNN()
 }

 /* de-constructor */
-T2TFNN::~T2TFNN()
+FNN::~FNN()
 {
 }

@@ -50,7 +47,7 @@ initialize the model
 >> argv - list of pointers to the arguments
 >> config - configurations of the model
 */
-void T2TFNN::InitModel(T2TConfig& config)
+void FNN::InitModel(Config& config)
 {
    devID = config.devID;

@@ -69,6 +66,9 @@ void T2TFNN::InitModel(T2TConfig& config)
    _SetDataFanInOut(&w1, scale);
    _SetDataFanInOut(&w2, scale);

+    w1.SetDataRand(-(DTYPE)sqrt(6.0F / inSize), (DTYPE)sqrt(6.0F / inSize));
+    w2.SetDataRand(-(DTYPE)sqrt(6.0F / hSize), (DTYPE)sqrt(6.0F / hSize));
+
    b1.SetZeroAll();
    b2.SetZeroAll();
 }
@@ -79,7 +79,7 @@ y = max(0, x * w1 + b1) * w2 + b2
 >> input - the input tensor
 >> return - the output tensor
 */
-XTensor T2TFNN::Make(XTensor& input, bool isTraining)
+XTensor FNN::Make(XTensor& input, bool isTraining)
 {
    XTensor t1;


--- a/source/sample/transformer/module/T2TFNN.h
+++ b/source/sample/transformer/module/T2TFNN.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,20 +19,20 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

-#ifndef __T2TFNN_H__
-#define __T2TFNN_H__
+#ifndef __FNN_H__
+#define __FNN_H__

-#include "T2TUtility.h"
-#include "T2TLayerNormal.h"
+#include "LayerNorm.h"
+#include "../Utility.h"
 #include "../../../tensor/XTensor.h"

 using namespace nts;

-namespace transformer
+namespace nmt
 {

 /* a fnn: y = max(0, x * w1 + b1) * w2 + b2 */
-class T2TFNN
+class FNN
 {
 public:
    /* device id */
@@ -66,13 +65,13 @@ public:
 public:

    /* constructor */
-    T2TFNN();
+    FNN();

    /* de-constructor */
-    ~T2TFNN();
+    ~FNN();

    /* initialize the model */
-    void InitModel(T2TConfig& config);
+    void InitModel(Config& config);

    /* make the network */
    XTensor Make(XTensor& input, bool isTraining);

--- a/source/sample/transformer/module/T2TGatedLinearUnit.cpp
+++ b/source/sample/transformer/module/T2TGatedLinearUnit.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,16 +18,13 @@
 * $Created by: Bei Li (libei_neu@outlook.com) 2020-02-03
 */

-
-#include <cmath>
-
-#include "T2TUtility.h"
-#include "T2TEmbedding.h"
-#include "T2TGatedLinearUnit.h"
+#include "GLU.h"
+#include "Embedding.h"
+#include "../Utility.h"
 #include "../../../tensor/core/CHeader.h"
 #include "../../../tensor/function/FHeader.h"

-namespace transformer
+namespace nmt
 {

 /* constructor */
@@ -48,7 +44,7 @@ GLU::~GLU()
 initialize the model
 >> config - configurations of the model
 */
-void GLU::InitModel(T2TConfig& config)
+void GLU::InitModel(Config& config)
 {
    devID = config.devID;


--- a/source/sample/transformer/module/T2TGatedLinearUnit.h
+++ b/source/sample/transformer/module/T2TGatedLinearUnit.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,12 +22,11 @@
 #ifndef __GLU_H__
 #define __GLU_H__

-#include "T2TLayerNormal.h"
-#include "T2TGatedLinearUnit.h"
+#include "LayerNorm.h"

 using namespace nts;

-namespace transformer
+namespace nmt
 {

 /* a fnn: y = max(0, x * w1 + b1) * w2 + b2 */
@@ -68,7 +66,7 @@ public:
    ~GLU();

    /* initialize the model */
-    void InitModel(T2TConfig& config);
+    void InitModel(Config& config);

    /* make the network */
    XTensor Make(XTensor& input);

--- a/source/sample/transformer/module/T2TLayerHistory.cpp
+++ b/source/sample/transformer/module/T2TLayerHistory.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,19 +18,16 @@
 * $Created by: Bei Li (libei_neu@outlook.com) 2020-02-03
 */

-#include <cmath>
-
-#include "T2TUtility.h"
-#include "T2TEmbedding.h"
-#include "T2TLayerNormal.h"
-#include "T2TLayerHistory.h"
-
+#include "Embedding.h"
+#include "LayerNorm.h"
+#include "LayerHistory.h"
+#include "../Utility.h"
 #include "../../../tensor/core/CHeader.h"

 #define SAFE_DELETE(x) do{ if((x) != NULL){delete (x); (x) = NULL;} } while(false)
 #define SAFE_DELETE_ARRAY(x) do{ if((x) != NULL) {delete [] (x); (x)=NULL;} } while(false)

-namespace transformer
+namespace nmt
 {

 /* constructor */
@@ -54,7 +50,7 @@ LayerHistory::~LayerHistory()
 initialize the model
 >> config - configurations of the model
 */
-void LayerHistory::InitModel(T2TConfig& config)
+void LayerHistory::InitModel(Config& config)
 {
    devID = config.devID;
    d = config.modelSize;
@@ -62,7 +58,7 @@ void LayerHistory::InitModel(T2TConfig& config)

    InitTensor2D(&weight, nlayer + 1, nlayer + 1, X_FLOAT, devID);

-    layerNorms = new T2TLN[nlayer];
+    layerNorms = new LN[nlayer];

    /* initialize the layer normalization of each layer */
    for (int i = 0; i < nlayer; i++) {

--- a/source/sample/transformer/module/T2TLayerHistory.h
+++ b/source/sample/transformer/module/T2TLayerHistory.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,14 +21,14 @@
 #ifndef __LAYERHISTORY_H__
 #define __LAYERHISTORY_H__

-#include "T2TLayerNormal.h"
-#include "T2TLayerHistory.h"
+#include "LayerNorm.h"
+#include "LayerHistory.h"

 #include "../../../tensor/function/FHeader.h"

 using namespace nts;

-namespace transformer
+namespace nmt
 {

 /*
@@ -61,7 +60,7 @@ public:
    TensorList history;

    /* layer normalization for each intimidate layer */
-    T2TLN* layerNorms;
+    LN* layerNorms;

 public:
    /* constructor */
@@ -71,7 +70,7 @@ public:
    ~LayerHistory();

    /* initialize the model */
-    void InitModel(T2TConfig& config);
+    void InitModel(Config& config);

    /* add the layer output to the history */
    void Add(XTensor& tensor);

--- a/source/sample/transformer/module/T2TLayerNormal.cpp
+++ b/source/sample/transformer/module/T2TLayerNormal.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,24 +19,23 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

-#include <cmath>
-#include "T2TUtility.h"
-#include "T2TEmbedding.h"
-#include "T2TLayerNormal.h"
+#include "Embedding.h"
+#include "LayerNorm.h"
+#include "../Utility.h"
 #include "../../../tensor/core/CHeader.h"

-namespace transformer
+namespace nmt
 {

 /* constructor */
-T2TLN::T2TLN()
+LN::LN()
 {
    devID = -1;
    d = 0;
 }

 /* de-constructor */
-T2TLN::~T2TLN()
+LN::~LN()
 {
 }

@@ -47,7 +45,7 @@ initialize the model
 >> argv - list of pointers to the arguments
 >> config - configurations of the model
 */
-void T2TLN::InitModel(T2TConfig& config)
+void LN::InitModel(Config& config)
 {
    devID = config.devID;

@@ -57,6 +55,8 @@ void T2TLN::InitModel(T2TConfig& config)
    InitTensor1D(&b, d, X_FLOAT, devID);
    w.SetDataRand(1.0F, 1.0F);
    b.SetZeroAll();
+
+    w.SetDataFixed(1);
 }

 /*
@@ -64,7 +64,7 @@ make the network
 >> input - the input tensor
 >> return - layer normalization output
 */
-XTensor T2TLN::Make(XTensor& input)
+XTensor LN::Make(XTensor& input)
 {
    XTensor& x = input;
    XTensor xn;

--- a/source/sample/transformer/module/T2TLayerNormal.h
+++ b/source/sample/transformer/module/T2TLayerNormal.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,20 +19,20 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

-#ifndef __T2TLAYERNORMAL_H__
-#define __T2TLAYERNORMAL_H__
+#ifndef __LAYERNORMAL_H__
+#define __LAYERNORMAL_H__

-#include "T2TUtility.h"
-#include "../../../network/XNet.h"
+#include "../Utility.h"
+#include "../../../network//XNet.h"

 using namespace nts;

-namespace transformer
+namespace nmt
 {

 /* layer normalization: y = norm(x) * w + b
   where norm(x) = (x - mean)/standardDeviation */
-class T2TLN
+class LN
 {
 public:
    /* device id */
@@ -50,13 +49,13 @@ public:

 public:
    /* constructor */
-    T2TLN();
+    LN();

    /* de-constructor */
-    ~T2TLN();
+    ~LN();

    /* initialize the model */
-    void InitModel(T2TConfig& config);
+    void InitModel(Config& config);

    /* make the network */
    XTensor Make(XTensor& input);

--- a/source/sample/transformer/module/T2TNNUtil.cpp
+++ b/source/sample/transformer/module/T2TNNUtil.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,12 +15,12 @@
 */

 /*
- * $Created by: Chi (huchinlp@foxmail.com) 2020-03-21
+ * $Created by: HU Chi (huchinlp@foxmail.com) 2020-03-21
 */

-#include "T2TNNUtil.h"
+#include "NNUtil.h"

-namespace transformer
+namespace nmt
 {

 /* 

--- a/source/sample/transformer/module/T2TNNUtil.h
+++ b/source/sample/transformer/module/T2TNNUtil.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,11 +15,11 @@
 */

 /*
- * $Created by: Chi (huchinlp@foxmail.com) 2020-03-21
+ * $Created by: HU Chi (huchinlp@foxmail.com) 2020-03-21
 */

-#ifndef __T2TNNUTIL_H__
-#define __T2TNNUTIL_H__
+#ifndef __NNUTIL_H__
+#define __NNUTIL_H__

 #include "../../../tensor/XGlobal.h"
 #include "../../../tensor/core/CHeader.h"
@@ -28,7 +27,7 @@

 using namespace nts;

-namespace transformer
+namespace nmt
 {

 /* the gather function for tensor with any dimension */

--- a/source/sample/transformer/module/T2TOutput.cpp
+++ b/source/sample/transformer/module/T2TOutput.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,18 +19,16 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

-#include <cmath>
-
-#include "T2TOutput.h"
-#include "T2TUtility.h"
-#include "T2TEmbedding.h"
+#include "Output.h"
+#include "Embedding.h"
+#include "../Utility.h"
 #include "../../../tensor/core/CHeader.h"

-namespace transformer
+namespace nmt
 {

 /* constructor */
-T2TOutput::T2TOutput()
+Output::Output()
 {
    devID = -1;
    vSize = -1;
@@ -39,7 +36,7 @@ T2TOutput::T2TOutput()
 }

 /* de-constructor */
-T2TOutput::~T2TOutput()
+Output::~Output()
 {
 }

@@ -47,7 +44,7 @@ T2TOutput::~T2TOutput()
 initialize the model
 >> config - configurations of the model
 */
-void T2TOutput::InitModel(T2TConfig& config)
+void Output::InitModel(Config& config)
 {
    devID = config.devID;
    hSize = config.modelSize;
@@ -66,7 +63,7 @@ make the network (redefined output tensor)
 >> isTraining - whether it is used for training
 >> normalized - whether ignore the log-softmax
 */
-void T2TOutput::Make(XTensor& input, XTensor& output, bool isTraining, bool normalized)
+void Output::Make(XTensor& input, XTensor& output, bool isTraining, bool normalized)
 {
    XTensor& x = input;


--- a/source/sample/transformer/module/T2TOutput.h
+++ b/source/sample/transformer/module/T2TOutput.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,19 +19,19 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

-#ifndef __T2TOUTPUT_H__
-#define __T2TOUTPUT_H__
+#ifndef __OUTPUT_H__
+#define __OUTPUT_H__

-#include "T2TUtility.h"
+#include "../Utility.h"
 #include "../../../tensor/function/FHeader.h"

 using namespace nts;

-namespace transformer
+namespace nmt
 {

 /* output layer */
-class T2TOutput
+class Output
 {
 public:
    /* device id */
@@ -49,13 +48,13 @@ public:

 public:
    /* constructor */
-    T2TOutput();
+    Output();

    /* de-constructor */
-    ~T2TOutput();
+    ~Output();

    /* initialize the model */
-    void InitModel(T2TConfig& config);
+    void InitModel(Config& config);

    /* make the network (redefined output tensor) */
    void Make(XTensor& input, XTensor& output, bool isTraining, bool normalized);

--- a/source/sample/transformer/train/T2TBatchLoader.cpp
+++ b/source/sample/transformer/train/T2TBatchLoader.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
- */
-
-#include "T2TBatchLoader.h"
-#include "../module/T2TUtility.h"
-#include "../../../tensor/XUtility.h"
-#include "../../../tensor/core/CHeader.h"
-#include "../../../network/XNoder.h"
-
-namespace transformer
-{
-
-/* constructor */
-T2TBatchLoader::T2TBatchLoader()
-{
-    seqLen = NULL;
-    seqLen2 = NULL;
-    nseqBuf = 0;
-    nextSeq = -1;
-    nextBatch = -1;
-    buf = NULL;
-    buf2 = NULL;
-    bufBatch = NULL;
-    bufSize = 0;
-    bufBatchSize = 0;
-    seqOffset = NULL;
-}
-
-/* de-constructor */
-T2TBatchLoader::~T2TBatchLoader()
-{
-    delete[] buf;
-    delete[] buf2;
-    delete[] bufBatch;
-    delete[] seqLen;
-    delete[] seqLen2;
-    delete[] seqOffset;
-}
-
-/*
-initialization
->> argc - number of arguments
->> argv - list of pointers to the arguments
-*/
-void T2TBatchLoader::Init(T2TConfig& config)
-{
-    bufSize = config.bufSize;
-    isDoubledEnd = config.isDoubledEnd;
-    isSmallBatch = config.isSmallBatch;
-    isBigBatch = config.isBigBatch;
-    isRandomBatch = config.isRandomBatch;
-    bucketSize = config.bucketSize;
-
-    buf = new int[bufSize];
-    buf2 = new int[bufSize];
-    bufBatch = new BatchNode[bufSize];
-    seqLen = new int[bufSize];
-    seqLen2 = new int[bufSize];
-    seqOffset = new int[bufSize];
-}
-
-char line[MAX_SEQUENCE_LENGTH];
-
-struct SampleNode
-{
-    int id;
-    int offset;
-    int* p;
-    int size;
-    int value;
-    int key;
-};
-
-int CompareSampleNode(const void* a, const void* b)
-{
-    return ((SampleNode*)b)->value - ((SampleNode*)a)->value;
-}
-
-int CompareSampleNodeV2(const void* a, const void* b)
-{
-    return ((SampleNode*)b)->key - ((SampleNode*)a)->key;
-}
-
-/*
-load data to buffer
->> file - where to load data
->> isSorted - indicates whether the samples are sorted by length
->> step - the number of sequences we go over when move to the next sample
-*/
-int T2TBatchLoader::LoadBuf(FILE* file, bool isSorted, int step)
-{
-    int lineCount = 0;
-    int seqCount = 0;
-    int wordCount = 0;
-    while (fgets(line, MAX_SEQUENCE_LENGTH - 1, file)) {
-        int len = (int)strlen(line);
-
-        while (line[len - 1] == '\r' || line[len - 1] == '\n') {
-            line[len - 1] = 0;
-            len--;
-        }
-
-        len = (int)strlen(line);
-        if (len == 0)
-            continue;
-
-        /* how many characters are in a word */
-        int wSize = 0;
-
-        /* how many words are in the sentence */
-        int wNum = 0;
-        int wNumLocal = 0;
-        int i = 0;
-
-        for (i = 0; i < len; i++) {
-            /* load word (id) seperated by space or tab */
-            if ((line[i] == ' ' || line[i] == '\t') && wSize > 0) {
-                line[i] = 0;
-
-                if (wSize == 3 && line[i - 1] == '|' && line[i - 2] == '|' && line[i - 3] == '|') {
-                    seqLen[seqCount] = wNumLocal;
-                    seqOffset[seqCount] = wordCount + wNum - wNumLocal;
-                    seqCount++;
-                    wNumLocal = 0;
-                }
-                else {
-                    buf[wordCount + wNum++] = atoi(line + i - wSize);
-                    wNumLocal++;
-                }
-
-                wSize = 0;
-            }
-            else
-                wSize++;
-        }
-
-        if (wSize > 0) {
-            buf[wordCount + wNum++] = atoi(line + i - wSize);
-            wNumLocal++;
-        }
-
-        seqLen[seqCount] = wNumLocal;
-        seqOffset[seqCount] = wordCount + wNum - wNumLocal;
-        seqCount++;
-
-        wordCount += wNum;
-        lineCount++;
-
-        if (wordCount >= bufSize - MAX_SEQUENCE_LENGTH)
-            break;
-
-        CheckNTErrors(seqCount % step == 0, "Wrong number of sequences!");
-    }
-
-    nseqBuf = seqCount;
-    nextSeq = 0;
-
-    /* sort the sequences by length */
-    if (isSorted) {
-        CheckNTErrors(seqCount % step == 0, "Wrong number of sequences!");
-        SampleNode* nodes = new SampleNode[seqCount];
-        int count = 0;
-        int offset = 0;
-        for (int i = 0; i < seqCount; i += step) {
-            SampleNode& node = nodes[count];
-            node.id = count;
-            node.offset = i;
-            node.p = buf + offset;
-            node.size = 0;
-            int max = 0;
-            for (int j = 0; j < step; j++) {
-                node.size += seqLen[i + j];
-                max = MAX(max, seqLen[i + j]);
-            }
-            node.value = max;
-            node.key = rand();
-            count++;
-            offset += node.size;
-        }
-
-        qsort(nodes, count, sizeof(SampleNode), CompareSampleNode);
-
-        /* distribute samples into buckets. In each bucket, sequences have
-           similar a length */
-        if (bucketSize > 0) {
-            int low = 0;
-            int high = low + bucketSize;
-            int n = count - 1;
-            int m = n;
-            int num = 0;
-            while (num < count) {
-                for (m = n; m >= 0; m--) {
-                    if (nodes[m].value > high)
-                        break;
-                }
-
-                qsort(nodes + m + 1, n - m, sizeof(SampleNode), CompareSampleNodeV2);
-                num += (n - m);
-                n = m;
-                low += bucketSize;
-                high = low + bucketSize;
-            }
-        }
-
-        count = 0;
-        offset = 0;
-        for (int i = 0; i < seqCount; i += step) {
-            SampleNode& node = nodes[count];
-            memcpy(buf2 + offset, node.p, sizeof(int) * node.size);
-            for (int j = 0; j < step; j++) {
-                seqLen2[i + j] = seqLen[node.offset + j];
-                seqOffset[i + j] = offset + (j > 0 ? seqLen[node.offset + j - 1] : 0);
-            }
-            count += 1;
-            offset += node.size;
-        }
-
-        int* tmp = buf;
-        buf = buf2;
-        buf2 = tmp;
-        tmp = seqLen;
-
-        seqLen = seqLen2;
-        seqLen2 = tmp;
-
-        delete[] nodes;
-    }
-
-    return lineCount;
-}
-
-/* clear the data buffer */
-void T2TBatchLoader::ClearBuf()
-{
-    nseqBuf = 0;
-    nextSeq = -1;
-}
-
-/*
-set the random batch flag
->> flag - as it is
-*/
-void T2TBatchLoader::SetRandomBatch(bool flag)
-{
-    isRandomBatch = flag;
-}
-
-/*
-load a batch of sequences
->> file - the handle to the data file
->> isLM - indicates whether the data is used for training lms
->> batchEnc - the batch of the input sequences
->> paddingEnc - padding of the input sequences
->> batchDec - the batch of the output sequences
->> paddingDec - padding of the output sequences
->> gold - gold standard
->> seqs - keep the sequences in an array
->> vsEnc - size of the encoder vocabulary
->> vsDec - size of the decoder vocabulary
->> sBatch - batch size of sequences
->> wBatch - batch size of words
->> isSorted - indicates whether the sequences are sorted by length
->> wCount - word count
->> devID - device id
->> isTraining - indicates whether we are training the model
-*/
-int T2TBatchLoader::LoadBatch(FILE* file, bool isLM,
-    XTensor* batchEnc, XTensor* paddingEnc,
-    XTensor* batchDec, XTensor* paddingDec,
-    XTensor* gold, XTensor* label,
-    int* seqs,
-    int vsEnc, int vsDec, int sBatch, int wBatch,
-    bool isSorted, int& ws, int& wCount,
-    int devID, bool isTraining)
-{
-    if (isLM) {
-        return LoadBatchLM(file, batchEnc, paddingEnc, batchDec, paddingDec, gold, label,
-            seqs, vsEnc, sBatch, wBatch,
-            isSorted, wCount, devID, isTraining);
-    }
-    else {
-        return LoadBatchMT(file, batchEnc, paddingEnc, batchDec, paddingDec, gold, label,
-            seqs, vsEnc, vsDec, sBatch, wBatch,
-            isSorted, ws, wCount, devID, isTraining);
-    }
-}
-
-/*
-load a batch of sequences (for LM)
->> file - the handle to the data file
->> isLM - indicates whether the data is used for training lms
->> batchEnc - the batch of the input sequences
->> paddingEnc - padding of the input sequences
->> batchDec - the batch of the output sequences
->> paddingDec - padding of the output sequences
->> gold - gold standard (distribution of every position)
->> label - (gold standard) label index of every position
->> seqs - keep the sequences in an array
->> vSize - vocabulary size
->> sBatch - batch size of sequences
->> wBatch - batch size of words
->> isSorted - indicates whether the sequences are sorted by length
->> wCount - word count
->> devID - device id
->> isTraining - indicates whether we are training the model
-*/
-int T2TBatchLoader::LoadBatchLM(FILE* file,
-    XTensor* batchEnc, XTensor* paddingEnc,
-    XTensor* batchDec, XTensor* paddingDec,
-    XTensor* gold, XTensor* label,
-    int* seqs,
-    int vSize, int sBatch, int wBatch,
-    bool isSorted, int& wCount,
-    int devID, bool isTraining)
-{
-    if (nextSeq < 0 || nextSeq >= nseqBuf)
-        LoadBuf(file, isSorted, 1);
-
-    int seq = MAX(nextSeq, 0);
-    int wc = 0;
-    int wn = 0;
-    int sc = 0;
-    int max = 0;
-    while (seq + sc < nseqBuf) {
-        int len = isDoubledEnd ? seqLen[seq + sc] : seqLen[seq + sc] - 1;
-        CheckNTErrors(len > 0, "Empty sequence!");
-        wn = len;
-        wc += wn;
-        sc += 1;
-
-        if (max < wn)
-            max = wn;
-
-        int tc = isBigBatch ? wc : max * sc;
-        if (sc >= sBatch && tc >= wBatch)
-            break;
-    }
-
-    wCount = 0;
-    nextSeq = seq + sc;
-
-    if (sc <= 0)
-        return 0;
-
-    int dims[MAX_TENSOR_DIM_NUM];
-    dims[0] = sc;
-    dims[1] = max;
-    dims[2] = vSize;
-
-    InitTensor2D(batchEnc, sc, max, X_INT, devID);
-    InitTensor2D(label, sc, max, X_INT, devID);
-    InitTensor(gold, 3, dims, X_FLOAT, devID);
-    InitTensor2D(paddingEnc, sc, max, X_FLOAT, devID);
-    InitTensor2D(paddingDec, sc, max, X_FLOAT, devID);
-
-    batchEnc->SetZeroAll();
-    label->SetZeroAll();
-    gold->SetZeroAll();
-    paddingEnc->SetZeroAll();
-    paddingDec->SetZeroAll();
-
-    int seqSize = 0;
-
-    int* batchEncValues = new int[batchEnc->unitNum];
-    int* labelValues = new int[label->unitNum];
-    MTYPE* goldOffsets = new MTYPE[gold->unitNum];
-    MTYPE* paddingEncOffsets = new MTYPE[paddingEnc->unitNum];
-    MTYPE* paddingDecOffsets = new MTYPE[paddingDec->unitNum];
-
-    int wGold = 0;
-
-    memset(batchEncValues, 0, sizeof(int) * batchEnc->unitNum);
-    memset(labelValues, 0, sizeof(int) * label->unitNum);
-
-    for (int s = seq; s < seq + sc; s++) {
-        int len = isDoubledEnd ? seqLen[s] : seqLen[s] - 1;
-        CheckNTErrors(len <= max, "Something is wrong!");
-        for (int w = 0; w < len; w++) {
-            int num = buf[seqOffset[s] + w];
-            batchEncValues[(int)batchEnc->GetOffset2D(s - seq, w)] = num;
-            paddingEncOffsets[wCount] = paddingEnc->GetOffset2D(s - seq, w);
-            paddingDecOffsets[wCount] = paddingDec->GetOffset2D(s - seq, w);
-            if (w > 0) {
-                goldOffsets[wGold++] = gold->GetOffset3D(s - seq, w - 1, num);
-                labelValues[(int)label->GetOffset2D(s - seq, w - 1)] = buf[seqOffset[s] + w];
-            }
-
-            if (w == len - 1) {
-                if (isDoubledEnd) {
-                    goldOffsets[wGold++] = gold->GetOffset3D(s - seq, w, num);
-                    labelValues[(int)label->GetOffset2D(s - seq, w)] = buf[seqOffset[s] + w];
-                }
-                else {
-                    goldOffsets[wGold++] = gold->GetOffset3D(s - seq, w, buf[seqOffset[s] + w + 1]);
-                    labelValues[(int)label->GetOffset2D(s - seq, w)] = buf[seqOffset[s] + w + 1];
-                }
-            }
-
-            wCount++;
-
-            if (seqs != NULL)
-                seqs[seqSize++] = buf[seqOffset[s] + w];
-        }
-
-        if (seqs != NULL) {
-            for (int w = len; w < max; w++)
-                seqs[seqSize++] = -1;
-        }
-    }
-
-    batchEnc->SetData(batchEncValues, batchEnc->unitNum);
-    label->SetData(labelValues, label->unitNum);
-    gold->SetDataBatched(goldOffsets, 1.0F, wGold);
-    paddingEnc->SetDataBatched(paddingEncOffsets, 1.0F, wCount);
-    paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCount);
-
-    /*XTensor * tmp = NewTensorBuf(paddingEnc, devID);
-    _ConvertDataType(batchEnc, tmp);
-    _NotEqual(tmp, paddingEnc, 0);
-    DelTensorBuf(tmp);
-
-    XTensor * tmp2 = NewTensorBuf(paddingDec, devID);
-    _ConvertDataType(batchEnc, tmp2);
-    _NotEqual(tmp2, paddingDec, 0);
-    DelTensorBuf(tmp2);*/
-
-    delete[] batchEncValues;
-    delete[] labelValues;
-    delete[] goldOffsets;
-    delete[] paddingEncOffsets;
-    delete[] paddingDecOffsets;
-
-    return sc;
-}
-
-int CompareBatchNode(const void* a, const void* b)
-{
-    return ((BatchNode*)b)->key - ((BatchNode*)a)->key;
-}
-
-/*
-load a batch of sequences (for MT)
->> file - the handle to the data file
->> batchEnc - the batch of the input sequences
->> paddingEnc - padding of the input sequences
->> batchDec - the batch of the output sequences
->> paddingDec - padding of the output sequences
->> gold - gold standard (distribution of every position)
->> label - (gold standard) label index of every position
->> seqs - keep the sequences in an array
->> vSizeEnc - size of the encoder vocabulary
->> vSizeDec - size of the decoder vocabulary
->> sBatch - batch size of sequences
->> wBatch - batch size of words
->> isSorted - indicates whether the sequences are sorted by length
->> wCount - word count
->> devID - device id
->> isTraining - indicates whether we are training the model
-*/
-int T2TBatchLoader::LoadBatchMT(FILE* file,
-    XTensor* batchEnc, XTensor* paddingEnc,
-    XTensor* batchDec, XTensor* paddingDec,
-    XTensor* gold, XTensor* label,
-    int* seqs,
-    int vSizeEnc, int vSizeDec, int sBatch, int wBatch,
-    bool isSorted, int& ws, int& wCount,
-    int devID, bool isTraining)
-{
-    if (nextBatch < 0 || nextBatch >= bufBatchSize) {
-        LoadBuf(file, isSorted, 2);
-
-        int seq = 0;
-
-        bufBatchSize = 0;
-        nextBatch = 0;
-
-        /* we segment the buffer into batches */
-        while (seq < nseqBuf) {
-            int wcEnc = 0;
-            int wcDec = 0;
-            int wnEnc = 0;
-            int wnDec = 0;
-            int maxEnc = 0;
-            int maxDec = 0;
-            int sc = 0;
-
-            while (seq + sc < nseqBuf) {
-                /* source-side sequence */
-                wnEnc = seqLen[seq + sc];
-
-                /* target-side sequence */
-                wnDec = isDoubledEnd ? seqLen[seq + sc + 1] : seqLen[seq + sc + 1] - 1;
-
-                int tcEnc = isBigBatch ? (wcEnc + wnEnc) : MAX(maxEnc, wnEnc) * (sc + 2) / 2;
-                int tcDec = isBigBatch ? (wcDec + wnDec) : MAX(maxDec, wnDec) * (sc + 2) / 2;
-
-                if (sc != 0 && sc > sBatch * 2 && (tcEnc > wBatch || tcDec > wBatch))
-                    break;
-
-                wcEnc += wnEnc;
-                sc += 1;
-
-                if (maxEnc < wnEnc)
-                    maxEnc = wnEnc;
-
-                wcDec += wnDec;
-                sc += 1;
-
-                if (maxDec < wnDec)
-                    maxDec = wnDec;
-            }
-
-            BatchNode& batch = bufBatch[bufBatchSize];
-            batch.beg = seq;
-            batch.end = seq + sc;
-            batch.maxEnc = maxEnc;
-            batch.maxDec = maxDec;
-            batch.key = rand();
-
-            bufBatchSize++;
-            seq = seq + sc;
-        }
-
-        if (isRandomBatch)
-            qsort(bufBatch, bufBatchSize, sizeof(BatchNode), CompareBatchNode);
-    }
-
-    if (bufBatchSize <= 0)
-        return 0;
-
-    BatchNode& batch = bufBatch[nextBatch++];
-    int seq = batch.beg;
-    int sc = batch.end - batch.beg;
-    int maxEnc = batch.maxEnc;
-    int maxDec = batch.maxDec;
-
-    CheckNTErrors(sc % 2 == 0, "The input samples must be paired");
-
-    int sCount = sc / 2;
-    int seqSize = 0;
-
-    InitTensor2D(batchEnc, sCount, maxEnc, X_INT, devID);
-    InitTensor2D(paddingEnc, sCount, maxEnc, X_FLOAT, devID);
-    InitTensor2D(batchDec, sCount, maxDec, X_INT, devID);
-    InitTensor2D(paddingDec, sCount, maxDec, X_FLOAT, devID);
-    InitTensor2D(label, sCount, maxDec, X_INT, devID);
-
-    //InitTensor(gold, 3, dimsDec, X_FLOAT, devID);
-
-    batchEnc->SetZeroAll();
-    paddingEnc->SetZeroAll();
-    batchDec->SetZeroAll();
-    paddingDec->SetZeroAll();
-    label->SetZeroAll();
-
-    //gold->SetZeroAll();
-
-    int wCountEnc = 0;
-    int wCountDec = 0;
-    int wCountPad = 0;
-    wCount = 0;
-
-    int* batchEncValues = new int[batchEnc->unitNum];
-    int* batchDecValues = new int[batchDec->unitNum];
-    int* labelValues = new int[label->unitNum];
-    MTYPE* paddingEncOffsets = new MTYPE[sc * maxEnc / 2];
-    MTYPE* paddingDecOffsets = new MTYPE[sc * maxDec / 2];
-
-    //MTYPE * goldOffsets = new MTYPE[sc * maxDec / 2];
-
-    memset(batchEncValues, 0, sizeof(int) * batchEnc->unitNum);
-    memset(batchDecValues, 0, sizeof(int) * batchDec->unitNum);
-    memset(labelValues, 0, sizeof(int) * batchDec->unitNum);
-
-    /* batch of the source-side sequences */
-    for (int s = seq; s < seq + sc; s += 2) {
-        int len = seqLen[s];
-        int sent = (s - seq) / 2;
-        for (int w = 0; w < len; w++) {
-            int num = buf[seqOffset[s] + w];
-            batchEncValues[batchEnc->GetOffset2D(sent, w)] = num;
-            paddingEncOffsets[wCountEnc] = paddingEnc->GetOffset2D(sent, w);
-            wCountEnc++;
-        }
-    }
-    ws = wCountEnc;
-    batchEnc->SetData(batchEncValues, batchEnc->unitNum);
-    paddingEnc->SetDataBatched(paddingEncOffsets, 1.0F, wCountEnc);
-
-    //XTensor * tmp = NewTensorBuf(paddingEnc, devID);
-    //_ConvertDataType(batchEnc, tmp);
-    //tmp->Dump(stderr, "tmp:");
-    //_NotEqual(tmp, paddingEnc, 0);
-    //DelTensorBuf(tmp);
-
-    /* batch of the target-side sequences */
-    for (int s = seq + 1; s < seq + sc; s += 2) {
-        int len = isDoubledEnd ? seqLen[s] : seqLen[s] - 1;
-        CheckNTErrors(len <= maxDec, "Something is wrong!");
-        int sent = (s - seq - 1) / 2;
-        for (int w = 0; w < len; w++) {
-            int num = buf[seqOffset[s] + w];
-            batchDecValues[batchDec->GetOffset2D(sent, w)] = num;
-
-            //paddingDecOffsets[wCountDec] = paddingDec->GetOffset2D(sent, w);
-            if (w < len - 1) {
-                paddingDecOffsets[wCountPad++] = paddingDec->GetOffset2D(sent, w);
-                wCount++;
-            }
-            if (w > 0) {
-
-                //goldOffsets[wGold++] = gold->GetOffset3D(sent, w - 1, buf[seqOffset[s] + w]);
-                labelValues[label->GetOffset2D(sent, w - 1)] = buf[seqOffset[s] + w];
-            }
-            if (w == len - 1) {
-                if (isDoubledEnd) {
-
-                    //goldOffsets[wGold++] = gold->GetOffset3D(sent, w, buf[seqOffset[s] + w]);
-                    labelValues[label->GetOffset2D(sent, w)] = buf[seqOffset[s] + w];
-                }
-                else {
-
-                    //goldOffsets[wGold++] = gold->GetOffset3D(sent, w, buf[seqOffset[s] + w + 1]);
-                    labelValues[label->GetOffset2D(sent, w)] = buf[seqOffset[s] + w + 1];
-                }
-            }
-
-            //wCount++;
-            wCountDec++;
-            if (seqs != NULL)
-                seqs[seqSize++] = buf[seqOffset[s] + w];
-        }
-
-        if (seqs != NULL) {
-            for (int w = len; w < maxDec; w++)
-                seqs[seqSize++] = -1;
-        }
-    }
-
-    batchDec->SetData(batchDecValues, batchDec->unitNum);
-    label->SetData(labelValues, label->unitNum);
-    paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCountPad);
-
-    //XTensor * tmp2 = NewTensorBuf(paddingDec, devID);
-    //_ConvertDataType(batchDec, tmp2);
-    //_NotEqual(tmp2, paddingDec, 0);
-    //DelTensorBuf(tmp2);
-
-    //gold->SetDataBatched(goldOffsets, 1.0F, wGold);
-
-    delete[] batchEncValues;
-    delete[] batchDecValues;
-    delete[] labelValues;
-    delete[] paddingEncOffsets;
-    delete[] paddingDecOffsets;
-
-    //delete[] goldOffsets;
-
-    return sc;
-}
-
-/*
-shuffle lines of the file
->> srcFile - the source file to shuffle
->> tgtFile - the resulting file
-*/
-void T2TBatchLoader::Shuffle(const char* srcFile, const char* tgtFile)
-{
-    char* line = new char[MAX_LINE_LENGTH];
-#ifndef WIN32
-    sprintf(line, "shuf %s > %s", srcFile, tgtFile);
-    system(line);
-#else
-    ShowNTErrors("Cannot shuffle the file on WINDOWS systems!");
-#endif
-    delete[] line;
-}
-
-}
\ No newline at end of file
--- a/source/sample/transformer/train/T2TBatchLoader.h
+++ b/source/sample/transformer/train/T2TBatchLoader.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-04-25
- * it is cold today but I'll move to a warm place tomorrow :)
- */
-
-#ifndef __T2TBATCHLOADER_H__
-#define __T2TBATCHLOADER_H__
-
-#include "../module/T2TUtility.h"
-#include "../../../network/XNet.h"
-
-using namespace nts;
-
-namespace transformer
-{
-
-#define MAX_SEQUENCE_LENGTH 1024 * 4
-
-/* node to keep batch information */
-struct BatchNode
-{
-    /* beginning position */
-    int beg;
-
-    /* end position */
-    int end;
-
-    /* maximum word number on the encoder side */
-    int maxEnc;
-
-    /* maximum word number on the decoder side */
-    int maxDec;
-
-    /* a key for sorting */
-    int key;
-};
-
-class T2TBatchLoader
-{
-public:
-    /* buffer for loading words */
-    int* buf;
-
-    /* another buffer */
-    int* buf2;
-
-    /* batch buf */
-    BatchNode* bufBatch;
-
-    /* buffer size */
-    int bufSize;
-
-    /* size of batch buffer */
-    int bufBatchSize;
-
-    /* length of each sequence */
-    int* seqLen;
-
-    /* another array */
-    int* seqLen2;
-
-    /* offset of the first word for each sequence */
-    int* seqOffset;
-
-    /* number of sequences in the buffer */
-    int nseqBuf;
-
-    /* offset for next sequence in the buffer */
-    int nextSeq;
-
-    /* offset for next batch */
-    int nextBatch;
-
-    /* indicates whether we double the </s> symbol for the output of LM */
-    bool isDoubledEnd;
-
-    /* indicates whether we use batchsize = max * sc
-       rather rather than batchsize = word-number, where max is the maximum
-       length and sc is the sentence number */
-    bool isSmallBatch;
-
-    /* counterpart of "isSmallBatch" */
-    bool isBigBatch;
-
-    /* randomize batches */
-    bool isRandomBatch;
-
-    /* bucket size */
-    int bucketSize;
-
-public:
-    /* constructor */
-    T2TBatchLoader();
-
-    /* de-constructor */
-    ~T2TBatchLoader();
-
-    /* initialization */
-    void Init(T2TConfig& config);
-
-    /* load data to buffer */
-    int LoadBuf(FILE* file, bool isSorted, int step);
-
-    /* clear data buffer */
-    void ClearBuf();
-
-    /* set the random batch flag */
-    void SetRandomBatch(bool flag = true);
-
-    /* load a batch of sequences */
-    int LoadBatch(FILE* file, bool isLM,
-        XTensor* batchEnc, XTensor* paddingEnc,
-        XTensor* batchDec, XTensor* paddingDec,
-        XTensor* gold, XTensor* label,
-        int* seqs,
-        int vsEnc, int vsDec, int sBatch, int wBatch,
-        bool isSorted, int& ws, int& wCount,
-        int devID, bool isTraining);
-
-    /* load a batch of sequences (for language modeling) */
-    int LoadBatchLM(FILE* file,
-        XTensor* batchEnc, XTensor* paddingEnc,
-        XTensor* batchDec, XTensor* paddingDec,
-        XTensor* gold, XTensor* label,
-        int* seqs, int vs, int sBatch, int wBatch,
-        bool isSorted, int& wCount,
-        int devID, bool isTraining);
-
-    /* load a batch of sequences (for machine translation) */
-    int LoadBatchMT(FILE* file,
-        XTensor* batchEnc, XTensor* paddingEnc,
-        XTensor* batchDec, XTensor* paddingDec,
-        XTensor* gold, XTensor* label,
-        int* seqs, int vsEnc, int vsDec, int sBatch, int wBatch,
-        bool isSorted, int& ws, int& wCount,
-        int devID, bool isTraining);
-
-    /* shuffle the data file */
-    void Shuffle(const char* srcFile, const char* tgtFile);
-};
-
-}
-
-#endif
\ No newline at end of file
--- a/source/sample/transformer/train/TrainDataSet.cpp
+++ b/source/sample/transformer/train/TrainDataSet.cpp
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: HU Chi (huchinlp@foxmail.com) 2020-08-09
+ * TODO: refactor the data loader class and references
+ */
+
+#include <string>
+#include <vector>
+#include <cstdlib>
+#include <fstream>
+#include <algorithm>
+
+#include "TrainDataSet.h"
+#include "../Utility.h"
+#include "../translate/Vocab.h"
+
+using namespace nmt;
+
+namespace nts {
+
+/* sort the dataset by length (in descending order) */
+void TrainDataSet::SortByLength() {
+    sort(buffer.items, buffer.items + buffer.count,
+        [](TrainExample* a, TrainExample* b) {
+            return (a->srcSent.Size() + a->tgtSent.Size())
+                 > (b->srcSent.Size() + b->tgtSent.Size());
+        });
+}
+
+/* sort buckets by key (in descending order) */
+void TrainDataSet::SortBucket() {
+    sort(buffer.items, buffer.items + buffer.count,
+        [](TrainExample* a, TrainExample* b) {
+            return a->bucketKey > b->bucketKey;
+        });
+}
+
+/*
+sort the output by key in a range (in descending order)
+>> begin - the first index of the range
+>> end - the last index of the range
+*/
+void TrainDataSet::SortInBucket(int begin, int end) {
+    sort(buffer.items + begin, buffer.items + end,
+        [](TrainExample* a, TrainExample* b) {
+            return (a->key > b->key);
+        });
+}
+
+/*
+load all data from a file to the buffer
+training data format (binary):
+first 8 bit: number of sentence pairs
+subsequent segements:
+source sentence length (4 bit)
+target sentence length (4 bit)
+source tokens (4 bit per token)
+target tokens (4 bit per token)
+*/
+void TrainDataSet::LoadDataToBuffer()
+{
+    buffer.Clear();
+    curIdx = 0;
+
+    int id = 0;
+    uint64_t sentNum = 0;
+
+    int srcVocabSize = 0;
+    int tgtVocabSize = 0;
+    fread(&srcVocabSize, sizeof(srcVocabSize), 1, fp);
+    fread(&tgtVocabSize, sizeof(tgtVocabSize), 1, fp);
+
+    fread(&sentNum, sizeof(uint64_t), 1, fp);
+    CheckNTErrors(sentNum > 0, "Invalid sentence pairs number");
+
+    while (id < sentNum) {
+        int srcLen = 0;
+        int tgtLen = 0;
+        fread(&srcLen, sizeof(int), 1, fp);
+        fread(&tgtLen, sizeof(int), 1, fp);
+        CheckNTErrors(srcLen > 0, "Invalid source sentence length");
+        CheckNTErrors(tgtLen > 0, "Invalid target sentence length");
+
+        IntList srcSent;
+        IntList tgtSent;
+        srcSent.ReadFromFile(fp, srcLen);
+        tgtSent.ReadFromFile(fp, tgtLen);
+
+        TrainExample* example = new TrainExample;
+        example->id = id++;
+        example->key = id;
+        example->srcSent = srcSent;
+        example->tgtSent = tgtSent;
+
+        buffer.Add(example);
+    }
+
+    fclose(fp);
+
+    XPRINT1(0, stderr, "[INFO] loaded %d sentences\n", id);
+}
+
+/*
+load a mini-batch to the device (for training)
+>> batchEnc - a tensor to store the batch of encoder input
+>> paddingEnc - a tensor to store the batch of encoder paddings
+>> batchDec - a tensor to store the batch of decoder input
+>> paddingDec - a tensor to store the batch of decoder paddings
+>> label - a tensor to store the label of input
+>> minSentBatch - the minimum number of sentence batch
+>> batchSize - the maxium number of words in a batch
+>> devID - the device id, -1 for the CPU
+<< return - number of target tokens and sentences
+*/
+UInt64List TrainDataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
+                                   XTensor* batchDec, XTensor* paddingDec, XTensor* label,
+                                   size_t minSentBatch, size_t batchSize, int devID)
+{
+    UInt64List info;
+    size_t srcTokenNum = 0;
+    size_t tgtTokenNum = 0;
+    int realBatchSize = 1;
+
+    if (!isTraining)
+        realBatchSize = minSentBatch;
+
+    /* get the maximum source sentence length in a mini-batch */
+    size_t maxSrcLen = buffer[curIdx]->srcSent.Size();
+
+    /* max batch size */
+    const int MAX_BATCH_SIZE = 512;
+
+    /* dynamic batching for sentences, enabled when the dataset is used for training */
+    if (isTraining) {
+        while ((realBatchSize < (buffer.Size() - curIdx))
+            && (realBatchSize * maxSrcLen < batchSize)
+            && (realBatchSize < MAX_BATCH_SIZE)
+            && (realBatchSize * buffer[curIdx + realBatchSize]->srcSent.Size() < batchSize)) {
+            if (maxSrcLen < buffer[curIdx + realBatchSize]->srcSent.Size())
+                maxSrcLen = buffer[curIdx + realBatchSize]->srcSent.Size();
+            realBatchSize++;
+        }
+    }
+    
+    /* real batch size */
+    if ((buffer.Size() - curIdx) < realBatchSize) {
+        realBatchSize = buffer.Size() - curIdx;
+    }
+
+    CheckNTErrors(realBatchSize > 0, "Invalid batch size");
+
+    /* get the maximum target sentence length in a mini-batch */
+    size_t maxTgtLen = buffer[curIdx]->tgtSent.Size();
+    for (size_t i = 0; i < realBatchSize; i++) {
+        if (maxTgtLen < buffer[curIdx + i]->tgtSent.Size())
+            maxTgtLen = buffer[curIdx + i]->tgtSent.Size();
+    }
+    for (size_t i = 0; i < realBatchSize; i++) {
+        if (maxSrcLen < buffer[curIdx + i]->srcSent.Size())
+            maxSrcLen = buffer[curIdx + i]->srcSent.Size();
+    }
+
+    CheckNTErrors(maxSrcLen != 0, "Invalid source length for batching");
+
+    int* batchEncValues = new int[realBatchSize * maxSrcLen];
+    float* paddingEncValues = new float[realBatchSize * maxSrcLen];
+
+    int* labelVaues = new int[realBatchSize * maxTgtLen];
+    int* batchDecValues = new int[realBatchSize * maxTgtLen];
+    float* paddingDecValues = new float[realBatchSize * maxTgtLen];
+
+    for (int i = 0; i < realBatchSize * maxSrcLen; i++) {
+        batchEncValues[i] = PAD;
+        paddingEncValues[i] = 1;
+    }
+    for (int i = 0; i < realBatchSize * maxTgtLen; i++) {
+        batchDecValues[i] = PAD;
+        labelVaues[i] = PAD;
+        paddingDecValues[i] = 1.0F;
+    }
+
+    size_t curSrc = 0;
+    size_t curTgt = 0;
+
+    /*
+    batchEnc: end with EOS (left padding)
+    batchDec: begin with SOS (right padding)
+    label:    end with EOS (right padding)
+    */
+    for (int i = 0; i < realBatchSize; ++i) {
+
+        srcTokenNum += buffer[curIdx + i]->srcSent.Size();
+        tgtTokenNum += buffer[curIdx + i]->tgtSent.Size();
+
+        curSrc = maxSrcLen * i;
+        for (int j = 0; j < buffer[curIdx + i]->srcSent.Size(); j++) {
+            batchEncValues[curSrc++] = buffer[curIdx + i]->srcSent[j];
+        }
+
+        curTgt = maxTgtLen * i;
+        for (int j = 0; j < buffer[curIdx + i]->tgtSent.Size(); j++) {
+            if (j > 0)
+                labelVaues[curTgt - 1] = buffer[curIdx + i]->tgtSent[j];
+            batchDecValues[curTgt++] = buffer[curIdx + i]->tgtSent[j];
+        }
+        labelVaues[curTgt - 1] = EOS;
+        while (curSrc < maxSrcLen * (i + 1))
+            paddingEncValues[curSrc++] = 0;
+        while (curTgt < maxTgtLen * (i + 1))
+            paddingDecValues[curTgt++] = 0;
+
+    }
+
+    InitTensor2D(batchEnc, realBatchSize, maxSrcLen, X_INT, devID);
+    InitTensor2D(paddingEnc, realBatchSize, maxSrcLen, X_FLOAT, devID);
+    InitTensor2D(batchDec, realBatchSize, maxTgtLen, X_INT, devID);
+    InitTensor2D(paddingDec, realBatchSize, maxTgtLen, X_FLOAT, devID);
+    InitTensor2D(label, realBatchSize, maxTgtLen, X_INT, devID);
+
+    curIdx += realBatchSize;
+
+    batchEnc->SetData(batchEncValues, batchEnc->unitNum);
+    paddingEnc->SetData(paddingEncValues, paddingEnc->unitNum);
+    batchDec->SetData(batchDecValues, batchDec->unitNum);
+    paddingDec->SetData(paddingDecValues, paddingDec->unitNum);
+    label->SetData(labelVaues, label->unitNum);
+
+    delete[] batchEncValues;
+    delete[] paddingEncValues;
+    delete[] batchDecValues;
+    delete[] paddingDecValues;
+    delete[] labelVaues;
+
+    info.Add(tgtTokenNum);
+    info.Add(realBatchSize);
+    return info;
+}
+
+/*
+the constructor of DataSet
+>> dataFile - path of the data file
+>> bucketSize - size of the bucket to keep similar length sentence pairs
+>> training - indicates whether it is used for training
+*/
+void TrainDataSet::Init(const char* dataFile, int myBucketSize, bool training)
+{
+    fp = fopen(dataFile, "rb");
+    CheckNTErrors(fp, "can not open the training file");
+    curIdx = 0;
+    bucketSize = myBucketSize;
+    isTraining = training;
+
+    LoadDataToBuffer();
+
+    SortByLength();
+
+    if (isTraining)
+        BuildBucket();
+}
+
+/* check if the buffer is empty */
+bool TrainDataSet::IsEmpty() {
+    if (curIdx < buffer.Size())
+        return false;
+    return true;
+}
+
+/* reset the buffer */
+void TrainDataSet::ClearBuf()
+{
+    curIdx = 0;
+
+    /* make different batches in different epochs */
+    SortByLength();
+
+    if (isTraining)
+        BuildBucket();
+}
+
+/* group data into buckets with similar length */
+void TrainDataSet::BuildBucket()
+{
+    size_t idx = 0;
+
+    /* build and shuffle buckets */
+    while (idx < buffer.Size()) {
+
+        /* sentence number in a bucket */
+        size_t sentNum = 1;
+
+        /* get the maximum source sentence length in a bucket */
+        size_t maxSrcLen = buffer[idx]->srcSent.Size();
+
+        /* bucketing for sentences */
+        while ((sentNum < (buffer.Size() - idx))
+            && (sentNum * maxSrcLen < bucketSize)
+            && (sentNum * buffer[curIdx + sentNum]->srcSent.Size() < bucketSize)) {
+            if (maxSrcLen < buffer[idx + sentNum]->srcSent.Size())
+                maxSrcLen = buffer[idx + sentNum]->srcSent.Size();
+            sentNum++;
+        }
+
+        /* make sure the number is valid */
+        if ((buffer.Size() - idx) < sentNum) {
+            sentNum = buffer.Size() - idx;
+        }
+
+        int randomKey = rand();
+
+        /* shuffle items in a bucket */
+        for (size_t i = 0; i < sentNum; i++) {
+            buffer[idx + i]->bucketKey = randomKey;
+        }
+
+        idx += sentNum;
+    }
+    SortBucket();
+
+    /* sort items in a bucket */
+    idx = 0;
+    while (idx < buffer.Size()) {
+        size_t sentNum = 0;
+        int bucketKey = buffer[idx + sentNum]->bucketKey;
+        while (sentNum < (buffer.Size() - idx)
+            && buffer[idx + sentNum]->bucketKey == bucketKey) {
+            buffer[idx + sentNum]->key = buffer[idx + sentNum]->srcSent.Size();
+            sentNum++;
+        }
+        SortInBucket(idx, idx + sentNum);
+        idx += sentNum;
+    }
+}
+
+/* de-constructor */
+TrainDataSet::~TrainDataSet()
+{
+
+    /* release the buffer */
+    for (int i = 0; i < buffer.Size(); i++)
+        delete buffer[i];
+}
+
+}
\ No newline at end of file
--- a/source/sample/transformer/train/TrainDataSet.h
+++ b/source/sample/transformer/train/TrainDataSet.h
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: HU Chi (huchinlp@foxmail.com) 2019-04-03
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
+ */
+
+#ifndef __TRAIN_DATASET_H__
+#define __TRAIN_DATASET_H__
+
+#include <cstdio>
+#include <vector>
+#include <fstream>
+
+#include "../../../tensor/XList.h"
+#include "../../../tensor/XTensor.h"
+#include "../../../tensor/XGlobal.h"
+
+#define MAX_WORD_NUM 120
+
+using namespace std;
+
+namespace nts {
+
+/* a class of sentence pairs for training */
+struct TrainExample {
+
+    /* id of the sentence pair */
+    int id;
+
+    /* source language setence (tokenized) */
+    IntList srcSent;
+
+    /* target language setence (tokenized) */
+    IntList tgtSent;
+
+    /* the key used to shuffle items in a bucket */
+    int key;
+
+    /* the key used to shuffle buckets */
+    int bucketKey;
+};
+
+/* A `TrainDataSet` is associated with a file which contains training data. */
+struct TrainDataSet {
+public:
+    /* the data buffer */
+    TrainBufferType buffer;
+
+    /* a list of empty line number */
+    IntList emptyLines;
+
+    /* the pointer to file stream */
+    FILE* fp;
+
+    /* current index in the buffer */
+    size_t curIdx;
+
+    /* size of used data in the buffer */
+    size_t bufferUsed;
+
+    /* size of the bucket used for grouping sentences */
+    size_t bucketSize;
+
+    /* indicates whether it is used for training */
+    bool isTraining;
+
+public:
+
+    /* sort the input by length (in descending order) */
+    void SortByLength();
+
+    /* sort buckets by key (in descending order) */
+    void SortBucket();
+
+    /* sort the output by key (in descending order) */
+    void SortInBucket(int begin, int end);
+
+    /* load data from a file to the buffer */
+    void LoadDataToBuffer();
+
+    /* generate a mini-batch */
+    UInt64List LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
+                         XTensor* batchDec, XTensor* paddingDec, XTensor* label,
+                         size_t minSentBatch, size_t batchSize, int devID);
+
+    /* initialization function */
+    void Init(const char* dataFile, int bucketSize, bool training);
+
+    /* check if the buffer is empty */
+    bool IsEmpty();
+
+    /* reset the buffer */
+    void ClearBuf();
+
+    /* group data into buckets with similar length */
+    void BuildBucket();
+
+    /* de-constructor */
+    ~TrainDataSet();
+};
+}
+
+#endif // __TRAIN_DATASET_H__
\ No newline at end of file
--- a/source/sample/transformer/train/T2TTrainer.cpp
+++ b/source/sample/transformer/train/T2TTrainer.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,30 +18,31 @@
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-02
 */

-#include <cmath>
-#include "T2TTrainer.h"
-#include "../module/T2TUtility.h"
+#include "Trainer.h"
+#include "../Utility.h"
+#include "../../../network/XNoder.h"
 #include "../../../tensor/XUtility.h"
 #include "../../../tensor/core/CHeader.h"
 #include "../../../tensor/loss/LHeader.h"
-#include "../../../network/XNoder.h"
+

 #ifndef WIN32
 #include <sys/time.h>
 #include <unistd.h>
 #endif
+#include "../../../tensor/XMem.h"

-namespace transformer
+namespace nmt
 {

 /* constructor */
-T2TTrainer::T2TTrainer()
+Trainer::Trainer()
 {
    cfg = NULL;
 }

 /* de-constructor */
-T2TTrainer::~T2TTrainer()
+Trainer::~Trainer()
 {
    for (int i = 0; i < moments.count; i++) {
        XTensor* m = (XTensor*)moments.Get(i);
@@ -59,15 +59,17 @@ T2TTrainer::~T2TTrainer()
 initialization
 >> config - configurations of the training process
 */
-void T2TTrainer::Init(T2TConfig& config)
+void Trainer::Init(Config& config)
 {
    cfg = &config;
    lrate = config.lrate;
    lrbias = config.lrbias;
    sBatchSize = config.sBatchSize;
    wBatchSize = config.wBatchSize;
+    bucketSize = config.bucketSize;
    nepoch = config.nepoch;
    nstep = config.nstep;
+    maxCheckpoint = config.maxCheckpoint;
    d = config.modelSize;
    nwarmup = config.nwarmup;
    vSize = config.srcVocabSize;
@@ -81,17 +83,12 @@ void T2TTrainer::Init(T2TConfig& config)
    nStepCheckpoint = config.nStepCheckpoint;
    useEpochCheckpoint = config.useEpochCheckpoint;
    updateStep = config.updateStep;
-    isDebugged = config.isDebugged;
    isLenSorted = config.isLenSorted;

    adamBeta1T = 1.0F;
    adamBeta2T = 1.0F;
-
-    batchLoader.Init(config);
 }

-int tc = 0;
-
 /*
 train the model
 >> fn - training data file
@@ -99,8 +96,14 @@ train the model
 >> modelFN - where we keep the model
 >> model - model to train
 */
-void T2TTrainer::Train(const char* fn, const char* validFN, const char* modelFN, T2TModel* model)
+void Trainer::Train(const char* fn, const char* validFN, 
+                    const char* modelFN, Model* model)
 {
+    /* disable cache during training */
+    for (int i = 0; i < model->decoder->nlayer; i++) {
+        model->decoder->selfAttCache[i].enable = false;
+        model->decoder->enDeAttCache[i].enable = false;
+    }
    int step = 0;
    int wc = 0;
    int ws = 0;
@@ -126,45 +129,42 @@ void T2TTrainer::Train(const char* fn, const char* validFN, const char* modelFN,
 #endif

    int devID = model->devID;
-    XNet net;

    PrepareModel(model);

    double startT = GetClockSec();

-    for (epoch = 1; epoch <= nepoch; epoch++) {
-#ifndef WIN32
-        if (isShuffled) {
-            fprintf(stderr, "shuffle the file\n");
-            batchLoader.Shuffle(fn, trainFN);
-        }
-#endif
+    batchLoader.Init(fn, bucketSize, true);

-        FILE* file = fopen(trainFN, "r");
-        CheckNTErrors(file, "cannot open training file!");
+    for (epoch = 1; epoch <= nepoch; epoch++) {

        wordCount = 0;
        loss = 0;

-        /* batch of sequences (on the encoder and decoder sides) */
-        XTensor batchEnc;
-        XTensor batchDec;
+        /* reset the batch loader */
+        batchLoader.ClearBuf();

-        /* labels */
-        XTensor label;
+        while (!batchLoader.IsEmpty())
+        {
+            XNet net;
+            net.Clear();

-        /* padding */
-        XTensor paddingEnc;
-        XTensor paddingDec;
+            /* batch of sequences (on the encoder and decoder sides) */
+            XTensor batchEnc;
+            XTensor batchDec;

-        /* gold standard */
-        XTensor gold;
+            /* labels */
+            XTensor label;

-        while (batchLoader.LoadBatch(file, model->isLM,
-            &batchEnc, &paddingEnc, &batchDec, &paddingDec, &gold, &label,
-            NULL, vSize, vSizeTgt,
-            sBatchSize, wBatchSize, isLenSorted, ws, wc, devID, true))
-        {
+            /* padding */
+            XTensor paddingEnc;
+            XTensor paddingDec;
+
+            UInt64List info = batchLoader.LoadBatch(&batchEnc, &paddingEnc, &batchDec, &paddingDec, &label, 
+                                                    sBatchSize, wBatchSize, devID);
+
+            wc = info[0];
+            ws = info[1];
            CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch");

            /* output probabilities */
@@ -204,10 +204,18 @@ void T2TTrainer::Train(const char* fn, const char* validFN, const char* modelFN,

                /* update the parameters */
                if (gradStep == updateStep) {
-                    /* learning rate */
-                    lr = lrate * (1.0F / (float)sqrt((float)d)) *
-                        (float)MIN(pow((float)validStep + 1, -0.5F - lrbias),
-                        ((float)validStep + 1) * pow((float)nwarmup, -1.5F - lrbias));
+
+                    float warmupEndLR = lrate;
+                    float warmupInitLR = 1e-7;
+                    float lrStep = (warmupEndLR - warmupInitLR) / nwarmup;
+                    float decayFactor = warmupEndLR * pow(float(nwarmup), 0.5F);
+
+                    /* learning rate, scheduled by inverse square root */
+                    if (step < nwarmup)
+                        lr = warmupInitLR + step * lrStep;
+                    else
+                        lr = decayFactor * pow((float)step, -0.5F);
+

                    /* model update */
                    Update(model, lr);
@@ -224,15 +232,21 @@ void T2TTrainer::Train(const char* fn, const char* validFN, const char* modelFN,
                break;
            }

+            if (step == 10) {
+                // LOG("after backward --------");
+                // lossTensor.mem->ShowMemUsage(stderr);
+                // exit(0);
+            }
+
            if (step % 100 == 0) {
                double elapsed = GetClockSec() - startT;
-                XPRINT8(0, stderr, "[INFO] elapsed=%.1fs, step=%d, epoch=%d, total word=%d, total batch=%d, loss=%.3f, ppl=%.3f, sppl=%.3f",
-                    elapsed, step, epoch,
-                    wordCountTotal, batchCountTotal,
-                    loss / wordCount, exp(loss / wordCount), exp(lossBatch / wc));
+                LOG("elapsed=%.1fs, step=%d, epoch=%d, "
+                    "total word=%d, total batch=%d, loss=%.3f, ppl=%.3f, lr=%.2e", 
+                    elapsed, step, epoch, wordCountTotal, batchCountTotal,
+                    loss / wordCount / log(2.0), exp(loss / wordCount), lr);
+                
                if (!doUpdate)
                    XPRINT(0, stderr, " (no update)");
-                XPRINT(0, stderr, "\n");
            }

            if (nStepCheckpoint > 0 && ++nStepCheck >= nStepCheckpoint) {
@@ -242,8 +256,6 @@ void T2TTrainer::Train(const char* fn, const char* validFN, const char* modelFN,
            }
        }

-        fclose(file);
-
        if (isEnd)
            break;

@@ -255,10 +267,14 @@ void T2TTrainer::Train(const char* fn, const char* validFN, const char* modelFN,

    epoch = MIN(epoch, nepoch);

-    XPRINT7(0, stderr, "[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, loss=%.3f, ppl=%.3f\n",
-        lr, elapsed, step, epoch, wordCountTotal, loss / wordCount, exp(loss / wordCount));
-    XPRINT4(0, stderr, "[INFO] training finished (took %.1fs, step=%d, skipped=%d and epoch=%d)\n",
-        elapsed, step, nSkipped, epoch);
+    LOG("lr=%.2e, elapsed=%.1fs, step=%d, "
+        "epoch=%d, word=%d, loss=%.3f, ppl=%.3f",
+        lr, elapsed, step, epoch, wordCountTotal, loss / wordCount / log(2.0), exp(loss / wordCount));
+    LOG("training finished (took %.1fs, step=%d, "
+        "skipped=%d and epoch=%d)", elapsed, step, nSkipped, epoch);
+
+    LOG("saving the final model");
+    model->Dump(modelFN);

    delete[] trainFN;
 }
@@ -269,7 +285,7 @@ test the model
 >> ofn - output data file
 >> model - model that is trained
 */
-void T2TTrainer::Validate(const char* fn, const char* ofn, T2TModel* model)
+void Trainer::Validate(const char* fn, const char* ofn, Model* model)
 {
    int wc = 0;
    int ws = 0;
@@ -278,42 +294,36 @@ void T2TTrainer::Validate(const char* fn, const char* ofn, T2TModel* model)
    float loss = 0;

    /* data files */
-    FILE* file = fopen(fn, "rb");
-    CheckNTErrors(file, "Cannot read the test file");
-    FILE* ofile = fopen(ofn, "wb");
-    CheckNTErrors(ofile, "Cannot open the output file");
+    batchLoader.Init(fn, 0, false);

    double startT = GetClockSec();

-    /* batch of input sequences */
-    XTensor batchEnc;
-    XTensor batchDec;
-
-    /* label */
-    XTensor label;
-
-    /* padding */
-    XTensor paddingEnc;
-    XTensor paddingDec;
-
-    /* gold standard */
-    XTensor gold;
-
-    /* an array that keeps the sequences */
-    int* seqs = new int[MILLION];
+    while (!batchLoader.IsEmpty())
+    {
+        /* batch of input sequences */
+        XTensor batchEnc;
+        XTensor batchDec;

-    batchLoader.ClearBuf();
+        /* label */
+        XTensor label;

-    while (batchLoader.LoadBatch(file, model->isLM,
-        &batchEnc, &paddingEnc, &batchDec, &paddingDec, &gold, &label,
-        seqs, vSize, vSizeTgt,
-        1, 1, false, ws, wc, model->devID, false))
-    {
-        CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch");
+        /* padding */
+        XTensor paddingEnc;
+        XTensor paddingDec;

        /* output probabilities */
        XTensor output;

+        /* prediction probabilities */
+        XTensor labelOnehot;
+        XTensor lossTensor;
+
+        UInt64List info = batchLoader.LoadBatch(&batchEnc, &paddingEnc, &batchDec, &paddingDec, &label, 
+                                                sBatchSize, 0, model->devID);
+        wc = info[0];
+        ws = info[1];
+        CheckNTErrors(batchEnc.order == 2, "Wrong tensor order of the sequence batch");
+
        /* make the network */
        if (model->isLM)
            model->MakeLM(batchEnc, output, paddingEnc, false);
@@ -326,52 +336,20 @@ void T2TTrainer::Validate(const char* fn, const char* ofn, T2TModel* model)
        int bSize = output.GetDim(0);
        int length = output.GetDim(1);

-        /* prediction probabilities */
-        XTensor labelOnehot;
-        XTensor lossTensor;
        labelOnehot = IndexToOnehot(label, vSizeTgt, 0);
        lossTensor = CrossEntropy(output, labelOnehot, paddingDec);
        float lossBatch = ReduceSumAllValue(lossTensor);

-        /* dump the test result */
-        for (int s = 0; s < bSize; s++) {
-            DTYPE sum = 0;
-            int* seq = seqs + s * length;
-            for (int i = 0; i < length; i++) {
-                if (seq[i] >= 0) {
-                    fprintf(ofile, "%d ", seq[i]);
-                }
-                else
-                    break;
-            }
-            fprintf(ofile, "||| ");
-            for (int i = 0; i < length; i++) {
-                if (seq[i] >= 0) {
-                    DTYPE p = lossTensor.Get2D(s, i);
-                    fprintf(ofile, "%.3e ", p);
-                    sum += p;
-                }
-                else
-                    break;
-            }
-            fprintf(ofile, "||| %e\n", sum);
-        }
-
        loss += lossBatch;

        wordCount += wc;
        sentCount += bSize;
    }

-    fclose(file);
-    fclose(ofile);
-
-    delete[] seqs;
-
    double elapsed = GetClockSec() - startT;

-    XPRINT5(0, stderr, "[INFO] test finished (took %.1fs, sentence=%d, word=%d, loss=%.3f and ppl=%.3f)\n",
-        elapsed, sentCount, wordCount, loss / wordCount, exp(loss / wordCount));
+    LOG("test finished (took %.1fs, sentence=%d, word=%d, loss=%.3f and ppl=%.3f)",
+        elapsed, sentCount, wordCount, loss / wordCount / log(2.0), exp(loss / wordCount));
 }

 /*
@@ -382,20 +360,29 @@ make a checkpoint
 >> label - label of the model
 >> id - id of the checkpoint
 */
-void T2TTrainer::MakeCheckpoint(T2TModel* model, const char* validFN, const char* modelFN, const char* label, int id)
+void Trainer::MakeCheckpoint(Model* model, const char* validFN, 
+                             const char* modelFN, const char* label, int id)
 {
-    fprintf(stderr, "make a checkpoint\n");
+    LOG("make a checkpoint");
    char* fn = new char[MAX_LINE_LENGTH];
+
+    Trainer validator;
+    validator.Init(*cfg);
+    
+    /* save last checkpoints */
+    id = validator.maxCheckpoint - (maxCheckpoint--);
+    if (maxCheckpoint == 0)
+        maxCheckpoint = validator.maxCheckpoint;
    sprintf(fn, "%s.%s.%03d", modelFN, label, id);
+
    model->Dump(fn);
    delete[] fn;

    char* fn2 = new char[MAX_LINE_LENGTH];
    sprintf(fn2, "%s.%s.%03d.output", modelFN, label, id);
    if (validFN != NULL) {
-        T2TTrainer trainer;
-        trainer.Init(*cfg);
-        trainer.Validate(validFN, fn2, model);
+        
+        validator.Validate(validFN, fn2, model);
    }
    delete[] fn2;
 }
@@ -405,12 +392,12 @@ update the model by delta rule
 \theta_{new} = \theta - \lrate * grad
 where
 \lrate = d^-0.5 * min(stepNum^{-0.5}, stepNum * warmupStepNum^{-1.5})
->> model - the t2t model
+>> model - the  model
 >> lr - learning rate
 */
-void T2TTrainer::Update(T2TModel* model, const float lr)
+void Trainer::Update(Model* model, const float lr)
 {
-    TensorList ws(100);
+    TensorList ws;

    model->GetParams(ws);

@@ -465,12 +452,12 @@ void T2TTrainer::Update(T2TModel* model, const float lr)
 prepare model for training
 >> model - the model for training
 */
-void T2TTrainer::PrepareModel(T2TModel* model)
+void Trainer::PrepareModel(Model* model)
 {
    moments.Clear();
    moments2nd.Clear();

-    TensorList ws(100);
+    TensorList ws;

    model->GetParams(ws);


--- a/source/sample/transformer/train/T2TTrainer.h
+++ b/source/sample/transformer/train/T2TTrainer.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,25 +18,24 @@
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-02
 */

-#ifndef __T2TTRAINER_H__
-#define __T2TTRAINER_H__
+#ifndef __TRAINER_H__
+#define __TRAINER_H__

-#include "../T2TModel.h"
-#include "T2TBatchLoader.h"
-#include "../../../tensor/function/FHeader.h"
+#include "../Model.h"
+#include "TrainDataSet.h"

 using namespace nts;

-namespace transformer
+namespace nmt
 {

-/* trainer of the T2T model */
-class T2TTrainer
+/* trainer of the  model */
+class Trainer
 {
 public:

    /* configurations */
-    T2TConfig* cfg;
+    Config* cfg;

    /* dimension size of each inner layer */
    int d;
@@ -63,12 +61,18 @@ public:
    /* word batch size */
    int wBatchSize;

+    /* size of bucket for grouping data by length */
+    int bucketSize;
+
    /* training epoch number */
    int nepoch;

    /* traing step number */
    int nstep;

+    /* the maximum number of saved checkpoints */
+    int maxCheckpoint;
+
    /* indicates whether we use adam */
    bool useAdam;

@@ -100,39 +104,36 @@ public:
    /* number of batches on which we do model update */
    int updateStep;

-    /* indicates whether we intend to debug the net */
-    bool isDebugged;
-
    /* indicates whether the sequence is sorted by length */
    bool isLenSorted;

-    /* for batching */
-    T2TBatchLoader batchLoader;
+    /* used for loading batches */
+    TrainDataSet batchLoader;

 public:
    /* constructor */
-    T2TTrainer();
+    Trainer();

    /* de-constructor */
-    ~T2TTrainer();
+    ~Trainer();

    /* initialize the trainer */
-    void Init(T2TConfig& config);
+    void Init(Config& config);

    /* train the model */
-    void Train(const char* fn, const char* validFN, const char* modelFN, T2TModel* model);
+    void Train(const char* fn, const char* validFN, const char* modelFN, Model* model);

    /* test the model */
-    void Validate(const char* fn, const char* ofn, T2TModel* model);
+    void Validate(const char* fn, const char* ofn, Model* model);

    /* make a checkpoint */
-    void MakeCheckpoint(T2TModel* model, const char* validFN, const char* modelFN, const char* label, int id);
+    void MakeCheckpoint(Model* model, const char* validFN, const char* modelFN, const char* label, int id);

    /* update the model by delta rule */
-    void Update(T2TModel* model, const float lr);
+    void Update(Model* model, const float lr);

    /* prepare model for training */
-    void PrepareModel(T2TModel* model);
+    void PrepareModel(Model* model);
 };

 }

--- a/source/sample/transformer/translate/T2TDataSet.cpp
+++ b/source/sample/transformer/translate/T2TDataSet.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -26,24 +25,26 @@
 #include <fstream>
 #include <algorithm>

-#include "T2TDataSet.h"
-#include "../module/T2TUtility.h"
+#include "DataSet.h"
+#include "../Utility.h"

-using namespace transformer;
+using namespace nmt;

 namespace nts {

 /* sort the output by id (in ascending order) */
 void DataSet::SortInput() {
-    sort(inputBuffer.items, inputBuffer.items + inputBuffer.count, [](Example* a, Example* b) {
-        return a->values.count > b->values.count;
+    sort(inputBuffer.items, inputBuffer.items + inputBuffer.count, 
+        [](Example* a, Example* b) {
+            return a->values.count > b->values.count;
        });
 }

 /* sort the input by length (in descending order) */
 void DataSet::SortOutput() {
-    sort(outputBuffer.items, outputBuffer.items + outputBuffer.count, [](Result* a, Result* b) {
-        return a->id < b->id;
+    sort(outputBuffer.items, outputBuffer.items + outputBuffer.count, 
+        [](Result* a, Result* b) {
+            return a->id < b->id;
        });
 }

@@ -74,7 +75,7 @@ void DataSet::LoadDataToBuffer()
                : line.size() - indices[i];
            string word = line.substr(indices[i], offset);
            if (srcVocab.word2id.find(word) == srcVocab.word2id.end())
-                values.Add(3);
+                values.Add(UNK);
            else
                values.Add(srcVocab.word2id.at(word));
        }
@@ -100,7 +101,7 @@ void DataSet::LoadDataToBuffer()
 }

 /*
-load a mini-batch to the device
+load a mini-batch to the device (for translating)
 >> batchEnc - a tensor to store the batch of input
 >> paddingEnc - a tensor to store the batch of paddings
 >> minSentBatch - the minimum number of sentence batch
@@ -117,10 +118,10 @@ UInt64List DataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
    size_t maxLen = inputBuffer[bufferUsed]->values.Size();

    /* dynamic batching for sentences */
-    while ((realBatchSize < (inputBuffer.Size() - bufferUsed))
-        && (realBatchSize * maxLen < batchSize)) {
-        realBatchSize++;
-    }
+    //while ((realBatchSize < (inputBuffer.Size() - bufferUsed))
+    //    && (realBatchSize * maxLen < batchSize)) {
+    //    realBatchSize++;
+    //}

    /* real batch size */
    if ((inputBuffer.Size() - bufferUsed) < realBatchSize) {
@@ -133,13 +134,13 @@ UInt64List DataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
    float* paddingValues = new float[realBatchSize * maxLen];

    for (int i = 0; i < realBatchSize * maxLen; i++) {
-        batchValues[i] = 1;
-        paddingValues[i] = 0.0F;
+        batchValues[i] = PAD;
+        paddingValues[i] = 1.0F;
    }

-    size_t cur = 0;
+    size_t curSrc = 0;

-    /* left padding */
+    /* right padding */
    UInt64List infos;
    size_t totalLength = 0;

@@ -147,11 +148,11 @@ UInt64List DataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
        infos.Add(inputBuffer[bufferUsed + i]->id);
        totalLength += inputBuffer[bufferUsed + i]->values.Size();

-        cur = maxLen * (i + 1) - inputBuffer[bufferUsed + i]->values.Size();
-        for (int j = 0; j < inputBuffer[bufferUsed + i]->values.Size(); j++) {
-            batchValues[cur] = inputBuffer[bufferUsed + i]->values[j];
-            paddingValues[cur++] = 1.0F;
-        }
+        curSrc = maxLen * i;
+        for (int j = 0; j < inputBuffer[bufferUsed + i]->values.Size(); j++)
+            batchValues[curSrc++] = inputBuffer[bufferUsed + i]->values[j];
+        while (curSrc < maxLen * (i + 1))
+            paddingValues[curSrc++] = 0;
    }
    infos.Add(totalLength);

@@ -178,7 +179,7 @@ the constructor of DataSet
 void DataSet::Init(const char* dataFile, const char* srcVocabFN, const char* tgtVocabFN)
 {
    fp = new ifstream(dataFile);
-    CheckNTErrors(fp->is_open(), "can not open the file");
+    CheckNTErrors(fp->is_open(), "Can not open the test data");
    bufferUsed = 0;

    CheckNTErrors(strcmp(srcVocabFN, "") != 0, "missing source vocab file");

--- a/source/sample/transformer/translate/T2TDataSet.h
+++ b/source/sample/transformer/translate/T2TDataSet.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -26,7 +25,7 @@
 #include <cstdio>
 #include <vector>
 #include <fstream>
-#include "T2TVocab.h"
+#include "Vocab.h"

 #include "../../../tensor/XList.h"
 #include "../../../tensor/XTensor.h"

--- a/source/sample/transformer/translate/T2TLengthPenalty.cpp
+++ b/source/sample/transformer/translate/T2TLengthPenalty.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,11 +21,11 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

-#include "T2TLengthPenalty.h"
+#include "LengthPenalty.h"

 using namespace nts;

-namespace transformer
+namespace nmt
 {

 /*
@@ -36,7 +35,7 @@ where n = length of the sequence
 >> alpha - the parameter controls the length preference
 << return - length penalty of the sequence
 */
-float T2TLengthPenalizer::GNMT(float length, float alpha)
+float LengthPenalizer::GNMT(float length, float alpha)
 {
    float base;
    float lp;

--- a/source/sample/transformer/translate/T2TLengthPenalty.h
+++ b/source/sample/transformer/translate/T2TLengthPenalty.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,21 +21,21 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

-#ifndef __T2TLENGTHPENALTY_H__
-#define __T2TLENGTHPENALTY_H__
+#ifndef __LENGTHPENALTY_H__
+#define __LENGTHPENALTY_H__

-#include "../module/T2TUtility.h"
+#include "../Utility.h"
 #include "../../../tensor/XTensor.h"

 using namespace nts;

-namespace transformer
+namespace nmt
 {

 /* We intend to penalize short sequences because they have higher score
   in product of a sequence of probability-like terms and have more chances
   to beat others in search. */
-class T2TLengthPenalizer
+class LengthPenalizer
 {
 public:
    /* GNMT-like length penalty: pl = ((5 + n)/(5 + 1))^\alpha

--- a/source/sample/transformer/translate/T2TPredictor.cpp
+++ b/source/sample/transformer/translate/T2TPredictor.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,23 +21,23 @@

 #include <iostream>

-#include "T2TPredictor.h"
-#include "../module/T2TNNUtil.h"
+#include "Predictor.h"
+#include "../module/NNUtil.h"

 using namespace nts;

-namespace transformer
+namespace nmt
 {

 /* constructor */
-T2TStateBundle::T2TStateBundle()
+StateBundle::StateBundle()
 {
    states = NULL;
    isStart = false;
 }

 /* de-constructor */
-T2TStateBundle::~T2TStateBundle()
+StateBundle::~StateBundle()
 {
    if (states != NULL)
        delete[] states;
@@ -48,18 +47,18 @@ T2TStateBundle::~T2TStateBundle()
 create states
 >> num - number of states
 */
-void T2TStateBundle::MakeStates(int num)
+void StateBundle::MakeStates(int num)
 {
    CheckNTErrors(num > 0, "invalid number");

    if (states != NULL)
        delete[] states;

-    states = new T2TState[num];
+    states = new State[num];

    for (int i = 0; i < num; i++) {
        states[i].prediction = -1;
-        states[i].pid = T2T_PID_EMPTY;
+        states[i].pid = _PID_EMPTY;
        states[i].isEnd = false;
        states[i].isStart = false;
        states[i].isCompleted = false;
@@ -74,26 +73,26 @@ void T2TStateBundle::MakeStates(int num)
 }

 /* constructor */
-T2TPredictor::T2TPredictor()
+Predictor::Predictor()
 {
    startSymbol = 2;
 }

 /* de-constructor */
-T2TPredictor::~T2TPredictor()
+Predictor::~Predictor()
 {
 }

 /*
 create an initial state
->> model - the t2t model
+>> model - the  model
 >> top - the top-most layer of the network
 >> input - input of the network
 >> beamSize - beam size
 >> state - the state to be initialized
 */
-void T2TPredictor::Create(T2TModel* model, XTensor* top, const XTensor* input,
-    int beamSize, T2TStateBundle* state)
+void Predictor::Create(Model* model, XTensor* top, const XTensor* input,
+                       int beamSize, StateBundle* state)
 {
    int dims[MAX_TENSOR_DIM_NUM];
    for (int i = 0; i < input->order - 1; i++)
@@ -114,20 +113,20 @@ void T2TPredictor::Create(T2TModel* model, XTensor* top, const XTensor* input,
 set start symbol
 >> symbol - the symbol (in integer)
 */
-void T2TPredictor::SetStartSymbol(int symbol)
+void Predictor::SetStartSymbol(int symbol)
 {
    startSymbol = symbol;
 }

 /*
 read a state
->> model - the t2t model that keeps the network created so far
+>> model - the  model that keeps the network created so far
 >> state - a set of states. It keeps
 1) hypotheses (states)
 2) probabilities of hypotheses
 3) parts of the network for expanding toward the next state
 */
-void T2TPredictor::Read(T2TModel* model, T2TStateBundle* state)
+void Predictor::Read(Model* model, StateBundle* state)
 {
    m = model;
    s = state;
@@ -147,9 +146,9 @@ predict the next state
 >> needReorder - whether we need reordering the states
 >> nstep - current time step of the target sequence
 */
-void T2TPredictor::Predict(T2TStateBundle* next, XTensor& aliveState, XTensor& encoding,
-                           XTensor& inputEnc, XTensor& paddingEnc, int batchSize, bool isStart,
-                           XTensor& reorderState, bool needReorder, int nstep)
+void Predictor::Predict(StateBundle* next, XTensor& aliveState, XTensor& encoding,
+                        XTensor& inputEnc, XTensor& paddingEnc, int batchSize, bool isStart,
+                        XTensor& reorderState, bool needReorder, int nstep)
 {
    int dims[MAX_TENSOR_DIM_NUM];

@@ -221,14 +220,14 @@ void T2TPredictor::Predict(T2TStateBundle* next, XTensor& aliveState, XTensor& e
 generate paths up to the states of the current step
 >> state - state bundle of the current step
 */
-XTensor T2TPredictor::GeneratePaths(T2TStateBundle* state)
+XTensor Predictor::GeneratePaths(StateBundle* state)
 {
    CheckNTErrors(state->stateNum >= 0, "Illegal state!");

    int distance = -1;

    for (int i = 0; i < state->stateNum; i++) {
-        T2TState* cur = state->states + i;
+        State* cur = state->states + i;
        int nsteps = 0;

        while (cur != NULL) {
@@ -245,7 +244,7 @@ XTensor T2TPredictor::GeneratePaths(T2TStateBundle* state)
    path.SetZeroAll();

    for (int i = 0; i < state->stateNum; i++) {
-        T2TState* cur = state->states + i;
+        State* cur = state->states + i;
        int nsteps = 0;

        while (cur != NULL) {
@@ -263,21 +262,21 @@ get the predictions of the previous step
 >> state - state bundle of the current step
 >> devID - the device id for the predictions
 */
-XTensor T2TPredictor::GetLastPrediction(T2TStateBundle* state, int devID)
+XTensor Predictor::GetLastPrediction(StateBundle* state, int devID)
 {
    CheckNTErrors(state->stateNum >= 0, "Illegal state!");

    IntList last;

    for (int i = 0; i < state->stateNum; i++) {
-        T2TState* cur = state->states + i;
+        State* cur = state->states + i;

        last.Add(cur->prediction);
    }

    XTensor lastPred;
-    InitTensor2D(&lastPred, last.Size(), 1, X_INT, devID);
-    lastPred.SetData(last.items, last.Size());
+    InitTensor2D(&lastPred, int(last.Size()), 1, X_INT, devID);
+    lastPred.SetData(last.items, int(last.Size()));

    return lastPred;
 }

--- a/source/sample/transformer/translate/T2TPredictor.h
+++ b/source/sample/transformer/translate/T2TPredictor.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -21,22 +20,22 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

-#ifndef __T2TPREDICTOR_H__
-#define __T2TPREDICTOR_H__
+#ifndef __PREDICTOR_H__
+#define __PREDICTOR_H__

-#include "../T2TModel.h"
-#include "T2TLengthPenalty.h"
+#include "../Model.h"
+#include "LengthPenalty.h"

 using namespace std;

-namespace transformer
+namespace nmt
 {

-#define T2T_PID_EMPTY -1
+#define _PID_EMPTY -1

 /* state for search. It keeps the path (back-pointer), prediction distribution,
   and etc. It can be regarded as a hypotheses in translation. */
-class T2TState
+class State
 {
 public:
    /* we assume that the prediction is an integer */
@@ -69,11 +68,11 @@ public:
    int nstep;

    /* pointer to the previous state */
-    T2TState* last;
+    State* last;
 };

 /* a bundle of states */
-class T2TStateBundle
+class StateBundle
 {
 public:
    /* predictions */
@@ -98,7 +97,7 @@ public:
    float nstep;

    /* list of states */
-    T2TState* states;
+    State* states;

    /* number of states */
    int stateNum;
@@ -108,10 +107,10 @@ public:

 public:
    /* constructor */
-    T2TStateBundle();
+    StateBundle();

    /* de-constructor */
-    ~T2TStateBundle();
+    ~StateBundle();

    /* create states */
    void MakeStates(int num);
@@ -122,14 +121,14 @@ public:
   we get the state of previous words and then generate the next word.
   Here, a state can be regarded as the representation of words (word
   indices, hidden states, embeddings and etc.).  */
-class T2TPredictor
+class Predictor
 {
 private:
    /* pointer to the transformer model */
-    T2TModel* m;
+    Model* m;

    /* current state */
-    T2TStateBundle* s;
+    StateBundle* s;

    /* start symbol */
    int startSymbol;
@@ -139,30 +138,30 @@ private:

 public:
    /* constructor */
-    T2TPredictor();
+    Predictor();

    /* de-constructor */
-    ~T2TPredictor();
+    ~Predictor();

    /* create an initial state */
-    void Create(T2TModel* model, XTensor* top, const XTensor* input, int beamSize, T2TStateBundle* state);
+    void Create(Model* model, XTensor* top, const XTensor* input, int beamSize, StateBundle* state);

    /* set the start symbol */
    void SetStartSymbol(int symbol);

    /* read a state */
-    void Read(T2TModel* model, T2TStateBundle* state);
+    void Read(Model* model, StateBundle* state);

    /* predict the next state */
-    void Predict(T2TStateBundle* next, XTensor& aliveIndices, XTensor& encoding,
+    void Predict(StateBundle* next, XTensor& aliveIndices, XTensor& encoding,
        XTensor& inputEnc, XTensor& paddingEnc, int rawBatchSize,
        bool isStart, XTensor& reorderState, bool needReorder, int nstep);

    /* generate paths up to the states of the current step */
-    XTensor GeneratePaths(T2TStateBundle* state);
+    XTensor GeneratePaths(StateBundle* state);

    /* get the predictions of the previous step */
-    XTensor GetLastPrediction(T2TStateBundle* state, int devID);
+    XTensor GetLastPrediction(StateBundle* state, int devID);
 };

 }

--- a/source/sample/transformer/translate/T2TSearch.cpp
+++ b/source/sample/transformer/translate/T2TSearch.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,13 +19,13 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
 */

-#include "T2TSearch.h"
-#include "../module/T2TUtility.h"
+#include "Search.h"
+#include "../Utility.h"
 #include "../../../tensor/core/CHeader.h"

 using namespace nts;

-namespace transformer
+namespace nmt
 {
 /* constructor */
 BeamSearch::BeamSearch()
@@ -55,7 +54,7 @@ initialize the model
 >> argc - number of arguments
 >> argv - list of pointers to the arguments
 */
-void BeamSearch::Init(T2TConfig& config)
+void BeamSearch::Init(Config& config)
 {
    beamSize = config.beamSize;
    batchSize = config.sBatchSize;
@@ -105,10 +104,10 @@ search for the most promising states
 >> output - output that represents the sequences as rows
 >> score - score of the sequences
 */
-void BeamSearch::Search(T2TModel* model, XTensor& input, XTensor& padding, 
+void BeamSearch::Search(Model* model, XTensor& input, XTensor& padding, 
                        IntList* output, XTensor& score)
 {
-    T2TPredictor predictor;
+    Predictor predictor;
    XTensor maskEnc;
    XTensor encoding;
    XTensor encodingBeam;
@@ -140,10 +139,10 @@ void BeamSearch::Search(T2TModel* model, XTensor& input, XTensor& padding,
    CheckNTErrors(lengthLimit > 0, "no max length specified!");
    maxLength = lengthLimit;

-    T2TStateBundle* states = new T2TStateBundle[lengthLimit + 1];
-    T2TStateBundle* first = states;
-    T2TStateBundle* cur = NULL;
-    T2TStateBundle* next = NULL;
+    StateBundle* states = new StateBundle[lengthLimit + 1];
+    StateBundle* first = states;
+    StateBundle* cur = NULL;
+    StateBundle* next = NULL;

    /* create the first state */
    predictor.Create(model, &encodingBeam, &input, beamSize, first);
@@ -213,7 +212,7 @@ compute the model score for each hypotheses
 >> prev - the beam of the previous state
 >> beam - the beam that keeps a number of states
 */
-void BeamSearch::Score(T2TStateBundle* prev, T2TStateBundle* beam)
+void BeamSearch::Score(StateBundle* prev, StateBundle* beam)
 {
    XTensor& score = beam->modelScore;
    XTensor& prob = beam->prob;
@@ -244,7 +243,7 @@ void BeamSearch::Score(T2TStateBundle* prev, T2TStateBundle* beam)
    beam->nstep = prev->nstep + 1.0F;

    /* the GNMT-like length penalty */
-    float lp = T2TLengthPenalizer::GNMT(beam->nstep, alpha);
+    float lp = LengthPenalizer::GNMT(beam->nstep, alpha);

    /* score = log-prob/lp */
    score = probPath / lp;
@@ -279,7 +278,7 @@ generate tokens for the next state via beam pruning
 >> prev - the last beam
 >> beam - the beam that keeps a number of states
 */
-void BeamSearch::Generate(T2TStateBundle* prev, T2TStateBundle* beam)
+void BeamSearch::Generate(StateBundle* prev, StateBundle* beam)
 {
    int dims[MAX_TENSOR_DIM_NUM];
    int dimsBeam[MAX_TENSOR_DIM_NUM];
@@ -323,7 +322,7 @@ void BeamSearch::Generate(T2TStateBundle* prev, T2TStateBundle* beam)
    /* keep the most promising candidates in the beam */
    TopK(score, scoreTopK, index, -1, beamSize, true);

-    float lp = T2TLengthPenalizer::GNMT(beam->nstep, alpha);
+    float lp = LengthPenalizer::GNMT(beam->nstep, alpha);

    CopyValues(index, indexCPU);
    CopyValues(index, preID);
@@ -375,26 +374,26 @@ expand the search graph
 >> beam - the beam that keeps a number of states
 >> reorderState - the new order of states
 */
-void BeamSearch::Expand(T2TStateBundle* prev, T2TStateBundle* beam, XTensor& reorderState)
+void BeamSearch::Expand(StateBundle* prev, StateBundle* beam, XTensor& reorderState)
 {
    CheckNTErrors(beam->prediction.unitNum == beam->preID.unitNum, 
                  "A problem occurs in the beam!");

    beam->MakeStates(beam->prediction.unitNum);

-    T2TState* states = beam->states;
+    State* states = beam->states;
    XTensor& idRef = beam->preID;
    XTensor& modelScoreRef = beam->modelScore;
    XTensor& probRef = beam->prob;
    XTensor& probPathRef = beam->probPath;
    XTensor& predictionRef = beam->prediction;
    XTensor& endMark = beam->endMark;
-    XTensor   id;
-    XTensor   modelScore;
-    XTensor   prob;
-    XTensor   probPath;
-    XTensor   prediction;
-    XTensor   endMarkCPU;
+    XTensor id;
+    XTensor modelScore;
+    XTensor prob;
+    XTensor probPath;
+    XTensor prediction;
+    XTensor endMarkCPU;
    XTensor reorderStateCPU;

    InitTensorOnCPU(&id, &idRef);
@@ -424,7 +423,7 @@ void BeamSearch::Expand(T2TStateBundle* prev, T2TStateBundle* beam, XTensor& reo
    for (int i = 0; i < beam->stateNum; i += beamSize) {
        for (int j = 0; j < beamSize; j++) {
            int k = i + j;
-            T2TState& state = states[k];
+            State& state = states[k];

            int offset = id.GetInt(k);
            int pid = i / beamSize;
@@ -432,7 +431,7 @@ void BeamSearch::Expand(T2TStateBundle* prev, T2TStateBundle* beam, XTensor& reo
            if (offset != j)
                needReorder = true;

-            T2TState* last = prev->states + pid * beamSize + offset;
+            State* last = prev->states + pid * beamSize + offset;

            CheckNTErrors(offset >= 0, "Wrong state index!");

@@ -482,12 +481,12 @@ collect hypotheses with ending symbols. Given a beam of hypotheses,
 we remove the finished hypotheses and keep them in a heap.
 >> beam  - the beam that keeps a number of states
 */
-void BeamSearch::Collect(T2TStateBundle* beam)
+void BeamSearch::Collect(StateBundle* beam)
 {
-    T2TState* states = beam->states;
+    State* states = beam->states;

    for (int i = 0; i < beam->stateNum; i++) {
-        T2TState& state = states[i];
+        State& state = states[i];

        CheckNTErrors(state.pid >= 0 && state.pid < batchSize,
            "Invalid sample id!");
@@ -508,13 +507,13 @@ void BeamSearch::Collect(T2TStateBundle* beam)
 fill the hypothesis heap with incomplete hypotheses
 >> beam  - the beam that keeps a number of states (final)
 */
-void BeamSearch::FillHeap(T2TStateBundle* beam)
+void BeamSearch::FillHeap(StateBundle* beam)
 {
-    T2TState* states = beam->states;
+    State* states = beam->states;

    for (int i = 0; i < beam->stateNum / beamSize; i++) {
        for (int j = 0; j < beamSize; j++) {
-            T2TState& state = states[i * beamSize + j];
+            State& state = states[i * beamSize + j];

            /* we push the incomplete hypothesis into the heap */
            if (fullHypos[state.pid].Count() == 0 && state.isEnd && state.isCompleted) {
@@ -548,10 +547,10 @@ void BeamSearch::Dump(IntList* output, XTensor* score)
        int c = heap.Count();

        float bestScore = -1e9F;
-        T2TState* state = NULL;
+        State* state = NULL;
        for (int i = 0; i < c; i++) {
            auto node = heap.Pop();
-            T2TState* s = (T2TState*)node.index;
+            State* s = (State*)node.index;
            if (i == 0 || bestScore < node.value) {
                state = s;
                bestScore = node.value;
@@ -619,12 +618,12 @@ void BeamSearch::SetEnd(const int* tokens, const int tokenNum)
 check whether all hypotheses are completed
 >> beam - the beam that keeps the searching states
 */
-bool BeamSearch::IsAllCompleted(T2TStateBundle* beam)
+bool BeamSearch::IsAllCompleted(StateBundle* beam)
 {
-    T2TState* states = beam->states;
+    State* states = beam->states;

    for (int i = 0; i < beam->stateNum; i++) {
-        T2TState& state = states[i];
+        State& state = states[i];
        if (!state.isCompleted)
            return false;
    }
@@ -640,11 +639,11 @@ update the beam by removing finished hypotheses
 >> alivePadding - new paddings for the inputs, (B, L)
 << aliveIdx - the indices of alive states
 */
-void BeamSearch::RemoveFinishedStates(T2TStateBundle* beam, XTensor& aliveEncoding,
+void BeamSearch::RemoveFinishedStates(StateBundle* beam, XTensor& aliveEncoding,
                                      XTensor& aliveInput, XTensor& alivePadding, 
                                      XTensor& aliveState)
 {
-    T2TState* states = beam->states;
+    State* states = beam->states;

    /* get the indices of uncompleted sentences and states */
    aliveSentList.Clear();
@@ -674,12 +673,12 @@ void BeamSearch::RemoveFinishedStates(T2TStateBundle* beam, XTensor& aliveEncodi
        }
    }

-    InitTensor1D(&aliveState, aliveStateList.Size(), X_INT, aliveEncoding.devID);
-    aliveState.SetData(aliveStateList.items, aliveStateList.Size());
+    InitTensor1D(&aliveState, int(aliveStateList.Size()), X_INT, aliveEncoding.devID);
+    aliveState.SetData(aliveStateList.items, int(aliveStateList.Size()));

    XTensor aliveSent;
-    InitTensor1D(&aliveSent, aliveSentList.Size(), X_INT, aliveEncoding.devID);
-    aliveSent.SetData(aliveSentList.items, aliveSentList.Size());
+    InitTensor1D(&aliveSent, int(aliveSentList.Size()), X_INT, aliveEncoding.devID);
+    aliveSent.SetData(aliveSentList.items, int(aliveSentList.Size()));

    if (aliveStateList.Size() < aliveEncoding.dimSize[0] && aliveStateList.Size() > 0) {
        aliveInput = AutoGather(aliveInput, aliveState);
@@ -697,7 +696,7 @@ void BeamSearch::RemoveFinishedStates(T2TStateBundle* beam, XTensor& aliveEncodi
 make a mask to prevent duplicated entries in beam expansion for the first position
 >> beam - the beam that keeps the searching states
 */
-XTensor BeamSearch::MakeFirstMask(T2TStateBundle* beam)
+XTensor BeamSearch::MakeFirstMask(StateBundle* beam)
 {
    XTensor& prob = beam->prob;
    XTensor mask;
@@ -742,7 +741,7 @@ initialize the model
 >> argc - number of arguments
 >> argv - list of pointers to the arguments
 */
-void GreedySearch::Init(T2TConfig& config)
+void GreedySearch::Init(Config& config)
 {
    batchSize = config.wBatchSize;
    endSymbols[0] = config.endID;
@@ -798,7 +797,7 @@ search for the most promising states
 >> padding - padding of the input
 >> output - output that represents the sequences as rows
 */
-void GreedySearch::Search(T2TModel* model, XTensor& input, 
+void GreedySearch::Search(Model* model, XTensor& input, 
                          XTensor& padding, IntList* output)
 {
    XTensor maskEnc;

--- a/source/sample/transformer/translate/T2TSearch.h
+++ b/source/sample/transformer/translate/T2TSearch.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,15 +19,15 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
 */

-#ifndef __T2TSEARCH_H__
-#define __T2TSEARCH_H__
+#ifndef __SEARCH_H__
+#define __SEARCH_H__

-#include "../T2TModel.h"
-#include "T2TPredictor.h"
+#include "../Model.h"
+#include "Predictor.h"

 using namespace std;

-namespace transformer
+namespace nmt
 {

 /* The class organizes the search process. It calls "predictors" to generate
@@ -42,7 +41,7 @@ private:
    float alpha;

    /* predictor */
-    T2TPredictor predictor;
+    Predictor predictor;

    /* max length of the generated sequence */
    int maxLength;
@@ -88,28 +87,28 @@ public:
    ~BeamSearch();

    /* initialize the model */
-    void Init(T2TConfig& config);
+    void Init(Config& config);

    /* search for the most promising states */
-    void Search(T2TModel* model, XTensor& input, XTensor& padding, IntList* output, XTensor& score);
+    void Search(Model* model, XTensor& input, XTensor& padding, IntList* output, XTensor& score);

    /* preparation */
    void Prepare(int myBatchSize, int myBeamSize);

    /* compute the model score for each hypotheses */
-    void Score(T2TStateBundle* prev, T2TStateBundle* beam);
+    void Score(StateBundle* prev, StateBundle* beam);

    /* generate token indices via beam pruning */
-    void Generate(T2TStateBundle* prev, T2TStateBundle* beam);
+    void Generate(StateBundle* prev, StateBundle* beam);

    /* expand the search graph */
-    void Expand(T2TStateBundle* prev, T2TStateBundle* beam, XTensor& reorderState);
+    void Expand(StateBundle* prev, StateBundle* beam, XTensor& reorderState);

    /* collect hypotheses with ending symbol */
-    void Collect(T2TStateBundle* beam);
+    void Collect(StateBundle* beam);

    /* fill the hypotheses heap with incomplete hypotheses */
-    void FillHeap(T2TStateBundle* beam);
+    void FillHeap(StateBundle* beam);

    /* save the output sequences and score */
    void Dump(IntList* output, XTensor* score);
@@ -118,17 +117,17 @@ public:
    bool IsEnd(int token);

    /* check whether all hypotheses are completed */
-    bool IsAllCompleted(T2TStateBundle* beam);
+    bool IsAllCompleted(StateBundle* beam);

    /* update the beam by pruning finished states */
-    void RemoveFinishedStates(T2TStateBundle* beam, XTensor& aliveEncoding,
+    void RemoveFinishedStates(StateBundle* beam, XTensor& aliveEncoding,
        XTensor& aliveInput, XTensor& alivePadding, XTensor& aliveIdx);

    /* set end symbols for search */
    void SetEnd(const int* tokens, const int tokenNum);

    /* make a mask to prevent duplicated entries in beam expansion for the first position */
-    XTensor MakeFirstMask(T2TStateBundle* beam);
+    XTensor MakeFirstMask(StateBundle* beam);
 };

 class GreedySearch
@@ -136,7 +135,7 @@ class GreedySearch
 private:

    /* predictor */
-    T2TPredictor predictor;
+    Predictor predictor;

    /* max length of the generated sequence */
    int maxLength;
@@ -164,10 +163,10 @@ public:
    ~GreedySearch();

    /* initialize the model */
-    void Init(T2TConfig& config);
+    void Init(Config& config);

    /* search for the most promising states */
-    void Search(T2TModel* model, XTensor& input, XTensor& padding, IntList* output);
+    void Search(Model* model, XTensor& input, XTensor& padding, IntList* output);

    /* preparation */
    void Prepare(int myBatchSize);

--- a/source/sample/transformer/translate/T2TTranslator.cpp
+++ b/source/sample/transformer/translate/T2TTranslator.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,27 +19,25 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
 */

-#include <cmath>
-
-#include "T2TTranslator.h"
-#include "T2TSearch.h"
-#include "../module/T2TUtility.h"
+#include "Search.h"
+#include "Translator.h"
+#include "../Utility.h"
 #include "../../../tensor/XTensor.h"
 #include "../../../tensor/XUtility.h"
 #include "../../../tensor/core/CHeader.h"

 using namespace nts;

-namespace transformer
+namespace nmt
 {

 /* constructor */
-T2TTranslator::T2TTranslator()
+Translator::Translator()
 {
 }

 /* de-constructor */
-T2TTranslator::~T2TTranslator()
+Translator::~Translator()
 {
    if (beamSize > 1)
        delete (BeamSearch*)seacher;
@@ -49,7 +46,7 @@ T2TTranslator::~T2TTranslator()
 }

 /* initialize the model */
-void T2TTranslator::Init(T2TConfig& config)
+void Translator::Init(Config& config)
 {
    beamSize = config.beamSize;
    vSize = config.srcVocabSize;
@@ -58,17 +55,17 @@ void T2TTranslator::Init(T2TConfig& config)
    wordBatch = config.wBatchSize;

    if (beamSize > 1) {
-        XPRINT1(0, stderr, "Translating with beam search (%d)\n", beamSize);
+        LOG("translating with beam search (%d)", beamSize);
        seacher = new BeamSearch();
        ((BeamSearch*)seacher)->Init(config);
    }
    else if (beamSize == 1) {
-        XPRINT1(0, stderr, "Translating with greedy search (%d)\n", beamSize);
+        LOG("translating with greedy search");
        seacher = new GreedySearch();
        ((GreedySearch*)seacher)->Init(config);
    }
    else {
-        CheckNTErrors(false, "invalid beam size\n");
+        CheckNTErrors(false, "Invalid beam size\n");
    }
 }

@@ -80,8 +77,8 @@ test the model
 >> ofn - output data file
 >> model - pretrained model
 */
-void T2TTranslator::Translate(const char* ifn, const char* sfn, const char* tfn, 
-                              const char* ofn, T2TModel* model)
+void Translator::Translate(const char* ifn, const char* sfn, 
+                           const char* tfn, const char* ofn, Model* model)
 {
    int wc = 0;
    int wordCountTotal = 0;
@@ -99,8 +96,7 @@ void T2TTranslator::Translate(const char* ifn, const char* sfn, const char* tfn,
    XTensor paddingEnc;

    batchLoader.Init(ifn, sfn, tfn);
-    XPRINT1(0, stderr, "[INFO] loaded the input file, elapsed=%.1fs \n", 
-            GetClockSec() - startT);
+    LOG("loaded the input file, elapsed=%.1fs ", GetClockSec() - startT);

    int count = 0;
    double batchStart = GetClockSec();
@@ -130,24 +126,24 @@ void T2TTranslator::Translate(const char* ifn, const char* sfn, const char* tfn,

        for (int i = 0; i < indices.Size() - 1; ++i) {
            Result* res = new Result;
-            res->id = indices[i];
+            res->id = int(indices[i]);
            res->res = output[i];
            batchLoader.outputBuffer.Add(res);
        }
        delete[] output;

-        wc += indices[-1];
-        wordCountTotal += indices[-1];
+        wc += int(indices[-1]);
+        wordCountTotal += int(indices[-1]);

-        sentCount += (indices.Size() - 1);
+        sentCount += int(indices.Size() - 1);
        batchCount += 1;

        if (count % 1 == 0) {
            double elapsed = GetClockSec() - batchStart;
            batchStart = GetClockSec();
-            XPRINT3(0, stderr, "[INFO] elapsed=%.1fs, sentence=%f, sword=%.1fw/s\n",
-                    elapsed, float(sentCount) / float(batchLoader.inputBuffer.Size()), 
-                    double(wc) / elapsed);
+            LOG("elapsed=%.1fs, sentence=%f, sword=%.1fw/s",
+                elapsed, float(sentCount) / float(batchLoader.inputBuffer.Size()), 
+                double(wc) / elapsed);
            wc = 0;
        }
    }
@@ -169,8 +165,8 @@ void T2TTranslator::Translate(const char* ifn, const char* sfn, const char* tfn,

    double elapsed = GetClockSec() - startDump;

-    XPRINT2(0, stderr, "[INFO] translation completed (word=%d, sent=%ld)\n",
-            wordCountTotal, batchLoader.inputBuffer.Size() + batchLoader.emptyLines.Size());
+    LOG("translation completed (word=%d, sent=%zu)", 
+        wordCountTotal, batchLoader.inputBuffer.Size() + batchLoader.emptyLines.Size());
 }

 /*
@@ -178,7 +174,7 @@ dump the result into the file
 >> file - data file
 >> output - output tensor
 */
-void T2TTranslator::Dump(FILE* file, XTensor* output)
+void Translator::Dump(FILE* file, XTensor* output)
 {
    if (output != NULL && output->unitNum != 0) {
        int seqLength = output->dimSize[output->order - 1];

--- a/source/sample/transformer/translate/T2TTranslator.h
+++ b/source/sample/transformer/translate/T2TTranslator.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -21,17 +20,17 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
 */

-#ifndef __T2TTESTER_H__
-#define __T2TTESTER_H__
+#ifndef __TESTER_H__
+#define __TESTER_H__

-#include "T2TSearch.h"
-#include "T2TDataSet.h"
+#include "Search.h"
+#include "DataSet.h"

-namespace transformer
+namespace nmt
 {

 /* This class translates test sentences with a trained model. */
-class T2TTranslator
+class Translator
 {
 public:
    /* vocabulary size of the source side */
@@ -57,17 +56,17 @@ public:

 public:
    /* constructor */
-    T2TTranslator();
+    Translator();

    /* de-constructor */
-    ~T2TTranslator();
+    ~Translator();

    /* initialize the model */
-    void Init(T2TConfig& config);
+    void Init(Config& config);

    /* test the model */
    void Translate(const char* ifn, const char* vfn, const char* ofn, 
-                   const char* tfn, T2TModel* model);
+                   const char* tfn, Model* model);

    /* dump the result into the file */
    void Dump(FILE* file, XTensor* output);

--- a/source/sample/transformer/translate/T2TVocab.cpp
+++ b/source/sample/transformer/translate/T2TVocab.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -21,8 +20,8 @@

 #include <fstream>

-#include "T2TVocab.h"
-#include "../module/T2TUtility.h"
+#include "Vocab.h"
+#include "../Utility.h"

 namespace nts {

@@ -31,7 +30,7 @@ void Vocab::Load(const string& src)
 {
    string vsz, sid;
    ifstream f(src, ios::in);
-    CheckNTErrors(f.is_open(), "Unable to open the vocabulary file");
+    CheckNTErrors(f.is_open(), "unable to open the vocabulary file");

    /* get the vocab size and the start id */
    f >> vsz >> sid;

--- a/source/sample/transformer/translate/T2TVocab.h
+++ b/source/sample/transformer/translate/T2TVocab.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,8 +18,8 @@
 * $Created by: HU Chi (huchinlp@foxmail.com) 2020-01-03
 */

-#ifndef __T2TVOCAB_H__
-#define __T2TVOCAB_H__
+#ifndef __VOCAB_H__
+#define __VOCAB_H__

 #include <cstdio>
 #include <unordered_map>
@@ -30,10 +29,10 @@ using namespace std;
 namespace nts {

 /* user-defined symbols */
-#define UNK 0
 #define PAD 1
 #define SOS 2
 #define EOS 2
+#define UNK 3

 /* the vocabulary class */
 struct Vocab