Update the Transformer sample based on the NiuTrans.NMT.

00d7b386 · liyinqiao · 3b93be69 · 00d7b386 · 00d7b386 · 3b93be69
Commit 00d7b386 authored Sep 19, 2020 by liyinqiao
--- a/data/transformer/test/bpevocab
+++ b/data/transformer/test/bpevocab
--- a/data/transformer/test/code
+++ b/data/transformer/test/code
--- a/data/transformer/test/fbis.test
+++ b/data/transformer/test/fbis.test
-Munich 18@@ 56 : Four maps that will change your view of the city
-A mental asylum , where today young people are said to meet .
-A cryp@@ t chap@@ el , where they are now dig@@ ging t@@ unn@@ els for the S @@@ -@@ @ Bahn .
-Al@@ lo@@ t@@ ment holders cul@@ tiv@@ ate the soil of former farmers .
-The oldest official map of Munich brings cap@@ tiv@@ ating stories to light .
--- a/data/transformer/test/test.de
+++ b/data/transformer/test/test.de
--- a/data/transformer/test/test.en
+++ b/data/transformer/test/test.en
--- a/data/transformer/train/bpevocab
+++ b/data/transformer/train/bpevocab
--- a/data/transformer/train/code
+++ b/data/transformer/train/code
--- a/data/transformer/train/fbis.train
+++ b/data/transformer/train/fbis.train
--- a/data/transformer/train/train.data.tgz
+++ b/data/transformer/train/train.data.tgz
--- a/data/transformer/train/valid.data.tgz
+++ b/data/transformer/train/valid.data.tgz
--- a/source/Main.cpp
+++ b/source/Main.cpp
@@ -26,7 +26,7 @@
 #include "./tensor/core/CHeader.h"
 #include "./tensor/test/Test.h"
 #include "./sample/fnnlm/FNNLM.h"
-#include "./sample/transformer/Transformer.h"
+#include "./sample/transformer/NMT.h"

 //#define CRTDBG_MAP_ALLOC
 //#include <stdlib.h>
@@ -34,7 +34,7 @@

 using namespace nts;
 using namespace fnnlm;
-using namespace transformer;
+using namespace nmt;

 int main( int argc, const char ** argv )
 {
@@ -43,7 +43,7 @@ int main( int argc, const char ** argv )
    else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
        FNNLMMain(argc - 1, argv + 1);
    else if(argc > 1 && !strcmp(argv[1], "-t2t"))
-        TransformerMain(argc - 1, argv + 1);
+        NMTMain(argc - 1, argv + 1);
    else{
        fprintf(stderr, "Thanks for using NiuTensor! This is a library for building\n");
        fprintf(stderr, "neural networks in an easy way. \n\n");

--- a/source/sample/transformer/T2TDecoder.cpp
+++ b/source/sample/transformer/T2TDecoder.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,15 +19,13 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

-#include <cmath>
-
-#include "T2TDecoder.h"
-#include "module/T2TUtility.h"
-#include "module/T2TLayerNormal.h"
-#include "module/T2TCommonModules.h"
+#include "Decoder.h"
+#include "Utility.h"
+#include "module/LayerNorm.h"
+#include "module/CommonModules.h"
 #include "../../tensor/core/CHeader.h"

-namespace transformer
+namespace nmt
 {

 /* constructor */
@@ -64,7 +61,7 @@ AttDecoder::~AttDecoder()
 initialize the model
 >> config - configurations of the model
 */
-void AttDecoder::InitModel(T2TConfig& config)
+void AttDecoder::InitModel(Config& config)
 {
    devID = config.devID;
    nlayer = config.nDecLayer;
@@ -80,16 +77,17 @@ void AttDecoder::InitModel(T2TConfig& config)
    /* embedding model */
    embedder.InitModel(config, false);

-    selfAtt = new T2TAttention[nlayer];
-    fnns = new T2TFNN[nlayer];
-    selfAttLayerNorms = new T2TLN[nlayer];
-    enDeAtt = new T2TAttention[nlayer];
-    enDeAttLayerNorms = new T2TLN[nlayer];
-    fnnLayerNorms = new T2TLN[nlayer];
+    selfAtt = new Attention[nlayer];
+    fnns = new FNN[nlayer];
+    selfAttLayerNorms = new LN[nlayer];
+    enDeAtt = new Attention[nlayer];
+    enDeAttLayerNorms = new LN[nlayer];
+    fnnLayerNorms = new LN[nlayer];
+
    selfAttCache = new Cache[nlayer];
    enDeAttCache = new Cache[nlayer];
    if (preNorm)
-        decoderLayerNorm = new T2TLN;
+        decoderLayerNorm = new LN;

    /* initialize the stacked layers */
    for (int i = 0; i < nlayer; i++) {
@@ -99,6 +97,8 @@ void AttDecoder::InitModel(T2TConfig& config)
        fnnLayerNorms[i].InitModel(config);
        enDeAtt[i].InitModel(config);
        enDeAttLayerNorms[i].InitModel(config);
+        selfAttCache[i].enable = true;
+        enDeAttCache[i].enable = true;
    }
    if (preNorm)
        decoderLayerNorm->InitModel(config);
@@ -115,9 +115,10 @@ make the decoding network
 << return - the output tensor of the decoder
 */
 XTensor AttDecoder::Make(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
-    XTensor* maskEncDec, int nstep, bool isTraining)
+                         XTensor* maskEncDec, int nstep, bool isTraining)
 {
    XTensor x;
+
    x = embedder.Make(inputDec, true, isTraining, nstep);

    /* dropout */
@@ -188,8 +189,86 @@ XTensor AttDecoder::Make(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
    }

    if (preNorm)
-        x = decoderLayerNorm->Make(x);
+        return decoderLayerNorm->Make(x);
+
+    return x;
+}
+
+/*
+make the decoding network
+>> inputDec - the input tensor of the decoder
+>> outputEnc - the output tensor of the encoder
+>> mask - mask that indicates which position is valid
+>> maskEncDec - mask for the encoder-decoder attention
+>> nstep - the current length of the decoder input
+>> isTraining - indicates whether the model is used for training
+<< return - the output tensor of the decoder
+*/
+XTensor AttDecoder::MakeFast(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
+                             XTensor* maskEncDec, int nstep, bool isTraining)
+{
+    XTensor x;
+
+    x = embedder.Make(inputDec, true, isTraining, nstep);
+
+    /* dropout */
+    if (isTraining && dropoutP > 0)
+        x = Dropout(x, dropoutP);
+
+    for (int i = 0; i < nlayer; i++) {
+        XTensor res;
+
+        res = x;
+
+        /* layer normalization with pre-norm for self-attn */
+        x = selfAttLayerNorms[i].Make(x);
+
+        /******************/
+        /* self attention */
+        x = selfAtt[i].Make(x, x, x, mask, isTraining, &selfAttCache[i], SELF_ATT);
+
+        /* dropout */
+        if (isTraining && dropoutP > 0)
+            x = Dropout(x, dropoutP);
+
+        /* residual connection */
+        x = Sum(res, x);
+
+        res = x;
+
+        /* layer normalization with pre-norm for encoder-decoder attention */
+        x = enDeAttLayerNorms[i].Make(x);
+
+        /* encoder-decoder attention */
+        x = enDeAtt[i].Make(outputEnc, x, outputEnc, maskEncDec,
+                            isTraining, &enDeAttCache[i], EN_DE_ATT);
+
+        /* dropout */
+        if (isTraining && dropoutP > 0)
+            x = Dropout(x, dropoutP);
+
+        /* residual connection */
+        x = Sum(res, x);
+
+        res = x;
+
+        /* layer normalization with pre-norm for fnn */
+        x = fnnLayerNorms[i].Make(x);
+
+        /* fnn */
+        x = fnns[i].Make(x, isTraining);
+
+        /* dropout */
+        if (isTraining && dropoutP > 0)
+            x = Dropout(x, dropoutP);
+
+        /* residual connection */
+        x = Sum(res, x);
+    }
+
+    x = decoderLayerNorm->Make(x);

    return x;
 }
+
 }
\ No newline at end of file
--- a/source/sample/transformer/T2TDecoder.h
+++ b/source/sample/transformer/T2TDecoder.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,13 +19,13 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

-#ifndef __T2TDECODER_H__
-#define __T2TDECODER_H__
+#ifndef __DECODER_H__
+#define __DECODER_H__

-#include "T2TEncoder.h"
-#include "module/T2TUtility.h"
+#include "Encoder.h"
+#include "Utility.h"

-namespace transformer
+namespace nmt
 {

 class AttDecoder
@@ -52,28 +51,28 @@ public:
    DTYPE dropoutP;

    /* embedding of word at each position */
-    T2TEmbedder embedder;
+    Embedder embedder;

    /* FNN model of each layer */
-    T2TFNN* fnns;
+    FNN* fnns;

    /* attention model of each layer */
-    T2TAttention* selfAtt;
+    Attention* selfAtt;

    /* layer normalization for attention */
-    T2TLN* selfAttLayerNorms;
+    LN* selfAttLayerNorms;

    /* layer normalization for fnn */
-    T2TLN* fnnLayerNorms;
+    LN* fnnLayerNorms;

    /* layer normalization for decoder */
-    T2TLN* decoderLayerNorm;
+    LN* decoderLayerNorm;

    /* encoder-decoder attention model of each layer */
-    T2TAttention* enDeAtt;
+    Attention* enDeAtt;

    /* layer normalization for encoder-decoder attention */
-    T2TLN* enDeAttLayerNorms;
+    LN* enDeAttLayerNorms;

    /* layer cache list */
    Cache* selfAttCache;
@@ -92,11 +91,15 @@ public:
    ~AttDecoder();

    /* initialize the model */
-    void InitModel(T2TConfig& config);
+    void InitModel(Config& config);

    /* make the decoding network */
    XTensor Make(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
                 XTensor* maskEncDec, int nstep, bool isTraining);
+
+    /* make the decoding network (pre norm) */
+    XTensor MakeFast(XTensor& inputDec, XTensor& outputEnc, XTensor* mask,
+                     XTensor* maskEncDec, int nstep, bool isTraining);
 };

 }

--- a/source/sample/transformer/T2TEncoder.cpp
+++ b/source/sample/transformer/T2TEncoder.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,15 +19,13 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

-#include <cmath>
-
-#include "T2TEncoder.h"
-#include "module/T2TUtility.h"
-#include "module/T2TLayerNormal.h"
-#include "module/T2TCommonModules.h"
+#include "Encoder.h"
+#include "Utility.h"
+#include "module/LayerNorm.h"
+#include "module/CommonModules.h"
 #include "../../tensor/core/CHeader.h"

-namespace transformer
+namespace nmt
 {

 /* constructor */
@@ -56,7 +53,7 @@ AttEncoder::~AttEncoder()
 initialize the model
 >> config - configurations for the model
 */
-void AttEncoder::InitModel(T2TConfig& config)
+void AttEncoder::InitModel(Config& config)
 {

    devID = config.devID;
@@ -68,18 +65,18 @@ void AttEncoder::InitModel(T2TConfig& config)
    dropoutP = config.dropout;

    CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
-    CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsize\"");
+    CheckNTErrors(vSize > 1, "Set vocabulary size by \"-vsize\"");

    /* embedding model */
    embedder.InitModel(config);

-    selfAtt = new T2TAttention[nlayer];
-    fnns = new T2TFNN[nlayer];
-    attLayerNorms = new T2TLN[nlayer];
-    fnnLayerNorms = new T2TLN[nlayer];
+    selfAtt = new Attention[nlayer];
+    fnns = new FNN[nlayer];
+    attLayerNorms = new LN[nlayer];
+    fnnLayerNorms = new LN[nlayer];

    if (preNorm)
-        encoderLayerNorm = new T2TLN;
+        encoderLayerNorm = new LN;

    /* initialize the stacked layers */
    for (int i = 0; i < nlayer; i++) {
@@ -122,7 +119,7 @@ XTensor AttEncoder::Make(XTensor& input, XTensor* mask, XTensor& maskEncDec, boo
        attnBefore = LayerNorm(x, attLayerNorms[i], preNorm, true, false);

        /* self attention */
-        att = selfAtt[i].Make(attnBefore, attnBefore, attnBefore, mask, isTraining, NULL, 0);
+        att = selfAtt[i].Make(attnBefore, attnBefore, attnBefore, mask, isTraining, NULL, SELF_ATT);

        /* dropout */
        if (isTraining && dropoutP > 0)
@@ -151,7 +148,63 @@ XTensor AttEncoder::Make(XTensor& input, XTensor* mask, XTensor& maskEncDec, boo
        x = LayerNorm(res, fnnLayerNorms[i], preNorm, false, true);
    }
    if (preNorm)
-        x = encoderLayerNorm->Make(x);
+        return encoderLayerNorm->Make(x);
+
+    return x;
+}
+
+/*
+make the encoding network
+>> input - the input tensor of the encoder
+>> mask - the mask that indicate each position is valid
+>> maskEncDec - no use
+>> isTraining - indicates whether the model is used for training
+<< return - the output tensor of the encoder
+*/
+XTensor AttEncoder::MakeFast(XTensor& input, XTensor* mask, XTensor& maskEncDec, bool isTraining)
+{
+    XTensor x;
+
+    x = embedder.Make(input, false, isTraining);
+
+    /* dropout */
+    if (isTraining && dropoutP > 0)
+        x = Dropout(x, dropoutP);
+
+    for (int i = 0; i < nlayer; i++) {
+        XTensor res;
+
+        res = x;
+
+        /* layer normalization with pre-norm for self-attn */
+        x = attLayerNorms[i].Make(x);
+
+        /* self attention */
+        x = selfAtt[i].Make(x, x, x, mask, isTraining, NULL, SELF_ATT);
+
+        /* dropout */
+        if (isTraining && dropoutP > 0)
+            x = Dropout(x, dropoutP);
+
+        /* residual connection */
+        x = Sum(res, x);
+
+        res = x;
+
+        /* layer normalization with pre-norm for fnn */
+        x = fnnLayerNorms[i].Make(x);
+
+        /* fnn */
+        x = fnns[i].Make(x, isTraining);
+
+        /* dropout */
+        if (isTraining && dropoutP > 0)
+            x = Dropout(x, dropoutP);
+
+        /* residual connection */
+        x = Sum(res, x);
+    }
+    x = encoderLayerNorm->Make(x);

    return x;
 }

--- a/source/sample/transformer/T2TEncoder.h
+++ b/source/sample/transformer/T2TEncoder.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,25 +19,25 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

-#ifndef __T2TENCODER_H__
-#define __T2TENCODER_H__
+#ifndef __ENCODER_H__
+#define __ENCODER_H__

-#include "module/T2TFNN.h"
-#include "module/T2TUtility.h"
-#include "module/T2TAttention.h"
-#include "module/T2TEmbedding.h"
-#include "module/T2TLayerNormal.h"
+#include "Utility.h"
+#include "module/FNN.h"
+#include "module/Attention.h"
+#include "module/Embedding.h"
+#include "module/LayerNorm.h"
 #include "../../network/XNet.h"

 using namespace nts;

-namespace transformer
+namespace nmt
 {

 /*
 base class of the encoder
 */
-class T2TEncoder
+class Encoder
 {
 public:
    virtual XTensor Make(XTensor& input, XTensor* mask, XTensor& mask2, bool isTraining) = 0;
@@ -47,7 +46,7 @@ public:
 /*
 the encoder based on self-attention
 */
-class AttEncoder : T2TEncoder
+class AttEncoder : Encoder
 {
 public:
    /* device id */
@@ -73,22 +72,22 @@ public:
    int ignored;

    /* embedding of word at each position */
-    T2TEmbedder embedder;
+    Embedder embedder;

    /* FNN model of each layer */
-    T2TFNN* fnns;
+    FNN* fnns;

    /* attention model of each layer */
-    T2TAttention* selfAtt;
+    Attention* selfAtt;

    /* layer normalizations for attention */
-    T2TLN* attLayerNorms;
+    LN* attLayerNorms;

    /* layer normalization for fnn */
-    T2TLN* fnnLayerNorms;
+    LN* fnnLayerNorms;

    /* layer normalization for encoder */
-    T2TLN* encoderLayerNorm;
+    LN* encoderLayerNorm;

    /* the location of layer normalization */
    bool preNorm;
@@ -101,11 +100,14 @@ public:
    ~AttEncoder();

    /* initialize the model */
-    void InitModel(T2TConfig& config);
+    void InitModel(Config& config);

    /* make the encoding network */
    XTensor Make(XTensor& input, XTensor* mask, XTensor& maskEncDec, bool isTraining);

+    /* make the encoding network */
+    XTensor MakeFast(XTensor& input, XTensor* mask, XTensor& maskEncDec, bool isTraining);
+
    /* make the encoding network (wrapper) */
    XTensor Make(XTensor& input, XTensor* mask, bool isTraining);
 };

--- a/source/sample/transformer/T2TModel.cpp
+++ b/source/sample/transformer/T2TModel.cpp
--- a/source/sample/transformer/T2TModel.h
+++ b/source/sample/transformer/T2TModel.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,23 +19,22 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

-#ifndef __T2TMODEL_H__
-#define __T2TMODEL_H__
+#ifndef __MODEL_H__
+#define __MODEL_H__

-#include "T2TEncoder.h"
-#include "T2TDecoder.h"
-#include "module/T2TFNN.h"
-#include "module/T2TOutput.h"
-#include "module/T2TUtility.h"
-#include "module/T2TAttention.h"
+#include "Encoder.h"
+#include "Decoder.h"
+#include "module/FNN.h"
+#include "module/Output.h"
+#include "Utility.h"
+#include "module/Attention.h"

-namespace transformer
+namespace nmt
 {

-/* a transformer model that keeps parameters of the encoder,
-   the decoder and the output layer (softmax). Also, it creates
-   the network used in transformer. */
-class T2TModel
+/* a nmt model that keeps parameters of the encoder,
+   the decoder and the output layer (softmax). */
+class Model
 {
 public:
    /* device id */
@@ -49,7 +47,7 @@ public:
    AttDecoder* decoder;

    /* output layer */
-    T2TOutput* outputLayer;
+    Output* outputLayer;

    /* indicates whether the model is running for language modeling */
    bool isLM;
@@ -71,13 +69,16 @@ public:

 public:
    /* constructor */
-    T2TModel();
+    Model();

    /* de-constructor */
-    ~T2TModel();
+    ~Model();

    /* initialize the model */
-    void InitModel(T2TConfig& config);
+    void InitModel(Config& config);
+
+    /* print model configurations */
+    void ShowModelConfig(Config& config);

    /* make the encoding network */
    XTensor MakeEncoder(XTensor& input, XTensor* mask, bool isTraining);

--- a/source/sample/transformer/Transformer.cpp
+++ b/source/sample/transformer/Transformer.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,49 +16,47 @@

 /*
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
- * $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-06, 2020-07
 */

-#include <cmath>
 #include <ctime>

-#include "Transformer.h"
-#include "train/T2TTrainer.h"
-#include "module/T2TUtility.h"
-#include "translate/T2TTranslator.h"
-#include "../../tensor/XDevice.h"
-#include "../../tensor/XGlobal.h"
-#include "../../tensor/XUtility.h"
+#include "NMT.h"
+#include "train/Trainer.h"
+#include "translate/Translator.h"

-namespace transformer
+namespace nmt
 {

-int TransformerMain(int argc, const char** argv)
+int NMTMain(int argc, const char** argv)
 {
    if (argc == 0)
        return 1;

    /* load configurations */
-    T2TConfig config(argc, argv);
+    Config config(argc, argv);

-    srand((unsigned int)time(NULL));
+    srand(1);

-    /* train the model */
+    /* training */
    if (strcmp(config.trainFN, "") != 0) {
-        ENABLE_GRAD;
-        T2TModel model;
+        
+        Model model;
        model.InitModel(config);
-        T2TTrainer trainer;
+        Trainer trainer;
        trainer.Init(config);
        trainer.Train(config.trainFN, config.validFN, config.modelFN, &model);
    }

-    /* translate the test file */
+    /* translating */
    if (strcmp(config.testFN, "") != 0 && strcmp(config.outputFN, "") != 0) {
+        
+        /* disable grad flow */
        DISABLE_GRAD;
-        T2TModel model;
+
+        Model model;
        model.InitModel(config);
-        T2TTranslator translator;
+        Translator translator;
        translator.Init(config);
        translator.Translate(config.testFN, config.srcVocabFN, 
                             config.tgtVocabFN, config.outputFN, &model);

--- a/source/sample/transformer/Transformer.h
+++ b/source/sample/transformer/Transformer.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,29 +15,17 @@
 */

 /*
- *
- * An implementation of the transformer system. See more details
- * about FNNLM in
- * "Attention Is All You Need" by Vaswani et al.
- * https://arxiv.org/pdf/1706.03762.pdf
- *
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
- * I start writing the code related to NMT - a long time since my last coding
- * work on MT
+ * An implementation of the NMT system. 
 */

-#ifndef __TRANSFORMER_H__
-#define __TRANSFORMER_H__
-
-#include "../../tensor/XGlobal.h"
-#include "../../tensor/XTensor.h"
-#include "../../tensor/core/CHeader.h"
+#ifndef __NMT_H__
+#define __NMT_H__

-namespace transformer
+namespace nmt
 {

 /* entrance of the program */
-int TransformerMain(int argc, const char** argv);
+int NMTMain(int argc, const char** argv);

 }


--- a/source/sample/transformer/module/T2TUtility.cpp
+++ b/source/sample/transformer/module/T2TUtility.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -27,13 +26,13 @@
 #include <fstream>
 #include <sstream>

-#include "T2TUtility.h"
-#include "../../../tensor/XGlobal.h"
+#include "Utility.h"
+#include "../../tensor/XGlobal.h"

 using namespace nts;
 using namespace std;

-namespace transformer
+namespace nmt
 {

 /*
@@ -41,7 +40,7 @@ load configurations from the command
 >> argc - number of arguments
 >> argv - the list of arguments
 */
-T2TConfig::T2TConfig(int argc, const char** argv)
+Config::Config(int argc, const char** argv)
 {
    char** args = new char* [MAX_PARAM_NUM];
    for (int i = 0; i < argc; i++) {
@@ -61,22 +60,26 @@ T2TConfig::T2TConfig(int argc, const char** argv)
    ShowParams(argsNum, args);

    /* options for the model */
-    LoadParamInt(argsNum, args, "nhead", &nhead, 8);
-    LoadParamInt(argsNum, args, "enclayer", &nEncLayer, 1);
-    LoadParamInt(argsNum, args, "declayer", &nDecLayer, 1);
+    LoadParamInt(argsNum, args, "nhead", &nhead, 4);
+    LoadParamInt(argsNum, args, "enclayer", &nEncLayer, 6);
+    LoadParamInt(argsNum, args, "declayer", &nDecLayer, 6);
    LoadParamInt(argsNum, args, "maxrp", &maxRP, 8);
-    LoadParamInt(argsNum, args, "embsize", &embSize, 256);
-    LoadParamInt(argsNum, args, "modelsize", &modelSize, 256);
+    LoadParamInt(argsNum, args, "embsize", &embSize, 512);
+    LoadParamInt(argsNum, args, "modelsize", &modelSize, 512);
    LoadParamInt(argsNum, args, "maxpos", &maxPosLen, 1024);
-    LoadParamInt(argsNum, args, "fnnhidden", &fnnHiddenSize, modelSize * 4);
-    LoadParamInt(argsNum, args, "vsize", &srcVocabSize, 10000);
-    LoadParamInt(argsNum, args, "vsizetgt", &tgtVocabSize, 10000);
+    LoadParamInt(argsNum, args, "fnnhidden", &fnnHiddenSize, modelSize * 2);
+    LoadParamInt(argsNum, args, "vsize", &srcVocabSize, 10152);
+    LoadParamInt(argsNum, args, "vsizetgt", &tgtVocabSize, 10152);
    LoadParamInt(argsNum, args, "padid", &padID, 1);
    LoadParamInt(argsNum, args, "startid", &startID, 2);
    LoadParamInt(argsNum, args, "endid", &endID, 2);
    LoadParamBool(argsNum, args, "rpr", &useRPR, false);
-    LoadParamBool(argsNum, args, "prenorm", &preNorm, false);
-    LoadParamString(argsNum, args, "model", modelFN, "model.bin");
+    LoadParamBool(argsNum, args, "prenorm", &preNorm, true);
+
+    // TODO: refactor the parameters type to support weight sharing during training
+    LoadParamInt(argsNum, args, "shareemb", &shareAllEmbeddings, 0);
+    LoadParamInt(argsNum, args, "sharedec", &shareDecInputOutputWeight, 0);
+    LoadParamString(argsNum, args, "model", modelFN, "");
    LoadParamString(argsNum, args, "srcvocab", srcVocabFN, "vocab.src");
    LoadParamString(argsNum, args, "tgtvocab", tgtVocabFN, "vocab.tgt");

@@ -84,19 +87,20 @@ T2TConfig::T2TConfig(int argc, const char** argv)
    LoadParamString(argsNum, args, "train", trainFN, "");
    LoadParamString(argsNum, args, "valid", validFN, "");
    LoadParamInt(argsNum, args, "dev", &devID, 0);
-    LoadParamInt(argsNum, args, "wbatch", &wBatchSize, 2048);
-    LoadParamInt(argsNum, args, "sbatch", &sBatchSize, 1);
+    LoadParamInt(argsNum, args, "wbatch", &wBatchSize, 4096);
+    LoadParamInt(argsNum, args, "sbatch", &sBatchSize, 8);
    isTraining = (strcmp(trainFN, "") == 0) ? false : true;
    LoadParamBool(argsNum, args, "mt", &isMT, true);
-    LoadParamFloat(argsNum, args, "dropout", &dropout, 0.1);
-    LoadParamFloat(argsNum, args, "fnndrop", &fnnDropout, 0.0);
-    LoadParamFloat(argsNum, args, "attdrop", &attDropout, 0.0);
+    LoadParamFloat(argsNum, args, "dropout", &dropout, 0.3);
+    LoadParamFloat(argsNum, args, "fnndrop", &fnnDropout, 0.1);
+    LoadParamFloat(argsNum, args, "attdrop", &attDropout, 0.1);

-    LoadParamFloat(argc, args, "lrate", &lrate, 1.0F);
+    LoadParamFloat(argc, args, "lrate", &lrate, 0.0015F);
    LoadParamFloat(argc, args, "lrbias", &lrbias, 0);
-    LoadParamInt(argc, args, "nepoch", &nepoch, 20);
+    LoadParamInt(argc, args, "nepoch", &nepoch, 50);
+    LoadParamInt(argc, args, "maxcheckpoint", &maxCheckpoint, 10);
    LoadParamInt(argc, args, "nstep", &nstep, 100000);
-    LoadParamInt(argc, args, "nwarmup", &nwarmup, 3000);
+    LoadParamInt(argc, args, "nwarmup", &nwarmup, 8000);
    LoadParamBool(argc, args, "adam", &useAdam, true);
    LoadParamFloat(argc, args, "adambeta1", &adamBeta1, 0.9F);
    LoadParamFloat(argc, args, "adambeta2", &adamBeta2, 0.98F);
@@ -104,9 +108,8 @@ T2TConfig::T2TConfig(int argc, const char** argv)
    LoadParamBool(argc, args, "shuffled", &isShuffled, true);
    LoadParamFloat(argc, args, "labelsmoothing", &labelSmoothingP, 0.1);
    LoadParamInt(argc, args, "nstepcheckpoint", &nStepCheckpoint, -1);
-    LoadParamBool(argc, args, "epochcheckpoint", &useEpochCheckpoint, false);
+    LoadParamBool(argc, args, "epochcheckpoint", &useEpochCheckpoint, true);
    LoadParamInt(argc, args, "updatestep", &updateStep, 1);
-    LoadParamBool(argc, args, "debug", &isDebugged, false);
    LoadParamBool(argc, args, "sorted", &isLenSorted, false);

    LoadParamInt(argc, args, "bufsize", &bufSize, 50000);
@@ -114,7 +117,7 @@ T2TConfig::T2TConfig(int argc, const char** argv)
    LoadParamBool(argc, args, "smallbatch", &isSmallBatch, true);
    LoadParamBool(argc, args, "bigbatch", &isBigBatch, false);
    LoadParamBool(argc, args, "randbatch", &isRandomBatch, false);
-    LoadParamInt(argc, args, "bucketsize", &bucketSize, 0);
+    LoadParamInt(argc, args, "bucketsize", &bucketSize, wBatchSize * 10);

    /* options for translating */
    LoadParamString(argsNum, args, "test", testFN, "");
@@ -122,7 +125,7 @@ T2TConfig::T2TConfig(int argc, const char** argv)
    LoadParamInt(argsNum, args, "beamsize", &beamSize, 1);
    LoadParamBool(argsNum, args, "fp16", &useFP16, false);
    LoadParamFloat(argsNum, args, "lenalpha", &lenAlpha, 0.6);
-    LoadParamFloat(argsNum, args, "maxlenalpha", &maxLenAlpha, 2.0);
+    LoadParamFloat(argsNum, args, "maxlenalpha", &maxLenAlpha, 1.2);

    for (int i = 0; i < argc; i++)
        delete[] args[i];
@@ -136,7 +139,7 @@ load configurations from a file
 >> args - the list to store the configurations
 format: one option per line, separated by a blank or a tab
 */
-int T2TConfig::LoadFromFile(const char* configFN, char** args) {
+int Config::LoadFromFile(const char* configFN, char** args) {
    ifstream f(configFN, ios::in);
    CheckNTErrors(f.is_open(), "unable to open the config file");


--- a/source/sample/transformer/module/T2TUtility.h
+++ b/source/sample/transformer/module/T2TUtility.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,18 +19,18 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
 */

-#ifndef __T2TUTILITY_H__
-#define __T2TUTILITY_H__
+#ifndef __UTILITY_H__
+#define __UTILITY_H__

 #include <string>
 #include <cstdio>

-#include "../../../tensor/XList.h"
+#include "../../tensor/XList.h"

 using namespace std;
 using namespace nts;

-namespace transformer
+namespace nmt
 {

 #define MAX_PARAM_NUM 100
@@ -50,8 +49,8 @@ IntList SplitInt(const string& s, const string& delimiter);
 FloatList SplitFloat(const string& s, const string& delimiter);
 UInt64List SplitToPos(const string& s, const string& delimiter);

-/* configurations for t2t */
-class T2TConfig {
+/* configurations for  */
+class Config {
 public:
    /* path to the model */
    char modelFN[1024];
@@ -131,6 +130,12 @@ public:
    /* indicates whether the model is running for machine translation */
    bool isMT;

+    /* indicates whether share encoder decoder embeddings */
+    int shareAllEmbeddings;
+
+    /* indicates whether share decoder embeddings and output weights */
+    int shareDecInputOutputWeight;
+
    /* indicates whether the model is running with FP16 data type */
    bool useFP16;

@@ -164,9 +169,12 @@ public:
    /* training epoch number */
    int nepoch;

-    /* traing step number */
+    /* training step number */
    int nstep;

+    /* the maximum number of saved checkpoints */
+    int maxCheckpoint;
+
    /* indicates whether we use Adam */
    bool useAdam;

@@ -193,9 +201,6 @@ public:
    /* number of batches on which we do model update */
    int updateStep;

-    /* indicates whether we intend to debug the net */
-    bool isDebugged;
-
    /* indicates whether the sequence is sorted by length */
    bool isLenSorted;

@@ -222,7 +227,7 @@ public:
 public:

    /* load configurations from the command */
-    T2TConfig(int argc, const char** argv);
+    Config(int argc, const char** argv);

    /* load configurations from a file */
    int LoadFromFile(const char* configFN, char** args);

--- a/source/sample/transformer/module/T2TAttention.cpp
+++ b/source/sample/transformer/module/T2TAttention.cpp
--- a/source/sample/transformer/module/T2TAttention.h
+++ b/source/sample/transformer/module/T2TAttention.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,17 +19,17 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
 */

-#ifndef __T2TATTENTION_H__
-#define __T2TATTENTION_H__
+#ifndef __ATTENTION_H__
+#define __ATTENTION_H__

-#include "T2TNNUtil.h"
-#include "T2TUtility.h"
+#include "NNUtil.h"
+#include "../Utility.h"
 #include "../../../network/XNet.h"
 #include "../../../tensor/core/CHeader.h"

 using namespace nts;

-namespace transformer
+namespace nmt
 {
 /* attention type */
 enum { NONE, SELF_ATT, EN_DE_ATT };
@@ -50,6 +49,9 @@ public:
    /* indicates cache miss if 'true' */
    bool miss;

+    /* indicates whether we use cache */
+    bool enable;
+
    /* constructor */
    Cache();

@@ -64,7 +66,7 @@ public:
 };

 /* multi-head attention */
-class T2TAttention
+class Attention
 {
 public:
    /* device id */
@@ -74,22 +76,22 @@ public:
    int nhead;

    /* transformation matrix for Q */
-    XTensor wq;
+    XTensor weightQ;

    /* bias for Q */
-    XTensor bq;
+    XTensor biasQ;

    /* transformation matrix for K */
-    XTensor wk;
+    XTensor weightK;

    /* bias for K */
-    XTensor bk;
+    XTensor biasK;

    /* transformation matrix for V */
-    XTensor wv;
+    XTensor weightV;

    /* bias for V */
-    XTensor bv;
+    XTensor biasV;

    XTensor wBig;

@@ -99,10 +101,10 @@ public:
    XTensor RPEmbK;

    /* transformation after dot-product attention */
-    XTensor wo;
+    XTensor weightO;

    /* bias after dot-product attention */
-    XTensor bo;
+    XTensor biasO;

    /* size of transformed Q and K */
    int dk;
@@ -124,13 +126,13 @@ public:

 public:
    /* constructor */
-    T2TAttention();
+    Attention();

    /* de-constructor */
-    ~T2TAttention();
+    ~Attention();

    /* initialize the model */
-    void InitModel(T2TConfig& config);
+    void InitModel(Config& config);

    /* make the network */
    XTensor Make(XTensor& k, XTensor& q, XTensor& v,
@@ -145,8 +147,10 @@ public:
    XTensor MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
                             XTensor* mask, bool isTraining, bool isEnc);

+    /* generate relative position embeddings */
    XTensor GetRPEmbedding(const int lenQ, const int lenKV, const int maxRelativeLen, const bool isEnc);

+    /* relative position-aware dot-product attention inner calculation */
    XTensor RPDotProduct(XTensor& x, XTensor& y, XTensor& z, const bool is_key);
 };
 }

--- a/source/sample/transformer/module/T2TCommonModules.cpp
+++ b/source/sample/transformer/module/T2TCommonModules.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,13 +19,11 @@
 * This file includes some common modules of the Transformer model
 */

-#include <cmath>
-
-#include "T2TCommonModules.h"
+#include "CommonModules.h"
 #include "../../../tensor/core/CHeader.h"
 #include "../../../tensor/function/FHeader.h"

-namespace transformer
+namespace nmt
 {

 /* 
@@ -37,7 +34,7 @@ flexible layer normalization for the Transformer
 >> before - whether we use layernorm before attention/fnn
 >> after - whether we use layernorm after attention/fnn
 */
-XTensor LayerNorm(XTensor& input, T2TLN& ln, bool prenorm, bool before, bool after)
+XTensor LayerNorm(XTensor& input, LN& ln, bool prenorm, bool before, bool after)
 {
    if (after ^ prenorm)
        return ln.Make(input);

--- a/source/sample/transformer/module/T2TCommonModules.h
+++ b/source/sample/transformer/module/T2TCommonModules.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northestern University. 
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,16 +21,16 @@
 #ifndef __COMMONMODULE_H__
 #define __COMMONMODULE_H__

-#include "T2TLayerNormal.h"
-#include "T2TCommonModules.h"
+#include "LayerNorm.h"
+#include "CommonModules.h"

 using namespace nts;

-namespace transformer
+namespace nmt
 {

 /* the layer normalization module to control pre-norm or post-norm*/
-XTensor LayerNorm(XTensor& input, T2TLN& ln, bool prenorm, bool before, bool after);
+XTensor LayerNorm(XTensor& input, LN& ln, bool prenorm, bool before, bool after);

 }


--- a/source/sample/transformer/module/T2TEmbedding.cpp
+++ b/source/sample/transformer/module/T2TEmbedding.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,17 +19,15 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-07
 */

-#include <cmath>
-
-#include "T2TUtility.h"
-#include "T2TEmbedding.h"
+#include "Embedding.h"
+#include "../Utility.h"
 #include "../../../tensor/core/CHeader.h"

-namespace transformer
+namespace nmt
 {

 /* constructor */
-T2TEmbedder::T2TEmbedder()
+Embedder::Embedder()
 {
    devID = -1;
    vSize = -1;
@@ -38,7 +35,7 @@ T2TEmbedder::T2TEmbedder()
 }

 /* de-constructor */
-T2TEmbedder::~T2TEmbedder()
+Embedder::~Embedder()
 {
 }

@@ -47,7 +44,7 @@ initialize the model
 >> config - configurations of the model
 >> isEnc - indicates if it is used for the encoder
 */
-void T2TEmbedder::InitModel(T2TConfig& config, bool isEnc)
+void Embedder::InitModel(Config& config, bool isEnc)
 {
    devID = config.devID;
    d = config.modelSize;
@@ -70,7 +67,7 @@ void T2TEmbedder::InitModel(T2TConfig& config, bool isEnc)
 make positional embeddings (of size eSize * length)
 >> length - length of the sequence
 */
-void T2TEmbedder::MakePosEmbedding(int length)
+void Embedder::MakePosEmbedding(int length)
 {
    InitTensor2D(&posEmbeddingBase, length, eSize, X_FLOAT, devID);

@@ -110,58 +107,45 @@ make the network
 >> isTraining - indicates whether it is training
 << return - word & position embeddings of the input
 */
-XTensor T2TEmbedder::Make(XTensor& input, bool isDec, bool isTraining, int nstep)
+XTensor Embedder::Make(XTensor& input, bool isDec, bool isTraining, int nstep)
 {
    /* make sure the padding index is 1 */
    CheckNTErrors(input.order > 1, "Wrong input tensor size!");
    CheckNTErrors(input.dimSize[input.order - 1] < maxLength, "The sequence is too long!");
-    CheckNTErrors(vSize > 0, "set vocabulary size by \"-vsize\"");
-    CheckNTErrors(eSize > 0, "set embedding size by \"-esize\"");
+    CheckNTErrors(vSize > 0, "Set vocabulary size by \"-vsize\"");
+    CheckNTErrors(eSize > 0, "Set embedding size by \"-esize\"");

    XTensor wordEmbedding, position, posEmbedding;
-    InitTensor(&position, &input);
-
-    int* posData = new int[input.unitNum];

-    XTensor inputCPU;
-    InitTensorOnCPU(&inputCPU, &input);
-    _CopyValues(&input, &inputCPU);
+    InitTensor1D(&position, input.GetDim(-1), X_INT, devID);

-    if (!isDec)
+    if (!isDec || isTraining || input.GetDim(-1) > 1)
    {
-        /* encoder embeddings */
-        for (int i = 0; i < inputCPU.dimSize[0]; i++) {
-            int startNoPad = 1 + 1;
-            int* p = ((int*)inputCPU.data) + i * inputCPU.dimSize[1];
-            for (int j = 0; j < inputCPU.dimSize[1]; j++) {
-                if (p[j] == 1) {
-                    posData[i * inputCPU.dimSize[1] + j] = 1;
-                }
-                else {
-                    posData[i * inputCPU.dimSize[1] + j] = startNoPad++;
-                }
-            }
-        }
-        position.SetData(posData, position.unitNum);
+        position.Range(0, position.unitNum, 1);
+
+        // disable grad
+        ScaleAndShiftMe(position, 1.0F, float(padIdx + 1));
    }
    else
    {
-        /* decoder embeddings */
-        position.SetDataFixed(nstep + 2);
+        /* decoder embeddings during decoding */
+        position.SetDataFixed(nstep + padIdx + 1);
    }

-    delete[] posData;
-
    /* we make positional embeddings first */
-    posEmbedding = Gather(posEmbeddingBase, position);
+    XTensor embTMP;
+    embTMP = Gather(posEmbeddingBase, position);
+    posEmbedding = Unsqueeze(embTMP, 0, input.GetDim(0));

    /* then we make word embeddings */
+    //w.enableGrad = false;
    wordEmbedding = Gather(w, input);

    wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize));

    /* we sum over the two embeddings */
-    return wordEmbedding + posEmbedding;
+    SumMe(wordEmbedding, posEmbedding);
+    return wordEmbedding;
 }

 }
\ No newline at end of file
--- a/source/sample/transformer/module/T2TEmbedding.h
+++ b/source/sample/transformer/module/T2TEmbedding.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,15 +19,15 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-07
 */

-#ifndef __T2TEMBEDDING_H__
-#define __T2TEMBEDDING_H__
+#ifndef __EMBEDDING_H__
+#define __EMBEDDING_H__

-#include "T2TUtility.h"
+#include "../Utility.h"
 #include "../../../network/XNet.h"

 using namespace nts;

-namespace transformer
+namespace nmt
 {

 #define DEFAULT_EMBEDDING_SIZE 512
@@ -37,7 +36,7 @@ namespace transformer
 embedding (of word at position i):
 word embedding + positional embedding
 */
-class T2TEmbedder
+class Embedder
 {
 public:
    /* device id */
@@ -52,7 +51,7 @@ public:
    /* maximum length of the sequence */
    int maxLength;

-    /* dimension size of the hidden layers in the t2t model */
+    /* dimension size of the hidden layers in the  model */
    int d;

    /* padding index */
@@ -67,13 +66,13 @@ public:

 public:
    /* constructor */
-    T2TEmbedder();
+    Embedder();

    /* de-constructor */
-    ~T2TEmbedder();
+    ~Embedder();

    /* initialize the model */
-    void InitModel(T2TConfig& config, bool isEnc = true);
+    void InitModel(Config& config, bool isEnc = true);

    /* make positional embeddings */
    void MakePosEmbedding(int length);

--- a/source/sample/transformer/module/T2TFNN.cpp
+++ b/source/sample/transformer/module/T2TFNN.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,19 +19,17 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

-#include <cmath>
-
-#include "T2TFNN.h"
-#include "T2TUtility.h"
-#include "T2TEmbedding.h"
+#include "FNN.h"
+#include "Embedding.h"
+#include "../Utility.h"
 #include "../../../tensor/core/CHeader.h"
 #include "../../../tensor/function/FHeader.h"

-namespace transformer
+namespace nmt
 {

 /* constructor */
-T2TFNN::T2TFNN()
+FNN::FNN()
 {
    inSize = -1;
    outSize = -1;
@@ -40,7 +37,7 @@ T2TFNN::T2TFNN()
 }

 /* de-constructor */
-T2TFNN::~T2TFNN()
+FNN::~FNN()
 {
 }

@@ -50,7 +47,7 @@ initialize the model
 >> argv - list of pointers to the arguments
 >> config - configurations of the model
 */
-void T2TFNN::InitModel(T2TConfig& config)
+void FNN::InitModel(Config& config)
 {
    devID = config.devID;

@@ -69,6 +66,9 @@ void T2TFNN::InitModel(T2TConfig& config)
    _SetDataFanInOut(&w1, scale);
    _SetDataFanInOut(&w2, scale);

+    w1.SetDataRand(-(DTYPE)sqrt(6.0F / inSize), (DTYPE)sqrt(6.0F / inSize));
+    w2.SetDataRand(-(DTYPE)sqrt(6.0F / hSize), (DTYPE)sqrt(6.0F / hSize));
+
    b1.SetZeroAll();
    b2.SetZeroAll();
 }
@@ -79,7 +79,7 @@ y = max(0, x * w1 + b1) * w2 + b2
 >> input - the input tensor
 >> return - the output tensor
 */
-XTensor T2TFNN::Make(XTensor& input, bool isTraining)
+XTensor FNN::Make(XTensor& input, bool isTraining)
 {
    XTensor t1;


--- a/source/sample/transformer/module/T2TFNN.h
+++ b/source/sample/transformer/module/T2TFNN.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,20 +19,20 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

-#ifndef __T2TFNN_H__
-#define __T2TFNN_H__
+#ifndef __FNN_H__
+#define __FNN_H__

-#include "T2TUtility.h"
-#include "T2TLayerNormal.h"
+#include "LayerNorm.h"
+#include "../Utility.h"
 #include "../../../tensor/XTensor.h"

 using namespace nts;

-namespace transformer
+namespace nmt
 {

 /* a fnn: y = max(0, x * w1 + b1) * w2 + b2 */
-class T2TFNN
+class FNN
 {
 public:
    /* device id */
@@ -66,13 +65,13 @@ public:
 public:

    /* constructor */
-    T2TFNN();
+    FNN();

    /* de-constructor */
-    ~T2TFNN();
+    ~FNN();

    /* initialize the model */
-    void InitModel(T2TConfig& config);
+    void InitModel(Config& config);

    /* make the network */
    XTensor Make(XTensor& input, bool isTraining);

--- a/source/sample/transformer/module/T2TGatedLinearUnit.cpp
+++ b/source/sample/transformer/module/T2TGatedLinearUnit.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,16 +18,13 @@
 * $Created by: Bei Li (libei_neu@outlook.com) 2020-02-03
 */

-
-#include <cmath>
-
-#include "T2TUtility.h"
-#include "T2TEmbedding.h"
-#include "T2TGatedLinearUnit.h"
+#include "GLU.h"
+#include "Embedding.h"
+#include "../Utility.h"
 #include "../../../tensor/core/CHeader.h"
 #include "../../../tensor/function/FHeader.h"

-namespace transformer
+namespace nmt
 {

 /* constructor */
@@ -48,7 +44,7 @@ GLU::~GLU()
 initialize the model
 >> config - configurations of the model
 */
-void GLU::InitModel(T2TConfig& config)
+void GLU::InitModel(Config& config)
 {
    devID = config.devID;


--- a/source/sample/transformer/module/T2TGatedLinearUnit.h
+++ b/source/sample/transformer/module/T2TGatedLinearUnit.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,12 +22,11 @@
 #ifndef __GLU_H__
 #define __GLU_H__

-#include "T2TLayerNormal.h"
-#include "T2TGatedLinearUnit.h"
+#include "LayerNorm.h"

 using namespace nts;

-namespace transformer
+namespace nmt
 {

 /* a fnn: y = max(0, x * w1 + b1) * w2 + b2 */
@@ -68,7 +66,7 @@ public:
    ~GLU();

    /* initialize the model */
-    void InitModel(T2TConfig& config);
+    void InitModel(Config& config);

    /* make the network */
    XTensor Make(XTensor& input);

--- a/source/sample/transformer/module/T2TLayerHistory.cpp
+++ b/source/sample/transformer/module/T2TLayerHistory.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,19 +18,16 @@
 * $Created by: Bei Li (libei_neu@outlook.com) 2020-02-03
 */

-#include <cmath>
-
-#include "T2TUtility.h"
-#include "T2TEmbedding.h"
-#include "T2TLayerNormal.h"
-#include "T2TLayerHistory.h"
-
+#include "Embedding.h"
+#include "LayerNorm.h"
+#include "LayerHistory.h"
+#include "../Utility.h"
 #include "../../../tensor/core/CHeader.h"

 #define SAFE_DELETE(x) do{ if((x) != NULL){delete (x); (x) = NULL;} } while(false)
 #define SAFE_DELETE_ARRAY(x) do{ if((x) != NULL) {delete [] (x); (x)=NULL;} } while(false)

-namespace transformer
+namespace nmt
 {

 /* constructor */
@@ -54,7 +50,7 @@ LayerHistory::~LayerHistory()
 initialize the model
 >> config - configurations of the model
 */
-void LayerHistory::InitModel(T2TConfig& config)
+void LayerHistory::InitModel(Config& config)
 {
    devID = config.devID;
    d = config.modelSize;
@@ -62,7 +58,7 @@ void LayerHistory::InitModel(T2TConfig& config)

    InitTensor2D(&weight, nlayer + 1, nlayer + 1, X_FLOAT, devID);

-    layerNorms = new T2TLN[nlayer];
+    layerNorms = new LN[nlayer];

    /* initialize the layer normalization of each layer */
    for (int i = 0; i < nlayer; i++) {

--- a/source/sample/transformer/module/T2TLayerHistory.h
+++ b/source/sample/transformer/module/T2TLayerHistory.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,14 +21,14 @@
 #ifndef __LAYERHISTORY_H__
 #define __LAYERHISTORY_H__

-#include "T2TLayerNormal.h"
-#include "T2TLayerHistory.h"
+#include "LayerNorm.h"
+#include "LayerHistory.h"

 #include "../../../tensor/function/FHeader.h"

 using namespace nts;

-namespace transformer
+namespace nmt
 {

 /*
@@ -61,7 +60,7 @@ public:
    TensorList history;

    /* layer normalization for each intimidate layer */
-    T2TLN* layerNorms;
+    LN* layerNorms;

 public:
    /* constructor */
@@ -71,7 +70,7 @@ public:
    ~LayerHistory();

    /* initialize the model */
-    void InitModel(T2TConfig& config);
+    void InitModel(Config& config);

    /* add the layer output to the history */
    void Add(XTensor& tensor);

--- a/source/sample/transformer/module/T2TLayerNormal.cpp
+++ b/source/sample/transformer/module/T2TLayerNormal.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,24 +19,23 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

-#include <cmath>
-#include "T2TUtility.h"
-#include "T2TEmbedding.h"
-#include "T2TLayerNormal.h"
+#include "Embedding.h"
+#include "LayerNorm.h"
+#include "../Utility.h"
 #include "../../../tensor/core/CHeader.h"

-namespace transformer
+namespace nmt
 {

 /* constructor */
-T2TLN::T2TLN()
+LN::LN()
 {
    devID = -1;
    d = 0;
 }

 /* de-constructor */
-T2TLN::~T2TLN()
+LN::~LN()
 {
 }

@@ -47,7 +45,7 @@ initialize the model
 >> argv - list of pointers to the arguments
 >> config - configurations of the model
 */
-void T2TLN::InitModel(T2TConfig& config)
+void LN::InitModel(Config& config)
 {
    devID = config.devID;

@@ -57,6 +55,8 @@ void T2TLN::InitModel(T2TConfig& config)
    InitTensor1D(&b, d, X_FLOAT, devID);
    w.SetDataRand(1.0F, 1.0F);
    b.SetZeroAll();
+
+    w.SetDataFixed(1);
 }

 /*
@@ -64,7 +64,7 @@ make the network
 >> input - the input tensor
 >> return - layer normalization output
 */
-XTensor T2TLN::Make(XTensor& input)
+XTensor LN::Make(XTensor& input)
 {
    XTensor& x = input;
    XTensor xn;

--- a/source/sample/transformer/module/T2TLayerNormal.h
+++ b/source/sample/transformer/module/T2TLayerNormal.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,20 +19,20 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

-#ifndef __T2TLAYERNORMAL_H__
-#define __T2TLAYERNORMAL_H__
+#ifndef __LAYERNORMAL_H__
+#define __LAYERNORMAL_H__

-#include "T2TUtility.h"
-#include "../../../network/XNet.h"
+#include "../Utility.h"
+#include "../../../network//XNet.h"

 using namespace nts;

-namespace transformer
+namespace nmt
 {

 /* layer normalization: y = norm(x) * w + b
   where norm(x) = (x - mean)/standardDeviation */
-class T2TLN
+class LN
 {
 public:
    /* device id */
@@ -50,13 +49,13 @@ public:

 public:
    /* constructor */
-    T2TLN();
+    LN();

    /* de-constructor */
-    ~T2TLN();
+    ~LN();

    /* initialize the model */
-    void InitModel(T2TConfig& config);
+    void InitModel(Config& config);

    /* make the network */
    XTensor Make(XTensor& input);

--- a/source/sample/transformer/module/T2TNNUtil.cpp
+++ b/source/sample/transformer/module/T2TNNUtil.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,12 +15,12 @@
 */

 /*
- * $Created by: Chi (huchinlp@foxmail.com) 2020-03-21
+ * $Created by: HU Chi (huchinlp@foxmail.com) 2020-03-21
 */

-#include "T2TNNUtil.h"
+#include "NNUtil.h"

-namespace transformer
+namespace nmt
 {

 /* 

--- a/source/sample/transformer/module/T2TNNUtil.h
+++ b/source/sample/transformer/module/T2TNNUtil.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,11 +15,11 @@
 */

 /*
- * $Created by: Chi (huchinlp@foxmail.com) 2020-03-21
+ * $Created by: HU Chi (huchinlp@foxmail.com) 2020-03-21
 */

-#ifndef __T2TNNUTIL_H__
-#define __T2TNNUTIL_H__
+#ifndef __NNUTIL_H__
+#define __NNUTIL_H__

 #include "../../../tensor/XGlobal.h"
 #include "../../../tensor/core/CHeader.h"
@@ -28,7 +27,7 @@

 using namespace nts;

-namespace transformer
+namespace nmt
 {

 /* the gather function for tensor with any dimension */

--- a/source/sample/transformer/module/T2TOutput.cpp
+++ b/source/sample/transformer/module/T2TOutput.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,18 +19,16 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

-#include <cmath>
-
-#include "T2TOutput.h"
-#include "T2TUtility.h"
-#include "T2TEmbedding.h"
+#include "Output.h"
+#include "Embedding.h"
+#include "../Utility.h"
 #include "../../../tensor/core/CHeader.h"

-namespace transformer
+namespace nmt
 {

 /* constructor */
-T2TOutput::T2TOutput()
+Output::Output()
 {
    devID = -1;
    vSize = -1;
@@ -39,7 +36,7 @@ T2TOutput::T2TOutput()
 }

 /* de-constructor */
-T2TOutput::~T2TOutput()
+Output::~Output()
 {
 }

@@ -47,7 +44,7 @@ T2TOutput::~T2TOutput()
 initialize the model
 >> config - configurations of the model
 */
-void T2TOutput::InitModel(T2TConfig& config)
+void Output::InitModel(Config& config)
 {
    devID = config.devID;
    hSize = config.modelSize;
@@ -66,7 +63,7 @@ make the network (redefined output tensor)
 >> isTraining - whether it is used for training
 >> normalized - whether ignore the log-softmax
 */
-void T2TOutput::Make(XTensor& input, XTensor& output, bool isTraining, bool normalized)
+void Output::Make(XTensor& input, XTensor& output, bool isTraining, bool normalized)
 {
    XTensor& x = input;


--- a/source/sample/transformer/module/T2TOutput.h
+++ b/source/sample/transformer/module/T2TOutput.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,19 +19,19 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

-#ifndef __T2TOUTPUT_H__
-#define __T2TOUTPUT_H__
+#ifndef __OUTPUT_H__
+#define __OUTPUT_H__

-#include "T2TUtility.h"
+#include "../Utility.h"
 #include "../../../tensor/function/FHeader.h"

 using namespace nts;

-namespace transformer
+namespace nmt
 {

 /* output layer */
-class T2TOutput
+class Output
 {
 public:
    /* device id */
@@ -49,13 +48,13 @@ public:

 public:
    /* constructor */
-    T2TOutput();
+    Output();

    /* de-constructor */
-    ~T2TOutput();
+    ~Output();

    /* initialize the model */
-    void InitModel(T2TConfig& config);
+    void InitModel(Config& config);

    /* make the network (redefined output tensor) */
    void Make(XTensor& input, XTensor& output, bool isTraining, bool normalized);

--- a/source/sample/transformer/train/T2TBatchLoader.cpp
+++ b/source/sample/transformer/train/T2TBatchLoader.cpp
--- a/source/sample/transformer/train/T2TBatchLoader.h
+++ b/source/sample/transformer/train/T2TBatchLoader.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-04-25
- * it is cold today but I'll move to a warm place tomorrow :)
- */
-
-#ifndef __T2TBATCHLOADER_H__
-#define __T2TBATCHLOADER_H__
-
-#include "../module/T2TUtility.h"
-#include "../../../network/XNet.h"
-
-using namespace nts;
-
-namespace transformer
-{
-
-#define MAX_SEQUENCE_LENGTH 1024 * 4
-
-/* node to keep batch information */
-struct BatchNode
-{
-    /* beginning position */
-    int beg;
-
-    /* end position */
-    int end;
-
-    /* maximum word number on the encoder side */
-    int maxEnc;
-
-    /* maximum word number on the decoder side */
-    int maxDec;
-
-    /* a key for sorting */
-    int key;
-};
-
-class T2TBatchLoader
-{
-public:
-    /* buffer for loading words */
-    int* buf;
-
-    /* another buffer */
-    int* buf2;
-
-    /* batch buf */
-    BatchNode* bufBatch;
-
-    /* buffer size */
-    int bufSize;
-
-    /* size of batch buffer */
-    int bufBatchSize;
-
-    /* length of each sequence */
-    int* seqLen;
-
-    /* another array */
-    int* seqLen2;
-
-    /* offset of the first word for each sequence */
-    int* seqOffset;
-
-    /* number of sequences in the buffer */
-    int nseqBuf;
-
-    /* offset for next sequence in the buffer */
-    int nextSeq;
-
-    /* offset for next batch */
-    int nextBatch;
-
-    /* indicates whether we double the </s> symbol for the output of LM */
-    bool isDoubledEnd;
-
-    /* indicates whether we use batchsize = max * sc
-       rather rather than batchsize = word-number, where max is the maximum
-       length and sc is the sentence number */
-    bool isSmallBatch;
-
-    /* counterpart of "isSmallBatch" */
-    bool isBigBatch;
-
-    /* randomize batches */
-    bool isRandomBatch;
-
-    /* bucket size */
-    int bucketSize;
-
-public:
-    /* constructor */
-    T2TBatchLoader();
-
-    /* de-constructor */
-    ~T2TBatchLoader();
-
-    /* initialization */
-    void Init(T2TConfig& config);
-
-    /* load data to buffer */
-    int LoadBuf(FILE* file, bool isSorted, int step);
-
-    /* clear data buffer */
-    void ClearBuf();
-
-    /* set the random batch flag */
-    void SetRandomBatch(bool flag = true);
-
-    /* load a batch of sequences */
-    int LoadBatch(FILE* file, bool isLM,
-        XTensor* batchEnc, XTensor* paddingEnc,
-        XTensor* batchDec, XTensor* paddingDec,
-        XTensor* gold, XTensor* label,
-        int* seqs,
-        int vsEnc, int vsDec, int sBatch, int wBatch,
-        bool isSorted, int& ws, int& wCount,
-        int devID, bool isTraining);
-
-    /* load a batch of sequences (for language modeling) */
-    int LoadBatchLM(FILE* file,
-        XTensor* batchEnc, XTensor* paddingEnc,
-        XTensor* batchDec, XTensor* paddingDec,
-        XTensor* gold, XTensor* label,
-        int* seqs, int vs, int sBatch, int wBatch,
-        bool isSorted, int& wCount,
-        int devID, bool isTraining);
-
-    /* load a batch of sequences (for machine translation) */
-    int LoadBatchMT(FILE* file,
-        XTensor* batchEnc, XTensor* paddingEnc,
-        XTensor* batchDec, XTensor* paddingDec,
-        XTensor* gold, XTensor* label,
-        int* seqs, int vsEnc, int vsDec, int sBatch, int wBatch,
-        bool isSorted, int& ws, int& wCount,
-        int devID, bool isTraining);
-
-    /* shuffle the data file */
-    void Shuffle(const char* srcFile, const char* tgtFile);
-};
-
-}
-
-#endif
\ No newline at end of file
--- a/source/sample/transformer/train/TrainDataSet.cpp
+++ b/source/sample/transformer/train/TrainDataSet.cpp
--- a/source/sample/transformer/train/TrainDataSet.h
+++ b/source/sample/transformer/train/TrainDataSet.h
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: HU Chi (huchinlp@foxmail.com) 2019-04-03
+ * $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
+ */
+
+#ifndef __TRAIN_DATASET_H__
+#define __TRAIN_DATASET_H__
+
+#include <cstdio>
+#include <vector>
+#include <fstream>
+
+#include "../../../tensor/XList.h"
+#include "../../../tensor/XTensor.h"
+#include "../../../tensor/XGlobal.h"
+
+#define MAX_WORD_NUM 120
+
+using namespace std;
+
+namespace nts {
+
+/* a class of sentence pairs for training */
+struct TrainExample {
+
+    /* id of the sentence pair */
+    int id;
+
+    /* source language setence (tokenized) */
+    IntList srcSent;
+
+    /* target language setence (tokenized) */
+    IntList tgtSent;
+
+    /* the key used to shuffle items in a bucket */
+    int key;
+
+    /* the key used to shuffle buckets */
+    int bucketKey;
+};
+
+/* A `TrainDataSet` is associated with a file which contains training data. */
+struct TrainDataSet {
+public:
+    /* the data buffer */
+    TrainBufferType buffer;
+
+    /* a list of empty line number */
+    IntList emptyLines;
+
+    /* the pointer to file stream */
+    FILE* fp;
+
+    /* current index in the buffer */
+    size_t curIdx;
+
+    /* size of used data in the buffer */
+    size_t bufferUsed;
+
+    /* size of the bucket used for grouping sentences */
+    size_t bucketSize;
+
+    /* indicates whether it is used for training */
+    bool isTraining;
+
+public:
+
+    /* sort the input by length (in descending order) */
+    void SortByLength();
+
+    /* sort buckets by key (in descending order) */
+    void SortBucket();
+
+    /* sort the output by key (in descending order) */
+    void SortInBucket(int begin, int end);
+
+    /* load data from a file to the buffer */
+    void LoadDataToBuffer();
+
+    /* generate a mini-batch */
+    UInt64List LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
+                         XTensor* batchDec, XTensor* paddingDec, XTensor* label,
+                         size_t minSentBatch, size_t batchSize, int devID);
+
+    /* initialization function */
+    void Init(const char* dataFile, int bucketSize, bool training);
+
+    /* check if the buffer is empty */
+    bool IsEmpty();
+
+    /* reset the buffer */
+    void ClearBuf();
+
+    /* group data into buckets with similar length */
+    void BuildBucket();
+
+    /* de-constructor */
+    ~TrainDataSet();
+};
+}
+
+#endif // __TRAIN_DATASET_H__
\ No newline at end of file
--- a/source/sample/transformer/train/T2TTrainer.cpp
+++ b/source/sample/transformer/train/T2TTrainer.cpp
--- a/source/sample/transformer/train/T2TTrainer.h
+++ b/source/sample/transformer/train/T2TTrainer.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,25 +18,24 @@
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-08-02
 */

-#ifndef __T2TTRAINER_H__
-#define __T2TTRAINER_H__
+#ifndef __TRAINER_H__
+#define __TRAINER_H__

-#include "../T2TModel.h"
-#include "T2TBatchLoader.h"
-#include "../../../tensor/function/FHeader.h"
+#include "../Model.h"
+#include "TrainDataSet.h"

 using namespace nts;

-namespace transformer
+namespace nmt
 {

-/* trainer of the T2T model */
-class T2TTrainer
+/* trainer of the  model */
+class Trainer
 {
 public:

    /* configurations */
-    T2TConfig* cfg;
+    Config* cfg;

    /* dimension size of each inner layer */
    int d;
@@ -63,12 +61,18 @@ public:
    /* word batch size */
    int wBatchSize;

+    /* size of bucket for grouping data by length */
+    int bucketSize;
+
    /* training epoch number */
    int nepoch;

    /* traing step number */
    int nstep;

+    /* the maximum number of saved checkpoints */
+    int maxCheckpoint;
+
    /* indicates whether we use adam */
    bool useAdam;

@@ -100,39 +104,36 @@ public:
    /* number of batches on which we do model update */
    int updateStep;

-    /* indicates whether we intend to debug the net */
-    bool isDebugged;
-
    /* indicates whether the sequence is sorted by length */
    bool isLenSorted;

-    /* for batching */
-    T2TBatchLoader batchLoader;
+    /* used for loading batches */
+    TrainDataSet batchLoader;

 public:
    /* constructor */
-    T2TTrainer();
+    Trainer();

    /* de-constructor */
-    ~T2TTrainer();
+    ~Trainer();

    /* initialize the trainer */
-    void Init(T2TConfig& config);
+    void Init(Config& config);

    /* train the model */
-    void Train(const char* fn, const char* validFN, const char* modelFN, T2TModel* model);
+    void Train(const char* fn, const char* validFN, const char* modelFN, Model* model);

    /* test the model */
-    void Validate(const char* fn, const char* ofn, T2TModel* model);
+    void Validate(const char* fn, const char* ofn, Model* model);

    /* make a checkpoint */
-    void MakeCheckpoint(T2TModel* model, const char* validFN, const char* modelFN, const char* label, int id);
+    void MakeCheckpoint(Model* model, const char* validFN, const char* modelFN, const char* label, int id);

    /* update the model by delta rule */
-    void Update(T2TModel* model, const float lr);
+    void Update(Model* model, const float lr);

    /* prepare model for training */
-    void PrepareModel(T2TModel* model);
+    void PrepareModel(Model* model);
 };

 }

--- a/source/sample/transformer/translate/T2TDataSet.cpp
+++ b/source/sample/transformer/translate/T2TDataSet.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -26,24 +25,26 @@
 #include <fstream>
 #include <algorithm>

-#include "T2TDataSet.h"
-#include "../module/T2TUtility.h"
+#include "DataSet.h"
+#include "../Utility.h"

-using namespace transformer;
+using namespace nmt;

 namespace nts {

 /* sort the output by id (in ascending order) */
 void DataSet::SortInput() {
-    sort(inputBuffer.items, inputBuffer.items + inputBuffer.count, [](Example* a, Example* b) {
-        return a->values.count > b->values.count;
+    sort(inputBuffer.items, inputBuffer.items + inputBuffer.count, 
+        [](Example* a, Example* b) {
+            return a->values.count > b->values.count;
        });
 }

 /* sort the input by length (in descending order) */
 void DataSet::SortOutput() {
-    sort(outputBuffer.items, outputBuffer.items + outputBuffer.count, [](Result* a, Result* b) {
-        return a->id < b->id;
+    sort(outputBuffer.items, outputBuffer.items + outputBuffer.count, 
+        [](Result* a, Result* b) {
+            return a->id < b->id;
        });
 }

@@ -74,7 +75,7 @@ void DataSet::LoadDataToBuffer()
                : line.size() - indices[i];
            string word = line.substr(indices[i], offset);
            if (srcVocab.word2id.find(word) == srcVocab.word2id.end())
-                values.Add(3);
+                values.Add(UNK);
            else
                values.Add(srcVocab.word2id.at(word));
        }
@@ -100,7 +101,7 @@ void DataSet::LoadDataToBuffer()
 }

 /*
-load a mini-batch to the device
+load a mini-batch to the device (for translating)
 >> batchEnc - a tensor to store the batch of input
 >> paddingEnc - a tensor to store the batch of paddings
 >> minSentBatch - the minimum number of sentence batch
@@ -117,10 +118,10 @@ UInt64List DataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
    size_t maxLen = inputBuffer[bufferUsed]->values.Size();

    /* dynamic batching for sentences */
-    while ((realBatchSize < (inputBuffer.Size() - bufferUsed))
-        && (realBatchSize * maxLen < batchSize)) {
-        realBatchSize++;
-    }
+    //while ((realBatchSize < (inputBuffer.Size() - bufferUsed))
+    //    && (realBatchSize * maxLen < batchSize)) {
+    //    realBatchSize++;
+    //}

    /* real batch size */
    if ((inputBuffer.Size() - bufferUsed) < realBatchSize) {
@@ -133,13 +134,13 @@ UInt64List DataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
    float* paddingValues = new float[realBatchSize * maxLen];

    for (int i = 0; i < realBatchSize * maxLen; i++) {
-        batchValues[i] = 1;
-        paddingValues[i] = 0.0F;
+        batchValues[i] = PAD;
+        paddingValues[i] = 1.0F;
    }

-    size_t cur = 0;
+    size_t curSrc = 0;

-    /* left padding */
+    /* right padding */
    UInt64List infos;
    size_t totalLength = 0;

@@ -147,11 +148,11 @@ UInt64List DataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
        infos.Add(inputBuffer[bufferUsed + i]->id);
        totalLength += inputBuffer[bufferUsed + i]->values.Size();

-        cur = maxLen * (i + 1) - inputBuffer[bufferUsed + i]->values.Size();
-        for (int j = 0; j < inputBuffer[bufferUsed + i]->values.Size(); j++) {
-            batchValues[cur] = inputBuffer[bufferUsed + i]->values[j];
-            paddingValues[cur++] = 1.0F;
-        }
+        curSrc = maxLen * i;
+        for (int j = 0; j < inputBuffer[bufferUsed + i]->values.Size(); j++)
+            batchValues[curSrc++] = inputBuffer[bufferUsed + i]->values[j];
+        while (curSrc < maxLen * (i + 1))
+            paddingValues[curSrc++] = 0;
    }
    infos.Add(totalLength);

@@ -178,7 +179,7 @@ the constructor of DataSet
 void DataSet::Init(const char* dataFile, const char* srcVocabFN, const char* tgtVocabFN)
 {
    fp = new ifstream(dataFile);
-    CheckNTErrors(fp->is_open(), "can not open the file");
+    CheckNTErrors(fp->is_open(), "Can not open the test data");
    bufferUsed = 0;

    CheckNTErrors(strcmp(srcVocabFN, "") != 0, "missing source vocab file");

--- a/source/sample/transformer/translate/T2TDataSet.h
+++ b/source/sample/transformer/translate/T2TDataSet.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -26,7 +25,7 @@
 #include <cstdio>
 #include <vector>
 #include <fstream>
-#include "T2TVocab.h"
+#include "Vocab.h"

 #include "../../../tensor/XList.h"
 #include "../../../tensor/XTensor.h"

--- a/source/sample/transformer/translate/T2TLengthPenalty.cpp
+++ b/source/sample/transformer/translate/T2TLengthPenalty.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,11 +21,11 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

-#include "T2TLengthPenalty.h"
+#include "LengthPenalty.h"

 using namespace nts;

-namespace transformer
+namespace nmt
 {

 /*
@@ -36,7 +35,7 @@ where n = length of the sequence
 >> alpha - the parameter controls the length preference
 << return - length penalty of the sequence
 */
-float T2TLengthPenalizer::GNMT(float length, float alpha)
+float LengthPenalizer::GNMT(float length, float alpha)
 {
    float base;
    float lp;

--- a/source/sample/transformer/translate/T2TLengthPenalty.h
+++ b/source/sample/transformer/translate/T2TLengthPenalty.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,21 +21,21 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

-#ifndef __T2TLENGTHPENALTY_H__
-#define __T2TLENGTHPENALTY_H__
+#ifndef __LENGTHPENALTY_H__
+#define __LENGTHPENALTY_H__

-#include "../module/T2TUtility.h"
+#include "../Utility.h"
 #include "../../../tensor/XTensor.h"

 using namespace nts;

-namespace transformer
+namespace nmt
 {

 /* We intend to penalize short sequences because they have higher score
   in product of a sequence of probability-like terms and have more chances
   to beat others in search. */
-class T2TLengthPenalizer
+class LengthPenalizer
 {
 public:
    /* GNMT-like length penalty: pl = ((5 + n)/(5 + 1))^\alpha

--- a/source/sample/transformer/translate/T2TPredictor.cpp
+++ b/source/sample/transformer/translate/T2TPredictor.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,23 +21,23 @@

 #include <iostream>

-#include "T2TPredictor.h"
-#include "../module/T2TNNUtil.h"
+#include "Predictor.h"
+#include "../module/NNUtil.h"

 using namespace nts;

-namespace transformer
+namespace nmt
 {

 /* constructor */
-T2TStateBundle::T2TStateBundle()
+StateBundle::StateBundle()
 {
    states = NULL;
    isStart = false;
 }

 /* de-constructor */
-T2TStateBundle::~T2TStateBundle()
+StateBundle::~StateBundle()
 {
    if (states != NULL)
        delete[] states;
@@ -48,18 +47,18 @@ T2TStateBundle::~T2TStateBundle()
 create states
 >> num - number of states
 */
-void T2TStateBundle::MakeStates(int num)
+void StateBundle::MakeStates(int num)
 {
    CheckNTErrors(num > 0, "invalid number");

    if (states != NULL)
        delete[] states;

-    states = new T2TState[num];
+    states = new State[num];

    for (int i = 0; i < num; i++) {
        states[i].prediction = -1;
-        states[i].pid = T2T_PID_EMPTY;
+        states[i].pid = _PID_EMPTY;
        states[i].isEnd = false;
        states[i].isStart = false;
        states[i].isCompleted = false;
@@ -74,26 +73,26 @@ void T2TStateBundle::MakeStates(int num)
 }

 /* constructor */
-T2TPredictor::T2TPredictor()
+Predictor::Predictor()
 {
    startSymbol = 2;
 }

 /* de-constructor */
-T2TPredictor::~T2TPredictor()
+Predictor::~Predictor()
 {
 }

 /*
 create an initial state
->> model - the t2t model
+>> model - the  model
 >> top - the top-most layer of the network
 >> input - input of the network
 >> beamSize - beam size
 >> state - the state to be initialized
 */
-void T2TPredictor::Create(T2TModel* model, XTensor* top, const XTensor* input,
-    int beamSize, T2TStateBundle* state)
+void Predictor::Create(Model* model, XTensor* top, const XTensor* input,
+                       int beamSize, StateBundle* state)
 {
    int dims[MAX_TENSOR_DIM_NUM];
    for (int i = 0; i < input->order - 1; i++)
@@ -114,20 +113,20 @@ void T2TPredictor::Create(T2TModel* model, XTensor* top, const XTensor* input,
 set start symbol
 >> symbol - the symbol (in integer)
 */
-void T2TPredictor::SetStartSymbol(int symbol)
+void Predictor::SetStartSymbol(int symbol)
 {
    startSymbol = symbol;
 }

 /*
 read a state
->> model - the t2t model that keeps the network created so far
+>> model - the  model that keeps the network created so far
 >> state - a set of states. It keeps
 1) hypotheses (states)
 2) probabilities of hypotheses
 3) parts of the network for expanding toward the next state
 */
-void T2TPredictor::Read(T2TModel* model, T2TStateBundle* state)
+void Predictor::Read(Model* model, StateBundle* state)
 {
    m = model;
    s = state;
@@ -147,9 +146,9 @@ predict the next state
 >> needReorder - whether we need reordering the states
 >> nstep - current time step of the target sequence
 */
-void T2TPredictor::Predict(T2TStateBundle* next, XTensor& aliveState, XTensor& encoding,
-                           XTensor& inputEnc, XTensor& paddingEnc, int batchSize, bool isStart,
-                           XTensor& reorderState, bool needReorder, int nstep)
+void Predictor::Predict(StateBundle* next, XTensor& aliveState, XTensor& encoding,
+                        XTensor& inputEnc, XTensor& paddingEnc, int batchSize, bool isStart,
+                        XTensor& reorderState, bool needReorder, int nstep)
 {
    int dims[MAX_TENSOR_DIM_NUM];

@@ -221,14 +220,14 @@ void T2TPredictor::Predict(T2TStateBundle* next, XTensor& aliveState, XTensor& e
 generate paths up to the states of the current step
 >> state - state bundle of the current step
 */
-XTensor T2TPredictor::GeneratePaths(T2TStateBundle* state)
+XTensor Predictor::GeneratePaths(StateBundle* state)
 {
    CheckNTErrors(state->stateNum >= 0, "Illegal state!");

    int distance = -1;

    for (int i = 0; i < state->stateNum; i++) {
-        T2TState* cur = state->states + i;
+        State* cur = state->states + i;
        int nsteps = 0;

        while (cur != NULL) {
@@ -245,7 +244,7 @@ XTensor T2TPredictor::GeneratePaths(T2TStateBundle* state)
    path.SetZeroAll();

    for (int i = 0; i < state->stateNum; i++) {
-        T2TState* cur = state->states + i;
+        State* cur = state->states + i;
        int nsteps = 0;

        while (cur != NULL) {
@@ -263,21 +262,21 @@ get the predictions of the previous step
 >> state - state bundle of the current step
 >> devID - the device id for the predictions
 */
-XTensor T2TPredictor::GetLastPrediction(T2TStateBundle* state, int devID)
+XTensor Predictor::GetLastPrediction(StateBundle* state, int devID)
 {
    CheckNTErrors(state->stateNum >= 0, "Illegal state!");

    IntList last;

    for (int i = 0; i < state->stateNum; i++) {
-        T2TState* cur = state->states + i;
+        State* cur = state->states + i;

        last.Add(cur->prediction);
    }

    XTensor lastPred;
-    InitTensor2D(&lastPred, last.Size(), 1, X_INT, devID);
-    lastPred.SetData(last.items, last.Size());
+    InitTensor2D(&lastPred, int(last.Size()), 1, X_INT, devID);
+    lastPred.SetData(last.items, int(last.Size()));

    return lastPred;
 }

--- a/source/sample/transformer/translate/T2TPredictor.h
+++ b/source/sample/transformer/translate/T2TPredictor.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -21,22 +20,22 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04
 */

-#ifndef __T2TPREDICTOR_H__
-#define __T2TPREDICTOR_H__
+#ifndef __PREDICTOR_H__
+#define __PREDICTOR_H__

-#include "../T2TModel.h"
-#include "T2TLengthPenalty.h"
+#include "../Model.h"
+#include "LengthPenalty.h"

 using namespace std;

-namespace transformer
+namespace nmt
 {

-#define T2T_PID_EMPTY -1
+#define _PID_EMPTY -1

 /* state for search. It keeps the path (back-pointer), prediction distribution,
   and etc. It can be regarded as a hypotheses in translation. */
-class T2TState
+class State
 {
 public:
    /* we assume that the prediction is an integer */
@@ -69,11 +68,11 @@ public:
    int nstep;

    /* pointer to the previous state */
-    T2TState* last;
+    State* last;
 };

 /* a bundle of states */
-class T2TStateBundle
+class StateBundle
 {
 public:
    /* predictions */
@@ -98,7 +97,7 @@ public:
    float nstep;

    /* list of states */
-    T2TState* states;
+    State* states;

    /* number of states */
    int stateNum;
@@ -108,10 +107,10 @@ public:

 public:
    /* constructor */
-    T2TStateBundle();
+    StateBundle();

    /* de-constructor */
-    ~T2TStateBundle();
+    ~StateBundle();

    /* create states */
    void MakeStates(int num);
@@ -122,14 +121,14 @@ public:
   we get the state of previous words and then generate the next word.
   Here, a state can be regarded as the representation of words (word
   indices, hidden states, embeddings and etc.).  */
-class T2TPredictor
+class Predictor
 {
 private:
    /* pointer to the transformer model */
-    T2TModel* m;
+    Model* m;

    /* current state */
-    T2TStateBundle* s;
+    StateBundle* s;

    /* start symbol */
    int startSymbol;
@@ -139,30 +138,30 @@ private:

 public:
    /* constructor */
-    T2TPredictor();
+    Predictor();

    /* de-constructor */
-    ~T2TPredictor();
+    ~Predictor();

    /* create an initial state */
-    void Create(T2TModel* model, XTensor* top, const XTensor* input, int beamSize, T2TStateBundle* state);
+    void Create(Model* model, XTensor* top, const XTensor* input, int beamSize, StateBundle* state);

    /* set the start symbol */
    void SetStartSymbol(int symbol);

    /* read a state */
-    void Read(T2TModel* model, T2TStateBundle* state);
+    void Read(Model* model, StateBundle* state);

    /* predict the next state */
-    void Predict(T2TStateBundle* next, XTensor& aliveIndices, XTensor& encoding,
+    void Predict(StateBundle* next, XTensor& aliveIndices, XTensor& encoding,
        XTensor& inputEnc, XTensor& paddingEnc, int rawBatchSize,
        bool isStart, XTensor& reorderState, bool needReorder, int nstep);

    /* generate paths up to the states of the current step */
-    XTensor GeneratePaths(T2TStateBundle* state);
+    XTensor GeneratePaths(StateBundle* state);

    /* get the predictions of the previous step */
-    XTensor GetLastPrediction(T2TStateBundle* state, int devID);
+    XTensor GetLastPrediction(StateBundle* state, int devID);
 };

 }

--- a/source/sample/transformer/translate/T2TSearch.cpp
+++ b/source/sample/transformer/translate/T2TSearch.cpp
--- a/source/sample/transformer/translate/T2TSearch.h
+++ b/source/sample/transformer/translate/T2TSearch.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,15 +19,15 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
 */

-#ifndef __T2TSEARCH_H__
-#define __T2TSEARCH_H__
+#ifndef __SEARCH_H__
+#define __SEARCH_H__

-#include "../T2TModel.h"
-#include "T2TPredictor.h"
+#include "../Model.h"
+#include "Predictor.h"

 using namespace std;

-namespace transformer
+namespace nmt
 {

 /* The class organizes the search process. It calls "predictors" to generate
@@ -42,7 +41,7 @@ private:
    float alpha;

    /* predictor */
-    T2TPredictor predictor;
+    Predictor predictor;

    /* max length of the generated sequence */
    int maxLength;
@@ -88,28 +87,28 @@ public:
    ~BeamSearch();

    /* initialize the model */
-    void Init(T2TConfig& config);
+    void Init(Config& config);

    /* search for the most promising states */
-    void Search(T2TModel* model, XTensor& input, XTensor& padding, IntList* output, XTensor& score);
+    void Search(Model* model, XTensor& input, XTensor& padding, IntList* output, XTensor& score);

    /* preparation */
    void Prepare(int myBatchSize, int myBeamSize);

    /* compute the model score for each hypotheses */
-    void Score(T2TStateBundle* prev, T2TStateBundle* beam);
+    void Score(StateBundle* prev, StateBundle* beam);

    /* generate token indices via beam pruning */
-    void Generate(T2TStateBundle* prev, T2TStateBundle* beam);
+    void Generate(StateBundle* prev, StateBundle* beam);

    /* expand the search graph */
-    void Expand(T2TStateBundle* prev, T2TStateBundle* beam, XTensor& reorderState);
+    void Expand(StateBundle* prev, StateBundle* beam, XTensor& reorderState);

    /* collect hypotheses with ending symbol */
-    void Collect(T2TStateBundle* beam);
+    void Collect(StateBundle* beam);

    /* fill the hypotheses heap with incomplete hypotheses */
-    void FillHeap(T2TStateBundle* beam);
+    void FillHeap(StateBundle* beam);

    /* save the output sequences and score */
    void Dump(IntList* output, XTensor* score);
@@ -118,17 +117,17 @@ public:
    bool IsEnd(int token);

    /* check whether all hypotheses are completed */
-    bool IsAllCompleted(T2TStateBundle* beam);
+    bool IsAllCompleted(StateBundle* beam);

    /* update the beam by pruning finished states */
-    void RemoveFinishedStates(T2TStateBundle* beam, XTensor& aliveEncoding,
+    void RemoveFinishedStates(StateBundle* beam, XTensor& aliveEncoding,
        XTensor& aliveInput, XTensor& alivePadding, XTensor& aliveIdx);

    /* set end symbols for search */
    void SetEnd(const int* tokens, const int tokenNum);

    /* make a mask to prevent duplicated entries in beam expansion for the first position */
-    XTensor MakeFirstMask(T2TStateBundle* beam);
+    XTensor MakeFirstMask(StateBundle* beam);
 };

 class GreedySearch
@@ -136,7 +135,7 @@ class GreedySearch
 private:

    /* predictor */
-    T2TPredictor predictor;
+    Predictor predictor;

    /* max length of the generated sequence */
    int maxLength;
@@ -164,10 +163,10 @@ public:
    ~GreedySearch();

    /* initialize the model */
-    void Init(T2TConfig& config);
+    void Init(Config& config);

    /* search for the most promising states */
-    void Search(T2TModel* model, XTensor& input, XTensor& padding, IntList* output);
+    void Search(Model* model, XTensor& input, XTensor& padding, IntList* output);

    /* preparation */
    void Prepare(int myBatchSize);

--- a/source/sample/transformer/translate/T2TTranslator.cpp
+++ b/source/sample/transformer/translate/T2TTranslator.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,27 +19,25 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-04, 2020-06
 */

-#include <cmath>
-
-#include "T2TTranslator.h"
-#include "T2TSearch.h"
-#include "../module/T2TUtility.h"
+#include "Search.h"
+#include "Translator.h"
+#include "../Utility.h"
 #include "../../../tensor/XTensor.h"
 #include "../../../tensor/XUtility.h"
 #include "../../../tensor/core/CHeader.h"

 using namespace nts;

-namespace transformer
+namespace nmt
 {

 /* constructor */
-T2TTranslator::T2TTranslator()
+Translator::Translator()
 {
 }

 /* de-constructor */
-T2TTranslator::~T2TTranslator()
+Translator::~Translator()
 {
    if (beamSize > 1)
        delete (BeamSearch*)seacher;
@@ -49,7 +46,7 @@ T2TTranslator::~T2TTranslator()
 }

 /* initialize the model */
-void T2TTranslator::Init(T2TConfig& config)
+void Translator::Init(Config& config)
 {
    beamSize = config.beamSize;
    vSize = config.srcVocabSize;
@@ -58,17 +55,17 @@ void T2TTranslator::Init(T2TConfig& config)
    wordBatch = config.wBatchSize;

    if (beamSize > 1) {
-        XPRINT1(0, stderr, "Translating with beam search (%d)\n", beamSize);
+        LOG("translating with beam search (%d)", beamSize);
        seacher = new BeamSearch();
        ((BeamSearch*)seacher)->Init(config);
    }
    else if (beamSize == 1) {
-        XPRINT1(0, stderr, "Translating with greedy search (%d)\n", beamSize);
+        LOG("translating with greedy search");
        seacher = new GreedySearch();
        ((GreedySearch*)seacher)->Init(config);
    }
    else {
-        CheckNTErrors(false, "invalid beam size\n");
+        CheckNTErrors(false, "Invalid beam size\n");
    }
 }

@@ -80,8 +77,8 @@ test the model
 >> ofn - output data file
 >> model - pretrained model
 */
-void T2TTranslator::Translate(const char* ifn, const char* sfn, const char* tfn, 
-                              const char* ofn, T2TModel* model)
+void Translator::Translate(const char* ifn, const char* sfn, 
+                           const char* tfn, const char* ofn, Model* model)
 {
    int wc = 0;
    int wordCountTotal = 0;
@@ -99,8 +96,7 @@ void T2TTranslator::Translate(const char* ifn, const char* sfn, const char* tfn,
    XTensor paddingEnc;

    batchLoader.Init(ifn, sfn, tfn);
-    XPRINT1(0, stderr, "[INFO] loaded the input file, elapsed=%.1fs \n", 
-            GetClockSec() - startT);
+    LOG("loaded the input file, elapsed=%.1fs ", GetClockSec() - startT);

    int count = 0;
    double batchStart = GetClockSec();
@@ -130,24 +126,24 @@ void T2TTranslator::Translate(const char* ifn, const char* sfn, const char* tfn,

        for (int i = 0; i < indices.Size() - 1; ++i) {
            Result* res = new Result;
-            res->id = indices[i];
+            res->id = int(indices[i]);
            res->res = output[i];
            batchLoader.outputBuffer.Add(res);
        }
        delete[] output;

-        wc += indices[-1];
-        wordCountTotal += indices[-1];
+        wc += int(indices[-1]);
+        wordCountTotal += int(indices[-1]);

-        sentCount += (indices.Size() - 1);
+        sentCount += int(indices.Size() - 1);
        batchCount += 1;

        if (count % 1 == 0) {
            double elapsed = GetClockSec() - batchStart;
            batchStart = GetClockSec();
-            XPRINT3(0, stderr, "[INFO] elapsed=%.1fs, sentence=%f, sword=%.1fw/s\n",
-                    elapsed, float(sentCount) / float(batchLoader.inputBuffer.Size()), 
-                    double(wc) / elapsed);
+            LOG("elapsed=%.1fs, sentence=%f, sword=%.1fw/s",
+                elapsed, float(sentCount) / float(batchLoader.inputBuffer.Size()), 
+                double(wc) / elapsed);
            wc = 0;
        }
    }
@@ -169,8 +165,8 @@ void T2TTranslator::Translate(const char* ifn, const char* sfn, const char* tfn,

    double elapsed = GetClockSec() - startDump;

-    XPRINT2(0, stderr, "[INFO] translation completed (word=%d, sent=%ld)\n",
-            wordCountTotal, batchLoader.inputBuffer.Size() + batchLoader.emptyLines.Size());
+    LOG("translation completed (word=%d, sent=%zu)", 
+        wordCountTotal, batchLoader.inputBuffer.Size() + batchLoader.emptyLines.Size());
 }

 /*
@@ -178,7 +174,7 @@ dump the result into the file
 >> file - data file
 >> output - output tensor
 */
-void T2TTranslator::Dump(FILE* file, XTensor* output)
+void Translator::Dump(FILE* file, XTensor* output)
 {
    if (output != NULL && output->unitNum != 0) {
        int seqLength = output->dimSize[output->order - 1];

--- a/source/sample/transformer/translate/T2TTranslator.h
+++ b/source/sample/transformer/translate/T2TTranslator.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2020, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -21,17 +20,17 @@
 * $Modified by: HU Chi (huchinlp@gmail.com) 2020-06
 */

-#ifndef __T2TTESTER_H__
-#define __T2TTESTER_H__
+#ifndef __TESTER_H__
+#define __TESTER_H__

-#include "T2TSearch.h"
-#include "T2TDataSet.h"
+#include "Search.h"
+#include "DataSet.h"

-namespace transformer
+namespace nmt
 {

 /* This class translates test sentences with a trained model. */
-class T2TTranslator
+class Translator
 {
 public:
    /* vocabulary size of the source side */
@@ -57,17 +56,17 @@ public:

 public:
    /* constructor */
-    T2TTranslator();
+    Translator();

    /* de-constructor */
-    ~T2TTranslator();
+    ~Translator();

    /* initialize the model */
-    void Init(T2TConfig& config);
+    void Init(Config& config);

    /* test the model */
    void Translate(const char* ifn, const char* vfn, const char* ofn, 
-                   const char* tfn, T2TModel* model);
+                   const char* tfn, Model* model);

    /* dump the result into the file */
    void Dump(FILE* file, XTensor* output);

--- a/source/sample/transformer/translate/T2TVocab.cpp
+++ b/source/sample/transformer/translate/T2TVocab.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -21,8 +20,8 @@

 #include <fstream>

-#include "T2TVocab.h"
-#include "../module/T2TUtility.h"
+#include "Vocab.h"
+#include "../Utility.h"

 namespace nts {

@@ -31,7 +30,7 @@ void Vocab::Load(const string& src)
 {
    string vsz, sid;
    ifstream f(src, ios::in);
-    CheckNTErrors(f.is_open(), "Unable to open the vocabulary file");
+    CheckNTErrors(f.is_open(), "unable to open the vocabulary file");

    /* get the vocab size and the start id */
    f >> vsz >> sid;

--- a/source/sample/transformer/translate/T2TVocab.h
+++ b/source/sample/transformer/translate/T2TVocab.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northeastern University.
- * All rights reserved.
+/* NiuTrans.NMT - an open-source neural machine translation system.
+ * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,8 +18,8 @@
 * $Created by: HU Chi (huchinlp@foxmail.com) 2020-01-03
 */

-#ifndef __T2TVOCAB_H__
-#define __T2TVOCAB_H__
+#ifndef __VOCAB_H__
+#define __VOCAB_H__

 #include <cstdio>
 #include <unordered_map>
@@ -30,10 +29,10 @@ using namespace std;
 namespace nts {

 /* user-defined symbols */
-#define UNK 0
 #define PAD 1
 #define SOS 2
 #define EOS 2
+#define UNK 3

 /* the vocabulary class */
 struct Vocab