update

0b43acf6 · 姜雨帆 · 896e5231 · 0b43acf6 · 0b43acf6 · 0b43acf6
Commit 0b43acf6 authored Feb 03, 2019 by 姜雨帆
--- a/source/network/XBackwardMath.cpp
+++ b/source/network/XBackwardMath.cpp
@@ -71,6 +71,8 @@ void XMathGrad::MakeGrad(XTensor * node, bool isEfficient)
        GradMultiply(node, isEfficient);
    else if(operID == MATH_MULTIPLYDIM)
        GradMultiplyDim(node, isEfficient);
+    else if (operID == MATH_MULTIPLYBROADCAST)
+        GradMultiplyBroadcast(node, isEfficient);
    else if(operID == MATH_NEGATE)
        GradNegate(node, isEfficient);
    else if(operID == MATH_NORMALIZE)

--- a/source/sample/transformer/T2TAttention.cpp
+++ b/source/sample/transformer/T2TAttention.cpp
@@ -75,16 +75,19 @@ void T2TAttention::InitModel(int argc, char ** argv,
    InitTensor2D(&wq, d, dk, X_FLOAT, devID, mem);
    InitTensor2D(&wv, d, dv, X_FLOAT, devID, mem);
    InitTensor2D(&wa, d, d, X_FLOAT, devID, mem);
+    InitTensor2D(&wbig, d, 3 * d, X_FLOAT, devID, mem);
    float scale = 1.0F;
    float finfoutk = (float)sqrt(6.0F * scale/(d + dk));
    float finfoutv = (float)sqrt(6.0F * scale/(d + dv));
    float finfouta = (float)sqrt(6.0F * scale / (d + d));
+    float finfoutbig = (float)sqrt(6.0F * scale / (d + 3*d));
    wk.SetDataRand(-finfoutk, finfoutk);
    wq.SetDataRand(-finfoutk, finfoutk);
    wv.SetDataRand(-finfoutv, finfoutv);
    wa.SetDataRand(-finfouta, finfouta);
+    wbig.SetDataRand(-finfoutbig, finfoutbig);
 }
 /* 
@@ -98,16 +101,40 @@ make the network
 >> isTraining - indicates whether the model is used for training
 << return - multi-attention result
 */
-XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining)
+XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining, bool selfatt)
 {
    XTensor k2;
    XTensor q2;
    XTensor v2;
+    if (selfatt){
+        XTensor con;
+        XList split;
+        con = MMul(k, wbig);
+        int d1 = con.GetDim(0);
+        int d2 = con.GetDim(1);
+        int d3 = con.GetDim(2) / 3;
+        InitTensor3D(&k2, d1, d2, d3, X_FLOAT, devID, mem);
+        InitTensor3D(&q2, d1, d2, d3, X_FLOAT, devID, mem);
+        InitTensor3D(&v2, d1, d2, d3, X_FLOAT, devID, mem);
+        split.Add(&q2);
+        split.Add(&k2);
+        split.Add(&v2);
+        Split(con, split, 2, 3);
+    }
+    else{
        /* linear transofmration before self-attention */
        k2 = MMul(k, wk);
        q2 = MMul(q, wq);
        v2 = MMul(v, wv);
+    }
    XTensor kheads;
    XTensor qheads;

--- a/source/sample/transformer/T2TAttention.h
+++ b/source/sample/transformer/T2TAttention.h
@@ -60,6 +60,7 @@ public:
    /* transformation after dot-product attention */
    XTensor wa;
+    XTensor wbig;
    /* size of transformed Q and K */
    int dk;
@@ -95,7 +96,7 @@ public:
                   int myDevID = -1, XMem * myMem = NULL);
    /* make the network */
-    XTensor Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining);
+    XTensor Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining, bool selfatt);
 };
 }

--- a/source/sample/transformer/T2TDecoder.cpp
+++ b/source/sample/transformer/T2TDecoder.cpp
@@ -21,6 +21,8 @@
 #include <math.h>
 #include "T2TDecoder.h"
+#include "T2TUtility.h"
+#include "T2TLayerNormal.h"
 #include "../../tensor/core/CHeader.h"
 namespace transformer
@@ -53,16 +55,43 @@ void AttDecoder::InitModel(int argc, char ** argv,
                           bool myIsMasked, int myIgnored, 
                           int myDevID, XMem * myMem)
 {
-    AttEncoder::InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
+    //AttEncoder::InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
+    devID = myDevID;
+    mem = myMem;
+    ignored = myIgnored;
+    LoadParamInt(argc, argv, "nlayer", &nlayer, 6);
+    LoadParamInt(argc, argv, "hsize", &hSize, DEFAULT_EMBEDDING_SIZE);
+    LoadParamInt(argc, argv, "esize", &eSize, DEFAULT_EMBEDDING_SIZE);
+    LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
+    LoadParamFloat(argc, argv, "dropout", &dropoutP, 0);
+    CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
+    CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsize\"");
+    /* embedding model */
+    embedder.InitModel(argc, argv, devID, mem, false);
+    attentions = new T2TAttention[nlayer];
+    fnns = new T2TFNN[nlayer];
+    attLayerNorms = new T2TLN[nlayer];
+    fnnLayerNorms = new T2TLN[nlayer];
    attentionsEnde = new T2TAttention[nlayer];
    attEndeLayerNorms = new T2TLN[nlayer];
    /* initialize the stacked layers */
-    for(int i = 0; i < nlayer; i++){
+    for (int i = 0; i < nlayer; i++) {
-        attentionsEnde[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
+        attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
+        fnns[i].InitModel(argc, argv, myDevID, myMem);
+        attLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
+        fnnLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
+        attentionsEnde[i].InitModel(argc, argv, true, myIgnored, myDevID, myMem);
        attEndeLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
    }
 }
 /* 
@@ -82,7 +111,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
    /* dropout */
    if(isTraining && dropoutP > 0)
-        x = Dropout(x, dropoutP, 2);
+        x = Dropout(x, dropoutP);
    for(int i = 0; i < nlayer; i++){
        XTensor att;
@@ -93,11 +122,11 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
        /******************/
        /* self attention */
-        att = attentions[i].Make(x, x, x, mask, isTraining);
+        att = attentions[i].Make(x, x, x, mask, isTraining, true);
        /* dropout */
        if(isTraining && dropoutP > 0)
-            att = Dropout(att, dropoutP, 2);
+            att = Dropout(att, dropoutP);
        /* residual connection */
        res = Sum(att, x);
@@ -107,11 +136,11 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
        /*****************************/
        /* encoder-decoder attention */
-        ende = attentionsEnde[i].Make(outputEnc, x, outputEnc, maskEncDec, isTraining);
+        ende = attentionsEnde[i].Make(outputEnc, x, outputEnc, maskEncDec, isTraining, false);
        /* dropout */
        if(isTraining && dropoutP > 0)
-            ende = Dropout(ende, dropoutP, 2);
+            ende = Dropout(ende, dropoutP);
        /* residual connection */
        res = Sum(ende, x);
@@ -125,7 +154,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
        /* dropout */
        if(isTraining && dropoutP > 0)
-            fnn = Dropout(fnn, dropoutP, 2);
+            fnn = Dropout(fnn, dropoutP);
        /* residual connection */
        res = Sum(fnn, x);

--- a/source/sample/transformer/T2TDecoder.h
+++ b/source/sample/transformer/T2TDecoder.h
@@ -27,9 +27,56 @@
 namespace transformer
 {
-class AttDecoder : public AttEncoder
+class AttDecoder
 {
 public:
+    /* device id */
+    int devID;
+    /* memory pool */
+    XMem * mem;
+    /* layer number */
+    int nlayer;
+    /* hidden layer size of the FNN layer */
+    int hSize;
+    /* embedding size */
+    int eSize;
+    /* vocabulary size */
+    int vSize;
+    /* dropout probability */
+    DTYPE dropoutP;
+    /* some positions can be ignored in attention. this is useful in lm where the first position needs
+ *     special design for the attention model. */
+    int ignored;
+    /* embedding of word at each position */
+    T2TEmbedder embedder;
+    /* FNN model of each layer */
+    T2TFNN * fnns;
+    /* attention model of each layer */
+    T2TAttention * attentions;
+    /* layer normalization for fnn */
+    T2TLN * fnnLayerNorms;
+    /* layer normalization for attention */
+    T2TLN * attLayerNorms;
+    /* input tensor of the encoder */
+    XTensor * input;
+    /* output tensor of the encoder */
+    XTensor * output;
    /* encoder-decoder attention model of each layer */
    T2TAttention * attentionsEnde;

--- a/source/sample/transformer/T2TEmbedding.cpp
+++ b/source/sample/transformer/T2TEmbedding.cpp
@@ -48,12 +48,18 @@ initialize the model
 >> myDevID - device id
 >> myMem - the memory pool
 */
-void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
+void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, XMem * myMem, bool isEnc)
 {
    devID = myDevID;
    mem = myMem;
+    if(isEnc){
        LoadParamInt(argc, argv, "vsize", &vSize, -1);
+    }
+    else{
+        LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
+    }
+    //LoadParamInt(argc, argv, "vsize", &vSize, -1);
    LoadParamInt(argc, argv, "maxlen", &maxLength, 512);
    LoadParamInt(argc, argv, "d", &eSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);

--- a/source/sample/transformer/T2TEmbedding.h
+++ b/source/sample/transformer/T2TEmbedding.h
@@ -71,7 +71,7 @@ public:
    ~T2TEmbedder();
    /* initialize the model */
-    void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL);
+    void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL, bool isEnc = true);
    /* make positional embeddings */
    void MakePosEmbedding(int eSize, int d, int length);

--- a/source/sample/transformer/T2TEncoder.cpp
+++ b/source/sample/transformer/T2TEncoder.cpp
@@ -107,7 +107,7 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo
    /* dropout */
    if(isTraining && dropoutP > 0)
-        x = Dropout(x, dropoutP, 2);
+        x = Dropout(x, dropoutP);
    for(int i = 0; i < nlayer; i++){
        XTensor att;
@@ -116,11 +116,11 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo
        XTensor res;
        /* self attention */
-        att = attentions[i].Make(x, x, x, mask, isTraining);
+        att = attentions[i].Make(x, x, x, mask, isTraining, true);
        /* dropout */
        if(isTraining && dropoutP > 0)
-            att = Dropout(att, dropoutP, 2);
+            att = Dropout(att, dropoutP);
        /* residual connection */
        res = Sum(att, x);
@@ -133,7 +133,7 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo
        /* dropout */
        if(isTraining && dropoutP > 0)
-            fnn = Dropout(fnn, dropoutP, 2);
+            fnn = Dropout(fnn, dropoutP);
        /* residual connection */
        res = Sum(fnn, x);
@@ -160,3 +160,4 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool isTraining)
 }
 }
--- a/source/sample/transformer/T2TModel.cpp
+++ b/source/sample/transformer/T2TModel.cpp
@@ -274,9 +274,10 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
    _Sum(&maskEnc, padding3, &maskEnc);
    encoding = MakeEncoder(inputEnc, maskEnc, isTraining);
+    //encoding.Dump(stderr, "encoding",10);
    decoding = MakeDecoder(inputDec, encoding, maskDec, maskEncDec, isTraining);
+    //decoding.Dump(stderr, "decoding", 10);
    outputLayer->Make(decoding, output);
    delete[] dims;
@@ -300,9 +301,10 @@ void T2TModel::GetParams(XList &list)
        list.Add(&encoder->fnns[i].b1);
        list.Add(&encoder->fnns[i].w2);
        list.Add(&encoder->fnns[i].b2);
-        list.Add(&encoder->attentions[i].wk);
+        //list.Add(&encoder->attentions[i].wk);
-        list.Add(&encoder->attentions[i].wq);
+        //list.Add(&encoder->attentions[i].wq);
-        list.Add(&encoder->attentions[i].wv);
+        //list.Add(&encoder->attentions[i].wv);
+        list.Add(&encoder->attentions[i].wbig);
        list.Add(&encoder->attentions[i].wa);
        list.Add(&encoder->fnnLayerNorms[i].w);
        list.Add(&encoder->fnnLayerNorms[i].b);
@@ -324,9 +326,10 @@ void T2TModel::GetParams(XList &list)
            list.Add(&decoder->attentionsEnde[i].wa);
            list.Add(&decoder->attEndeLayerNorms[i].w);
            list.Add(&decoder->attEndeLayerNorms[i].b);
-            list.Add(&decoder->attentions[i].wk);
+            //list.Add(&decoder->attentions[i].wk);
-            list.Add(&decoder->attentions[i].wq);
+            //list.Add(&decoder->attentions[i].wq);
-            list.Add(&decoder->attentions[i].wv);
+            //list.Add(&decoder->attentions[i].wv);
+            list.Add(&decoder->attentions[i].wbig);
            list.Add(&decoder->attentions[i].wa);
            list.Add(&decoder->fnnLayerNorms[i].w);
            list.Add(&decoder->fnnLayerNorms[i].b);

--- a/source/sample/transformer/T2TOutput.cpp
+++ b/source/sample/transformer/T2TOutput.cpp
@@ -56,7 +56,7 @@ void T2TOutput::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
    float minmax = 0;
-    LoadParamInt(argc, argv, "vsize", &vSize, -1);
+    LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
    LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "d", &hSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamFloat(argc, argv, "outputminmax", &minmax, 0.08F);

--- a/source/sample/transformer/T2TTrainer.cpp
+++ b/source/sample/transformer/T2TTrainer.cpp
@@ -41,12 +41,15 @@ T2TTrainer::T2TTrainer()
    seqLen2 = NULL;
    nseqBuf = 0;
    nextSeq = -1;
+    nextBatch = -1;
    argNum = 0;
    argArray = NULL;
    buf = NULL;
    buf2 = NULL;
+    bufBatch = NULL;
    bufSize = 0;
+    bufBatchSize = 0;
    seqOffset = NULL;
 }
@@ -55,6 +58,7 @@ T2TTrainer::~T2TTrainer()
 {
    delete[] buf;
    delete[] buf2;
+    delete[] bufBatch;
    delete[] seqLen;
    delete[] seqLen2;
    delete[] seqOffset;
@@ -117,9 +121,11 @@ void T2TTrainer::Init(int argc, char ** argv)
    LoadParamBool(argc, argv, "smallbatch", &isSmallBatch, true);
    LoadParamBool(argc, argv, "bigbatch", &isBigBatch, false);
    LoadParamBool(argc, argv, "debug", &isDebugged, false);
+    LoadParamBool(argc, argv, "randbatch", &isRandomBatch, false);
    buf  = new int[bufSize];
    buf2 = new int[bufSize];
+    bufBatch = new BatchNode[bufSize];
    seqLen  = new int[bufSize];
    seqLen2 = new int[bufSize];
    seqOffset = new int[bufSize];
@@ -172,6 +178,9 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
    double startT = GetClockSec();
+    FILE * fileen = fopen("enc.txt", "w");
+    FILE * filede = fopen("dec.txt", "w");
    for(epoch = 1; epoch <= nepoch; epoch++){
 #ifndef WIN32
        if(isShuffled)
@@ -205,6 +214,10 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
            CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch");
+            //batchEnc.Dump(stderr, "enc",1);
+            //batchDec.Dump(stderr, "dec",1);
+            //paddingDec.Dump(stderr, "paddec");
            /* output probabilities */
            XTensor output;
@@ -222,17 +235,18 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
                LabelSmooth(&gold, &goldSmoothed, labelSmoothingP);
            /* make paddings for the output */
-            if (output.GetDim(0) > 1)
+            if (output.GetDim(0) > 0)
                PadOutput(&output, &gold, &paddingDec);
            /* get probabilities */
            float prob = GetProb(&output, &gold, NULL);
+            //printf("%f\n", prob);
+            //float prob = 0;
            DTYPE lossLocal = -prob / wc;
            bool doUpdate = (!IsNAN(lossLocal) && !IsINF(lossLocal) && lossLocal < 1e3F);
            XTensor &g = labelSmoothingP > 0 ? goldSmoothed : gold;   
+            //doUpdate = false;
            if (doUpdate) {
                /* recale the output for normalized loss */
@@ -292,6 +306,9 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
            MakeCheckpoint(model, validFN, modelFN, "epoch", epoch);
    }
+    fclose(fileen);
+    fclose(filede);
    double elapsed = GetClockSec() - startT;
    epoch = MIN(epoch, nepoch);
@@ -434,11 +451,11 @@ void T2TTrainer::MakeCheckpoint(T2TModel * model, const char * validFN, const ch
    sprintf(fn2, "%s.%s.%03d.output", modelFN, label, id);
    model->Dump(fn);
-    if(validFN != NULL){
+    //if(validFN != NULL){
-        T2TTrainer trainer;
+        //T2TTrainer trainer;
-        trainer.Init(argNum, argArray);
+        //trainer.Init(argNum, argArray);
-        trainer.Test(validFN, fn2, model);
+        //trainer.Test(validFN, fn2, model);
-    }
+    //}
    delete[] fn;
    delete[] fn2;
@@ -473,7 +490,8 @@ int T2TTrainer::LoadBuf(FILE * file, bool isSorted, int step)
    int wordCount = 0;
    while(fgets(line, MAX_SEQUENCE_LENGTH - 1, file)){
        int len = (int)strlen(line);
+        if(line[0]=='b')
+            break;
        while(line[len - 1] == '\r' || line[len - 1] == '\n'){
            line[len - 1] = 0;
            len--;
@@ -544,9 +562,14 @@ int T2TTrainer::LoadBuf(FILE * file, bool isSorted, int step)
            node.offset = i;
            node.p = buf + offset;
            node.size = 0;
-            for(int j = 0; j < step; j++)
+            int max = 0;
+            for(int j = 0; j < step; j++){
                node.size += seqLen[i + j];
-            node.value = seqLen[i];
+                max = MAX(max, seqLen[i + j]);
+            }
+            //node.value = seqLen[i+1]+seqLen[i];
+            //node.value = MAX(seqLen[i+1],seqLen[i]);
+            node.value = max;
            count++;
            offset += node.size;
        }
@@ -768,6 +791,12 @@ int T2TTrainer::LoadBatchLM(FILE * file,
    return sc;
 }
+int CompareBatchNode(const void * a, const void * b)
+{
+    return ((BatchNode*)b)->key - ((BatchNode*)a)->key;
+}
 /*
 load a batch of sequences (for MT)
 >> file - the handle to the data file
@@ -797,10 +826,70 @@ int T2TTrainer::LoadBatchMT(FILE * file,
                            int devID, XMem * mem, 
 							bool isTraining)
 {
-    if(nextSeq < 0 || nextSeq >= nseqBuf)
+    //if (nextSeq < 0 || nextSeq >= nseqBuf)
+    //    LoadBuf(file, isSorted, 2);
+    if (nextBatch < 0 || nextBatch >= bufBatchSize) {
        LoadBuf(file, isSorted, 2);
-    int seq = MAX(nextSeq, 0);
+        int seq = 0;
+        bufBatchSize = 0;
+        nextBatch = 0;
+        /* we segment the buffer into batches */
+        while (seq < nseqBuf) {
+            int wcEnc = 0;
+            int wcDec = 0;
+            int wnEnc = 0;
+            int wnDec = 0;
+            int maxEnc = 0;
+            int maxDec = 0;
+            int sc = 0;
+            while (seq + sc < nseqBuf) {
+        /* source-side sequence */
+        wnEnc = seqLen[seq + sc];
+        /* target-side sequence */
+        wnDec = isDoubledEnd ? seqLen[seq + sc + 1] : seqLen[seq + sc + 1] - 1;
+        int tcEnc = isBigBatch ? (wcEnc + wnEnc): MAX(maxEnc, wnEnc) * (sc + 2) / 2;
+        int tcDec = isBigBatch ? (wcDec + wnDec): MAX(maxDec, wnDec) * (sc + 2) / 2;
+        if(sc != 0 && sc > sBatch * 2 && (tcEnc > wBatch || tcDec > wBatch))
+            break;
+        wcEnc += wnEnc;
+        sc += 1;
+        if(maxEnc < wnEnc)
+            maxEnc = wnEnc;
+        wcDec += wnDec;
+        sc += 1;
+        if(maxDec < wnDec)
+            maxDec = wnDec;
+    }
+            BatchNode & batch = bufBatch[bufBatchSize];
+            batch.beg = seq;
+            batch.end = seq + sc;
+            batch.maxEnc = maxEnc;
+            batch.maxDec = maxDec;
+            batch.key = rand();
+            bufBatchSize++;
+            seq = seq + sc;
+        }
+        if(isRandomBatch)
+            qsort(bufBatch, bufBatchSize, sizeof(BatchNode), CompareBatchNode);
+    }
+    /*int seq = MAX(nextSeq, 0);
    int wcEnc = 0;
    int wcDec = 0;
    int wnEnc = 0;
@@ -813,10 +902,8 @@ int T2TTrainer::LoadBatchMT(FILE * file,
    while(seq + sc < nseqBuf){
-        /* source-side sequence */
        wnEnc = seqLen[seq + sc];
-        /* target-side sequence */
        wnDec = isDoubledEnd ? seqLen[seq + sc + 1] : seqLen[seq + sc + 1] - 1;
        int tcEnc = isBigBatch ? (wcEnc + wnEnc): MAX(maxEnc, wnEnc) * (sc + 2) / 2;
@@ -841,8 +928,18 @@ int T2TTrainer::LoadBatchMT(FILE * file,
    nextSeq = seq + sc;
    if(sc <= 0)
+        return 0;*/
+    if(bufBatchSize <= 0)
        return 0;
+    BatchNode & batch = bufBatch[nextBatch++];
+    int seq = batch.beg;
+    int sc = batch.end - batch.beg;
+    int maxEnc = batch.maxEnc;
+    int maxDec = batch.maxDec;
+    CheckNTErrors(sc % 2 == 0, "The input samples must be paired");
    int sCount = sc/2;
    int seqSize = 0;
    int dimsDec[3] = {sCount, maxDec, vsDec};
@@ -861,13 +958,14 @@ int T2TTrainer::LoadBatchMT(FILE * file,
    int wCountEnc = 0;
    int wCountDec = 0;
+    int wCountPad = 0;
    int wGold = 0;
    wCount = 0;
    int * batchEncValues = new int[batchEnc->unitNum];
    int * batchDecValues = new int[batchDec->unitNum];
    //MTYPE * paddingEncOffsets = new MTYPE[sc * maxEnc / 2];
-    //MTYPE * paddingDecOffsets = new MTYPE[sc * maxDec / 2];
+    MTYPE * paddingDecOffsets = new MTYPE[sc * maxDec / 2];
    MTYPE * goldOffsets = new MTYPE[sc * maxDec / 2];
    memset(batchEncValues, 0, sizeof(int) * batchEnc->unitNum);
@@ -901,7 +999,10 @@ int T2TTrainer::LoadBatchMT(FILE * file,
            int num = buf[seqOffset[s] + w];
            batchDecValues[batchDec->GetOffset2D(sent, w)] = num;
            //paddingDecOffsets[wCountDec] = paddingDec->GetOffset2D(sent, w);
+            if (w < len-1){
+                paddingDecOffsets[wCountPad++] = paddingDec->GetOffset2D(sent, w);
+                wCount++;
+            }
            if (w > 0)
                goldOffsets[wGold++] = gold->GetOffset3D(sent, w - 1, buf[seqOffset[s] + w]);
@@ -911,7 +1012,7 @@ int T2TTrainer::LoadBatchMT(FILE * file,
                else
                    goldOffsets[wGold++] = gold->GetOffset3D(sent, w, buf[seqOffset[s] + w + 1]);
            }
-            wCount++;
+            //wCount++;
            wCountDec++;
            if(seqs != NULL)
                seqs[seqSize++] = buf[seqOffset[s] + w];
@@ -924,19 +1025,19 @@ int T2TTrainer::LoadBatchMT(FILE * file,
    }
    batchDec->SetData(batchDecValues, batchDec->unitNum);
-    //paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCountDec);
+    paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCountPad);
-    XTensor * tmp2 = NewTensorBuf(paddingDec, devID, mem);
+    //XTensor * tmp2 = NewTensorBuf(paddingDec, devID, mem);
-    _ConvertDataType(batchDec, tmp2);
+    //_ConvertDataType(batchDec, tmp2);
-    _NotEqual(tmp2, paddingDec, 0);
+    //_NotEqual(tmp2, paddingDec, 0);
-    DelTensorBuf(tmp2);
+    //DelTensorBuf(tmp2);
    gold->SetDataBatched(goldOffsets, 1.0F, wGold);
    delete[] batchEncValues;
    delete[] batchDecValues;
    //delete[] paddingEncOffsets;
-    //delete[] paddingDecOffsets;
+    delete[] paddingDecOffsets;
    delete[] goldOffsets;
    return sc;

--- a/source/sample/transformer/T2TTrainer.h
+++ b/source/sample/transformer/T2TTrainer.h
@@ -33,6 +33,25 @@ using namespace nts;
 namespace transformer
 {
+/* node to keep batch information */
+struct BatchNode
+{
+    /* begining position */
+    int beg;
+    /* end position */
+    int end;
+    /* maximum word number on the encoder side */
+    int maxEnc;
+    /* maximum word number on the decoder side */
+    int maxDec;
+    /* a key for sorting */
+    int key;
+};
 /* trainer of the T2T model */
 class T2TTrainer
 {
@@ -49,9 +68,15 @@ public:
    /* another buffer */
    int * buf2;
+    /* batch buf */
+    BatchNode * bufBatch;
    /* buffer size */
    int bufSize;
+    /* size of batch buffer */
+    int bufBatchSize;
    /* length of each sequence */
    int * seqLen;
@@ -67,6 +92,9 @@ public:
    /* offset for next sequence in the buffer */
    int nextSeq;
+    /* offset for next batch */
+    int nextBatch;
    /* indicates whether the sequence is sorted by length */
    bool isLenSorted;
@@ -142,6 +170,9 @@ public:
    /* counterpart of "isSmallBatch" */
    bool isBigBatch;
+    /* randomize batches */
+    bool isRandomBatch;
    /* indicates whether we intend to debug the net */
    bool isDebugged;

--- a/source/sample/transformer/Transformer.cpp
+++ b/source/sample/transformer/Transformer.cpp
@@ -59,23 +59,28 @@ int TransformerMain(int argc, const char ** argv)
    LoadParamString(argc, args, "test", testFN, "");
    LoadParamString(argc, args, "output", outputFN, "");
+    srand((unsigned int)time(NULL));
    T2TTrainer trainer;
    trainer.Init(argc, args);
    T2TModel model;
    model.InitModel(argc, args);
+    //if(strcmp(modelFN, ""))
+        //model.Read(modelFN);    
    /* learn model parameters */
    if(strcmp(trainFN, ""))
        trainer.Train(trainFN, testFN, strcmp(modelFN, "") ? modelFN : "checkpoint.model", &model);
    /* save the final model */
-    if(strcmp(modelFN, "") && strcmp(trainFN, ""))
+    //if(strcmp(modelFN, "") && strcmp(trainFN, ""))
-        model.Dump(modelFN);
+        //model.Dump(modelFN);
    /* load the model if neccessary */
-    if(strcmp(modelFN, ""))
+    //if(strcmp(modelFN, ""))
-        model.Read(modelFN);
+        //model.Read(modelFN);
    T2TTrainer tester;
    tester.Init(argc, args);

--- a/source/tensor/XDevice.cpp
+++ b/source/tensor/XDevice.cpp
@@ -60,6 +60,7 @@ XDevice::~XDevice()
        cublasDestroy(cublasHandle);
    if(stream != NULL)
        delete stream;
+    curandDestroyGenerator(gen);
 #endif
 }
@@ -82,6 +83,10 @@ void XDevice::Init(int myDevID)
        cudaDeviceProp prop;
        cudaSetDevice(myDevID);
+        curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
+        curandSetPseudoRandomGeneratorSeed(gen, seed);
        if(cudaGetDeviceProperties(&prop, devID) != cudaSuccess){
            XPRINT1(0, stderr, "cannot get GPU(%d) information.", devID);
            exit(1);

--- a/source/tensor/XDevice.h
+++ b/source/tensor/XDevice.h
@@ -112,6 +112,9 @@ public:
    /* specify if the handle is initialized */
    bool isHandleReady;
+    /* generater of random numbers */
+    curandGenerator_t gen;
 #endif

--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -1614,11 +1614,17 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg, 
        else if (dataType == X_INT) {
            int end = MIN(n > 0 ? beg + n : beg + unitNum, unitNum);
            for(int i = beg; i < end; i++){
+                if((i%(dimSize[1]) == 0)&&(i!=0)) {
+                    fprintf(file, " \n");
+                }
                int f = ((int*)d)[i];
                if(i == beg)
                    fprintf(file, "%d", f);
                else
                    fprintf(file, " %d", f);
+                //if((i%(dimSize[1]-1) == 0)&&(i!=0)) {
+                    //fprintf(file, " \n");
+                //}
            }
        }
        else

--- a/source/tensor/core/getandset/SetData.cpp
+++ b/source/tensor/core/getandset/SetData.cpp
@@ -387,7 +387,7 @@ generate data items with a uniform distribution in [lower, upper]
 >> lower - lower value of the range
 >> upper - upper value of the range
 */
-void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
+void _SetDataRand(const XTensor * tensor, DTYPE lower, DTYPE upper)
 {
    CheckNTErrors(upper > lower, "the high value must be greater than low value!");
@@ -432,6 +432,39 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
 }
 /* 
+generate data items with a uniform distribution in [lower, upper] and set
+the item to a pre-defined value if the item >= p, set the item to 0 otherwise
+>> tensor - the tensor whose data array would be initialized
+>> lower - lower value of the range
+>> upper - upper value of the range
+>> p - the threshold
+>> value - the value we intend to assign to the item
+*/
+void _SetDataRandP(const XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value)
+{
+    CheckNTErrors(tensor->dataType == DEFAULT_DTYPE, "TODO");
+    if (tensor->devID < 0) {
+        _SetDataRand(tensor, lower, upper);
+        DTYPE * data = (DTYPE*)tensor->data;
+        for (int i = 0; i < tensor->unitNum; i++) {
+            if (data[i] >= p)
+                data[i] = value;
+            else
+                data[i] = 0;
+        }
+    }
+    else {
+#ifdef USE_CUDA
+        _CudaSetDataRandP(tensor, lower, upper, p, value);
+#else
+        ShowNTErrors("Please recompile the code by specifying USE_CUDA");
+#endif // USE_CUDA
+    }
+}
+/*
 generate data items with a normal distribution with specified mean and standard deviation 
 >> tensor - the tensor that keeps the data
 >> mean - mean or expectation of the distribution

--- a/source/tensor/core/getandset/SetData.cu
+++ b/source/tensor/core/getandset/SetData.cu
@@ -186,6 +186,26 @@ void KernelSetDataRandDouble(double * d, int size, DTYPE lower, DTYPE variance)
 }
 /*
+set data items to a pre-defined value if its value >= p, set it to 0 otherwise
+>> d - pointer to the data array
+>> size - size of the array
+>> lower - low value of the range
+>> variance - the variance of the range
+*/
+__global__
+void KernelSetDataPCut(DTYPE * d, int size, DTYPE p, DTYPE value)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < size) {
+        if (d[i] >= p)
+            d[i] = value;
+        else
+            d[i] = 0;
+    }
+}
+/* 
 set data items along with a given dimension (and keep the remaining items unchanged) - kernel version
 >> tensor - the tensor whose data array would be initialized
 >> beg - the beginning position
@@ -437,7 +457,7 @@ generate data items with a uniform distribution in [lower, upper]
 >> lower - lower value of the range
 >> upper - upper value of the range
 */
-void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
+void _CudaSetDataRand(const XTensor * tensor, DTYPE lower, DTYPE upper)
 {
    CheckNTErrors(upper > lower, "the high value must be greater than low value!");
@@ -452,17 +472,46 @@ void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
    int devIDBackup;
    ProtectCudaDev(tensor->devID, devIDBackup);
-    curandGenerator_t gen;
+    curandGenerator_t & gen = GDevs.GPUs[tensor->devID].gen;
-    curandCreateGenerator (&gen, CURAND_RNG_PSEUDO_DEFAULT);
-    curandSetPseudoRandomGeneratorSeed(gen, time(NULL));
    curandGenerateUniform(gen , (float*)tensor->data , tensor->unitNum);
-    curandDestroyGenerator(gen);
    DTYPE variance = upper - lower;
+    if(variance != 1.0F || lower != 0){
        if (tensor->dataType == X_FLOAT)
            KernelSetDataRandFloat  <<<blocks, threads >>>((float*) tensor->data, tensor->unitNum, lower, variance);
        else if (tensor->dataType == X_DOUBLE)
            KernelSetDataRandDouble <<<blocks, threads >>>((double*)tensor->data, tensor->unitNum, lower, variance);
+    }
+    BacktoCudaDev(tensor->devID, devIDBackup);
+}
+/* 
+generate data items with a uniform distribution in [lower, upper] and set
+the item to a pre-defined value if the item >= p, set the item to 0 otherwise 
+>> tensor - the tensor whose data array would be initialized
+>> lower - lower value of the range
+>> upper - upper value of the range
+>> p - the threshold
+>> value - the value we intend to assign to the item
+*/
+void _CudaSetDataRandP(const XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value)
+{
+    _CudaSetDataRand(tensor, lower, upper);
+    int gridSize[3];
+    int blockSize[3];
+    GDevs.GetCudaThread(tensor->devID, tensor->unitNum, gridSize, blockSize);
+    dim3 blocks(gridSize[0]);
+    dim3 threads(blockSize[0]);
+    int devIDBackup;
+    ProtectCudaDev(tensor->devID, devIDBackup);
+    KernelSetDataPCut << <blocks, threads >> >((float*)tensor->data, tensor->unitNum, p, value);
    BacktoCudaDev(tensor->devID, devIDBackup);
 }

--- a/source/tensor/core/getandset/SetData.cuh
+++ b/source/tensor/core/getandset/SetData.cuh
@@ -47,7 +47,11 @@ void _CudaSetDataIndexed(XTensor * source, XTensor * modify, int dim, int index)
 void _CudaSetDataLowTri(XTensor * tensor, DTYPE p, int shift);
 /* generate data items with a uniform distribution in [lower, upper] */
-void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper);
+void _CudaSetDataRand(const XTensor * tensor, DTYPE lower, DTYPE upper);
+/* generate data items with a uniform distribution in [lower, upper] and set
+   the item to a pre-defined value if the item >= p, set the item to 0 otherwise */
+void _CudaSetDataRandP(const XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value);
 /* set the data with an array of offsets */
 void _CudaSetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYPE num);

--- a/source/tensor/core/getandset/SetData.h
+++ b/source/tensor/core/getandset/SetData.h
@@ -55,7 +55,11 @@ void _SetDataIndexed(XTensor * source, XTensor * modify, int dim, int index);
 void _SetDataLowTri(XTensor * tensor, DTYPE p, int shift);
 /* generate data items with a uniform distribution in [lower, upper] */
-void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper);
+void _SetDataRand(const XTensor * tensor, DTYPE lower, DTYPE upper);
+/* generate data items with a uniform distribution in [lower, upper] and set 
+   the item to a pre-defined value if the item >= p, set the item to 0 otherwise */
+void _SetDataRandP(const XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value);
 /* generate data items with a normal distribution with specified mean and standard deviation */
 void _SetDataRandN(XTensor * tensor, DTYPE mean = 0.0F, DTYPE standardDeviation = 1.0F);

--- a/source/tensor/function/Dropout.cpp
+++ b/source/tensor/function/Dropout.cpp
@@ -26,6 +26,7 @@
 #include "../core/arithmetic/Multiply.h"
 #include "../core/arithmetic/MultiplyDim.h"
 #include "../core/math/ScaleAndShift.h"
+#include "../core/getandset/SetData.h"
 namespace nts{ // namespace nts(NiuTrans.Tensor
@@ -147,17 +148,21 @@ XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim, int leadingDim
    XTensor mask;
    DTYPE * maskArray = NULL;
+    DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - dropProb);
    if(leadingDim < 0 && leadingDim2 < 0){
-        ShowNTErrors("TODO");
+        XTensor mask;
+        InitTensor(&mask, &x);
+        _SetDataRandP(&mask, 0, 1.0F, dropProb, scaleFactor);
+        return Multiply(x, mask);
    }
    else if(leadingDim2 < 0){
        int n = leadingDim;
        CheckNTErrors(n >= 0 && n < x.order, "Wrong leadingDim!");
-        DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - dropProb);
        /* generate a mask tensor with probability p */
        int unitNum = x.dimSize[n];
        maskArray = new DTYPE[unitNum];
@@ -181,8 +186,6 @@ XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim, int leadingDim
        CheckNTErrors(n >= 0 && n < x.order, "Wrong leadingDim!");
        CheckNTErrors(m >= 0 && m < x.order, "Wrong leadingDim!");
-        DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - dropProb);
        /* generate a mask tensor with probability p */
        int unitNum = x.dimSize[n] * x.dimSize[m];
        maskArray = new DTYPE[unitNum];