load batch of sequence on both langauge sides

c6f50a22 · xiaotong · 430f0dfc · c6f50a22 · c6f50a22 · c6f50a22
Commit c6f50a22 authored Oct 09, 2018 by xiaotong
--- a/source/sample/transformer/T2TDecoder.cpp
+++ b/source/sample/transformer/T2TDecoder.cpp
@@ -67,17 +67,17 @@ void AttDecoder::InitModel(int argc, char ** argv,

 /* 
 make the decoding network
->> input - the input tensor of the decoder
->> encoderOutput - the output tensor of the encoder
+>> inputDec - the input tensor of the decoder
+>> outputEnc - the output tensor of the encoder
 >> mask - the mask that indicate each position is valid
 >> isTraining - indicates whether the model is used for training
 << return - the output tensor of the encoder
 */
-XTensor AttDecoder::Make(XTensor &input, XTensor &encoderOutput, XTensor &mask, bool isTraining)
+XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, bool isTraining)
 {
    XTensor x;

-    x = embedder.Make(input);
+    x = embedder.Make(inputDec);

    /* dropout */
    if(isTraining && dropoutP > 0)
@@ -106,7 +106,7 @@ XTensor AttDecoder::Make(XTensor &input, XTensor &encoderOutput, XTensor &mask, 

        /*****************************/
        /* encoder-decoder attention */
-        ende = attentionsEnde[i].Make(encoderOutput, x, encoderOutput, mask, isTraining);
+        ende = attentionsEnde[i].Make(outputEnc, x, outputEnc, mask, isTraining);

        /* dropout */
        if(isTraining && dropoutP > 0)

--- a/source/sample/transformer/T2TDecoder.h
+++ b/source/sample/transformer/T2TDecoder.h
@@ -48,7 +48,7 @@ public:
                   int myDevID = -1, XMem * myMem = NULL);

    /* make the decoding network */
-    XTensor Make(XTensor &input, XTensor &encoderOutput, XTensor &mask, bool isTraining);
+    XTensor Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, bool isTraining);
 };

 }

--- a/source/sample/transformer/T2TModel.cpp
+++ b/source/sample/transformer/T2TModel.cpp
@@ -90,13 +90,27 @@ make the encoding network
 >> isTraining - indicates whether we are training the model
 << return - encoding result
 */
-XTensor T2TModel::MakeEncoding(XTensor &input, XTensor &mask, bool isTraining)
+XTensor T2TModel::MakeEncoder(XTensor &input, XTensor &mask, bool isTraining)
 {
    return encoder.Make(input, mask, isTraining);
 }

 /* 
-make the entire network for language modeling (with the output softmax layer) 
+make the decoding network
+>> inputDec - input tensor of the decoder
+>> outputEnc - output tensor of the encoder
+>> output - output tensor (distribution)
+>> mask - the mask for positions that are/not involved in computation
+>> isTraining - indicates whether we are training the model
+<< return - encoding result
+*/
+XTensor T2TModel::MakeDecoder(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, bool isTraining)
+{
+    return decoder.Make(inputDec, outputEnc, mask, isTraining);
+}
+
+/* 
+make the network for language modeling (with the output softmax layer) 
 >> input - input tensor
 >> output - output tensor (distribution)
 >> padding - padding of the sequences
@@ -145,7 +159,7 @@ void T2TModel::MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool is
        
    //_Sum(&mask, padding3, &mask);

-    encoding = MakeEncoding(input, mask, isTraining);
+    encoding = MakeEncoder(input, mask, isTraining);
    outputLayer.Make(encoding, output);

    delete[] dims;
@@ -156,6 +170,43 @@ void T2TModel::MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool is
 }

 /* 
+make the network for machine translation (with the output softmax layer) 
+>> inputEnc - input tensor of the encoder
+>> inputDec - input tensor of the decoder
+>> output - output tensor (distribution)
+>> padding - padding of the sequences
+>> isTraining - indicates whether the model is for training
+*/
+void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTensor &padding, bool isTraining)
+{
+    XTensor encoding;
+    XTensor decoding;
+    XTensor maskEnc;
+    XTensor maskDec;
+    
+    /* generate mask to see "previous" words on the decoder side */
+    int len = inputDec.GetDim(inputDec.order - 2);
+    int * dims = new int[inputDec.order + 1];
+    for(int i = 0; i < inputDec.order; i++)
+        dims[i + 1] = inputDec.GetDim(i);
+    dims[0] = nhead;
+    dims[inputDec.order] = len;
+    InitTensor(&maskDec, inputDec.order + 1, dims, X_FLOAT, 1.0F, inputDec.devID, inputDec.mem);
+        
+    /* a upper triangular matrix where the cells of the upper triangular are set to -1e-9.
+        this matrix can be used to prevent the attention to current or following words in
+        a given sequence. */
+    _SetDataLowTri(&maskDec, 1e9F, 0);
+    _ScaleAndShiftMe(&maskDec, 1.0F, -1e9F);
+
+    encoding = MakeEncoder(inputEnc, maskEnc, isTraining);
+    decoding = MakeDecoder(inputDec, encoding, maskDec, isTraining);
+    outputLayer.Make(decoding, output);
+
+    delete[] dims;
+}
+
+/* 
 get parameter matrics
 >> list - the list that keeps the parameter matrics
 */

--- a/source/sample/transformer/T2TModel.h
+++ b/source/sample/transformer/T2TModel.h
@@ -69,11 +69,17 @@ public:
    void InitModel(int argc, char ** argv);

    /* make the encoding network */
-    XTensor MakeEncoding(XTensor &input, XTensor &mask, bool isTraining);
+    XTensor MakeEncoder(XTensor &input, XTensor &mask, bool isTraining);

-    /* make the entire network for langauge modeling (with the output softmax layer) */
+    /* make the encoding network */
+    XTensor MakeDecoder(XTensor &inputEnc, XTensor &inputDec, XTensor &mask, bool isTraining);
+
+    /* make the network for langauge modeling (with the output softmax layer) */
    void MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool isTraining);

+    /* make the network for machine translation (with the output softmax layer) */
+    void MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTensor &padding, bool isTraining);
+
    /* get parameter matrics */
    void GetParams(XList &list);


--- a/source/sample/transformer/T2TTrainer.cpp
+++ b/source/sample/transformer/T2TTrainer.cpp
@@ -101,6 +101,7 @@ void T2TTrainer::Init(int argc, char ** argv)
    LoadParamInt(argc, argv, "d", &d, 512);
    LoadParamInt(argc, argv, "nwarmup", &nwarmup, 4000);
    LoadParamInt(argc, argv, "vsize", &vSize, 1);
+    LoadParamInt(argc, argv, "vsizetgt", &vSizeTgt, vSize);
    LoadParamBool(argc, argv, "sorted", &isLenSorted, false);
    LoadParamInt(argc, argv, "bufsize", &bufSize, 50000);
    LoadParamBool(argc, argv, "adam", &useAdam, false);
@@ -189,7 +190,9 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
        /* label smoothed gold standard (if needed) */
        XTensor goldSmoothed;
        
-        while (LoadBatch(file, true, &batch, &padding, &gold, NULL, 1, vSize, sBatchSize, wBatchSize, isLenSorted, wc, devID, mem)) {
+        while (LoadBatch(file, true, &batch, &padding, &gold, NULL, vSize, vSizeTgt, 
+                         sBatchSize, wBatchSize, isLenSorted, wc, devID, mem)) 
+        {

            CheckNTErrors(batch.order == 3, "wrong tensor order of the sequence batch");

@@ -197,7 +200,13 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
            XTensor output;

            /* make the network */
-            model->MakeLM(batch, output, padding, true);
+            if(model->isLM)
+                model->MakeLM(batch, output, padding, true);
+            else if(model->isMT)
+                model->MakeMT(batch, gold, output, padding, true);
+            else{
+                ShowNTErrors("Illegal model type!");
+            }

            /* back-propagation for obtaining gradients */
            if (labelSmoothingP > 0)
@@ -222,13 +231,6 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
                /* back-propagation */
                net.Backward(output, g, CROSSENTROPY);

-                /*for(int i = 0; i < net.nodes.count; i++){
-                    XTensor * node = (XTensor*)net.nodes.Get(i);
-                    XLink::ShowNode(stderr, node);
-                }
-
-                exit(0);*/
-
                gradStep += 1;
                loss += -prob;
                wordCount += wc;
@@ -335,7 +337,9 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
    
    ClearBuf();

-    while(LoadBatch(file, true, &batch, &padding, &gold, seqs, 1, vSize, 1, 1, false, wc, devID, mem)){
+    while(LoadBatch(file, true, &batch, &padding, &gold, seqs, vSize, vSizeTgt,
+                    1, 1, false, wc, devID, mem))
+    {

        CheckNTErrors(batch.order == 3, "wrong tensor order of the sequence batch");
            
@@ -343,7 +347,13 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
        XTensor output;
            
        /* make the network */
-        model->MakeLM(batch, output, padding, false);
+        if(model->isLM)
+            model->MakeLM(batch, output, padding, false);
+        else if(model->isMT)
+            model->MakeMT(batch, gold, output, padding, false);
+        else{
+            ShowNTErrors("Illegal model type!");
+        }

        int bSize = batch.GetDim(0);
        int length = batch.GetDim(1);
@@ -532,7 +542,6 @@ int T2TTrainer::LoadBuf(FILE * file, bool isSorted, int step)
        offset = 0;
        for(int i = 0; i < seqCount; i++){
            SampleNode &node = nodes[count];
-            //fprintf(stderr, "%d %d %d\n", node.size, node.id, node.value);
            memcpy(buf2 + offset, node.p, sizeof(int) * node.size);
            for(int j = 0; j < step; j++){
                seqLen2[count + j] = seqLen[node.id + j];
@@ -562,7 +571,7 @@ void T2TTrainer::ClearBuf()
    nextSeq = -1;
 }

-/* 
+/*
 load a batch of sequences 
 >> file - the handle to the data file
 >> isLM - indicates whether the data is used for training lms
@@ -570,8 +579,8 @@ load a batch of sequences
 >> padding - padding of the input sequences
 >> output - the batch of the output sequences
 >> seqs - keep the sequences in an array
->> step - the step we go over when move to the next sequence
->> vs - vocabulary size
+>> vsEnc - size of the encoder vocabulary
+>> vsDec - size of the decoder vocabulary
 >> sBatch - batch size of sequences
 >> wBatch - batch size of words
 >> isSorted - indicates whether the sequences are sorted by length
@@ -582,12 +591,47 @@ load a batch of sequences
 int T2TTrainer::LoadBatch(FILE * file, bool isLM, 
                          XTensor * batch, XTensor * padding, XTensor * output, 
                          int * seqs,
-                          int step, int vs, int sBatch, int wBatch, 
+                          int vsEnc, int vsDec, int sBatch, int wBatch, 
                          bool isSorted, int &wCount,
                          int devID, XMem * mem)
 {
+    if(isLM){
+        return LoadBatchLM(file, batch, padding, output, seqs,
+                           vsEnc, sBatch, wBatch, 
+                           isSorted, wCount, devID, mem);
+    }
+    else{
+        return LoadBatchMT(file, batch, padding, output, seqs,
+                           vsEnc, vsDec, sBatch, wBatch, 
+                           isSorted, wCount, devID, mem);
+    }
+}
+
+/* 
+load a batch of sequences (for LM)
+>> file - the handle to the data file
+>> isLM - indicates whether the data is used for training lms
+>> batch - the batch of the input sequences
+>> padding - padding of the input sequences
+>> output - the batch of the output sequences
+>> seqs - keep the sequences in an array
+>> vs - vocabulary size
+>> sBatch - batch size of sequences
+>> wBatch - batch size of words
+>> isSorted - indicates whether the sequences are sorted by length
+>> wCount - word count
+>> devID - device id
+>> mem - memory pool
+*/
+int T2TTrainer::LoadBatchLM(FILE * file, 
+                            XTensor * batch, XTensor * padding, XTensor * output, 
+                            int * seqs,
+                            int vs, int sBatch, int wBatch, 
+                            bool isSorted, int &wCount,
+                            int devID, XMem * mem)
+{
    if(nextSeq < 0 || nextSeq >= nseqBuf)
-        LoadBuf(file, isSorted, step);
+        LoadBuf(file, isSorted, 1);

    int seq = MAX(nextSeq, 0);
    int wc = 0;
@@ -614,74 +658,175 @@ int T2TTrainer::LoadBatch(FILE * file, bool isLM,
    if(sc <= 0)
        return 0;

-    if(isLM){
-        int dims[MAX_TENSOR_DIM_NUM];
-        dims[0] = sc;
-        dims[1] = max;
-        dims[2] = vs;
-
-        InitTensor(batch, 3, dims, X_FLOAT, 1.0F, devID, mem);
-        InitTensor2D(padding, sc, max, X_FLOAT, devID, mem);
-        InitTensor(output, 3, dims, X_FLOAT, 1.0F, devID, mem);
-
-        if(batch->grad == NULL)
-            XNoder::MakeGrad(batch);
-        else
-            InitTensor(batch->grad, 3, dims, X_FLOAT, 1.0F, devID, mem);
-
-        if(padding->grad == NULL)
-            XNoder::MakeGrad(padding);
-        else
-            InitTensor2D(padding->grad, sc, max, X_FLOAT, devID, mem);
-
-        if(output->grad == NULL)
-            XNoder::MakeGrad(output);
-        else
-            InitTensor(output->grad, 3, dims, X_FLOAT, 1.0F, devID, mem);
-
-        batch->SetZeroAll();
-        padding->SetZeroAll();
-        output->SetZeroAll();
-        batch->grad->SetZeroAll();
-        padding->grad->SetZeroAll();
-        output->grad->SetZeroAll();
-
-        int seqSize = 0;
-
-        //fprintf(tf, "batch %d(%d)\n", tc++, sc);
-
-        /* this might be slow on GPUs :( */
-        for(int s = seq; s < seq + sc; s++){
-            int len = isDoubledEnd ? seqLen[s] : seqLen[s] - 1;
-            CheckNTErrors(len <= max, "Something is wrong!");
-            for(int w = 0; w < len; w++){
-                batch->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w]);
-                padding->Set2D(1.0F, s - seq, w);
-                if(w > 0)
-                    output->Set3D(1.0F, s - seq, w - 1, buf[seqOffset[s] + w]);
-                if(w == len - 1){
-                    if(isDoubledEnd)
-                        output->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w]);
-                    else
-                        output->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w + 1]);
-                }
-                wCount++;
-                /*fprintf(tf, "%d", buf[seqOffset[s] + w]);
-                if(w < seqLen[s] - 1)
-                    fprintf(tf, " ");
+    int dims[MAX_TENSOR_DIM_NUM];
+    dims[0] = sc;
+    dims[1] = max;
+    dims[2] = vs;
+
+    InitTensor(batch, 3, dims, X_FLOAT, 1.0F, devID, mem);
+    InitTensor2D(padding, sc, max, X_FLOAT, devID, mem);
+    InitTensor(output, 3, dims, X_FLOAT, 1.0F, devID, mem);
+
+    XNoder::MakeGrad(batch);
+    XNoder::MakeGrad(padding);
+    XNoder::MakeGrad(output);
+ 
+    batch->SetZeroAll();
+    padding->SetZeroAll();
+    output->SetZeroAll();
+    batch->grad->SetZeroAll();
+    padding->grad->SetZeroAll();
+    output->grad->SetZeroAll();
+
+    int seqSize = 0;
+
+    //fprintf(tf, "batch %d(%d)\n", tc++, sc);
+
+    /* this might be slow on GPUs :( */
+    for(int s = seq; s < seq + sc; s++){
+        int len = isDoubledEnd ? seqLen[s] : seqLen[s] - 1;
+        CheckNTErrors(len <= max, "Something is wrong!");
+        for(int w = 0; w < len; w++){
+            batch->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w]);
+            padding->Set2D(1.0F, s - seq, w);
+            if(w > 0)
+                output->Set3D(1.0F, s - seq, w - 1, buf[seqOffset[s] + w]);
+            if(w == len - 1){
+                if(isDoubledEnd)
+                    output->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w]);
                else
-                    fprintf(tf, "\n");*/
-                if(seqs != NULL)
-                    seqs[seqSize++] = buf[seqOffset[s] + w];
+                    output->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w + 1]);
            }
+            wCount++;
+            /*fprintf(tf, "%d", buf[seqOffset[s] + w]);
+            if(w < seqLen[s] - 1)
+                fprintf(tf, " ");
+            else
+                fprintf(tf, "\n");*/
+            if(seqs != NULL)
+                seqs[seqSize++] = buf[seqOffset[s] + w];
+        }

-            if(seqs != NULL){
-                for(int w = len; w < max; w++)
-                    seqs[seqSize++] = -1;
-            }
+        if(seqs != NULL){
+            for(int w = len; w < max; w++)
+                seqs[seqSize++] = -1;
        }
+    }

-        fflush(tf);
+    fflush(tf);
+
+    return sc;
+}
+
+/*
+load a batch of sequences (for MT)
+>> file - the handle to the data file
+>> batch - the batch of the input sequences
+>> padding - padding of the input sequences
+>> output - the batch of the output sequences
+>> seqs - keep the sequences in an array
+>> vsEnc - size of the encoder vocabulary
+>> vsDec - size of the decoder vocabulary
+>> sBatch - batch size of sequences
+>> wBatch - batch size of words
+>> isSorted - indicates whether the sequences are sorted by length
+>> wCount - word count
+>> devID - device id
+>> mem - memory pool
+*/
+int T2TTrainer::LoadBatchMT(FILE * file, 
+                            XTensor * batch, XTensor * padding, XTensor * output, 
+                            int * seqs,
+                            int vsEnc, int vsDec, int sBatch, int wBatch, 
+                            bool isSorted, int &wCount,
+                            int devID, XMem * mem)
+{
+    if(nextSeq < 0 || nextSeq >= nseqBuf)
+        LoadBuf(file, isSorted, 2);
+
+    int seq = MAX(nextSeq, 0);
+    int wcEnc = 0;
+    int wcDec = 0;
+    int wnEnc = 0;
+    int wnDec = 0;
+    int maxEnc = 0;
+    int maxDec = 0;
+    int sc = 0;
+
+    CheckNTErrors((nseqBuf - seq) % 2 == 0, "Input sequence must be paired!");
+
+    while(seq + sc < nseqBuf){
+
+        /* source-side sequence */
+        wnEnc = seqLen[seq + sc];
+        wcEnc += wnEnc;
+        sc += 1;
+
+        if(maxEnc < wnEnc)
+            maxEnc = wnEnc;
+
+        /* target-side sequence */
+        wnDec = seqLen[seq + sc];
+        wcDec += wnDec;
+        sc += 1;
+
+        if(maxDec < wnDec)
+            maxDec = wnDec;
+
+        if(sc >= sBatch * 2 && wcEnc >= wBatch)
+            break;
+    }
+
+    nextSeq = seq + sc;
+
+    if(sc <= 0)
+        return 0;
+
+    int sCount = sc/2;
+    int seqSize = 0;
+    int dimsEnc[3] = {sCount, maxEnc, vsEnc};
+    int dimsDec[3] = {sCount, maxDec, vsDec};
+
+    InitTensor(batch, 3, dimsEnc, X_FLOAT, 1.0F, devID, mem);
+    InitTensor2D(padding, sCount, maxDec, X_FLOAT, devID, mem);
+    InitTensor(output, 3, dimsDec, X_FLOAT, 1.0F, devID, mem);
+
+    batch->SetZeroAll();
+    padding->SetZeroAll();
+    output->SetZeroAll();
+
+    wCount = 0;
+
+    /* batch of the source-side sequences */
+    for(int s = seq; s < seq + sc; s += 2){
+        int len = seqLen[s];
+        int sent = (s - seq)/2;
+        for(int w = 0; w < len; w++){
+            batch->Set3D(1.0F, sent, w, buf[seqOffset[s] + w]);
+            wCount++;
+        }
+    }
+
+    /* batch of the target-side sequences */
+    for(int s = seq + 1; s < seq + sc; s += 2){
+        int len = seqLen[s];
+        int sent = (s - seq - 1)/2;
+        for(int w = 0; w < len; w++){
+            padding->Set2D(1.0F, sent, w);
+            if(w > 0)
+                output->Set3D(1.0F, sent, w - 1, buf[seqOffset[s] + w]);
+            if(w == len - 1)
+                output->Set3D(1.0F, sent, w, buf[seqOffset[s] + w]);
+            wCount++;
+
+            if(seqs != NULL)
+                seqs[seqSize++] = buf[seqOffset[s] + w];
+        }
+
+        if(seqs != NULL){
+            for(int w = len; w < maxDec; w++)
+                seqs[seqSize++] = -1;
+        }
    }

    return sc;

--- a/source/sample/transformer/T2TTrainer.h
+++ b/source/sample/transformer/T2TTrainer.h
@@ -79,6 +79,9 @@ public:
    /* vocabulary size of the source side */
    int vSize;

+    /* vocabulary size of the target side */
+    int vSizeTgt;
+
    /* learning rate */
    float lrate;
    
@@ -160,10 +163,24 @@ public:
    int LoadBatch(FILE * file, bool isLM,
                  XTensor * batch, XTensor * padding, XTensor * output, 
                  int * seqs,
-                  int step, int vs, int sBatch, int wBatch, 
+                  int vsEnc, int vsDec, int sBatch, int wBatch, 
                  bool isSorted, int &wCount,
                  int devID, XMem * mem);

+    /* load a batch of sequences (for language modeling) */
+    int LoadBatchLM(FILE * file, 
+                    XTensor * batch, XTensor * padding, XTensor * output, 
+                    int * seqs, int vs, int sBatch, int wBatch, 
+                    bool isSorted, int &wCount,
+                    int devID, XMem * mem);
+
+    /* load a batch of sequences (for machine translation) */
+    int LoadBatchMT(FILE * file, 
+                    XTensor * batch, XTensor * padding, XTensor * output, 
+                    int * seqs, int vsEnc, int vsDec, int sBatch, int wBatch, 
+                    bool isSorted, int &wCount,
+                    int devID, XMem * mem);
+
    /* shuffle the data file */
    void Shuffle(const char * srcFile, const char * tgtFile);