padding for batch training of t2t

e1630c28 · xiaotong · 6793e025 · e1630c28 · e1630c28 · e1630c28
Commit e1630c28 authored Aug 20, 2018 by xiaotong
--- a/source/network/Main.cpp
+++ b/source/network/Main.cpp
@@ -39,7 +39,6 @@ void SumDimTest();
 using namespace nts;
 using namespace fnnlm;
 using namespace transformer;
-using namespace GAN;
 int main( int argc, const char ** argv )
 {
@@ -47,9 +46,7 @@ int main( int argc, const char ** argv )
    //BackwardTest();
    //return 0;
-    if(argc > 1 && !strcmp(argv[1], "-test"))
+    if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
-        Test();
-    else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
        FNNLMMain(argc - 1, argv + 1);
    else if(argc > 1 && !strcmp(argv[1], "-t2t"))
        TransformerMain(argc - 1, argv + 1);

--- a/source/sample/transformer/T2TEncoder.cpp
+++ b/source/sample/transformer/T2TEncoder.cpp
@@ -103,6 +103,8 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool skipInputRes)
        XTensor fnn;
        XTensor res;
+        /* we skip the residual connection for the first layer if
+           the encoder is used in language modeling. */
        if(skipInputRes && i == 0){
            /* self attention */
            att = attentions[i].Make(x, x, x, mask);

--- a/source/sample/transformer/T2TModel.cpp
+++ b/source/sample/transformer/T2TModel.cpp
@@ -60,7 +60,7 @@ void T2TModel::InitModel(int argc, const char ** argv)
    if(useMem){
        delete mem;
-        mem = new XMem(devID);
+        mem = new XMem(devID, UNI_FREE, MILLION * 512, 1024, MILLION * 128);
    }
    encoder.InitModel(argc, argv, isLM, isLM ? 1 : 0, devID, mem);
@@ -98,7 +98,9 @@ void T2TModel::Make(XTensor &input, XTensor &output)
        dims[input.order] = len;
        XTensor mask(input.order + 1, dims, X_FLOAT, 1.0F, input.devID, input.mem);
-        /* a upper triangular matrix where the cells of the upper triangular are set to -1e-9 */
+        /* a upper triangular matrix where the cells of the upper triangular are set to -1e-9.
+           this matrix can be used to prevent the attention to current or following words in
+           a given sequence. */
        _SetDataLowTri(&mask, 1e9F, -1);
        _ScaleAndShiftMe(&mask, 1.0F, -1e9F);

--- a/source/sample/transformer/T2TTrainer.cpp
+++ b/source/sample/transformer/T2TTrainer.cpp
@@ -53,6 +53,9 @@ initialization
 */
 void T2TTrainer::Init(int argc, const char ** argv)
 {
+    bool useMem = false;
+    LoadParamBool(argc, argv, "mem", &useMem, useMem);
    LoadParamInt(argc, argv, "dev", &devID, -1);
    LoadParamFloat(argc, argv, "lrate", &lrate, 0.001F);
    LoadParamInt(argc, argv, "sbatch", &sBatchSize, 1);
@@ -68,6 +71,11 @@ void T2TTrainer::Init(int argc, const char ** argv)
    buf = new int[bufSize];
    seqLen = new int[bufSize];
    seqOffset = new int[bufSize];
+    if(useMem){
+        delete mem;
+        mem = new XMem(devID, UNI_FREE, MILLION * 64, 1024, MILLION * 64);
+    }
 }
 /* 
@@ -86,6 +94,9 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
    float loss = 0;
    float lr = 0;
+    model->mem->SetPin();
+    mem->SetPin();
    XNet net;
    double startT = GetClockSec();
@@ -96,11 +107,17 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
        CheckNTErrors(file, "cannot open training file!");
        wordCount = 0;
+        model->mem->BackToPin();
+        mem->BackToPin();
        /* batch of input sequences */
        XTensor batch;
+        /* padding */
+        XTensor padding;
-        while(LoadBatch(file, &batch, 1, vSize, sBatchSize, wBatchSize, isLenSorted, wc)){
+        while(LoadBatch(file, &batch, &padding, 1, vSize, sBatchSize, wBatchSize, isLenSorted, wc)){
            /* output probabilities */
            XTensor output;
@@ -108,6 +125,10 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
            /* make the network */
            model->Make(batch, output);
+            /* make paddings for the output */
+            if(output.GetDim(0) > 1)
+                PadOutput(&output, &padding);
            /* back-propagation for obtaining gradients */
            net.Backward(output, batch, CROSSENTROPY);
@@ -135,6 +156,9 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
                XPRINT6(0, stderr, "[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, ppl=%.3f\n",
                        lr, elapsed, step, epoch + 1, wordCountTotal, exp(loss / wordCount));
            }
+            model->mem->BackToPin();
+            mem->BackToPin();
        }
        fclose(file);
@@ -230,6 +254,7 @@ int T2TTrainer::LoadBuf(FILE * file)
 load a batch of sequences 
 >> file - the handle to the data file
 >> batch - the batch
+>> padding - padding of the input sequences
 >> step - the step we go over when move to the next sequence
 >> vs - vocabulary size
 >> sBatch - batch size of sequences
@@ -237,7 +262,9 @@ load a batch of sequences
 >> isSorted - indicates whether the sequences are sorted by length
 >> wCount - word count
 */
-int T2TTrainer::LoadBatch(FILE * file, XTensor * batch, int step, int vs, int sBatch, int wBatch, bool isSorted, int &wCount)
+int T2TTrainer::LoadBatch(FILE * file, XTensor * batch, XTensor * padding, 
+                          int step, int vs, int sBatch, int wBatch, 
+                          bool isSorted, int &wCount)
 {
    if(nextSeq < 0 || nextSeq >= nseqBuf)
        LoadBuf(file);
@@ -273,12 +300,19 @@ int T2TTrainer::LoadBatch(FILE * file, XTensor * batch, int step, int vs, int sB
               InitTensor(batch, 3, dims, X_FLOAT, 1.0F, devID, mem);
        }
+        if(padding->order != 2 || padding->GetDim(0) != sc || 
+           padding->GetDim(1) != max){
+               InitTensor2D(padding, sc, max, X_FLOAT, devID, mem);
+        }
        batch->SetZeroAll();
+        padding->SetZeroAll();
        /* this might be slow on GPUs :( */
        for(int s = seq; s < seq + sc; s++){
            for(int w = 0; w < seqLen[s]; w++){
                batch->Set3D(1.0F, s - seq, w, buf[seqOffset[s] + w]);
+                padding->Set2D(1.0F, s - seq, w);
                wCount++;
            }
        }
@@ -394,4 +428,35 @@ void T2TTrainer::Update(T2TModel * model, const float lr)
    }
 }
+/* 
+do padding on the output 
+>> output - output tensor of the network
+>> padding - padding of a batch of sentences
+*/
+void T2TTrainer::PadOutput(XTensor * output, XTensor * padding)
+{
+    if(output == NULL || padding == NULL)
+        return;
+    int on = output->order;
+    int * dimso = new int[on];
+    memcpy(dimso, output->dimSize, sizeof(int) * on);
+    output->Reshape(output->unitNum/dimso[output->order - 1], dimso[output->order - 1]);
+    XTensor * padding2 = NewTensorBuf(1, &padding->unitNum, X_FLOAT, 1.0F, padding->devID, padding->mem);
+    _CopyValues(padding, padding2);
+    _ScaleAndShiftMe(padding2, 1e9F, -1e9F);
+    _SumDim(output, padding2, output, 0);
+    output->Reshape(on, dimso);
+    delete[] dimso;
+    DelTensorBuf(padding2);
+}
 }
--- a/source/sample/transformer/T2TTrainer.h
+++ b/source/sample/transformer/T2TTrainer.h
@@ -105,13 +105,18 @@ public:
    int LoadBuf(FILE * file);
    /* load a batch of sequences */
-    int LoadBatch(FILE * file, XTensor * batch, int step, int vs, int sBatch, int wBatch, bool isSorted, int &wCount);
+    int LoadBatch(FILE * file, XTensor * batch, XTensor * padding, 
+                  int step, int vs, int sBatch, int wBatch, 
+                  bool isSorted, int &wCount);
    /* get word probabilities for a batch of sequences */
    float GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs);
    /* update the model by delta rule */
    void Update(T2TModel * model, const float lr);
+    /* do padding on the output */
+    void PadOutput(XTensor * output, XTensor * padding);
 };

--- a/source/tensor/core/reduce/ReduceMax.cu
+++ b/source/tensor/core/reduce/ReduceMax.cu
@@ -405,7 +405,7 @@ inline void continuousStorageThreadAllocation(dim3& grid, dim3& block, long long
    if (vectorSize % 32 != 0) minWarpNum++;
    warpNum = min(warpNum, minWarpNum);
-    grid.x = vectorNum;
+    grid.x = (unsigned int)vectorNum;
    grid.y = 1;
    grid.z = 1;
    block.x = 1;

--- a/source/tensor/core/reduce/ReduceSum.cu
+++ b/source/tensor/core/reduce/ReduceSum.cu
@@ -629,7 +629,7 @@ inline void continuousStorageThreadAllocation(dim3& grid, dim3& block, long long
    if (vectorSize % 32 != 0) minWarpNum++;
    warpNum = min(warpNum, minWarpNum);
-    grid.x = vectorNum;
+    grid.x = (unsigned int)vectorNum;
    grid.y = 1;
    grid.z = 1;
    block.x = 1;