add dropout to transformer

cecbceb9 · xiaotong · 2e20824a · cecbceb9 · cecbceb9 · cecbceb9
Commit cecbceb9 authored Sep 16, 2018 by xiaotong
--- a/source/sample/transformer/T2TAttention.cpp
+++ b/source/sample/transformer/T2TAttention.cpp
@@ -125,17 +125,8 @@ XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask)

    dot = Linear(dot, 1.0F/(float)sqrt((float)dk));

-    //if(llnum == 1)
-    //    dot.Dump(tf, "dot:");
-
    scalar = Softmax(dot, -1);

-    //if(llnum == 1)
-    //    scalar.Dump(tf, "scalar:");
-
-    //if(ignored > 0)
-    //    _SetDataDim(&scalar, 0, ignored, scalar.order - 2, 1e-9F);
-
    att = BMMul(scalar, vheads);

    /* concatenate the heads */

--- a/source/sample/transformer/T2TAttention.h
+++ b/source/sample/transformer/T2TAttention.h
@@ -73,6 +73,9 @@ public:
       special design for the attention model. */
    int ignored;

+    /* indicates whether the model is used for training */
+    bool isTraining;
+
 public:
    /* constructor */
    T2TAttention();

--- a/source/sample/transformer/T2TEncoder.cpp
+++ b/source/sample/transformer/T2TEncoder.cpp
@@ -63,6 +63,7 @@ void AttEncoder::InitModel(int argc, const char ** argv,
    LoadParamInt(argc, argv, "hsize", &hSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "esize", &eSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "vsize", &vSize, -1);
+    LoadParamFloat(argc, argv, "dropout", &dropoutP, 0);

    CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
    CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsize\"");
@@ -89,9 +90,10 @@ make the encoding network
 >> input - the input tensor of the encoder
 >> mask - the mask that indicate each position is valid
 >> skipInputRes - indicates whether we skip the residual connection of the first layer
+>> isTraining - indicates whether the model is for training
 << return - the output tensor of the encoder
 */
-XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool skipInputRes)
+XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool skipInputRes, bool isTraining)
 {
    XTensor x;

@@ -111,7 +113,9 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool skipInputRes)
            /* self attention */
            att = attentions[i].Make(x, x, x, mask);

-            /* TODO: dropout */
+            /* dropout */
+            if(isTraining && dropoutP > 0)
+                att = Dropout(att);

            /* layer normalization */
            x = attLayerNorms[i].Make(att); 
@@ -121,10 +125,12 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool skipInputRes)
            /* self attention */
            att = attentions[i].Make(x, x, x, mask);

+            /* dropout */
+            if(isTraining && dropoutP > 0)
+                att = Dropout(att);
+
            /* residual connection */
            res = Sum(att, x);
-        
-            /* TODO: dropout */

            /* layer normalization */
            x = attLayerNorms[i].Make(res);
@@ -133,13 +139,18 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool skipInputRes)
        /* fnn */
        fnn = fnns[i].Make(x);

+        /* dropout */
+        if(isTraining && dropoutP > 0)
+            fnn = Dropout(fnn);
+
        /* residual connection */
        res = Sum(fnn, x);

-        /* TODO: dropout */
-
        /* layer normalization */
        x = fnnLayerNorms[i].Make(res);
+
+        if(isTraining && dropoutP > 0)
+            x = Dropout(x);
    }

    return x;

--- a/source/sample/transformer/T2TEncoder.h
+++ b/source/sample/transformer/T2TEncoder.h
@@ -40,7 +40,7 @@ class T2TEncoder
 {
 public:
    virtual
-    XTensor Make(XTensor &input, XTensor &mask, bool skipInputRes) = 0;
+    XTensor Make(XTensor &input, XTensor &mask, bool skipInputRes, bool isTraining) = 0;
 };

 /* 
@@ -49,7 +49,7 @@ the encoder based on RNN
 class RNNEncoder : T2TEncoder
 {
 public:
-    XTensor Make(XTensor &input, XTensor &mask, bool skipInputRes);
+    XTensor Make(XTensor &input, XTensor &mask, bool skipInputRes, bool isTraining);
 };


@@ -77,6 +77,9 @@ public:
    /* vocabulary size */
    int vSize;

+    /* dropout probability */
+    DTYPE dropoutP;
+
    /* some positions can be ignored in attention. this is useful in lm where the first position needs
       special design for the attention model. */
    int ignored;
@@ -115,7 +118,7 @@ public:
                   int myDevID = -1, XMem * myMem = NULL);

    /* make the encoding network */
-    XTensor Make(XTensor &input, XTensor &mask, bool skipInputRes);
+    XTensor Make(XTensor &input, XTensor &mask, bool skipInputRes, bool isTraining);
 };



--- a/source/sample/transformer/T2TModel.cpp
+++ b/source/sample/transformer/T2TModel.cpp
@@ -77,11 +77,12 @@ make the encoding network
 >> input - input tensor
 >> mask - the mask for positions that are/not involved in computation
 >> skipInputRes - indicates whether we skip the residual connection of the first layer
+>> isTraining - indicates whether we are training the model
 << return - encoding result
 */
-XTensor T2TModel::MakeEncoding(XTensor &input, XTensor &mask, bool skipInputRes)
+XTensor T2TModel::MakeEncoding(XTensor &input, XTensor &mask, bool skipInputRes, bool isTraining)
 {
-    return encoder.Make(input, mask, skipInputRes);
+    return encoder.Make(input, mask, skipInputRes, isTraining);
 }

 /* 
@@ -89,8 +90,9 @@ make the entire network (with the output softmax layer)
 >> input - input tensor
 >> output - output tensor (distribution)
 >> padding - padding of the sequences
+>> isTraining - indicates whether the model is for training
 */
-void T2TModel::Make(XTensor &input, XTensor &output, XTensor &padding)
+void T2TModel::Make(XTensor &input, XTensor &output, XTensor &padding, bool isTraining)
 {
    XTensor encoding;
    
@@ -134,7 +136,7 @@ void T2TModel::Make(XTensor &input, XTensor &output, XTensor &padding)
        
        //_Sum(&mask, padding3, &mask);

-        encoding = MakeEncoding(input, mask, true);
+        encoding = MakeEncoding(input, mask, true, isTraining);
        outputLayer.Make(encoding, output);

        delete[] dims;

--- a/source/sample/transformer/T2TModel.h
+++ b/source/sample/transformer/T2TModel.h
@@ -69,10 +69,10 @@ public:
    void InitModel(int argc, const char ** argv);

    /* make the encoding network */
-    XTensor MakeEncoding(XTensor &input, XTensor &mask, bool skipInputRes);
+    XTensor MakeEncoding(XTensor &input, XTensor &mask, bool skipInputRes, bool isTraining);

    /* make the entire network (with the output softmax layer) */
-    void Make(XTensor &input, XTensor &output, XTensor &padding);
+    void Make(XTensor &input, XTensor &output, XTensor &padding, bool isTraining);

    /* get parameter matrics */
    void GetParams(XList &list);

--- a/source/sample/transformer/T2TTrainer.cpp
+++ b/source/sample/transformer/T2TTrainer.cpp
@@ -149,7 +149,7 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
            XTensor output;
            
            /* make the network */
-            model->Make(batch, output, padding);
+            model->Make(batch, output, padding, true);

            /* make paddings for the output */
            if(output.GetDim(0) > 1)
@@ -271,7 +271,7 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
        XTensor output;
            
        /* make the network */
-        model->Make(batch, output, padding);
+        model->Make(batch, output, padding, false);

        int bSize = batch.GetDim(0);
        int length = batch.GetDim(1);

--- a/source/tensor/function/Dropout.cpp
+++ b/source/tensor/function/Dropout.cpp
@@ -30,14 +30,6 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor

 /*
-generate a random bernoulli number
-*/
-DTYPE RandomBernoulli(DTYPE prob)
-{
-    return (DTYPE)rand()/(DTYPE)RAND_MAX > prob ? (DTYPE)1.0 : (DTYPE)0.0;
-}
-
-/*
 dropout function
 It randomly zeroes some of the elements of the input tensor
 with probability p via a Bernoulli distribution.
@@ -64,7 +56,7 @@ void _Dropout(const XTensor *x, XTensor *y, unsigned int seed, DTYPE prob)
    int unitNum = x->unitNum;
    DTYPE * maskArray = new DTYPE[unitNum];
    for (int i = 0; i < unitNum; i++)
-        maskArray[i] = RandomBernoulli(prob);
+        maskArray[i] = RandomBernoulli(prob, 1.0F);

    XTensor * maskTensor = NewTensorBuf(x, x->devID, x->mem);
    maskTensor->SetData(maskArray, unitNum);
@@ -112,7 +104,7 @@ void _DropoutBackward(const XTensor * y, const XTensor * x,
        srand(seed);
        DTYPE * maskArray = new DTYPE[unitNum];
        for (int i = 0; i < unitNum; i++)
-            maskArray[i] = RandomBernoulli(prob);
+            maskArray[i] = RandomBernoulli(prob, 1.0F);

        XTensor * maskTensor = NewTensorBuf(x, x->devID, x->mem);
        maskTensor->SetData(maskArray, unitNum);
@@ -142,48 +134,39 @@ void _DropoutBackward(const XTensor * y, const XTensor * x,
 }
    
 /*
- dropout function (we make tensor connections here)
- It randomly zeroes some of the elements of the input tensor
- with probability p via a Bernoulli distribution.
+dropout function (we make tensor connections here)
+It randomly zeroes some of the elements of the input tensor
+with probability p via a Bernoulli distribution.
 
- See "Improving neural networks by preventing co-adaptation of feature detectors"
- for more details.
+See "Improving neural networks by preventing co-adaptation of feature detectors"
+for more details.
 
- Here, the output is scaled by a factor of \frac{1}{1-p} so that we do not need
- to mark the tensor with probability p in the inference phase. Instead we perform
- the same inference procedure as that with no use of dropout on the test data.
+Here, the output is scaled by a factor of \frac{1}{1-p} so that we do not need
+to mark the tensor with probability p in the inference phase. Instead we perform
+the same inference procedure as that with no use of dropout on the test data.
 
- >> x - input tensor
- >> y - output tensor
- >> prob - probability to set an element to zero
+>> x - input tensor
+>> y - output tensor
+>> prob - probability to set an element to zero
 */
 XTensor Dropout(const XTensor &x, DTYPE prob)
 {
    DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - prob);
    
-    /* generate a mask tensor again with special probability */
-    srand((unsigned int)time(NULL));
+    /* generate a mask tensor with probability p */
    int unitNum = x.unitNum;
    DTYPE * maskArray = new DTYPE[unitNum];
+
+    srand((unsigned int)time(NULL));
    for (int i = 0; i < unitNum; i++)
-        maskArray[i] = RandomBernoulli(prob);
-    
-    XTensor maskTensor(&x);
-    maskTensor.SetData(maskArray, unitNum);
-    
-    XTensor y;
-    XTensor inter;
-    
-    inter = Multiply(x, maskTensor);
-    y = ScaleAndShift(inter, scaleFactor, 0);
+        maskArray[i] = RandomBernoulli(prob, scaleFactor);
    
+    XTensor mask(&x);
+    mask.SetData(maskArray, unitNum);
+
    delete[] maskArray;
    
-    ///* tensor connection */
-    //XLink::MakeLink(&x, NULL, &y, FUNC_DROPOUT);
-    //XLink::AddParamToHead(&y, prob);
-    
-    return y;
+    return Multiply(x, mask);
 }

 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/function/Dropout.h
+++ b/source/tensor/function/Dropout.h
@@ -27,6 +27,12 @@

 namespace nts{ // namespace nts(NiuTrans.Tensor)

+/* generate a random bernoulli number */
+inline DTYPE RandomBernoulli(DTYPE prob, DTYPE value)
+{
+    return (DTYPE)rand()/(DTYPE)RAND_MAX > prob ? (DTYPE)value : (DTYPE)0.0;
+}
+
 /* dropout function */
 void _Dropout(const XTensor * x, XTensor * y, unsigned int seed, DTYPE prob = 0.5);


--- a/source/tensor/function/FHeader.h
+++ b/source/tensor/function/FHeader.h
@@ -26,6 +26,7 @@

 #include "../XTensor.h"

+#include "Dropout.h"
 #include "HardTanH.h"
 #include "Identity.h"
 #include "LogSoftmax.h"