better implementation of dropout

52a27964 · xiaotong · a8304bed · 52a27964 · 52a27964 · 52a27964
Commit 52a27964 authored Dec 28, 2018 by xiaotong
--- a/source/network/XBackwardMath.cpp
+++ b/source/network/XBackwardMath.cpp
@@ -87,6 +87,8 @@ void XMathGrad::MakeGrad(XTensor * node, bool isEfficient)
        GradSum(node, isEfficient);
    else if(operID == MATH_SUMDIM)
        GradSumDim(node, isEfficient);
+    else if(operID == MATH_SUMBROADCAST)
+        GradSumBroadcast(node, isEfficient);
    else if(operID == REDUCE_REDUCEMEAN)
        GradReduceMean(node, isEfficient);
    else if(operID == REDUCE_REDUCESUM)
@@ -817,6 +819,37 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
 }

 /*
+gradient for multiplication by broadcasting: 
+c = a * b
+where some dimensions of b are of size 1
+
+dE/da = dE/dc * b
+dE/db = (dE/dc * a).reduce(0...n)
+where a.reduce(0...n) is the reduction along the dimension
+whose size is 1 in b. Note that there might be several reductions.
+
+>> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
+*/
+void XMathGrad::GradMultiplyBroadcast(XTensor * node, bool isEfficient)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for MULTIPLYBROADCAST!");
+
+    XTensor * a = income.tails[0];
+    XTensor * b = income.tails[1];
+    DTYPE beta = income.GetParam(0);
+
+    XNoder::MakeGrad(a);
+    _MultiplyBroadcast(node->grad, b, a->grad, 1.0F);
+
+    if(b->isVar || b->income.tailNum > 0){
+        ShowNTErrors("TODO");
+    }
+}
+
+/*
 gradient for negate
 for
 c = -a
@@ -1253,6 +1286,37 @@ void XMathGrad::GradSumDim(XTensor * node, bool isEfficient)
    node->visitMark = NODE_FINISHED;
 }

+/* 
+gradient for sum by broadcasting: 
+c = a + b * \beta
+where some dimensions of b are of size 1
+
+dE/da = dE/dc
+dE/db = dE/dc * a.reduce(0..n) * \beta 
+where a.reduce(0..n) is the reduction along the dimension
+whose size is 1 in b
+
+>> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
+*/
+void XMathGrad::GradSumBroadcast(XTensor * node, bool isEfficient)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for SUMBROADCAST!");
+
+    XTensor * a = income.tails[0];
+    XTensor * b = income.tails[1];
+    DTYPE beta = income.GetParam(0);
+
+    XNoder::MakeGrad(a);
+    _Sum(a->grad, node->grad, a->grad);
+
+    if(b->isVar || b->income.tailNum > 0){
+        ShowNTErrors("TODO");
+    }
+}
+
 /*
 gradient for reduceMean
 for

--- a/source/network/XBackwardMath.h
+++ b/source/network/XBackwardMath.h
@@ -109,6 +109,11 @@ private:
    static
    void GradMultiplyDim(XTensor * node, bool isEfficient);

+    /* gradient for multiply one dimension: c =  a * b
+       where some dimensions of b are of size 1 */
+    static
+    void GradMultiplyBroadcast(XTensor * node, bool isEfficient);
+
    /* gradient for negate */
    static
    void GradNegate(XTensor * node, bool isEfficient);
@@ -143,6 +148,11 @@ private:
    static
    void GradSumDim(XTensor * node, bool isEfficient);

+    /* gradient for sum by broadcasting: c = a + b * \beta
+       where some dimensions of b are of size 1 */
+    static
+    void GradSumBroadcast(XTensor * node, bool isEfficient);
+
    /* gradient for reduceMean */
    static
    void GradReduceMean(XTensor * node, bool isEfficient);

--- a/source/sample/transformer/T2TDecoder.cpp
+++ b/source/sample/transformer/T2TDecoder.cpp
@@ -82,7 +82,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X

    /* dropout */
    if(isTraining && dropoutP > 0)
-        x = Dropout(x, dropoutP);
+        x = Dropout(x, dropoutP, 2);

    for(int i = 0; i < nlayer; i++){
        XTensor att;
@@ -97,7 +97,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X

        /* dropout */
        if(isTraining && dropoutP > 0)
-            att = Dropout(att, dropoutP);
+            att = Dropout(att, dropoutP, 2);

        /* residual connection */
        res = Sum(att, x);
@@ -111,7 +111,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X

        /* dropout */
        if(isTraining && dropoutP > 0)
-            ende = Dropout(ende, dropoutP);
+            ende = Dropout(ende, dropoutP, 2);

        /* residual connection */
        res = Sum(ende, x);
@@ -125,7 +125,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X

        /* dropout */
        if(isTraining && dropoutP > 0)
-            fnn = Dropout(fnn, dropoutP);
+            fnn = Dropout(fnn, dropoutP, 2);

        /* residual connection */
        res = Sum(fnn, x);

--- a/source/sample/transformer/T2TEncoder.cpp
+++ b/source/sample/transformer/T2TEncoder.cpp
@@ -107,7 +107,7 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo

    /* dropout */
    if(isTraining && dropoutP > 0)
-        x = Dropout(x, dropoutP);
+        x = Dropout(x, dropoutP, 2);

    for(int i = 0; i < nlayer; i++){
        XTensor att;
@@ -117,10 +117,10 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo

        /* self attention */
        att = attentions[i].Make(x, x, x, mask, isTraining);
-
+        
        /* dropout */
        if(isTraining && dropoutP > 0)
-            att = Dropout(att, dropoutP);
+            att = Dropout(att, dropoutP, 2);

        /* residual connection */
        res = Sum(att, x);
@@ -133,7 +133,7 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo

        /* dropout */
        if(isTraining && dropoutP > 0)
-            fnn = Dropout(fnn, dropoutP);
+            fnn = Dropout(fnn, dropoutP, 2);

        /* residual connection */
        res = Sum(fnn, x);

--- a/source/sample/transformer/T2TModel.cpp
+++ b/source/sample/transformer/T2TModel.cpp
@@ -274,7 +274,9 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
    _Sum(&maskEnc, padding3, &maskEnc);

    encoding = MakeEncoder(inputEnc, maskEnc, isTraining);
+
    decoding = MakeDecoder(inputDec, encoding, maskDec, maskEncDec, isTraining);
+
    outputLayer->Make(decoding, output);

    delete[] dims;

--- a/source/sample/transformer/T2TTrainer.cpp
+++ b/source/sample/transformer/T2TTrainer.cpp
@@ -231,7 +231,7 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
            DTYPE lossLocal = -prob / wc;
            bool doUpdate = (!IsNAN(lossLocal) && !IsINF(lossLocal) && lossLocal < 1e3F);

-            XTensor &g = labelSmoothingP > 0 ? goldSmoothed : gold;
+            XTensor &g = labelSmoothingP > 0 ? goldSmoothed : gold;   

            if (doUpdate) {
                

--- a/source/sample/transformer/Transformer.cpp
+++ b/source/sample/transformer/Transformer.cpp
@@ -60,6 +60,7 @@ int TransformerMain(int argc, const char ** argv)
    LoadParamString(argc, args, "output", outputFN, "");

    srand((unsigned int)time(NULL));
+
    T2TTrainer trainer;
    trainer.Init(argc, args);


--- a/source/tensor/function/Dropout.cpp
+++ b/source/tensor/function/Dropout.cpp
@@ -39,7 +39,7 @@ for more details.

 Here, the output is scaled by a factor of \frac{1}{1-p} so that we do not need
 to mark the tensor with probability p in the inference phase. Instead we perform
-the same inference procedure as that with no use of dropout on the test data.
+the same inference procedure as that on the test data withno nb use of dropout.
 
 >> x - input tensor
 >> y - output tensor
@@ -122,8 +122,8 @@ void _DropoutBackward(const XTensor * y, const XTensor * x,
    else
        ShowNTErrors("TODO!");
 }
-    
-/*
+
+/* 
 dropout function (we make tensor connections here)
 It randomly zeroes some of the elements of the input tensor
 with probability p via a Bernoulli distribution.
@@ -134,36 +134,79 @@ for more details.
 Here, the output is scaled by a factor of \frac{1}{1-p} so that we do not need
 to mark the tensor with probability p in the inference phase. Instead we perform
 the same inference procedure as that with no use of dropout on the test data.
- 
+
 >> x - input tensor
 >> dropProb - probability to set an element to zero
 >> leadingDim - the dimension which we generate the random numbers and perform broadcasting
+>> leadingDim2 - another dimension which we generate the random numbers and perform broadcasting
+<< return - tensor after dropout
 */
-XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim)
+XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim, int leadingDim2)
 {
    CheckNTErrors(dropProb >= 0.0 && dropProb <= 1.0, "The probability must be 0-1!");

-    int n = leadingDim < 0 ? x.order - 1 : leadingDim;
+    XTensor mask;
+    DTYPE * maskArray = NULL;

-    CheckNTErrors(n >= 0 && n < x.order, "Wrong leadingDim!");
+    if(leadingDim < 0 && leadingDim2 < 0){
+        ShowNTErrors("TODO");
+    }
+    else if(leadingDim2 < 0){
+        int n = leadingDim;

-    DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - dropProb);
+        CheckNTErrors(n >= 0 && n < x.order, "Wrong leadingDim!");
+
+        DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - dropProb);
    
-    /* generate a mask tensor with probability p */
-    int unitNum = x.dimSize[n];
-    DTYPE * maskArray = new DTYPE[unitNum];
+        /* generate a mask tensor with probability p */
+        int unitNum = x.dimSize[n];
+        maskArray = new DTYPE[unitNum];

-    //srand((unsigned int)time(NULL));
-    for (int i = 0; i < unitNum; i++)
-        maskArray[i] = RandomBernoulli(dropProb, scaleFactor);
+        //srand((unsigned int)time(NULL));
+        for (int i = 0; i < unitNum; i++)
+            maskArray[i] = RandomBernoulli(dropProb, scaleFactor);
    
-    XTensor mask;
-    InitTensor1D(&mask, unitNum, x.dataType, x.devID, x.mem);
-    mask.SetData(maskArray, unitNum);
+        XTensor mask;
+        InitTensor1D(&mask, unitNum, x.dataType, x.devID, x.mem);
+        mask.SetData(maskArray, unitNum);
+
+        delete[] maskArray;
+    
+        return MultiplyDim(x, mask, n);
+    }
+    else{
+        int n = leadingDim;
+        int m = leadingDim2;
+
+        CheckNTErrors(n >= 0 && n < x.order, "Wrong leadingDim!");
+        CheckNTErrors(m >= 0 && m < x.order, "Wrong leadingDim!");
+
+        DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - dropProb);
+    
+        /* generate a mask tensor with probability p */
+        int unitNum = x.dimSize[n] * x.dimSize[m];
+        maskArray = new DTYPE[unitNum];
+
+        //srand((unsigned int)time(NULL));
+        for (int i = 0; i < unitNum; i++)
+            maskArray[i] = RandomBernoulli(dropProb, scaleFactor);
+
+        int dims[MAX_TENSOR_DIM_NUM];
+
+        for(int i = 0; i < x.order; i++)
+            dims[i] = 1;
+        dims[n] = x.GetDim(n);
+        dims[m] = x.GetDim(m);
+    
+        InitTensor(&mask, x.order, dims, x.dataType, x.denseRatio,x.devID, x.mem);
+        mask.SetData(maskArray, unitNum);
+
+        delete[] maskArray;
+    
+        return MultiplyBroadcast(x, mask);
+    }

-    delete[] maskArray;
    
-    return MultiplyDim(x, mask, n, 0);
 }

 /* 
@@ -182,7 +225,6 @@ XTensor DropoutWithoutBroadcast(const XTensor &x, DTYPE dropProb)
    int unitNum = x.unitNum;
    DTYPE * maskArray = new DTYPE[unitNum];

-    srand((unsigned int)time(NULL));
    for (int i = 0; i < unitNum; i++)
        maskArray[i] = RandomBernoulli(dropProb, scaleFactor);
    

--- a/source/tensor/function/Dropout.h
+++ b/source/tensor/function/Dropout.h
@@ -39,9 +39,9 @@ void _Dropout(const XTensor * x, XTensor * y, unsigned int seed, DTYPE dropProb,
 void _DropoutBackward(const XTensor * y, const XTensor * x, 
                      const XTensor * dedy, XTensor * dedx, 
                      unsigned int seed, DTYPE dropProb, int leadingDim = -1);
-    
+
 /* dropout function */
-XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim = -1);
+XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim = -1, int leadingDim2 = -1);
    
 /* dropout function without broadcast */
 XTensor DropoutWithoutBroadcast(const XTensor &x, DTYPE dropProb);