better implementation of dropout

52a27964 · xiaotong · a8304bed · 52a27964 · 52a27964 · 52a27964
Commit 52a27964 authored Dec 28, 2018 by xiaotong
--- a/source/network/XBackwardMath.cpp
+++ b/source/network/XBackwardMath.cpp
@@ -87,6 +87,8 @@ void XMathGrad::MakeGrad(XTensor * node, bool isEfficient)
        GradSum(node, isEfficient);
    else if(operID == MATH_SUMDIM)
        GradSumDim(node, isEfficient);
+    else if(operID == MATH_SUMBROADCAST)
+        GradSumBroadcast(node, isEfficient);
    else if(operID == REDUCE_REDUCEMEAN)
        GradReduceMean(node, isEfficient);
    else if(operID == REDUCE_REDUCESUM)
@@ -817,6 +819,37 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
 }
 /*
+gradient for multiplication by broadcasting: 
+c = a * b
+where some dimensions of b are of size 1
+dE/da = dE/dc * b
+dE/db = (dE/dc * a).reduce(0...n)
+where a.reduce(0...n) is the reduction along the dimension
+whose size is 1 in b. Note that there might be several reductions.
+>> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
+*/
+void XMathGrad::GradMultiplyBroadcast(XTensor * node, bool isEfficient)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for MULTIPLYBROADCAST!");
+    XTensor * a = income.tails[0];
+    XTensor * b = income.tails[1];
+    DTYPE beta = income.GetParam(0);
+    XNoder::MakeGrad(a);
+    _MultiplyBroadcast(node->grad, b, a->grad, 1.0F);
+    if(b->isVar || b->income.tailNum > 0){
+        ShowNTErrors("TODO");
+    }
+}
+/*
 gradient for negate
 for
 c = -a
@@ -1254,6 +1287,37 @@ void XMathGrad::GradSumDim(XTensor * node, bool isEfficient)
 }
 /* 
+gradient for sum by broadcasting: 
+c = a + b * \beta
+where some dimensions of b are of size 1
+dE/da = dE/dc
+dE/db = dE/dc * a.reduce(0..n) * \beta 
+where a.reduce(0..n) is the reduction along the dimension
+whose size is 1 in b
+>> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
+*/
+void XMathGrad::GradSumBroadcast(XTensor * node, bool isEfficient)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for SUMBROADCAST!");
+    XTensor * a = income.tails[0];
+    XTensor * b = income.tails[1];
+    DTYPE beta = income.GetParam(0);
+    XNoder::MakeGrad(a);
+    _Sum(a->grad, node->grad, a->grad);
+    if(b->isVar || b->income.tailNum > 0){
+        ShowNTErrors("TODO");
+    }
+}
+/*
 gradient for reduceMean
 for
 c = reduceMean(a, dim)

--- a/source/network/XBackwardMath.h
+++ b/source/network/XBackwardMath.h
@@ -109,6 +109,11 @@ private:
    static
    void GradMultiplyDim(XTensor * node, bool isEfficient);
+    /* gradient for multiply one dimension: c =  a * b
+       where some dimensions of b are of size 1 */
+    static
+    void GradMultiplyBroadcast(XTensor * node, bool isEfficient);
    /* gradient for negate */
    static
    void GradNegate(XTensor * node, bool isEfficient);
@@ -143,6 +148,11 @@ private:
    static
    void GradSumDim(XTensor * node, bool isEfficient);
+    /* gradient for sum by broadcasting: c = a + b * \beta
+       where some dimensions of b are of size 1 */
+    static
+    void GradSumBroadcast(XTensor * node, bool isEfficient);
    /* gradient for reduceMean */
    static
    void GradReduceMean(XTensor * node, bool isEfficient);

--- a/source/sample/transformer/T2TDecoder.cpp
+++ b/source/sample/transformer/T2TDecoder.cpp
@@ -82,7 +82,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
    /* dropout */
    if(isTraining && dropoutP > 0)
-        x = Dropout(x, dropoutP);
+        x = Dropout(x, dropoutP, 2);
    for(int i = 0; i < nlayer; i++){
        XTensor att;
@@ -97,7 +97,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
        /* dropout */
        if(isTraining && dropoutP > 0)
-            att = Dropout(att, dropoutP);
+            att = Dropout(att, dropoutP, 2);
        /* residual connection */
        res = Sum(att, x);
@@ -111,7 +111,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
        /* dropout */
        if(isTraining && dropoutP > 0)
-            ende = Dropout(ende, dropoutP);
+            ende = Dropout(ende, dropoutP, 2);
        /* residual connection */
        res = Sum(ende, x);
@@ -125,7 +125,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
        /* dropout */
        if(isTraining && dropoutP > 0)
-            fnn = Dropout(fnn, dropoutP);
+            fnn = Dropout(fnn, dropoutP, 2);
        /* residual connection */
        res = Sum(fnn, x);

--- a/source/sample/transformer/T2TEncoder.cpp
+++ b/source/sample/transformer/T2TEncoder.cpp
@@ -107,7 +107,7 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo
    /* dropout */
    if(isTraining && dropoutP > 0)
-        x = Dropout(x, dropoutP);
+        x = Dropout(x, dropoutP, 2);
    for(int i = 0; i < nlayer; i++){
        XTensor att;
@@ -120,7 +120,7 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo
        /* dropout */
        if(isTraining && dropoutP > 0)
-            att = Dropout(att, dropoutP);
+            att = Dropout(att, dropoutP, 2);
        /* residual connection */
        res = Sum(att, x);
@@ -133,7 +133,7 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo
        /* dropout */
        if(isTraining && dropoutP > 0)
-            fnn = Dropout(fnn, dropoutP);
+            fnn = Dropout(fnn, dropoutP, 2);
        /* residual connection */
        res = Sum(fnn, x);

--- a/source/sample/transformer/T2TModel.cpp
+++ b/source/sample/transformer/T2TModel.cpp
@@ -274,7 +274,9 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
    _Sum(&maskEnc, padding3, &maskEnc);
    encoding = MakeEncoder(inputEnc, maskEnc, isTraining);
    decoding = MakeDecoder(inputDec, encoding, maskDec, maskEncDec, isTraining);
    outputLayer->Make(decoding, output);
    delete[] dims;

--- a/source/sample/transformer/T2TTrainer.cpp
+++ b/source/sample/transformer/T2TTrainer.cpp
--- a/source/sample/transformer/Transformer.cpp
+++ b/source/sample/transformer/Transformer.cpp
@@ -60,6 +60,7 @@ int TransformerMain(int argc, const char ** argv)
    LoadParamString(argc, args, "output", outputFN, "");
    srand((unsigned int)time(NULL));
    T2TTrainer trainer;
    trainer.Init(argc, args);

--- a/source/tensor/function/Dropout.cpp
+++ b/source/tensor/function/Dropout.cpp
@@ -39,7 +39,7 @@ for more details.
 Here, the output is scaled by a factor of \frac{1}{1-p} so that we do not need
 to mark the tensor with probability p in the inference phase. Instead we perform
-the same inference procedure as that with no use of dropout on the test data.
+the same inference procedure as that on the test data withno nb use of dropout.
 >> x - input tensor
 >> y - output tensor
@@ -138,12 +138,21 @@ the same inference procedure as that with no use of dropout on the test data.
 >> x - input tensor
 >> dropProb - probability to set an element to zero
 >> leadingDim - the dimension which we generate the random numbers and perform broadcasting
+>> leadingDim2 - another dimension which we generate the random numbers and perform broadcasting
+<< return - tensor after dropout
 */
-XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim)
+XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim, int leadingDim2)
 {
    CheckNTErrors(dropProb >= 0.0 && dropProb <= 1.0, "The probability must be 0-1!");
-    int n = leadingDim < 0 ? x.order - 1 : leadingDim;
+    XTensor mask;
+    DTYPE * maskArray = NULL;
+    if(leadingDim < 0 && leadingDim2 < 0){
+        ShowNTErrors("TODO");
+    }
+    else if(leadingDim2 < 0){
+        int n = leadingDim;
        CheckNTErrors(n >= 0 && n < x.order, "Wrong leadingDim!");
@@ -151,7 +160,7 @@ XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim)
        /* generate a mask tensor with probability p */
        int unitNum = x.dimSize[n];
-    DTYPE * maskArray = new DTYPE[unitNum];
+        maskArray = new DTYPE[unitNum];
        //srand((unsigned int)time(NULL));
        for (int i = 0; i < unitNum; i++)
@@ -163,7 +172,41 @@ XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim)
        delete[] maskArray;
-    return MultiplyDim(x, mask, n, 0);
+        return MultiplyDim(x, mask, n);
+    }
+    else{
+        int n = leadingDim;
+        int m = leadingDim2;
+        CheckNTErrors(n >= 0 && n < x.order, "Wrong leadingDim!");
+        CheckNTErrors(m >= 0 && m < x.order, "Wrong leadingDim!");
+        DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - dropProb);
+        /* generate a mask tensor with probability p */
+        int unitNum = x.dimSize[n] * x.dimSize[m];
+        maskArray = new DTYPE[unitNum];
+        //srand((unsigned int)time(NULL));
+        for (int i = 0; i < unitNum; i++)
+            maskArray[i] = RandomBernoulli(dropProb, scaleFactor);
+        int dims[MAX_TENSOR_DIM_NUM];
+        for(int i = 0; i < x.order; i++)
+            dims[i] = 1;
+        dims[n] = x.GetDim(n);
+        dims[m] = x.GetDim(m);
+        InitTensor(&mask, x.order, dims, x.dataType, x.denseRatio,x.devID, x.mem);
+        mask.SetData(maskArray, unitNum);
+        delete[] maskArray;
+        return MultiplyBroadcast(x, mask);
+    }
 }
 /* 
@@ -182,7 +225,6 @@ XTensor DropoutWithoutBroadcast(const XTensor &x, DTYPE dropProb)
    int unitNum = x.unitNum;
    DTYPE * maskArray = new DTYPE[unitNum];
-    srand((unsigned int)time(NULL));
    for (int i = 0; i < unitNum; i++)
        maskArray[i] = RandomBernoulli(dropProb, scaleFactor);

--- a/source/tensor/function/Dropout.h
+++ b/source/tensor/function/Dropout.h
@@ -41,7 +41,7 @@ void _DropoutBackward(const XTensor * y, const XTensor * x,
                      unsigned int seed, DTYPE dropProb, int leadingDim = -1);
 /* dropout function */
-XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim = -1);
+XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim = -1, int leadingDim2 = -1);
 /* dropout function without broadcast */
 XTensor DropoutWithoutBroadcast(const XTensor &x, DTYPE dropProb);