better code of dropout

2e20824a · xiaotong · df76b612 · 2e20824a · 2e20824a
Commit 2e20824a authored Sep 15, 2018 by xiaotong
--- a/source/tensor/function/Dropout.cpp
+++ b/source/tensor/function/Dropout.cpp
@@ -39,20 +39,19 @@ DTYPE RandomBernoulli(DTYPE prob)
 /*
 dropout function
+It randomly zeroes some of the elements of the input tensor
+with probability p via a Bernoulli distribution.
-During training, randomly zeroes some of the elements of the input tensor
+See "Improving neural networks by preventing co-adaptation of feature detectors"
-with probability p using samples from a Bernoulli distribution.
+for more details.
-The elements to zero are randomized on every forward call.
-This has proven to be an effective technique for regularization and
+Here, the output is scaled by a factor of \frac{1}{1-p} so that we do not need
-preventing the co-adaptation of neurons as described in the paper
+to mark the tensor with probability p in the inference phase. Instead we perform
-"Improving neural networks by preventing co-adaptation of feature detectors".
+the same inference procedure as that with no use of dropout on the test data.
-Furthermore, the outputs are scaled by a factor of \frac{1}{1-p} during training.
-This means that during evaluation the module simply computes an identity function.
 >> x - input tensor
 >> y - output tensor
->> prob - probability to set an element zero
+>> prob - probability to set an element to zero
 */
 void _Dropout(const XTensor *x, XTensor *y, unsigned int seed, DTYPE prob)
 {
@@ -90,55 +89,7 @@ void _Dropout(const XTensor *x, XTensor *y, unsigned int seed, DTYPE prob)
 }
 /* 
-dropout function (return a XTensor structure)
+backward computation of the dropout function
-make a new tensor to keep the result and return it
-During training, randomly zeroes some of the elements of the input tensor
-with probability p using samples from a Bernoulli distribution.
-The elements to zero are randomized on every forward call.
-This has proven to be an effective technique for regularization and
-preventing the co-adaptation of neurons as described in the paper
-"Improving neural networks by preventing co-adaptation of feature detectors".
-Furthermore, the outputs are scaled by a factor of \frac{1}{1-p} during training.
-This means that during evaluation the module simply computes an identity function.
->> x - input tensor
->> y - output tensor
->> prob - probability to set an element zero
-*/
-XTensor Dropout(const XTensor &x, DTYPE prob)
-{
-    XTensor y(&x);
-    y.SetTMP();
-   DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - prob);
-    /* generate a mask tensor again with special probability */
-    srand((unsigned int)time(NULL));
-    int unitNum = x.unitNum;
-    DTYPE * maskArray = new DTYPE[unitNum];
-    for (int i = 0; i < unitNum; i++)
-        maskArray[i] = RandomBernoulli(prob);
-    XTensor maskTensor(&x);
-    maskTensor.SetData(maskArray, unitNum);
-    XTensor inter;
-    inter = Multiply(x, maskTensor);
-    y = ScaleAndShift(inter, scaleFactor, 0);
-    delete[] maskArray;
-    ///* tensor connection */
-    //XLink::MakeLink(&x, NULL, &y, FUNC_DROPOUT);
-    //XLink::AddParamToHead(&y, prob);
-    return y;
-}
-/* 
-backward computation of dropout function
 dE/dx = dE/dy * dy/dx
@@ -166,7 +117,7 @@ void _DropoutBackward(const XTensor * y, const XTensor * x,
        XTensor * maskTensor = NewTensorBuf(x, x->devID, x->mem);
        maskTensor->SetData(maskArray, unitNum);
-        #ifdef USE_CUDA
+#ifdef USE_CUDA
        if(x->devID >= 0 || y->devID >= 0){
            _CudaDropoutBackward(y, x, dedy, dedx, maskTensor, scaleFactor);
@@ -174,7 +125,7 @@ void _DropoutBackward(const XTensor * y, const XTensor * x,
            delete[] maskArray;
            return;
        }
-        #endif
+#endif
        DTYPE * dedyp = (DTYPE*)dedy->data;
        DTYPE * dedxp = (DTYPE*)dedx->data;
@@ -190,4 +141,49 @@ void _DropoutBackward(const XTensor * y, const XTensor * x,
        ShowNTErrors("TODO!");
 }
+/*
+ dropout function (we make tensor connections here)
+ It randomly zeroes some of the elements of the input tensor
+ with probability p via a Bernoulli distribution.
+ See "Improving neural networks by preventing co-adaptation of feature detectors"
+ for more details.
+ Here, the output is scaled by a factor of \frac{1}{1-p} so that we do not need
+ to mark the tensor with probability p in the inference phase. Instead we perform
+ the same inference procedure as that with no use of dropout on the test data.
+ >> x - input tensor
+ >> y - output tensor
+ >> prob - probability to set an element to zero
+*/
+XTensor Dropout(const XTensor &x, DTYPE prob)
+{
+    DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - prob);
+    /* generate a mask tensor again with special probability */
+    srand((unsigned int)time(NULL));
+    int unitNum = x.unitNum;
+    DTYPE * maskArray = new DTYPE[unitNum];
+    for (int i = 0; i < unitNum; i++)
+        maskArray[i] = RandomBernoulli(prob);
+    XTensor maskTensor(&x);
+    maskTensor.SetData(maskArray, unitNum);
+    XTensor y;
+    XTensor inter;
+    inter = Multiply(x, maskTensor);
+    y = ScaleAndShift(inter, scaleFactor, 0);
+    delete[] maskArray;
+    ///* tensor connection */
+    //XLink::MakeLink(&x, NULL, &y, FUNC_DROPOUT);
+    //XLink::AddParamToHead(&y, prob);
+    return y;
+}
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/function/Dropout.h
+++ b/source/tensor/function/Dropout.h
@@ -30,14 +30,14 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 /* dropout function */
 void _Dropout(const XTensor * x, XTensor * y, unsigned int seed, DTYPE prob = 0.5);
-/* dropout function */
-XTensor Dropout(const XTensor &x, DTYPE prob = 0.5);
 /* de/dx */
 void _DropoutBackward(const XTensor * y, const XTensor * x, 
                      const XTensor * dedy, XTensor * dedx, 
                      unsigned int seed, DTYPE prob = 0.5);
+/* dropout function */
+XTensor Dropout(const XTensor &x, DTYPE prob = 0.5);
 } // namespace nts(NiuTrans.Tensor)
 #endif // __DROPOUT_H__