better code for dropout function (by broadcasting)

102db468 · xuchen · 4336f2f9 · 102db468 · 102db468 · 102db468
Commit 102db468 authored Sep 17, 2018 by xuchen
--- a/source/sample/transformer/T2TAttention.cpp
+++ b/source/sample/transformer/T2TAttention.cpp
@@ -125,17 +125,8 @@ XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask)

    dot = Linear(dot, 1.0F/(float)sqrt((float)dk));

-    //if(llnum == 1)
-    //    dot.Dump(tf, "dot:");
-
    scalar = Softmax(dot, -1);

-    //if(llnum == 1)
-    //    scalar.Dump(tf, "scalar:");
-
-    //if(ignored > 0)
-    //    _SetDataDim(&scalar, 0, ignored, scalar.order - 2, 1e-9F);
-
    att = BMMul(scalar, vheads);

    /* concatenate the heads */

--- a/source/sample/transformer/T2TAttention.h
+++ b/source/sample/transformer/T2TAttention.h
@@ -73,6 +73,9 @@ public:
       special design for the attention model. */
    int ignored;

+    /* indicates whether the model is used for training */
+    bool isTraining;
+
 public:
    /* constructor */
    T2TAttention();

--- a/source/sample/transformer/T2TEncoder.cpp
+++ b/source/sample/transformer/T2TEncoder.cpp
@@ -63,6 +63,7 @@ void AttEncoder::InitModel(int argc, const char ** argv,
    LoadParamInt(argc, argv, "hsize", &hSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "esize", &eSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "vsize", &vSize, -1);
+    LoadParamFloat(argc, argv, "dropout", &dropoutP, 0);

    CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
    CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsize\"");
@@ -89,29 +90,34 @@ make the encoding network
 >> input - the input tensor of the encoder
 >> mask - the mask that indicate each position is valid
 >> skipInputRes - indicates whether we skip the residual connection of the first layer
+>> isTraining - indicates whether the model is for training
 << return - the output tensor of the encoder
 */
-XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool skipInputRes)
+XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool skipInputRes, bool isTraining)
 {
    XTensor x;

    x = embedder.Make(input);

+    /* dropout */
+    if(isTraining && dropoutP > 0)
+        x = Dropout(x, dropoutP);
+
    for(int i = 0; i < nlayer; i++){
        XTensor att;
        XTensor ln;
        XTensor fnn;
        XTensor res;

-        llnum = -1;
-
        /* we skip the residual connection for the first layer if
           the encoder is used in language modeling. */
        if(skipInputRes && i == 0){
            /* self attention */
            att = attentions[i].Make(x, x, x, mask);

-            /* TODO: dropout */
+            /* dropout */
+            if(isTraining && dropoutP > 0)
+                att = Dropout(att, dropoutP);

            /* layer normalization */
            x = attLayerNorms[i].Make(att); 
@@ -121,27 +127,32 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool skipInputRes)
            /* self attention */
            att = attentions[i].Make(x, x, x, mask);

+            /* dropout */
+            if(isTraining && dropoutP > 0)
+                att = Dropout(att, dropoutP);
+
            /* residual connection */
            res = Sum(att, x);
-        
-            /* TODO: dropout */

            /* layer normalization */
            x = attLayerNorms[i].Make(res);
-
-            llnum = -1;
        }

        /* fnn */
        fnn = fnns[i].Make(x);

+        /* dropout */
+        if(isTraining && dropoutP > 0)
+            fnn = Dropout(fnn, dropoutP);
+
        /* residual connection */
        res = Sum(fnn, x);

-        /* TODO: dropout */
-
        /* layer normalization */
        x = fnnLayerNorms[i].Make(res);
+
+        if(isTraining && dropoutP > 0)
+            x = Dropout(x, dropoutP);
    }

    return x;

--- a/source/sample/transformer/T2TEncoder.h
+++ b/source/sample/transformer/T2TEncoder.h
@@ -40,7 +40,7 @@ class T2TEncoder
 {
 public:
    virtual
-    XTensor Make(XTensor &input, XTensor &mask, bool skipInputRes) = 0;
+    XTensor Make(XTensor &input, XTensor &mask, bool skipInputRes, bool isTraining) = 0;
 };

 /* 
@@ -49,7 +49,7 @@ the encoder based on RNN
 class RNNEncoder : T2TEncoder
 {
 public:
-    XTensor Make(XTensor &input, XTensor &mask, bool skipInputRes);
+    XTensor Make(XTensor &input, XTensor &mask, bool skipInputRes, bool isTraining);
 };


@@ -77,6 +77,9 @@ public:
    /* vocabulary size */
    int vSize;

+    /* dropout probability */
+    DTYPE dropoutP;
+
    /* some positions can be ignored in attention. this is useful in lm where the first position needs
       special design for the attention model. */
    int ignored;
@@ -115,7 +118,7 @@ public:
                   int myDevID = -1, XMem * myMem = NULL);

    /* make the encoding network */
-    XTensor Make(XTensor &input, XTensor &mask, bool skipInputRes);
+    XTensor Make(XTensor &input, XTensor &mask, bool skipInputRes, bool isTraining);
 };



--- a/source/sample/transformer/T2TFNN.cpp
+++ b/source/sample/transformer/T2TFNN.cpp
@@ -58,7 +58,7 @@ void T2TFNN::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)

    LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "d", &outSize, DEFAULT_EMBEDDING_SIZE);
-    LoadParamInt(argc, argv, "fnnh", &hSize, DEFAULT_EMBEDDING_SIZE);
+    LoadParamInt(argc, argv, "fnnh", &hSize, DEFAULT_EMBEDDING_SIZE * 4);
    LoadParamFloat(argc, argv, "fnnminmax", &minmax, 0.1F);

    InitTensor2D(&w1, inSize, hSize, X_FLOAT, devID, mem);

--- a/source/sample/transformer/T2TModel.cpp
+++ b/source/sample/transformer/T2TModel.cpp
@@ -77,11 +77,12 @@ make the encoding network
 >> input - input tensor
 >> mask - the mask for positions that are/not involved in computation
 >> skipInputRes - indicates whether we skip the residual connection of the first layer
+>> isTraining - indicates whether we are training the model
 << return - encoding result
 */
-XTensor T2TModel::MakeEncoding(XTensor &input, XTensor &mask, bool skipInputRes)
+XTensor T2TModel::MakeEncoding(XTensor &input, XTensor &mask, bool skipInputRes, bool isTraining)
 {
-    return encoder.Make(input, mask, skipInputRes);
+    return encoder.Make(input, mask, skipInputRes, isTraining);
 }

 /* 
@@ -89,8 +90,9 @@ make the entire network (with the output softmax layer)
 >> input - input tensor
 >> output - output tensor (distribution)
 >> padding - padding of the sequences
+>> isTraining - indicates whether the model is for training
 */
-void T2TModel::Make(XTensor &input, XTensor &output, XTensor &padding)
+void T2TModel::Make(XTensor &input, XTensor &output, XTensor &padding, bool isTraining)
 {
    XTensor encoding;
    
@@ -134,7 +136,7 @@ void T2TModel::Make(XTensor &input, XTensor &output, XTensor &padding)
        
        //_Sum(&mask, padding3, &mask);

-        encoding = MakeEncoding(input, mask, true);
+        encoding = MakeEncoding(input, mask, true, isTraining);
        outputLayer.Make(encoding, output);

        delete[] dims;

--- a/source/sample/transformer/T2TModel.h
+++ b/source/sample/transformer/T2TModel.h
@@ -69,10 +69,10 @@ public:
    void InitModel(int argc, const char ** argv);

    /* make the encoding network */
-    XTensor MakeEncoding(XTensor &input, XTensor &mask, bool skipInputRes);
+    XTensor MakeEncoding(XTensor &input, XTensor &mask, bool skipInputRes, bool isTraining);

    /* make the entire network (with the output softmax layer) */
-    void Make(XTensor &input, XTensor &output, XTensor &padding);
+    void Make(XTensor &input, XTensor &output, XTensor &padding, bool isTraining);

    /* get parameter matrics */
    void GetParams(XList &list);

--- a/source/sample/transformer/T2TTrainer.cpp
+++ b/source/sample/transformer/T2TTrainer.cpp
@@ -149,7 +149,7 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
            XTensor output;
            
            /* make the network */
-            model->Make(batch, output, padding);
+            model->Make(batch, output, padding, true);

            /* make paddings for the output */
            if(output.GetDim(0) > 1)
@@ -166,16 +166,6 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
            
            /* get probabilities */
            float prob = GetProb(&output, &gold, NULL);
-
-            MTYPE totalUsed = 0;
-            MTYPE totalSize = 0;
-            
-            for (int i = 0; i <= mem->curBlockID; i++) {
-                totalSize += mem->blocks[i].size;
-                totalUsed += mem->blocks[i].used;
-            }
-
-            //fprintf(stderr, "%d(%ld,%ld,%f)\n", mem->curBlockID, totalUsed, totalSize, (float)totalUsed/totalSize);
            
            loss += -prob;
            wordCount += wc;
@@ -209,6 +199,8 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)

    fclose(tf);
    
+    epoch = MIN(epoch, nepoch);
+    
    XPRINT6(0, stderr, "[INFO] lr=%.2e, elapsed=%.1fs, step=%d, epoch=%d, word=%d, ppl=%.3f\n",
            lr, elapsed, step, epoch, wordCountTotal, exp(loss / wordCount));
    XPRINT3(0, stderr, "[INFO] training finished (took %.1fs, step=%d and epoch=%d)\n",
@@ -271,7 +263,7 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
        XTensor output;
            
        /* make the network */
-        model->Make(batch, output, padding);
+        model->Make(batch, output, padding, false);

        int bSize = batch.GetDim(0);
        int length = batch.GetDim(1);
@@ -333,11 +325,19 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)

 char line[MAX_SEQUENCE_LENGTH];

+struct SampleNode
+{
+    int id;
+    int size;
+};
+
 /* 
 load data to buffer 
 >> file - where to load data
+>> isSorted - indicates whether the samples are sorted by length
+>> step - the number of sequences we go over when move to the next sample
 */
-int T2TTrainer::LoadBuf(FILE * file)
+int T2TTrainer::LoadBuf(FILE * file, bool isSorted, int step)
 {
    int lineCount = 0;
    int seqCount = 0;
@@ -403,6 +403,17 @@ int T2TTrainer::LoadBuf(FILE * file)
    nseqBuf = seqCount;
    nextSeq = 0;

+    if (isSorted) {
+        SampleNode * nodes = new SampleNode[seqCount];
+        int count = 0;
+        for (int i = 0; i < seqCount; i += step) {
+            nodes[count].id = count;
+            nodes[count].size = seqLen[i];
+            count++;
+        }
+        delete[] nodes;
+    }
+
    return lineCount;
 }

@@ -430,7 +441,7 @@ load a batch of sequences
 >> devID - device id
 >> mem - memory pool
 */
-int T2TTrainer::LoadBatch(FILE * file, bool isLM,
+int T2TTrainer::LoadBatch(FILE * file, bool isLM, 
                          XTensor * batch, XTensor * padding, XTensor * output, 
                          int * seqs,
                          int step, int vs, int sBatch, int wBatch, 
@@ -438,7 +449,7 @@ int T2TTrainer::LoadBatch(FILE * file, bool isLM,
                          int devID, XMem * mem)
 {
    if(nextSeq < 0 || nextSeq >= nseqBuf)
-        LoadBuf(file);
+        LoadBuf(file, isSorted);

    int seq = MAX(nextSeq, 0);
    int wc = 0;

--- a/source/sample/transformer/T2TTrainer.h
+++ b/source/sample/transformer/T2TTrainer.h
@@ -118,7 +118,7 @@ public:
    void Test(const char * fn, const char * ofn, T2TModel * model);

    /* load data to buffer */
-    int LoadBuf(FILE * file);
+    int LoadBuf(FILE * file, bool isSorted, int step);

    /* clear data buffer */
    void ClearBuf();

--- a/source/tensor/XMem.cpp
+++ b/source/tensor/XMem.cpp
@@ -747,6 +747,64 @@ void * XMem::AllocStandard(int myDevID, MTYPE mySize, bool myIsRebuiltIndex)
    	CheckNTErrors(nodeNumUsed < nodeNum, "No enough index nodes for the memory pool!");
    }

+    /*if(testxmemid == 30){
+        recordp = result;
+    }
+
+    if(curBlockID >= 25){
+        MHeader * head = blocks[25].head;
+        while(head != NULL){
+            fprintf(stderr, "head: %ld %ld\n", head->indexNode->pReal, head->indexNode->size);
+            head = head->next;
+        }
+    }
+
+    if(testxmemid == 32){
+        int nnn = 0;
+    }
+
+    if(recordp != NULL){
+        MTYPE size = mySize;
+        if(size <= minSizeIndex[0])
+            size = minSizeIndex[0];
+    
+        MPieceNode * entry = NULL;
+        MPieceNode * node = NULL;
+        MPieceNode * hit = NULL;
+        MPieceNode * last = NULL;
+    
+        entry = memIndex + indexEntryNum + FindIndexEntry(size);
+    
+        last = entry;
+        node = entry->next;
+    
+        while(node != NULL){
+            CheckNTErrors(node->pre == last, "Something is wrong!");
+            CheckNTErrors(last->next == node, "Something is wrong!");
+            CheckNTErrors(node->head.state == 2, "Something is wrong!");
+            last = node;
+        
+            if(node->size == 0){
+                MPieceNode * next = node->next;
+                RemoveFreeIndexNode(node, entry);
+                node = next;
+                ShowNTErrors("Something is wrong!");
+            }
+            else{
+                CheckNTErrors(node->pReal != NULL, "Illegal pointer!");
+                if(node->pReal == recordp){
+                    hit = node;
+                    break;
+                }
+                node = node->next;
+            }
+        }
+
+        if(hit == NULL){
+            int nnn = 0;
+        }
+    }*/
+
    return result;
 }

@@ -918,6 +976,8 @@ void XMem::ReleaseStandard(int myDevID, void * p, MTYPE size)
    hit->head.state = 1;
    
    RemoveAllocIndexNode(hit);
+
+    hit->size = (char*)hit->p + hit->head.size - (char*)GetPitchedAddress((char*)hit->p, MY_PITCH);
    AddFreeIndexNode(hit);

    blocks[hit->head.blockID].used -= hit->head.size;
@@ -981,8 +1041,9 @@ void XMem::RebuildIndex()
                /* make a new index node */
                MPieceNode * newNode = memIndex2 + nodeNumUsed2++;
                newNode->p = p;
-                newNode->size = (char*)p + head->size - 
-                                ( head->state == 1 ? (char*)GetPitchedAddress((char*)p, MY_PITCH) : (char*)head->indexNode->pReal);
+                newNode->size = node->size;
+                //newNode->size = (char*)p + head->size - 
+                //                ( head->state == 1 ? (char*)GetPitchedAddress((char*)p, MY_PITCH) : (char*)head->indexNode->pReal);
                newNode->pre = NULL;
                newNode->next = NULL;
                

--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -552,11 +552,17 @@ void XTensor::SetZeroAll(XStream * stream)
        if(devID >= 0){
 #ifdef USE_CUDA
            int size = sizeof(int) + (sizeof(int)+sizeof(DTYPE)) * unitNumNonZero;
+            
+            int devIDBackup = 0;
+            cudaGetDevice(&devIDBackup);
+            cudaSetDevice(devID);

            if(stream == NULL)
                cudaMemset(data, 0, size);
            else
                cudaMemsetAsync(data, 0, size, stream->stream);
+            
+            cudaSetDevice(devIDBackup);
 #endif
        }
        else
@@ -567,10 +573,16 @@ void XTensor::SetZeroAll(XStream * stream)
    else{
        if(devID >= 0){
 #ifdef USE_CUDA
+            int devIDBackup = 0;
+            cudaGetDevice(&devIDBackup);
+            cudaSetDevice(devID);
+            
            if(stream == NULL)
                cudaMemset(data, 0, unitNum * unitSize);
            else
                cudaMemsetAsync(data, 0, unitNum * unitSize, stream->stream);
+            
+            cudaSetDevice(devIDBackup);
 #endif
        }
        else

--- a/source/tensor/core/arithmetic/Multiply.cpp
+++ b/source/tensor/core/arithmetic/Multiply.cpp
@@ -66,8 +66,8 @@ void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, i
    for (int i = 0; i < a->order; i++) {
        if (i != leadingDimRDI) {
            CheckNTErrors((a->dimSizeRDI[i] == b->dimSizeRDI[i] &&
-                a->dimSizeRDI[i] == c->dimSizeRDI[i]),
-                "Unmatched tensors!");
+                           a->dimSizeRDI[i] == c->dimSizeRDI[i]),
+                          "Unmatched tensors!");
        }
        if (i < leadingDimRDI)
            stride *= a->dimSizeRDI[i];

--- a/source/tensor/core/arithmetic/Multiply.cu
+++ b/source/tensor/core/arithmetic/Multiply.cu
@@ -77,7 +77,7 @@ where |a_lead| means the size of the leading dimension of a
 */
 template<int nonZeroAlpha> __global__
 void KernelMulElementWiseTensorDynamic(DTYPE * a, DTYPE * b, DTYPE * c, DTYPE alpha,
-    int stride, int ldSizeA, int ldSizeB, int ldSizeC, int blockNum)
+                                       int stride, int ldSizeA, int ldSizeB, int ldSizeC, int blockNum)
 {
    __shared__ DTYPE* ap[MAX_CUDA_THREAD_NUM_PER_BLOCK];
    __shared__ DTYPE* bp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
@@ -171,14 +171,12 @@ void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alph
                if (alpha == 0) {
                    KernelMulElementWiseTensorDynamic<0> << <blocks, threads >> >
                        ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, 0,
-                            stride, dimensionSizeA, dimensionSizeB, dimensionSizeC,
-                            blockNum);
+                          stride, dimensionSizeA, dimensionSizeB, dimensionSizeC, blockNum);
                }
                else {
                    KernelMulElementWiseTensorDynamic<1> << <blocks, threads >> >
                        ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, alpha,
-                            stride, dimensionSizeA, dimensionSizeB, dimensionSizeC,
-                            blockNum);
+                          stride, dimensionSizeA, dimensionSizeB, dimensionSizeC, blockNum);
                }
            }
        }

--- a/source/tensor/function/Dropout.cpp
+++ b/source/tensor/function/Dropout.cpp
@@ -25,120 +25,59 @@
 #include "Dropout.h"
 #include "Dropout.cuh"
 #include "../core/arithmetic/Multiply.h"
+#include "../core/arithmetic/MultiplyDim.h"
 #include "../core/math/ScaleAndShift.h"

 namespace nts{ // namespace nts(NiuTrans.Tensor

 /*
-generate a random bernoulli number
-*/
-DTYPE RandomBernoulli(DTYPE prob)
-{
-    return (DTYPE)rand()/(DTYPE)RAND_MAX > prob ? (DTYPE)1.0 : (DTYPE)0.0;
-}
-
-/*
 dropout function
+It randomly zeroes some of the elements of the input tensor
+with probability p via a Bernoulli distribution.

-During training, randomly zeroes some of the elements of the input tensor
-with probability p using samples from a Bernoulli distribution.
-The elements to zero are randomized on every forward call.
-
-This has proven to be an effective technique for regularization and
-preventing the co-adaptation of neurons as described in the paper
-"Improving neural networks by preventing co-adaptation of feature detectors".
+See "Improving neural networks by preventing co-adaptation of feature detectors"
+for more details.

-Furthermore, the outputs are scaled by a factor of \frac{1}{1-p} during training.
-This means that during evaluation the module simply computes an identity function.
+Here, the output is scaled by a factor of \frac{1}{1-p} so that we do not need
+to mark the tensor with probability p in the inference phase. Instead we perform
+the same inference procedure as that with no use of dropout on the test data.
+ 
 >> x - input tensor
 >> y - output tensor
->> prob - probability to set an element zero
+>> seed - random seed
+>> dropProb - probability to set an element to zero
+>> leadingDim - the dimension which we generate the random numbers and perform broadcasting
 */
-void _Dropout(const XTensor *x, XTensor *y, unsigned int seed, DTYPE prob)
+void _Dropout(const XTensor * x, XTensor * y, unsigned int seed, DTYPE dropProb, int leadingDim)
 {
-    CheckNTErrors(prob >= 0.0 && prob <= 1.0, "The probability must be 0-1!");
-    
-   DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - prob);
-    
-    /* generate a mask tensor again with special probability */
-    srand(seed);
-    int unitNum = x->unitNum;
-    DTYPE * maskArray = new DTYPE[unitNum];
-    for (int i = 0; i < unitNum; i++)
-        maskArray[i] = RandomBernoulli(prob);
+    CheckNTErrors(dropProb >= 0.0 && dropProb <= 1.0, "The probability must be 0-1!");

-    XTensor * maskTensor = NewTensorBuf(x, x->devID, x->mem);
-    maskTensor->SetData(maskArray, unitNum);
+    int n = leadingDim < 0 ? x->order - 1 : leadingDim;

-#ifdef USE_CUDA
-    if(x->devID >=0 || y->devID >= 0){
-        _CudaDropout(x, y, maskTensor, scaleFactor);
-        
-        DelTensorBuf(maskTensor);
-        delete[] maskArray;
-        return;
-    }
-#endif
+    CheckNTErrors(n >= 0 && n < x->order, "Wrong leadingDim!");

-    XTensor * inter = NewTensorBuf(x, x->devID, x->mem);
-    _Multiply(x, maskTensor, inter);
-    _ScaleAndShift(inter, y, scaleFactor, 0);
-    
-    DelTensorBuf(inter);
-    DelTensorBuf(maskTensor);
-    delete[] maskArray;
-}
-
-/*
-dropout function (return a XTensor structure)
-make a new tensor to keep the result and return it
-
-During training, randomly zeroes some of the elements of the input tensor
-with probability p using samples from a Bernoulli distribution.
-The elements to zero are randomized on every forward call.
-
-This has proven to be an effective technique for regularization and
-preventing the co-adaptation of neurons as described in the paper
-"Improving neural networks by preventing co-adaptation of feature detectors".
-
-Furthermore, the outputs are scaled by a factor of \frac{1}{1-p} during training.
-This means that during evaluation the module simply computes an identity function.
->> x - input tensor
->> y - output tensor
->> prob - probability to set an element zero
-*/
-XTensor Dropout(const XTensor &x, DTYPE prob)
-{
-    XTensor y(&x);
-    y.SetTMP();
-
-   DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - prob);
+    DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - dropProb);
    
    /* generate a mask tensor again with special probability */
-    srand((unsigned int)time(NULL));
-    int unitNum = x.unitNum;
+    int unitNum = x->dimSize[n];
    DTYPE * maskArray = new DTYPE[unitNum];
-    for (int i = 0; i < unitNum; i++)
-        maskArray[i] = RandomBernoulli(prob);
-
-    XTensor maskTensor(&x);
-    maskTensor.SetData(maskArray, unitNum);

-    XTensor inter;
-    inter = Multiply(x, maskTensor);
-    y = ScaleAndShift(inter, scaleFactor, 0);
+    srand(seed);
+    for (int i = 0; i < unitNum; i++)
+        maskArray[i] = RandomBernoulli(dropProb, scaleFactor);

-    delete[] maskArray;
+    XTensor * mask = NewTensor1D(unitNum, x->dataType, x->devID, x->mem);
+    mask->SetData(maskArray, unitNum);

-    ///* tensor connection */
-    //XLink::MakeLink(&x, NULL, &y, FUNC_DROPOUT);
-    //XLink::AddParamToHead(&y, prob);
+    /* call Multiply function for mask */
+    _MultiplyDim(x, mask, y, n, 0);
    
-    return y;
+    delete mask;
+    delete[] maskArray;
 }

 /* 
-backward computation of dropout function
+backward computation of the dropout function

 dE/dx = dE/dy * dy/dx

@@ -146,48 +85,86 @@ dE/dx = dE/dy * dy/dx
 >> x - input of the dropout function
 >> dedy - dE/dy
 >> dedx - dE/dx
->> prob - probability to set an element zero
+>> seed - random seed
+>> dropProb - probability to set an element to zero
+>> leadingDim - the dimension which we generate the random numbers and perform broadcasting
 */
 void _DropoutBackward(const XTensor * y, const XTensor * x, 
                      const XTensor * dedy, XTensor * dedx, 
-                      unsigned int seed, DTYPE prob)
+                      unsigned int seed, DTYPE dropProb, int leadingDim)
 {
+    CheckNTErrors(dropProb >= 0.0 && dropProb <= 1.0, "The probability must be 0-1!");
+
+    int n = leadingDim < 0 ? x->order - 1 : leadingDim;
+
+    CheckNTErrors(n >= 0 && n < x->order, "Wrong leadingDim!");
+
    if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE)
    {
-        int unitNum = y->unitNum;
-        DTYPE scaleFactor = (DTYPE)1.0F / ((DTYPE)1.0F - prob);
+        DTYPE scaleFactor = (DTYPE)1.0F / ((DTYPE)1.0F - dropProb);

        /* generate a mask tensor again with special probability */
-        srand(seed);
+        int unitNum = x->dimSize[n];
        DTYPE * maskArray = new DTYPE[unitNum];
+        
+        srand(seed);
        for (int i = 0; i < unitNum; i++)
-            maskArray[i] = RandomBernoulli(prob);
-
-        XTensor * maskTensor = NewTensorBuf(x, x->devID, x->mem);
-        maskTensor->SetData(maskArray, unitNum);
-
-        #ifdef USE_CUDA
-            if(x->devID >= 0 || y->devID >= 0){
-                _CudaDropoutBackward(y, x, dedy, dedx, maskTensor, scaleFactor);
-                
-                DelTensorBuf(maskTensor);
-                delete[] maskArray;
-                return;
-            }
-        #endif
+            maskArray[i] = RandomBernoulli(dropProb, scaleFactor);

-        DTYPE * dedyp = (DTYPE*)dedy->data;
-        DTYPE * dedxp = (DTYPE*)dedx->data;
+        XTensor * mask = NewTensor1D(unitNum, x->dataType, x->devID, x->mem);
+        mask->SetData(maskArray, unitNum);

-        /* dE/dx = dE/dy * dy/dx */
-        for(int i = 0; i < unitNum; i++)
-            dedxp[i] = dedyp[i] * maskArray[i] * scaleFactor;
+        /* call MultiplyDim function for mask */
+        _MultiplyDim(dedy, mask, dedx, n, 0);

-        DelTensorBuf(maskTensor);
+        delete mask;
        delete[] maskArray;
    }
    else
        ShowNTErrors("TODO!");
 }
+    
+/*
+dropout function (we make tensor connections here)
+It randomly zeroes some of the elements of the input tensor
+with probability p via a Bernoulli distribution.
+ 
+See "Improving neural networks by preventing co-adaptation of feature detectors"
+for more details.
+ 
+Here, the output is scaled by a factor of \frac{1}{1-p} so that we do not need
+to mark the tensor with probability p in the inference phase. Instead we perform
+the same inference procedure as that with no use of dropout on the test data.
+ 
+>> x - input tensor
+>> dropProb - probability to set an element to zero
+>> leadingDim - the dimension which we generate the random numbers and perform broadcasting
+*/
+XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim)
+{
+    CheckNTErrors(dropProb >= 0.0 && dropProb <= 1.0, "The probability must be 0-1!");
+
+    int n = leadingDim < 0 ? x.order - 1 : leadingDim;
+
+    CheckNTErrors(n >= 0 && n < x.order, "Wrong leadingDim!");
+
+    DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - dropProb);
+    
+    /* generate a mask tensor with probability p */
+    int unitNum = x.dimSize[n];
+    DTYPE * maskArray = new DTYPE[unitNum];
+
+    srand((unsigned int)time(NULL));
+    for (int i = 0; i < unitNum; i++)
+        maskArray[i] = RandomBernoulli(dropProb, scaleFactor);
+    
+    XTensor mask;
+    InitTensor1D(&mask, unitNum, x.dataType, x.devID, x.mem);
+    mask.SetData(maskArray, unitNum);
+
+    delete[] maskArray;
+    
+    return MultiplyDim(x, mask, n, 0);
+}

 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/function/Dropout.h
+++ b/source/tensor/function/Dropout.h
@@ -27,16 +27,22 @@

 namespace nts{ // namespace nts(NiuTrans.Tensor)

-/* dropout function */
-void _Dropout(const XTensor * x, XTensor * y, unsigned int seed, DTYPE prob = 0.5);
+/* generate a random bernoulli number */
+inline DTYPE RandomBernoulli(DTYPE dropProb, DTYPE value)
+{
+    return (DTYPE)rand()/(DTYPE)RAND_MAX >= dropProb ? (DTYPE)value : 0;
+}

 /* dropout function */
-XTensor Dropout(const XTensor &x, DTYPE prob = 0.5);
+void _Dropout(const XTensor * x, XTensor * y, unsigned int seed, DTYPE dropProb, int leadingDim = -1);

 /* de/dx */
 void _DropoutBackward(const XTensor * y, const XTensor * x, 
                      const XTensor * dedy, XTensor * dedx, 
-                      unsigned int seed, DTYPE prob = 0.5);
+                      unsigned int seed, DTYPE dropProb, int leadingDim = -1);
+    
+/* dropout function */
+XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim = -1);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/function/FHeader.h
+++ b/source/tensor/function/FHeader.h
@@ -26,6 +26,7 @@

 #include "../XTensor.h"

+#include "Dropout.h"
 #include "HardTanH.h"
 #include "Identity.h"
 #include "LogSoftmax.h"

--- a/source/tensor/test/TDropout.cpp
+++ b/source/tensor/test/TDropout.cpp
@@ -31,10 +31,11 @@ case 1: test Dropout function.
 bool TestDropout1()
 {
    /* a input tensor of size (4, 5) */
-    int order = 2;
+    int order = 3;
    int * dimSize = new int[order];
    dimSize[0] = 40;
    dimSize[1] = 50;
+    dimSize[2] = 60;

    int unitNum = 1;
    for (int i = 0; i < order; i++)
@@ -49,14 +50,14 @@ bool TestDropout1()
    XTensor yUser;

    /* initialize variables */
-    x->SetDataRand(0, 1);
+    _SetDataFixedFloat(x, 1.0F);
    y->SetZeroAll();

    /* call Dropout function */
-    float prob = 0.2F;
+    float drop_prob = 0.2F;
    int seed = 20;
-    _Dropout(x, y, seed, prob);
-    yUser = Dropout(*x);
+    _Dropout(x, y, seed, drop_prob);
+    yUser = Dropout(*x, drop_prob);

    /* check result */
    int zeroNum1 = 0;
@@ -73,9 +74,9 @@ bool TestDropout1()
    }
    printf("CPU Test:\n");
    printf("In tensor y, there are %d units.\n", unitNum);
-    printf("There are %d zero units by Dropout layer with probability %.2f.\n", zeroNum1, prob);
+    printf("There are %d zero units by Dropout layer with probability %.2f.\n", zeroNum1, drop_prob);
    printf("In tensor yUser, there are %d units.\n", unitNum);
-    printf("There are %d zero units by Dropout layer with default probability %.2f.\n", zeroNum2, 0.5F);
+    printf("There are %d zero units by Dropout layer with default probability %.2f.\n", zeroNum2, drop_prob);

 #ifdef USE_CUDA
    /* GPU test */
@@ -87,12 +88,12 @@ bool TestDropout1()
    XTensor yUserGPU;

    /* initialize variables */
-    xGPU->SetDataRand(0, 1);
+    _SetDataFixedFloat(xGPU, 1.0F);
    yGPU->SetZeroAll();

    /* call Dropout function */
-    _Dropout(xGPU, yGPU, seed, prob);
-    yUserGPU = Dropout(*xGPU);
+    _Dropout(xGPU, yGPU, seed, drop_prob);
+    yUserGPU = Dropout(*xGPU, drop_prob);

    /* check result */
    zeroNum1 = 0;
@@ -109,9 +110,9 @@ bool TestDropout1()
    }
    printf("CPU Test:\n");
    printf("In tensor y, there are %d units.\n", unitNum);
-    printf("There are %d zero units by Dropout layer with probability %.2f.\n", zeroNum1, prob);
+    printf("There are %d zero units by Dropout layer with probability %.2f.\n", zeroNum1, drop_prob);
    printf("In tensor yUser, there are %d units.\n", unitNum);
-    printf("There are %d zero units by Dropout layer with default probability %.2f.\n", zeroNum2, 0.5F);
+    printf("There are %d zero units by Dropout layer with default probability %.2f.\n", zeroNum2, drop_prob);

    /* destroy variables */
    delete x;
@@ -159,13 +160,13 @@ bool TestDropout2()
    _SetDataFixedFloat(x, 1.0F);
    y->SetZeroAll();
    dedx->SetZeroAll();
-    _SetDataFixedFloat(dedy, 1.0F);
+    _SetDataFixedFloat(dedy, 1.5F);

    /* call Dropout function */
-    float prob = 0.5F;
+    float drop_prob = 0.5F;
    int seed = 1;
-    _Dropout(x, y, seed, prob);
-    _DropoutBackward(y, x, dedy, dedx, 1, prob);
+    _Dropout(x, y, seed, drop_prob);
+    _DropoutBackward(y, x, dedy, dedx, 1, drop_prob);

    /* check result */
    y->Dump(stderr, "y");
@@ -185,11 +186,11 @@ bool TestDropout2()
    _SetDataFixedFloat(xGPU, 1.0F);
    yGPU->SetZeroAll();
    dedxGPU->SetZeroAll();
-    _SetDataFixedFloat(dedyGPU, 1.0F);
+    _SetDataFixedFloat(dedyGPU, 1.5F);

    /* call Dropout function */
-    _Dropout(xGPU, yGPU, seed, prob);
-    _DropoutBackward(yGPU, xGPU, dedyGPU, dedxGPU, 1, prob);
+    _Dropout(xGPU, yGPU, seed, drop_prob);
+    _DropoutBackward(yGPU, xGPU, dedyGPU, dedxGPU, 1, drop_prob);

    /* check result */
    yGPU->Dump(stderr, "yGPU");

--- a/source/tensor/test/TXMem.cpp
+++ b/source/tensor/test/TXMem.cpp
@@ -65,9 +65,10 @@ bool TestXMemCase1()

            for (int i = 0; i < testNum * scalar; i++) {
                testxmemid++;
-                //fprintf(stderr, "%d %d\n", testxmemid, ok);
                int j = rand() % caseNum;

+                //fprintf(stderr, "%d %d %d\n", testxmemid, j, ok);
+
                if (p[j] == NULL) {
                    p[j] = (int*)mem.AllocStandard(mem.devID, size[j] * sizeof(int));
                    for (int k = 0; k < size[j]; k++)