1. add dropout function 2. add some function in tensor/core/math/unary file 3.…

1. add dropout function 2. add some function in tensor/core/math/unary file 3. merge with xiaotong-working branch

1. add dropout function 2. add some function in tensor/core/math/unary file 3.…
1. add dropout function 2. add some function in tensor/core/math/unary file 3. merge with xiaotong-working branch
4336f2f9 · xuchen · 98db6f24 · 4336f2f9 · 4336f2f9 · 4336f2f9
Commit 4336f2f9 authored Sep 15, 2018 by xuchen
--- a/source/network/Main.cpp
+++ b/source/network/Main.cpp
@@ -39,7 +39,6 @@ void SumDimTest();
 using namespace nts;
 using namespace fnnlm;
 using namespace transformer;
-using namespace GAN;

 int main( int argc, const char ** argv )
 {
@@ -47,9 +46,7 @@ int main( int argc, const char ** argv )
    //BackwardTest();
    //return 0;

-    if(argc > 1 && !strcmp(argv[1], "-test"))
-        Test();
-    else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
+    if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
        FNNLMMain(argc - 1, argv + 1);
    else if(argc > 1 && !strcmp(argv[1], "-t2t"))
        TransformerMain(argc - 1, argv + 1);

--- a/source/network/XBackwardMath.cpp
+++ b/source/network/XBackwardMath.cpp
@@ -451,10 +451,10 @@ void XMathGrad::GradDivDim(XTensor * node)
        DelTensorBuf(interGrad);
    }

-    DelTensorBuf(aTMP1);
-    DelTensorBuf(aTMP2);
-    DelTensorBuf(bTMP);
    DelTensorBuf(interGradTMP);
+    DelTensorBuf(bTMP);
+    DelTensorBuf(aTMP2);
+    DelTensorBuf(aTMP1);

    node->visitMark = NODE_FINISHED;
 }
@@ -499,8 +499,8 @@ void XMathGrad::GradMatrixMul(XTensor * node)

        a->Reshape(a->unitNum/a->GetDim(-1), a->GetDim(-1));
        c->Reshape(c->unitNum/c->GetDim(-1), c->GetDim(-1));
-        deda->Reshape(a->unitNum/a->GetDim(-1), a->GetDim(-1));
-        dedc->Reshape(c->unitNum/c->GetDim(-1), c->GetDim(-1));
+        deda->Reshape(deda->unitNum/deda->GetDim(-1), deda->GetDim(-1));
+        dedc->Reshape(dedc->unitNum/dedc->GetDim(-1), dedc->GetDim(-1));

        GradMatrixMul(a, deda, transA, b, dedb, transB, dedc, alpha);

@@ -760,7 +760,7 @@ void XMathGrad::GradMultiplyDim(XTensor * node)
        DelTensorBuf(interGrad);
    }

-    DelTensor(bGradTMP);
+    DelTensorBuf(bGradTMP);
    node->visitMark = NODE_FINISHED;
 }

@@ -796,6 +796,8 @@ gradient for normalize
 */
 void XMathGrad::GradNormalize(XTensor * node)
 {
+    ShowNTErrors("This is really a bad piece of code!!!");
+    
    XLink &income = node->income;
    CheckNTErrors(income.tailNum == 5, "Wrong input tensor number for NORMALIZE!");

@@ -902,7 +904,7 @@ void XMathGrad::GradPower(XTensor * node)
    _ScaleAndShiftMe(b, p);
    _Multiply(node->grad, b, a->grad, 1.0F);

-    DelTensor(b);
+    DelTensorBuf(b);

    node->visitMark = NODE_FINISHED;
 }
@@ -1229,7 +1231,7 @@ void XMathGrad::GradReduceSum(XTensor * node)
    _Unsqueeze(node->grad, b, dim, n);
    _Sum(a->grad, b, a->grad);

-    DelTensor(b);
+    DelTensorBuf(b);

    node->visitMark = NODE_FINISHED;
 }
@@ -1274,10 +1276,10 @@ void XMathGrad::GradReduceSumSquared(XTensor * node)
    _ScaleAndShiftMe(f, -2.0F);
    _Multiply(node->grad, f, b->grad, 1.0F);

-    DelTensorBuf(c);
-    DelTensorBuf(d);
-    DelTensorBuf(e);
    DelTensorBuf(f);
+    DelTensorBuf(e);
+    DelTensorBuf(d);
+    DelTensorBuf(c);

    node->visitMark = NODE_FINISHED;
 }
@@ -1323,10 +1325,10 @@ void XMathGrad::GradReduceVariance(XTensor * node)
    _ScaleAndShiftMe(f, -2.0F /n);
    _Multiply(node->grad, f, b->grad, 1.0F);

-    DelTensorBuf(c);
-    DelTensorBuf(d);
-    DelTensorBuf(e);
    DelTensorBuf(f);
+    DelTensorBuf(e);
+    DelTensorBuf(d);
+    DelTensorBuf(c);

    node->visitMark = NODE_FINISHED;
 }

--- a/source/network/XNet.cpp
+++ b/source/network/XNet.cpp
@@ -145,14 +145,19 @@ void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
            lossGrad.Compute(gold, root, root->grad, loss);
        }
    }
-
+    
    /* back-propagation from output to input */
    for(int i = nodes.count - 1; i >= 0; i--){
-        XTensor * node = (XTensor*)nodes.Get(i);;
+        XTensor * node = (XTensor*)nodes.Get(i);
+
+        if(node->mem != NULL){
+            CheckNTErrors(node->mem->bufUsed < BUF_PITCH, "Illegal access of buffer!");
+        }
+
        if(node->visitMark == NODE_FINISHED)
            continue;

-        BackwardNode(node);
+        BackwardNode(node);  
    }
 }


--- a/source/sample/transformer/T2TAttention.cpp
+++ b/source/sample/transformer/T2TAttention.cpp
@@ -116,15 +116,25 @@ XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask)
    XTensor att;
    XTensor dot;
    XTensor scalar;
-    
+
    /* scalar = softmax(Q * K^T / sqrt(dk)) * V */
    dot = BMMul(qheads, X_NOTRANS, kheads, X_TRANS);
+
    if(isMasked)
        dot = dot + mask;
-    scalar = Softmax(Linear(dot, 1/(float)sqrt((float)dk)), -1);

-    if(ignored > 0)
-        _SetDataDim(&scalar, 0, ignored, scalar.order - 2, 1e-9F);
+    dot = Linear(dot, 1.0F/(float)sqrt((float)dk));
+
+    //if(llnum == 1)
+    //    dot.Dump(tf, "dot:");
+
+    scalar = Softmax(dot, -1);
+
+    //if(llnum == 1)
+    //    scalar.Dump(tf, "scalar:");
+
+    //if(ignored > 0)
+    //    _SetDataDim(&scalar, 0, ignored, scalar.order - 2, 1e-9F);

    att = BMMul(scalar, vheads);


--- a/source/sample/transformer/T2TEmbedding.cpp
+++ b/source/sample/transformer/T2TEmbedding.cpp
@@ -111,6 +111,9 @@ XTensor T2TEmbedder::Make(XTensor &input)
    memcpy(dims, input.dimSize, input.order * sizeof(int));
    dims[input.order - 1] = eSize;

+    XTensor wordEmbedding;
+    XTensor posEmbedding;
+
    bool match = (posEmbedding.order == input.order);
    if(match){
        for(int i = 0; i < input.order; i++){
@@ -120,7 +123,8 @@ XTensor T2TEmbedder::Make(XTensor &input)
    }

    /* we make positional embeddings first */
-    if(!match){
+    //if(!match){
+    if(true){
        InitTensor(&posEmbedding, input.order, dims, X_FLOAT, 1.0F, devID, mem);
        XTensor * posTMP = NewTensorBuf(2, dims + 1, X_FLOAT, 1.0F, devID, mem);

@@ -130,8 +134,6 @@ XTensor T2TEmbedder::Make(XTensor &input)
        DelTensorBuf(posTMP);
    }

-    XTensor wordEmbedding;
-
    /* then we make word embeddings */
    wordEmbedding = Linear(MMul(input, w), (float)sqrt((float)d));


--- a/source/sample/transformer/T2TEmbedding.h
+++ b/source/sample/transformer/T2TEmbedding.h
@@ -63,9 +63,6 @@ public:
       the embedding processing by re-loading. */
    XTensor posEmbeddingBase;

-    /* positional embeddings */
-    XTensor posEmbedding;
-
 public:
    /* constructor */
    T2TEmbedder();

--- a/source/sample/transformer/T2TEncoder.cpp
+++ b/source/sample/transformer/T2TEncoder.cpp
@@ -103,6 +103,10 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool skipInputRes)
        XTensor fnn;
        XTensor res;

+        llnum = -1;
+
+        /* we skip the residual connection for the first layer if
+           the encoder is used in language modeling. */
        if(skipInputRes && i == 0){
            /* self attention */
            att = attentions[i].Make(x, x, x, mask);
@@ -113,6 +117,7 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool skipInputRes)
            x = attLayerNorms[i].Make(att); 
        }
        else{
+
            /* self attention */
            att = attentions[i].Make(x, x, x, mask);

@@ -123,6 +128,8 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool skipInputRes)

            /* layer normalization */
            x = attLayerNorms[i].Make(res);
+
+            llnum = -1;
        }

        /* fnn */

--- a/source/sample/transformer/T2TLayerNormal.cpp
+++ b/source/sample/transformer/T2TLayerNormal.cpp
@@ -27,7 +27,7 @@

 namespace transformer
 {
-    
+
 /* constructor */
 T2TLN::T2TLN()
 {

--- a/source/sample/transformer/T2TModel.cpp
+++ b/source/sample/transformer/T2TModel.cpp
@@ -51,19 +51,24 @@ initialize the model
 void T2TModel::InitModel(int argc, const char ** argv)
 {
    bool useMem = false;
+    int memSize = 0;
+    bool isMemFreeOTF = false;

    LoadParamInt(argc, argv, "dev", &devID, -1);
    LoadParamBool(argc, argv, "mem", &useMem, useMem);
+    LoadParamInt(argc, argv, "memsize", &memSize, 1024);
    LoadParamBool(argc, argv, "lm", &isLM, true);
    LoadParamBool(argc, argv, "mt", &isMT, false);
    LoadParamInt(argc, argv, "nhead", &nhead, 8);
+    LoadParamBool(argc, argv, "freeotf", &isMemFreeOTF, false);

    if(useMem){
        delete mem;
-        mem = new XMem(devID);
+        mem = new XMem(devID, isMemFreeOTF ? FREE_ON_THE_FLY : UNI_FREE, (MTYPE)MILLION * 256, 1024, MILLION * 128);
+        mem->SetDesiredSize(devID, 0, (MTYPE)memSize * MILLION);
    }

-    encoder.InitModel(argc, argv, isLM, isLM ? 1 : 0, devID, mem);
+    encoder.InitModel(argc, argv, isLM, 0, devID, mem);
    outputLayer.InitModel(argc, argv, devID, mem);
 }

@@ -83,8 +88,9 @@ XTensor T2TModel::MakeEncoding(XTensor &input, XTensor &mask, bool skipInputRes)
 make the entire network (with the output softmax layer) 
 >> input - input tensor
 >> output - output tensor (distribution)
+>> padding - padding of the sequences
 */
-void T2TModel::Make(XTensor &input, XTensor &output)
+void T2TModel::Make(XTensor &input, XTensor &output, XTensor &padding)
 {
    XTensor encoding;
    
@@ -98,18 +104,118 @@ void T2TModel::Make(XTensor &input, XTensor &output)
        dims[input.order] = len;
        XTensor mask(input.order + 1, dims, X_FLOAT, 1.0F, input.devID, input.mem);
        
-        /* a upper triangular matrix where the cells of the upper triangular are set to -1e-9 */
-        _SetDataLowTri(&mask, 1e9F, -1);
+        /* a upper triangular matrix where the cells of the upper triangular are set to -1e-9.
+           this matrix can be used to prevent the attention to current or following words in
+           a given sequence. */
+        _SetDataLowTri(&mask, 1e9F, 0);
        _ScaleAndShiftMe(&mask, 1.0F, -1e9F);
+        
+        int * dimsPadding = new int[padding.order + 2];
+        for(int i = 0; i < padding.order - 1; i++)
+            dimsPadding[i] = padding.GetDim(i);
+        dimsPadding[padding.order - 1] = padding.GetDim(-1);
+        dimsPadding[padding.order] = padding.GetDim(-1);
+        
+        XTensor * padding2 = NewTensorBuf(padding.order + 1, dimsPadding, padding.dataType,
+                                          padding.denseRatio, padding.devID, padding.mem);
+        
+        for(int i = 0; i < padding2->order; i++)
+            dimsPadding[i + 1] = padding2->GetDim(i);
+        dimsPadding[0] = nhead;
+        
+        XTensor * padding3 = NewTensorBuf(padding.order + 2, dimsPadding, padding.dataType,
+                                          padding.denseRatio, padding.devID, padding.mem);
+        
+        /* mask of the padding */
+        _Unsqueeze(&padding, padding2, padding.order - 1, padding.GetDim(-1));
+        _Unsqueeze(padding2, padding3, 0, nhead);
+        
+        _ScaleAndShiftMe(padding3, 1e9F, -1e9F);
+        
+        //_Sum(&mask, padding3, &mask);

        encoding = MakeEncoding(input, mask, true);
        outputLayer.Make(encoding, output);

        delete[] dims;
+        delete[] dimsPadding;
+        
+        DelTensorBuf(padding2);
+        DelTensorBuf(padding3);
    }
    else{
        ShowNTErrors("TODO!");
    }
 }

+/* 
+get parameter matrics
+>> list - the list that keeps the parameter matrics
+*/
+void T2TModel::GetParams(XList &list)
+{
+    list.Clear();
+    list.Add(&outputLayer.w);
+    
+    for(int i = 0; i < encoder.nlayer; i++){
+        list.Add(&encoder.fnns[i].w1);
+        list.Add(&encoder.fnns[i].b1);
+        list.Add(&encoder.fnns[i].w2);
+        list.Add(&encoder.fnns[i].b2);
+        list.Add(&encoder.attentions[i].wk);
+        list.Add(&encoder.attentions[i].wq);
+        list.Add(&encoder.attentions[i].wv);
+        list.Add(&encoder.fnnLayerNorms[i].w);
+        list.Add(&encoder.fnnLayerNorms[i].b);
+        list.Add(&encoder.attLayerNorms[i].w);
+        list.Add(&encoder.attLayerNorms[i].b);
+    }
+
+    list.Add(&encoder.embedder.w);
+}
+
+/*
+dump the parameters 
+>> fn - where to keep the model
+>> model - the model
+*/
+void T2TModel::Dump(const char * fn)
+{
+    FILE * file = fopen(fn, "wb");
+    CheckNTErrors(file, "Cannot open the model file");
+
+    XList params(100);
+
+    GetParams(params);
+
+    for(int i = 0; i < params.count; i++){
+        XTensor * p = (XTensor*)params.Get(i);
+        p->Dump(file, "param:");
+    }
+
+    fclose(file);
+
+    XPRINT(0, stderr, "[INFO] model saved\n");
+}
+
+/* read the parameters */
+void T2TModel::Read(const char * fn)
+{
+    FILE * file = fopen(fn, "rb");
+    CheckNTErrors(file, "Cannot open the model file");
+
+    XList params(100);
+
+    GetParams(params);
+
+    for(int i = 0; i < params.count; i++){
+        XTensor * p = (XTensor*)params.Get(i);
+        p->Read(file, "param:");
+    }
+
+    fclose(file);
+
+    XPRINT(0, stderr, "[INFO] model loaded\n");
+}
+
 }
--- a/source/sample/transformer/T2TModel.h
+++ b/source/sample/transformer/T2TModel.h
@@ -72,9 +72,18 @@ public:
    XTensor MakeEncoding(XTensor &input, XTensor &mask, bool skipInputRes);

    /* make the entire network (with the output softmax layer) */
-    void Make(XTensor &input, XTensor &output);
+    void Make(XTensor &input, XTensor &output, XTensor &padding);
+
+    /* get parameter matrics */
+    void GetParams(XList &list);
+
+    /* dump the parameters */
+    void Dump(const char * fn);
+
+    /* read the parameters */
+    void Read(const char * fn);
 };

 }

-#endif
\ No newline at end of file
+#endif
--- a/source/sample/transformer/T2TTrainer.cpp
+++ b/source/sample/transformer/T2TTrainer.cpp
--- a/source/sample/transformer/T2TTrainer.h
+++ b/source/sample/transformer/T2TTrainer.h
@@ -37,12 +37,6 @@ namespace transformer
 class T2TTrainer
 {
 public:
-    /* device id */
-    int devID;
-
-    /* memory pool */
-    XMem * mem;
-
    /* buffer for loading words */
    int * buf;

@@ -75,6 +69,9 @@ public:

    /* learning rate */
    float lrate;
+    
+    /* the parameter that controls the maximum learning rate in training */
+    float lrbias;

    /* sentence batch size */
    int sBatchSize;
@@ -88,6 +85,22 @@ public:
    /* traing step number */
    int nstep;

+    /* indicates whether we use adam */
+    bool useAdam;
+
+    /* hyper parameters of adam*/
+    float adamBeta1;
+    float adamBeta2;
+    float adamDelta;
+    float adamBeta1T;
+    float adamBeta2T;
+
+    /* list of the moment of the parameter matrics */
+    XList moments;
+
+    /* list of the 2nd order moment of the parameter matrics */
+    XList moments2nd;
+
 public:
    /* constructor */
    T2TTrainer();
@@ -101,17 +114,34 @@ public:
    /* train the model */
    void Train(const char * fn, T2TModel * model);

+    /* test the model */
+    void Test(const char * fn, const char * ofn, T2TModel * model);
+
    /* load data to buffer */
    int LoadBuf(FILE * file);

+    /* clear data buffer */
+    void ClearBuf();
+
    /* load a batch of sequences */
-    int LoadBatch(FILE * file, XTensor * batch, int step, int vs, int sBatch, int wBatch, bool isSorted, int &wCount);
+    int LoadBatch(FILE * file, bool isLM,
+                  XTensor * batch, XTensor * padding, XTensor * output, 
+                  int * seqs,
+                  int step, int vs, int sBatch, int wBatch, 
+                  bool isSorted, int &wCount,
+                  int devID, XMem * mem);
    
    /* get word probabilities for a batch of sequences */
    float GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs);

    /* update the model by delta rule */
    void Update(T2TModel * model, const float lr);
+
+    /* prepare model for training */
+    void PrepareModel(T2TModel * model);
+
+    /* do padding on the output */
+    void PadOutput(XTensor * output, XTensor * padding);
 };



--- a/source/sample/transformer/T2TUtility.cpp
+++ b/source/sample/transformer/T2TUtility.cpp
@@ -27,6 +27,8 @@ namespace transformer
 {

 FILE * tmpFILE;
+int llnum = 0;
+FILE * tf = NULL;

 void LoadParamString(int argc, const char ** argv, const char * name, char * p, const char * defaultP)
 {

--- a/source/sample/transformer/T2TUtility.h
+++ b/source/sample/transformer/T2TUtility.h
@@ -38,6 +38,9 @@ void LoadParamFloat(int argc, const char ** argv, const char * name, float * p, 
 /* show arguments */
 void ShowParams(int argc, const char ** argv);

+extern int llnum;
+extern FILE * tf;
+
 }

 #endif
--- a/source/sample/transformer/Transformer.cpp
+++ b/source/sample/transformer/Transformer.cpp
@@ -38,20 +38,42 @@ int TransformerMain(int argc, const char ** argv)
    ShowParams(argc, argv);

    char * trainFN = new char[MAX_LINE_LENGTH];
+    char * modelFN = new char[MAX_LINE_LENGTH];
+    char * testFN = new char[MAX_LINE_LENGTH];
+    char * outputFN = new char[MAX_LINE_LENGTH];

    LoadParamString(argc, argv, "train", trainFN, "");
+    LoadParamString(argc, argv, "model", modelFN, "");
+    LoadParamString(argc, argv, "test", testFN, "");
+    LoadParamString(argc, argv, "output", outputFN, "");
+
+    T2TTrainer trainer;
+    trainer.Init(argc, argv);

    T2TModel model;

    model.InitModel(argc, argv);

-    if(strcmp(trainFN, "")){
-        T2TTrainer trainer;
-        trainer.Init(argc, argv);
+    /* learn model parameters */
+    if(strcmp(trainFN, ""))
        trainer.Train(trainFN, &model);
-    }
+
+    /* save the final model */
+    if(strcmp(modelFN, "") && strcmp(trainFN, ""))
+        model.Dump(modelFN);
+
+    /* load the model if neccessary */
+    if(strcmp(modelFN, ""))
+        model.Read(modelFN);
+
+    /* test the model on the new data */
+    if(strcmp(testFN, "") && strcmp(outputFN, ""))
+        trainer.Test(testFN, outputFN, &model);

    delete[] trainFN;
+    delete[] modelFN;
+    delete[] testFN;
+    delete[] outputFN;

    fclose(tmpFILE);


--- a/source/tensor/XDevice.cpp
+++ b/source/tensor/XDevice.cpp
@@ -41,6 +41,7 @@ XDevManager GDevs;
 XDevice::XDevice()
 {
    stream = NULL;
+    isInitialized = false;
    Clear();

 #ifdef USE_CUDA
@@ -126,6 +127,7 @@ void XDevice::Init(int myDevID)
 #endif
    }

+    isInitialized = true;
 }

 /* clear it */
@@ -152,11 +154,14 @@ void XDevice::Clear()
 /* get cublas handle */
 cublasHandle_t * XDevice::GetCublasHandle()
 {
+    if (!isInitialized)
+        Init(devID);
+
    if(!isHandleReady){
        MUTEX_LOCK(cublasMutex);
        int devIDBackup = 0;
        ProtectCudaDev(devID, devIDBackup);
-        CheckNTErrors(cublasCreate(&cublasHandle) == cudaSuccess, 
+        CheckNTErrors(cublasCreate(&cublasHandle) == CUBLAS_STATUS_SUCCESS, 
                     "Cannot create the cublas handle.");
        isHandleReady = true;
        BacktoCudaDev(devID, devIDBackup);
@@ -169,6 +174,9 @@ cublasHandle_t * XDevice::GetCublasHandle()
 /* get the stream of cuda */
 cudaStream_t * XDevice::GetCudaStream()
 {
+    if (!isInitialized)
+        Init(devID);
+
    CheckNTErrors(stream != NULL, "the stream is not initialized!");

    return &stream->stream;
@@ -279,33 +287,13 @@ void XDevManager::Init()
        exit(1);
    }

-    cudaDeviceProp prop[64];
-
    for(int i = 0; i < GPUCount; i++){
-        GPUs[i].Init(i);
-        cudaGetDeviceProperties(&prop[i], i);
+        GPUs[i].devID = i;
+        //GPUs[i].Init(i);
    }

-#ifdef USA_CUDA_P2P
-    for(int i = 0; i < GPUCount; i++){
-        cudaSetDevice(i);
-        for(int j = 0; j < GPUCount; j++){
-            if(i == j)
-                continue;
-            int access;
-            cudaDeviceCanAccessPeer(&access, i, j);
-            bool hasUVA = (prop[i].unifiedAddressing && prop[j].unifiedAddressing);
-            fprintf(stderr, "device %d -> device %d access:%d UVA:%d\n", i, j, access, hasUVA ? 1 : 0);
-            if(access != 0){
-                CheckNTErrors((hasUVA == true), "at least one GPU does not support UVA.")
-                CheckNTErrors((cudaDeviceEnablePeerAccess(j, 0)==cudaSuccess), "cannot set cuda p2t mode!");
-            }
-        }
-    }
-#endif
 #endif

-
    nGPU = GPUCount;
 }

@@ -351,6 +339,9 @@ into blocks
 */
 int XDevManager::GetCudaThread(const int devID, const int n, int * gridSize, int * blockSize)
 {
+    if (!GPUs[devID].isInitialized)
+        GPUs[devID].Init(devID);
+
    memset(gridSize, 0, sizeof(int) * 3);
    memset(blockSize, 0, sizeof(int) * 3);

@@ -402,6 +393,9 @@ into blocks
 */
 int XDevManager::GetCudaThread2D(const int devID, const int n, const int m, int nLimit, int * gridSize, int * blockSize)
 {
+    if (!GPUs[devID].isInitialized)
+        GPUs[devID].Init(devID);
+
    memset(gridSize, 0, sizeof(int) * 3);
    memset(blockSize, 0, sizeof(int) * 3);


--- a/source/tensor/XDevice.h
+++ b/source/tensor/XDevice.h
@@ -67,6 +67,9 @@ public:
    /* warp size of an (Navida) GPU */
    int GPUWarpSize;

+    /* indicates whether the device class has been initialized */
+    bool isInitialized;
+
    /* 
    max grid size (or number of blocks) of an (Navida) GPU 
    NOTE: the grid size is alone with three dimensions (x, y, z)

--- a/source/tensor/XGlobal.h
+++ b/source/tensor/XGlobal.h
@@ -147,6 +147,7 @@ extern bool useCUDA;
 #define XPRINT4(VERBOSE,FILEH,STR,ARG,ARG2,ARG3,ARG4) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2,ARG3,ARG4);FFLUSH(FILEH);}}
 #define XPRINT5(VERBOSE,FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5);FFLUSH(FILEH);}}
 #define XPRINT6(VERBOSE,FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6);FFLUSH(FILEH);}}
+#define XPRINT7(VERBOSE,FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6,ARG7) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6,ARG7);FFLUSH(FILEH);}}

 #define B2I(V) V==0?false:true


--- a/source/tensor/XMem.cpp
+++ b/source/tensor/XMem.cpp
--- a/source/tensor/XMem.h
+++ b/source/tensor/XMem.h
@@ -53,28 +53,14 @@ typedef long long          INT_64;
 #define MIN_BLOCK_SIZE_FOR_MEMPOOL 128 * 1024 * 1024
 #define MIN_BLOCK_NUM_FOR_MEMPOOL 1024

-/* memory block */
-struct XMemBlock
-{
-    /* pointer to where to start */
-    void * mem;
-
-    /* size of the block */
-    MTYPE size;
-
-    /* size of the used memory in this block */
-    MTYPE used;
-
-    /* disired size of the block */
-    MTYPE sizeDesired;
-};
-
 /* 
 mode of runnig a memory pool 
 - UNI_FREE: free all memory space when the memory allocation is no use
 - FREE_ON_THE_FLY: run in normal "malloc" and "free" ways
 */
 enum MEMPOOL_MODE {UNI_FREE, FREE_ON_THE_FLY};
+    
+struct MPieceNode;

 /* header of a memory piece (FREE_ON_THE_FLY) */
 struct MHeader
@@ -96,6 +82,9 @@ struct MHeader

    /* id of the memory block */
    int blockID;
+    
+    /* pointer to the index node */
+    MPieceNode * indexNode;
 };

 /* index of memory piece */
@@ -112,6 +101,31 @@ struct MPieceNode

    /* pointer to the head of a memory piece */
    void * p;
+    
+    /* pointer to the head of memory that is returned back to the user */
+    void * pReal;
+
+    /* header of the memory piece */
+    MHeader head;
+};
+
+/* memory block */
+struct XMemBlock
+{
+    /* pointer to where to start */
+    void * mem;
+
+    /* size of the block */
+    MTYPE size;
+
+    /* size of the used memory in this block */
+    MTYPE used;
+
+    /* desired size of the block */
+    MTYPE sizeDesired;
+    
+    /* first head of the block */
+    MHeader * head;
 };

 /* 
@@ -138,6 +152,9 @@ public:
    /* mode of running the memory pool */
    MEMPOOL_MODE mode;

+    /* signature */
+    MTYPE signature;
+
    /* indicates whether the memory allocation is static */
    bool isStatic;

@@ -194,13 +211,16 @@ public:

 public:
    /* index of the free memory pieces */
-    MPieceNode * freeMemIndex;
+    MPieceNode * memIndex;
+    
+    /* for double buffering */
+    MPieceNode * memIndex2;

    /* maximum number of index nodes */
-    INT_64 indexNodeNum;
+    INT_64 nodeNum;

    /* count of the used nodes */
-    INT_64 indexNodeNumUsed;
+    INT_64 nodeNumUsed;

    /* minimal size allocation for each index entry */
    MTYPE * minSizeIndex;
@@ -235,6 +255,9 @@ public:
    /* free a piece of memory */
    void Free(int myDevID, void * mem);

+    /* get signature */
+    MTYPE GetSignature();
+
    /* use string as the name of the memory pool */
    void SetName(const char * myName);

@@ -282,10 +305,10 @@ public:
    void * AllocBuf(int myDevID, MTYPE mySize, int pitch = BUF_PITCH);

    /* release a piece of memory */
-    void Release(void * p);
+    void Release(void * p, MTYPE size, MTYPE code);

    /* release a piece of memory */
-    void Release(int myDevID, void * p);
+    void Release(int myDevID, void * p, MTYPE size);

    /* release a piece of memory in the buffer */
    void ReleaseBuf(int myDevID, MTYPE mySize, int pitch = BUF_PITCH);
@@ -302,14 +325,20 @@ public:
    /* find the index entry for allocation query */
    int FindIndexEntry(MTYPE mySize);

-    /* remove an index node */
-    void RemoveIndexNode(MPieceNode * node, MPieceNode * entry = NULL);
+    /* remove an index node for available memory pieces */
+    void RemoveFreeIndexNode(MPieceNode * node, MPieceNode * entry = NULL);

-    /* add an index node */
-    void AddIndexNode(MPieceNode * node, MPieceNode * entry = NULL);
+    /* add an index node for available memory pieces */
+    void AddFreeIndexNode(MPieceNode * node, MPieceNode * entry = NULL);
+    
+    /* remove an index node for memory pieces in use */
+    void RemoveAllocIndexNode(MPieceNode * node, MPieceNode * entry = NULL);
+    
+    /* add an index node for available memory pieces */
+    void AddAllocIndexNode(MPieceNode * node, MPieceNode * entry = NULL);

    /* release a piece of memory as "free" */
-    void ReleaseStandard(int myDevID, void * p);
+    void ReleaseStandard(int myDevID, void * p, MTYPE size);

    /* rebuild index to merge small fragments of memory and free the block with no use */
    void RebuildIndex();
@@ -379,6 +408,9 @@ public:

 extern XMem * GMem;

+extern int testxmemid;
+extern void * recordp;
+
 } /* end of the nts (NiuTrans.Tensor) namespace */

 #endif
--- a/source/tensor/XName.cpp
+++ b/source/tensor/XName.cpp
@@ -29,10 +29,18 @@ const char * GetOPName(int type)
    if ((type & MATH_BASE) != 0){
        if (type == MATH_ABSOLUTE)
            return "M_ABSOLUTE";
+        else if (type == MATH_CEIL)
+            return "M_CEIL";
        else if (type == MATH_EXP)
            return "M_EXP";
+        else if (type == MATH_FLOOR)
+            return "M_FLOOR";
        else if (type == MATH_LOG)
            return "M_LOG";
+        else if (type == MATH_SQRT)
+            return "M_SQRT";
+        else if (type == MATH_SQUARE)
+            return "M_SQUARE";
        else if (type == MATH_SIN)
            return "M_SIN";
        else if (type == MATH_COS)
@@ -113,7 +121,9 @@ const char * GetOPName(int type)
            return "S_TOPK";
    }
    else if ((type & FUNCTION_BASE) != 0){
-        if (type == FUNC_HARDTANH)
+        if (type == FUNC_DROPOUT)
+            return "F_DROPOUT";
+        else if (type == FUNC_HARDTANH)
            return "F_HARDTANH";
        else if (type == FUNC_IDENTITY)
            return "F_IDENTITY";

--- a/source/tensor/XName.h
+++ b/source/tensor/XName.h
@@ -32,9 +32,13 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define MATH_BASE               0x00001000

 #define MATH_ABSOLUTE           MATH_BASE + 1
-#define MATH_EXP                MATH_ABSOLUTE + 1
-#define MATH_LOG                MATH_EXP + 1
-#define MATH_SIN                MATH_LOG + 1
+#define MATH_CEIL               MATH_ABSOLUTE + 1
+#define MATH_EXP                MATH_CEIL + 1
+#define MATH_FLOOR              MATH_EXP + 1
+#define MATH_LOG                MATH_FLOOR + 1
+#define MATH_SQRT               MATH_LOG + 1
+#define MATH_SQUARE             MATH_SQRT + 1
+#define MATH_SIN                MATH_SQUARE + 1
 #define MATH_COS                MATH_SIN + 1
 #define MATH_TAN                MATH_COS + 1
 #define MATH_ROUND              MATH_TAN + 1
@@ -88,7 +92,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /* activation functions */
 #define FUNCTION_BASE           DATA_BASE * 2
-#define FUNC_HARDTANH           FUNCTION_BASE + 1
+#define FUNC_DROPOUT            FUNCTION_BASE + 1
+#define FUNC_HARDTANH           FUNC_DROPOUT + 1
 #define FUNC_IDENTITY           FUNC_HARDTANH + 1
 #define FUNC_LOGSOFTMAX         FUNC_IDENTITY + 1
 #define FUNC_RECTIFY            FUNC_LOGSOFTMAX + 1

--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -162,6 +162,7 @@ XTensor::XTensor(const XTensor &reference)
        devID = reference.devID;
        mem = reference.mem;
        data = reference.data;
+        signature = reference.signature;
        
        /* what we really want to do is "reference.data = NULL;"
           As "reference" is constant, we cannot reset reference.data
@@ -221,7 +222,8 @@ XTensor::~XTensor()
 void XTensor::Init()
 {
    id = -1;
-    mem = NULL;;
+    mem = NULL;
+    signature = 0;
    data = NULL;
    dataHost = NULL;
    dataP = NULL;
@@ -254,7 +256,7 @@ void XTensor::DestroyData()
    else if(data != NULL && isInGlobalMem)
        FreeData(this, mem);
    else if(data != NULL)
-        mem->Release(data);
+        mem->Release(data, GetDataSizeInChar(), signature);
    data = NULL;

    if(dataHost != NULL)
@@ -298,6 +300,7 @@ XTensor& XTensor::operator= (const XTensor& tensor)
        newTensor->SetTMP();
        newTensor->data = data;
        newTensor->dataHost = dataHost;
+        newTensor->signature = tensor.signature;
        
        XLink::Replace(this, newTensor);
        XLink::ClearOutgoing(this);
@@ -1135,19 +1138,21 @@ resize a tensor with a specified tensor size
 bool XTensor::Resize(const int myOrder, const int * myDimSize, 
                     const TENSOR_DATA_TYPE myDataType, const float myDenseRatio)
 {
-    order = myOrder;
-    unitNum = 1;
-    unitNumNonZero = 0;
-    isInit = true;
-
    /* free old mem */
    if(data != NULL){
        if (mem == NULL)
            XMemFree(devID, data);
        else
-            mem->Release(data);
+            mem->Release(data, GetDataSizeInChar(), signature);
    }

+    signature = mem != NULL ? mem->GetSignature() : 0;
+    
+    order = myOrder;
+    unitNum = 1;
+    unitNumNonZero = 0;
+    isInit = true;
+
    bool filledData = true;
    bool zeroData = false;
    for(int i = 0; i < order; i++){
@@ -1243,56 +1248,6 @@ bool XTensor::Resize(const int myOrder, const int * myDimSize,
 }

 /* 
-resize a tensor with a specified tensor size (with no data filled)
->> myOrder - order of the tensor
->> myDimSize - the size of each dimension
->> myDataType - unit size (e.g., int, float, and double) 
->> myDenseRatio - how often an element has non-zero value
-<< return - succeeded or not
-*/
-bool XTensor::ResizeWithNoData(const int myOrder, const int * myDimSize, 
-                               const TENSOR_DATA_TYPE myDataType, const float myDenseRatio)
-{
-    order = myOrder;
-    unitNum = 1;
-    unitNumNonZero = 0;
-
-    /* free old mem */
-    if(data != NULL && mem == NULL)
-        delete[] (char*)data;
-
-    bool filledData = true;
-    bool zeroData = false;
-    for(int i = 0; i < order; i++){
-        dimSize[i] = abs(myDimSize[i]);
-        dimSizeRDI[order - i - 1] = dimSize[i];
-        if(myDimSize[i] < 0)
-            filledData = false;
-        if(myDimSize[i] == 0)
-            zeroData = true;
-        unitNum *= dimSize[i];
-    }
-
-    data = NULL;
-    denseRatio = myDenseRatio;
-    isSparse = denseRatio < 1.0F ? true : false;
-    dataType = myDataType;
-    unitSize = GetUnitSize(dataType);
-
-    if(myDataType != DEFAULT_DTYPE)
-        isDefaultDType = false;
-    else
-        isDefaultDType = true;
-
-    if(zeroData){
-        unitNum = 0;
-        return false;
-    }
-
-    return true;
-}
-
-/* 
 resize a tensor by another one 
 >> myTensor - tensor for reference
 */
@@ -1377,9 +1332,10 @@ dump data to a file
 >> file - where to domp the data
 >> label - label of the tensor
 >> n - number of items to dump
+>> beg - the first item id
 >> verbose - verbose level
 */
-void XTensor::Dump(FILE * file, const char * label, const int n, const int verbose)
+void XTensor::Dump(FILE * file, const char * label, const int n, const int beg, const int verbose)
 {
    if (verbose > verboseLevel)
        return;
@@ -1437,28 +1393,26 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int verbo
    }
    if (!isSparse) {
        if (dataType == DEFAULT_DTYPE) {
-            if (unitNum > 0) {
-                DTYPE f = *(DTYPE*)d;
-                fprintf(file, "%e", f);
-            }
-            int num = unitNum;
-            if (n > 0)
-                num = MIN(num, n);
-            for (int i = 1; i < num; i++) {
-                DTYPE * f = ((DTYPE*)d) + i;
-                fprintf(file, " %e", *f);
+            int end = MIN(n > 0 ? beg + n : beg + unitNum, unitNum);
+            for(int i = beg; i < end; i++){
+                DTYPE f = ((DTYPE*)d)[i];
+                if(i == beg)
+                    fprintf(file, "%e", f);
+                else
+                    fprintf(file, " %e", f);
+
            }
        }
        else {
-            ShowNTErrors("Cannot dump the tensor to the file in non-float values!");
+            ShowNTErrors("TODO!");
        }
    }
    else {
        int num = this->unitNumNonZero > 0 ? *(int*)d : 0;
-        if (n > 0)
-            num = MIN(num, n);
+        if (beg + n > 0)
+            num = MIN(num, beg + n);
        fprintf(file, "%d ", num);
-        for (int i = 0; i < num; i++) {
+        for (int i = beg; i < num; i++) {
            int key = GetKeyInSparse(i);
            DTYPE value = GetInSparse(i);
            fprintf(file, "[%d]%e ", key, value);
@@ -1481,13 +1435,14 @@ dump data to a file
 >> file - where to domp the data
 >> label - label of the tensor
 >> n - number of items to dump
+>> beg - the first item id
 >> verbose - verbose level
 */
-void XTensor::Dump(const XTensor * tensor, FILE * file, const char * label, const int n, const int verbose)
+void XTensor::Dump(const XTensor * tensor, FILE * file, const char * label, const int n, const int beg, const int verbose)
 {
    XTensor a(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, tensor->devID, tensor->mem);
    _CopyValues(tensor, &a);
-    a.Dump(file, label, n, verbose);
+    a.Dump(file, label, n, beg, verbose);
 }

 /* 
@@ -1670,6 +1625,8 @@ void XTensor::AllocateData(XTensor * tensor, XMem * myMem, bool useBuf)
            tensor->isInGlobalMem = true;
        }
    }
+
+    tensor->signature = 0;
 }

 /* 

--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
@@ -51,7 +51,6 @@ struct XLink;
 #define MIN_TENSOR_SPLIT_LIST_NUM 1024
 #define MIN_TENSOR_CAT_NUM 8

-
 /* computation flags */
 #define UNSAFE_BUT_FAST_MEM
 #define FAST_MATRIX
@@ -66,6 +65,9 @@ public:
    /* memory pool */
    XMem * mem;

+    /* signature of the memory pool */
+    MTYPE signature;
+
    /* data array to keep the elements */
    void * data;

@@ -327,11 +329,6 @@ public:
                const TENSOR_DATA_TYPE myDataType = DEFAULT_DTYPE,
                const float myDenseRatio = 1.0F);

-    /* resize a matrix with a specified matrix size (with no data filled) */
-    bool ResizeWithNoData(const int myOrder, const int * myDimSize, 
-                          const TENSOR_DATA_TYPE myDataType = DEFAULT_DTYPE,
-                          const float myDenseRatio = 1.0F);
-
    /* resize a matrix by another one */
    bool Resize(const XTensor * myTensor);

@@ -339,11 +336,11 @@ public:
    bool BinarySearch(int key, DTYPE &value, void * &position) const;

    /* dump data to a file */
-    void Dump(FILE * file, const char * label = NULL, const int n = -1, const int verbose = 0);
+    void Dump(FILE * file, const char * label = NULL, const int n = -1, const int beg = 0, const int verbose = 0);

    /* dump data to a file */
    static
-    void Dump(const XTensor * tensor, FILE * file, const char * label = NULL, const int n = -1, const int verbose = 0);
+    void Dump(const XTensor * tensor, FILE * file, const char * label = NULL, const int n = -1, const int beg = 0, const int verbose = 0);

    /* read data from a file */
    void Read(FILE * file, const char * label = NULL);

--- a/source/tensor/core/arithmetic/Div.cpp
+++ b/source/tensor/core/arithmetic/Div.cpp
@@ -203,7 +203,7 @@ XTensor Div(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim)
        /* tensor connections */
        XLink::MakeLink(&a, &b, &c, MATH_DIVDIM);
        XLink::AddParamToHeadInt(&c, n);
-        XLink::AddParamToHeadInt(&c, alpha);
+        XLink::AddParamToHead(&c, alpha);
    }
    else{
        ShowNTErrors("Something is wrong!");

--- a/source/tensor/core/arithmetic/Multiply.cpp
+++ b/source/tensor/core/arithmetic/Multiply.cpp
@@ -204,7 +204,7 @@ XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim
        /* tensor connections */
        XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM);
        XLink::AddParamToHeadInt(&c, n);
-        XLink::AddParamToHeadInt(&c, alpha);
+        XLink::AddParamToHead(&c, alpha);
    }
    else{
        ShowNTErrors("Something is wrong!");

--- a/source/tensor/core/math/Normalize.cpp
+++ b/source/tensor/core/math/Normalize.cpp
@@ -50,7 +50,6 @@ void _Normalize(const XTensor * input, XTensor * output, int dim, const XTensor 
    CheckNTErrors((XTensor::IsSameShaped(mean, var)), "Unmatched input tensors");
    CheckNTErrors((input && output && mean && var && a && b), "Empty input tensors!");
    CheckNTErrors((dimRDI >= 0 && dimRDI < input->order), "Incorrect reduction dimension!");
-    CheckNTErrors((dimRDI == a->order - 1), "Incorrect reduction dimension!");
    CheckNTErrors((input->order == mean->order + 1), "Incorrect reduction dimension!");

    int stride = 1;

--- a/source/tensor/core/math/Unary.cpp
+++ b/source/tensor/core/math/Unary.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
+*/
+
 #include <math.h>
 #include "../../XName.h"
 #include "Unary.h"
@@ -5,9 +26,18 @@

 namespace nts{
    
+DTYPE square(DTYPE x)
+{
+    return x * x;
+}
+
+DTYPE round(DTYPE r)
+{
+	return (r > 0.0) ? (DTYPE)floor(r + 0.5) : (DTYPE)ceil(r - 0.5);
+}

 #ifdef USE_CUDA
-/* define three marco separately, specify the respective function names */
+/* define three marco separately, specify the respective function names  (GPU mode) */
 #define _SIMPLE_UNARY_FUNCTION(_funcName, _cudaFuncName, origFunc)          \
 void _funcName(const XTensor * a, XTensor * b)                              \
 {                                                                           \
@@ -45,14 +75,35 @@ _SIMPLE_UNARY_FUNCTION(_Absolute, _CudaAbsolute, fabs)
 _SIMPLE_UNARY_FUNCTION_ME(_AbsoluteMe, _Absolute)
 SIMPLE_UNARY_FUNCTION(Absolute, _Absolute, MATH_ABSOLUTE)

+_SIMPLE_UNARY_FUNCTION(_Ceil, _CudaCeil, ceil)
+_SIMPLE_UNARY_FUNCTION_ME(_CeilMe, _Ceil)
+SIMPLE_UNARY_FUNCTION(Ceil, _Ceil, MATH_CEIL)
+
 _SIMPLE_UNARY_FUNCTION(_Exp, _CudaExp, exp)
 _SIMPLE_UNARY_FUNCTION_ME(_ExpMe, _Exp)
 SIMPLE_UNARY_FUNCTION(Exp, _Exp, MATH_EXP)

+_SIMPLE_UNARY_FUNCTION(_Floor, _CudaFloor, floor)
+_SIMPLE_UNARY_FUNCTION_ME(_FloorMe, _Floor)
+SIMPLE_UNARY_FUNCTION(Floor, _Floor, MATH_FLOOR)
+
 _SIMPLE_UNARY_FUNCTION(_Log, _CudaLog, log)
 _SIMPLE_UNARY_FUNCTION_ME(_LogMe, _Log)
 SIMPLE_UNARY_FUNCTION(Log, _Log, MATH_LOG)

+_SIMPLE_UNARY_FUNCTION(_Round, _CudaRound, round)
+_SIMPLE_UNARY_FUNCTION_ME(_RoundMe, _Round)
+SIMPLE_UNARY_FUNCTION(Round, _Round, MATH_ROUND)
+
+_SIMPLE_UNARY_FUNCTION(_Sqrt, _CudaSqrt, sqrt)
+_SIMPLE_UNARY_FUNCTION_ME(_SqrtMe, _Sqrt)
+SIMPLE_UNARY_FUNCTION(Sqrt, _Sqrt, MATH_SQRT)
+
+_SIMPLE_UNARY_FUNCTION(_Square, _CudaSquare, square)
+_SIMPLE_UNARY_FUNCTION_ME(_SquareMe, _Square)
+SIMPLE_UNARY_FUNCTION(Square, _Square, MATH_SQUARE)
+
+
 _SIMPLE_UNARY_FUNCTION(_Sin, _CudaSin, sin)
 _SIMPLE_UNARY_FUNCTION_ME(_SinMe, _Sin)
 SIMPLE_UNARY_FUNCTION(Sin, _Sin, MATH_SIN)
@@ -65,11 +116,8 @@ _SIMPLE_UNARY_FUNCTION(_Tan, _CudaTan, tan)
 _SIMPLE_UNARY_FUNCTION_ME(_TanMe, _Tan)
 SIMPLE_UNARY_FUNCTION(Tan, _Tan, MATH_TAN)

-/*_SIMPLE_UNARY_FUNCTION(_Round, _CudaRound, round)
-_SIMPLE_UNARY_FUNCTION_ME(_RoundMe, _Round)
-SIMPLE_UNARY_FUNCTION(Round, _Round, MATH_ROUND)*/
 #else
-/* define three marco separately, specify the respective function names */
+/* define three marco separately, specify the respective function names (CPU mode) */
 #define _SIMPLE_UNARY_FUNCTION(_funcName, origFunc)          \
 void _funcName(const XTensor * a, XTensor * b)                              \
 {                                                                           \
@@ -102,14 +150,35 @@ _SIMPLE_UNARY_FUNCTION(_Absolute, fabs)
 _SIMPLE_UNARY_FUNCTION_ME(_AbsoluteMe, _Absolute)
 SIMPLE_UNARY_FUNCTION(Absolute, _Absolute, MATH_ABSOLUTE)

+
+_SIMPLE_UNARY_FUNCTION(_Ceil, ceil)
+_SIMPLE_UNARY_FUNCTION_ME(_CeilMe, _Ceil)
+SIMPLE_UNARY_FUNCTION(Ceil, _Ceil, MATH_CEIL)
+
 _SIMPLE_UNARY_FUNCTION(_Exp, exp)
 _SIMPLE_UNARY_FUNCTION_ME(_ExpMe, _Exp)
 SIMPLE_UNARY_FUNCTION(Exp, _Exp, MATH_EXP)

+_SIMPLE_UNARY_FUNCTION(_Floor, floor)
+_SIMPLE_UNARY_FUNCTION_ME(_FloorMe, _Floor)
+SIMPLE_UNARY_FUNCTION(Floor, _Floor, MATH_FLOOR)
+
 _SIMPLE_UNARY_FUNCTION(_Log, log)
 _SIMPLE_UNARY_FUNCTION_ME(_LogMe, _Log)
 SIMPLE_UNARY_FUNCTION(Log, _Log, MATH_LOG)

+_SIMPLE_UNARY_FUNCTION(_Round, round)
+_SIMPLE_UNARY_FUNCTION_ME(_RoundMe, _Round)
+SIMPLE_UNARY_FUNCTION(Round, _Round, MATH_ROUND)
+
+_SIMPLE_UNARY_FUNCTION(_Sqrt, sqrt)
+_SIMPLE_UNARY_FUNCTION_ME(_SqrtMe, _Sqrt)
+SIMPLE_UNARY_FUNCTION(Sqrt, _Sqrt, MATH_SQRT)
+
+_SIMPLE_UNARY_FUNCTION(_Square, square)
+_SIMPLE_UNARY_FUNCTION_ME(_SquareMe, _Square)
+SIMPLE_UNARY_FUNCTION(Square, _Square, MATH_SQUARE)
+
 _SIMPLE_UNARY_FUNCTION(_Sin, sin)
 _SIMPLE_UNARY_FUNCTION_ME(_SinMe, _Sin)
 SIMPLE_UNARY_FUNCTION(Sin, _Sin, MATH_SIN)

--- a/source/tensor/core/math/Unary.cu
+++ b/source/tensor/core/math/Unary.cu
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-31
+*/
+
 #include <math.h>
 #include "../../XDevice.h"
 #include "../../XName.h"
+#include "Unary.h"
 #include "Unary.cuh"

 namespace nts {

+__device__
+DTYPE CudaSquare(DTYPE x)
+{
+    return x * x;
+}
+
+__device__
+DTYPE CudaRound(DTYPE r)
+{
+	return (r > 0.0) ? (DTYPE)floor(r + 0.5) : (DTYPE)ceil(r - 0.5);
+}
+
 #define SIMPLE_UNARY_FUNCTION_GPU(funcName, origFunc)                       \
 __global__                                                                  \
 void Kernel##funcName(DTYPE * a, DTYPE * b, int size)                       \
@@ -15,7 +49,7 @@ void Kernel##funcName(DTYPE * a, DTYPE * b, int size)                       \
        b[i] = (DTYPE)origFunc(a[i]);                                       \
 }                                                                           \
 __global__                                                                  \
-    void Kernel##funcName(__half * a, __half * b, int size)                 \
+void Kernel##funcName(__half * a, __half * b, int size)                     \
 {                                                                           \
    return;                                                                 \
 }                                                                           \
@@ -37,12 +71,12 @@ void _Cuda##funcName(const XTensor * a, XTensor * b)                        \
    ProtectCudaDev(a->devID, devIDBackup);                                  \
                                                                            \
    if (a->dataType == DEFAULT_DTYPE) {                                     \
-        Kernel##funcName << <blocks, threads >> >                           \
-                     ((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);        \
+        Kernel##funcName<<<blocks, threads>>>                               \
+                         ((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);    \
    }                                                                       \
    else if (a->dataType == X_FLOAT16) {                                    \
-        Kernel##funcName << <blocks, threads >> >                           \
-                     ((__half*)a->data, (__half*)b->data, a->unitNum);      \
+        Kernel##funcName<<<blocks, threads>>>                               \
+                         ((__half*)a->data, (__half*)b->data, a->unitNum);  \
    }                                                                       \
    else {                                                                  \
        ShowNTErrors("TODO!");                                              \
@@ -52,11 +86,16 @@ void _Cuda##funcName(const XTensor * a, XTensor * b)                        \
 }                                                                           \

 SIMPLE_UNARY_FUNCTION_GPU(Absolute, fabs)
+SIMPLE_UNARY_FUNCTION_GPU(Ceil, ceil)
 SIMPLE_UNARY_FUNCTION_GPU(Exp, exp)
+SIMPLE_UNARY_FUNCTION_GPU(Floor, floor)
 SIMPLE_UNARY_FUNCTION_GPU(Log, log)
+SIMPLE_UNARY_FUNCTION_GPU(Round, CudaRound)
+SIMPLE_UNARY_FUNCTION_GPU(Sqrt, sqrt)
+SIMPLE_UNARY_FUNCTION_GPU(Square, CudaSquare)
+
 SIMPLE_UNARY_FUNCTION_GPU(Sin, sin)
 SIMPLE_UNARY_FUNCTION_GPU(Cos, cos)
 SIMPLE_UNARY_FUNCTION_GPU(Tan, tan)
-//SIMPLE_UNARY_FUNCTION_GPU(Round, round)

 }
\ No newline at end of file
--- a/source/tensor/core/math/Unary.cuh
+++ b/source/tensor/core/math/Unary.cuh
@@ -38,6 +38,15 @@ void KernelAbsolute(__half * a, __half * b, int size);
 /* set each entry to its absolute value */
 void _CudaAbsolute(const XTensor * a, XTensor * b);

+/* set each entry to its ceil value (CUDA Kernel) */
+__global__
+void KernelCeil(DTYPE * a, DTYPE * b, int size);
+/* set each entry to its ceil value (CUDA Kernel) with float16 data type*/
+__global__
+void KernelCeil(__half * a, __half * b, int size);
+/* set each entry to its ceil value */
+void _CudaCeil(const XTensor * a, XTensor * b);
+
 /* set each entry to its exponent value (CUDA Kernel) */
 __global__
 void KernelExp(DTYPE * a, DTYPE * b, int size);
@@ -47,6 +56,15 @@ void KernelExp(__half * a, __half * b, int size);
 /* set each entry to its exponent value */
 void _CudaExp(const XTensor * a, XTensor * b);

+/* set each entry to its floor value (CUDA Kernel) */
+__global__
+void KernelFloor(DTYPE * a, DTYPE * b, int size);
+/* set each entry to its floor value (CUDA Kernel) with float16 data type*/
+__global__
+void KernelFloor(__half * a, __half * b, int size);
+/* set each entry to its floor value */
+void _CudaFloor(const XTensor * a, XTensor * b);
+
 /* set each entry to its logarithm value (CUDA Kernel) */
 __global__
 void KernelLog(DTYPE * a, DTYPE * b, int size);
@@ -56,6 +74,34 @@ void KernelLog(__half * a, __half * b, int size);
 /* set each entry to its logarithm value */
 void _CudaLog(const XTensor * a, XTensor * b);

+/* set each entry to its round value (CUDA Kernel) */
+__global__
+void KernelRound(DTYPE * a, DTYPE * b, int size);
+/* set each entry to its round value (CUDA Kernel) with float16 data type*/
+__global__
+void KernelRound(__half * a, __half * b, int size);
+/* set each entry to its round value */
+void _CudaRound(const XTensor * a, XTensor * b);
+
+/* set each entry to its sqrt value (CUDA Kernel) */
+__global__
+void KernelSqrt(DTYPE * a, DTYPE * b, int size);
+/* set each entry to its sqrt value (CUDA Kernel) with float16 data type*/
+__global__
+void KernelSqrt(__half * a, __half * b, int size);
+/* set each entry to its sqrt value */
+void _CudaSqrt(const XTensor * a, XTensor * b);
+
+/* set each entry to its square value (CUDA Kernel) */
+__global__
+void KernelSquare(DTYPE * a, DTYPE * b, int size);
+/* set each entry to its square value (CUDA Kernel) with float16 data type*/
+__global__
+void KernelSquare(__half * a, __half * b, int size);
+/* set each entry to its square value */
+void _CudaSquare(const XTensor * a, XTensor * b);
+
+
 /* set each entry to its sine value (CUDA Kernel) */
 __global__
 void KernelSin(DTYPE * a, DTYPE * b, int size);
@@ -83,15 +129,6 @@ void KernelTan(__half * a, __half * b, int size);
 /* set each entry to its tangent value */
 void _CudaTan(const XTensor * a, XTensor * b);

-/* set each entry to its round value (CUDA Kernel) */
-//__global__
-//void KernelRound(DTYPE * a, DTYPE * b, int size);
-/* set each entry to its round value (CUDA Kernel) with float16 data type*/
-//__global__
-//void KernelRound(__half * a, __half * b, int size);
-/* set each entry to its round value */
-//void _CudaRound(const XTensor * a, XTensor * b);
-
 #endif // USE_CUDA

 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/math/Unary.h
+++ b/source/tensor/core/math/Unary.h
@@ -28,95 +28,103 @@ namespace nts{

 /* set every entry to its absolute value */
 void _Absolute(const XTensor * a, XTensor * b);
-/* 
-set every entry to its absolute value (do it on site)
-keep the result in the input tensor a and return nothing
-*/
+/* set every entry to its absolute value (do it on site)
+keep the result in the input tensor a and return nothing */
 void _AbsoluteMe(XTensor * a);
-/* 
-set every entry to its absolute value (return a XTensor structure)
-make a new tensor to keep the result and return it
-*/
+/* set every entry to its absolute value (return a XTensor structure)
+make a new tensor to keep the result and return it */
 XTensor Absolute(const XTensor & a);

+/* set every entry to its ceil value */
+void _Ceil(const XTensor * a, XTensor * b);
+/* set every entry to its ceil value (do it on site)
+keep the result in the input tensor a and return nothing */
+void _CeilMe(XTensor * a);
+/* set every entry to its ceil value (return a XTensor structure)
+make a new tensor to keep the result and return it */
+XTensor Ceil(const XTensor & a);
+
 /* set every entry to its exponent value */
 void _Exp(const XTensor * a, XTensor * b);
-/* 
-set every entry to its exponent value (do it on site)
-keep the result in the input tensor a and return nothing
-*/
+/* set every entry to its exponent value (do it on site)
+keep the result in the input tensor a and return nothing */
 void _ExpMe(XTensor * a);
-/* 
-set every entry to its exponent value (return a XTensor structure)
-make a new tensor to keep the result and return it
-*/
+/* set every entry to its exponent value (return a XTensor structure)
+make a new tensor to keep the result and return it */
 XTensor Exp(const XTensor & a);

+/* set every entry to its floor value */
+void _Floor(const XTensor * a, XTensor * b);
+/* set every entry to its floor value (do it on site)
+keep the result in the input tensor a and return nothing */
+void _FloorMe(XTensor * a);
+/* set every entry to its floor value (return a XTensor structure)
+make a new tensor to keep the result and return it */
+XTensor Floor(const XTensor & a);
+
 /* set every entry to its logarithm value */
 void _Log(const XTensor * a, XTensor * b);
-/* 
-set every entry to its logarithm value (do it on site)
-keep the result in the input tensor a and return nothing
-*/
+/* set every entry to its logarithm value (do it on site)
+keep the result in the input tensor a and return nothing */
 void _LogMe(XTensor * a);
-/* 
-set every entry to its logarithm value (return a XTensor structure)
-make a new tensor to keep the result and return it
-*/
+/* set every entry to its logarithm value (return a XTensor structure)
+make a new tensor to keep the result and return it */
 XTensor Log(const XTensor & a);

+/* set every entry to its round value */
+void _Round(const XTensor * a, XTensor * b);
+/* set every entry to its round value (do it on site)
+keep the result in the input tensor a and return nothing */
+void _RoundMe(XTensor * a);
+/* set every entry to its round value (return a XTensor structure)
+make a new tensor to keep the result and return it */
+XTensor Round(const XTensor & a);
+
+/* set every entry to its sqrt value */
+void _Sqrt(const XTensor * a, XTensor * b);
+/* set every entry to its sqrt value (do it on site)
+keep the result in the input tensor a and return nothing */
+void _SqrtMe(XTensor * a);
+/* set every entry to its sqrt value (return a XTensor structure)
+make a new tensor to keep the result and return it */
+XTensor Sqrt(const XTensor & a);
+
+/* set every entry to its square value */
+void _Square(const XTensor * a, XTensor * b);
+/* set every entry to its square value (do it on site)
+keep the result in the input tensor a and return nothing */
+void _SquareMe(XTensor * a);
+/* set every entry to its square value (return a XTensor structure)
+make a new tensor to keep the result and return it */
+XTensor Square(const XTensor & a);
+
+
 /* set every entry to its sine value */
 void _Sin(const XTensor * a, XTensor * b);
-/* 
-set every entry to its sine value (do it on site)
-keep the result in the input tensor a and return nothing
-*/
+/* set every entry to its sine value (do it on site)
+keep the result in the input tensor a and return nothing */
 void _SinMe(XTensor * a);
-/* 
-set every entry to its sine value (return a XTensor structure)
-make a new tensor to keep the result and return it
-*/
+/* set every entry to its sine value (return a XTensor structure)
+make a new tensor to keep the result and return it */
 XTensor Sin(const XTensor & a);

 /* set every entry to its cosine value */
 void _Cos(const XTensor * a, XTensor * b);
-/* 
-set every entry to its cosine value (do it on site)
-keep the result in the input tensor a and return nothing
-*/
+/* set every entry to its cosine value (do it on site)
+keep the result in the input tensor a and return nothing */
 void _CosMe(XTensor * a);
-/* 
-set every entry to its cosine value (return a XTensor structure)
-make a new tensor to keep the result and return it
-*/
+/* set every entry to its cosine value (return a XTensor structure)
+make a new tensor to keep the result and return it */
 XTensor Cos(const XTensor & a);

 /* set every entry to its tangent value */
 void _Tan(const XTensor * a, XTensor * b);
-/* 
-set every entry to its tangent value (do it on site)
-keep the result in the input tensor a and return nothing
-*/
+/* set every entry to its tangent value (do it on site)
+keep the result in the input tensor a and return nothing */
 void _TanMe(XTensor * a);
-/* 
-set every entry to its tangent value (return a XTensor structure)
-make a new tensor to keep the result and return it
-*/
+/* set every entry to its tangent value (return a XTensor structure)
+make a new tensor to keep the result and return it */
 XTensor Tan(const XTensor & a);

-
-/* set every entry to its round value */
-//void _Round(const XTensor * a, XTensor * b);
-/* 
-set every entry to its round value (do it on site)
-keep the result in the input tensor a and return nothing
-*/
-//void _RoundMe(XTensor * a);
-/* 
-set every entry to its round value (return a XTensor structure)
-make a new tensor to keep the result and return it
-*/
-//XTensor Round(const XTensor & a);
-
 }
 #endif //end __UNARY_H__
\ No newline at end of file
--- a/source/tensor/core/movement/CopyBlocksOnSite.cu
+++ b/source/tensor/core/movement/CopyBlocksOnSite.cu
@@ -77,7 +77,7 @@ void KernelCopyBlocksV2(T * source, int blockSize, int blockNum, int totalSize, 
    int targetBlockID = targetBlocks[i / blockSize];
    int targetOffset  = i % blockSize;

-    *(target + blockSize * targetBlockID + targetOffset) = source[i];
+    target[blockSize * targetBlockID + targetOffset] = source[i];
 }

 /*
@@ -98,16 +98,6 @@ void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, 
    int devIDBackup;
    ProtectCudaDev(devID, devIDBackup);

-    if(blockSize % sizeof(double) == 0){
-        int bSize = blockSize / sizeof(double);
-        GDevs.GetCudaThread(devID, bSize * blockNum, cudaGrids, cudaBlocks);
-        KernelCopyBlocksV2<double> <<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
-                                    ((double*)source, bSize, blockNum, bSize * blockNum, (double*)target, targetBlocks);
-        //GDevs.GetCudaThread2D(devID, bSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
-        //KernelCopyBlocks<double> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>>
-        //                            ((double*)source, bSize, blockNum, (double*)target, targetBlocks);
-    }
-    else 
    if(blockSize % sizeof(float) == 0){
        int bSize = blockSize / sizeof(float);
        GDevs.GetCudaThread(devID, bSize * blockNum, cudaGrids, cudaBlocks);

--- a/source/tensor/core/reduce/ReduceMax.cu
+++ b/source/tensor/core/reduce/ReduceMax.cu
@@ -405,7 +405,7 @@ inline void continuousStorageThreadAllocation(dim3& grid, dim3& block, long long
    if (vectorSize % 32 != 0) minWarpNum++;
    warpNum = min(warpNum, minWarpNum);

-    grid.x = vectorNum;
+    grid.x = (unsigned int)vectorNum;
    grid.y = 1;
    grid.z = 1;
    block.x = 1;
@@ -482,7 +482,7 @@ void KernelReduceMaxOp(DTYPE * input, DTYPE * output,int stride, int strideNum,
    if (tid < 32){
        if (tid < blockDim.y / 32)
            threadMax = data[tid];
-        else threadMax = 0;
+        else threadMax = FLOAT_MIN;
        threadMax = shflDownReduceMax(threadMax);
        if (tid == 0 && blockIdx.y < reducedStrideNum)
            output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = threadMax;

--- a/source/tensor/core/reduce/ReduceSum.cu
+++ b/source/tensor/core/reduce/ReduceSum.cu
@@ -480,8 +480,8 @@ void KernelReduceSumFast(__half * input, __half * output,
 if data storage is discontinuius ,use this way to reduce 
 */
 __global__ 
-void KernelReduceSumDiscontinuousStorage(DTYPE * input, DTYPE * output, int stride, int blockNum,
-                                         int strideNum, DTYPE * shift, DTYPE power, bool isExp)
+void KernelReduceSumDiscontinuousStorage(DTYPE * input, DTYPE * output, int stride, int strideNum, 
+                                         int blockNum, DTYPE * shift, DTYPE power, bool isExp)
 {
    __shared__ DTYPE bias[MAX_CUDA_THREAD_NUM_PER_BLOCK];
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
@@ -629,7 +629,7 @@ inline void continuousStorageThreadAllocation(dim3& grid, dim3& block, long long
    if (vectorSize % 32 != 0) minWarpNum++;
    warpNum = min(warpNum, minWarpNum);

-    grid.x = vectorNum;
+    grid.x = (unsigned int)vectorNum;
    grid.y = 1;
    grid.z = 1;
    block.x = 1;

--- a/source/tensor/function/Dropout.cpp
+++ b/source/tensor/function/Dropout.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-12
+ */
+
+#include "../XName.h"
+#include <math.h>
+#include <time.h>
+#include "Dropout.h"
+#include "Dropout.cuh"
+#include "../core/arithmetic/Multiply.h"
+#include "../core/math/ScaleAndShift.h"
+
+namespace nts{ // namespace nts(NiuTrans.Tensor
+
+/*
+generate a random bernoulli number
+*/
+DTYPE RandomBernoulli(DTYPE prob)
+{
+    return (DTYPE)rand()/(DTYPE)RAND_MAX > prob ? (DTYPE)1.0 : (DTYPE)0.0;
+}
+
+/*
+dropout function
+
+During training, randomly zeroes some of the elements of the input tensor
+with probability p using samples from a Bernoulli distribution.
+The elements to zero are randomized on every forward call.
+
+This has proven to be an effective technique for regularization and
+preventing the co-adaptation of neurons as described in the paper
+"Improving neural networks by preventing co-adaptation of feature detectors".
+
+Furthermore, the outputs are scaled by a factor of \frac{1}{1-p} during training.
+This means that during evaluation the module simply computes an identity function.
+>> x - input tensor
+>> y - output tensor
+>> prob - probability to set an element zero
+*/
+void _Dropout(const XTensor *x, XTensor *y, unsigned int seed, DTYPE prob)
+{
+    CheckNTErrors(prob >= 0.0 && prob <= 1.0, "The probability must be 0-1!");
+    
+   DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - prob);
+    
+    /* generate a mask tensor again with special probability */
+    srand(seed);
+    int unitNum = x->unitNum;
+    DTYPE * maskArray = new DTYPE[unitNum];
+    for (int i = 0; i < unitNum; i++)
+        maskArray[i] = RandomBernoulli(prob);
+
+    XTensor * maskTensor = NewTensorBuf(x, x->devID, x->mem);
+    maskTensor->SetData(maskArray, unitNum);
+
+#ifdef USE_CUDA
+    if(x->devID >=0 || y->devID >= 0){
+        _CudaDropout(x, y, maskTensor, scaleFactor);
+        
+        DelTensorBuf(maskTensor);
+        delete[] maskArray;
+        return;
+    }
+#endif
+
+    XTensor * inter = NewTensorBuf(x, x->devID, x->mem);
+    _Multiply(x, maskTensor, inter);
+    _ScaleAndShift(inter, y, scaleFactor, 0);
+    
+    DelTensorBuf(inter);
+    DelTensorBuf(maskTensor);
+    delete[] maskArray;
+}
+
+/*
+dropout function (return a XTensor structure)
+make a new tensor to keep the result and return it
+
+During training, randomly zeroes some of the elements of the input tensor
+with probability p using samples from a Bernoulli distribution.
+The elements to zero are randomized on every forward call.
+
+This has proven to be an effective technique for regularization and
+preventing the co-adaptation of neurons as described in the paper
+"Improving neural networks by preventing co-adaptation of feature detectors".
+
+Furthermore, the outputs are scaled by a factor of \frac{1}{1-p} during training.
+This means that during evaluation the module simply computes an identity function.
+>> x - input tensor
+>> y - output tensor
+>> prob - probability to set an element zero
+*/
+XTensor Dropout(const XTensor &x, DTYPE prob)
+{
+    XTensor y(&x);
+    y.SetTMP();
+
+   DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - prob);
+    
+    /* generate a mask tensor again with special probability */
+    srand((unsigned int)time(NULL));
+    int unitNum = x.unitNum;
+    DTYPE * maskArray = new DTYPE[unitNum];
+    for (int i = 0; i < unitNum; i++)
+        maskArray[i] = RandomBernoulli(prob);
+
+    XTensor maskTensor(&x);
+    maskTensor.SetData(maskArray, unitNum);
+
+    XTensor inter;
+    inter = Multiply(x, maskTensor);
+    y = ScaleAndShift(inter, scaleFactor, 0);
+
+    delete[] maskArray;
+
+    ///* tensor connection */
+    //XLink::MakeLink(&x, NULL, &y, FUNC_DROPOUT);
+    //XLink::AddParamToHead(&y, prob);
+    
+    return y;
+}
+
+/* 
+backward computation of dropout function
+
+dE/dx = dE/dy * dy/dx
+
+>> y - output of the dropout function
+>> x - input of the dropout function
+>> dedy - dE/dy
+>> dedx - dE/dx
+>> prob - probability to set an element zero
+*/
+void _DropoutBackward(const XTensor * y, const XTensor * x, 
+                      const XTensor * dedy, XTensor * dedx, 
+                      unsigned int seed, DTYPE prob)
+{
+    if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE)
+    {
+        int unitNum = y->unitNum;
+        DTYPE scaleFactor = (DTYPE)1.0F / ((DTYPE)1.0F - prob);
+
+        /* generate a mask tensor again with special probability */
+        srand(seed);
+        DTYPE * maskArray = new DTYPE[unitNum];
+        for (int i = 0; i < unitNum; i++)
+            maskArray[i] = RandomBernoulli(prob);
+
+        XTensor * maskTensor = NewTensorBuf(x, x->devID, x->mem);
+        maskTensor->SetData(maskArray, unitNum);
+
+        #ifdef USE_CUDA
+            if(x->devID >= 0 || y->devID >= 0){
+                _CudaDropoutBackward(y, x, dedy, dedx, maskTensor, scaleFactor);
+                
+                DelTensorBuf(maskTensor);
+                delete[] maskArray;
+                return;
+            }
+        #endif
+
+        DTYPE * dedyp = (DTYPE*)dedy->data;
+        DTYPE * dedxp = (DTYPE*)dedx->data;
+
+        /* dE/dx = dE/dy * dy/dx */
+        for(int i = 0; i < unitNum; i++)
+            dedxp[i] = dedyp[i] * maskArray[i] * scaleFactor;
+
+        DelTensorBuf(maskTensor);
+        delete[] maskArray;
+    }
+    else
+        ShowNTErrors("TODO!");
+}
+
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/function/Dropout.cu
+++ b/source/tensor/function/Dropout.cu
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-12
+ */
+
+#include "Dropout.h"
+#include "Dropout.cuh"
+#include "Loss.cuh"
+#include "../XDevice.h"
+
+#ifdef USE_CUDA
+
+// the CUDA stuff
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <cuda.h>
+
+#endif
+
+namespace nts{ // namespace nts(NiuTrans.Tensor)
+
+#ifdef USE_CUDA
+
+/* 
+dropout function (Cuda kernel) 
+>> x - input data pointer
+>> y - output data pointer
+>> m - mask indicator to set zero 
+>> s - the scale factor
+>> size - size of input/output
+*/
+__global__ 
+void KernelDropoutCompute(DTYPE * x, DTYPE * y, DTYPE * m, DTYPE s, int size)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < size){
+        y[i] = x[i] * m[i] * s;
+    }
+}
+
+/*
+dropout function (Cuda version)
+>> x - input tensor
+>> y - output tensor
+>> mask - mask tensor to set 0
+>> scaleFactor - the scale factor
+*/
+void _CudaDropout(const XTensor * x, XTensor * y, const XTensor * mask, DTYPE scaleFactor)
+{
+    if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
+
+        CheckNTErrors(!x->isSparse && !y->isSparse, "the activation function (rectify) does not support sparse matrices.");
+        CheckNTErrors(x->unitNum && y->unitNum, "we require two vectors with the same length.");
+
+        int gridSize[3], blockSize[3];
+
+        GDevs.GetCudaThread(x->devID, x->unitNum, gridSize, blockSize);
+
+        int devIDBackup;
+        ProtectCudaDev(x->devID, devIDBackup);
+
+        KernelDropoutCompute<<<dim3(gridSize[0]), dim3(blockSize[0])>>>((DTYPE*)x->data, (DTYPE*)y->data, (DTYPE*)mask->data, scaleFactor, x->unitNum);
+
+        BacktoCudaDev(x->devID, devIDBackup);
+    }
+    else
+        ShowNTErrors("TODO!");
+}
+
+/* 
+backward computation of dropout function (Cuda kernel)
+
+dE/dx = dE/dy * dy/dx
+
+>> dedy - dE/dy
+>> dedx - dE/dx
+>> m - mask indicator to set zero 
+>> s - the scale factor
+>> size - size of input/output
+*/
+__global__
+void KernelDropoutBackward(DTYPE * dedy, DTYPE * dedx, 
+                           DTYPE * m, DTYPE s, int size)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < size){
+        dedx[i] = dedy[i] * m[i] * s;
+    }
+}
+
+/* 
+backward computation of dropout function (Cuda version)
+
+dE/dx = dE/dy * dy/dx
+
+>> y - output of the dropout function
+>> x - input of the dropout function
+>> dedy - dE/dy
+>> dedx - dE/dx
+>> mask - mask tensor to set 0
+>> scaleFactor - the scale factor
+*/
+void _CudaDropoutBackward(const XTensor * y, const XTensor * x,
+                          const XTensor * dedy, XTensor * dedx,
+                          const XTensor * mask, DTYPE scaleFactor)
+{
+    int gridSize[3], blockSize[3];
+
+    if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
+        GDevs.GetCudaThread(x->devID, x->unitNum, gridSize, blockSize);
+
+        int devIDBackup;
+        ProtectCudaDev(x->devID, devIDBackup);
+
+        /* dE/ds = dE/dy * dy/ds */
+        KernelDropoutBackward<<<dim3(gridSize[0]),dim3(blockSize[0])>>>
+                              ((DTYPE*)dedy->data, (DTYPE*)dedx->data, 
+                               (DTYPE*)mask->data, scaleFactor, x->unitNum);
+
+        BacktoCudaDev(x->devID, devIDBackup);
+    }
+    else
+        ShowNTErrors("TODO!");
+}
+
+#endif
+
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/function/Dropout.cuh
+++ b/source/tensor/function/Dropout.cuh
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-12
+ */
+
+#ifndef __DROPOUT_CUH__
+#define __DROPOUT_CUH__
+
+#include "../XTensor.h"
+#include "Loss.h"
+
+namespace nts{ // namespace nts(NiuTrans.Tensor)
+
+#ifdef USE_CUDA
+
+/* dropout function (Cuda version) */
+void _CudaDropout(const XTensor * x, XTensor * y, const XTensor * r, DTYPE scaleFactor);
+
+/* de/dx (Cuda version) */
+void _CudaDropoutBackward(const XTensor * y, const XTensor * x,
+                          const XTensor * dedy, XTensor * dedx,
+                          const XTensor * mask, DTYPE scaleFactor);
+
+#endif // USE_CUDA
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __DROPOUT_CUH__
\ No newline at end of file
--- a/source/tensor/function/Dropout.h
+++ b/source/tensor/function/Dropout.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-12
+ */
+
+#ifndef __DROPOUT_H__
+#define __DROPOUT_H__
+
+#include "../XTensor.h"
+#include "Loss.h"
+
+namespace nts{ // namespace nts(NiuTrans.Tensor)
+
+/* dropout function */
+void _Dropout(const XTensor * x, XTensor * y, unsigned int seed, DTYPE prob = 0.5);
+
+/* dropout function */
+XTensor Dropout(const XTensor &x, DTYPE prob = 0.5);
+
+/* de/dx */
+void _DropoutBackward(const XTensor * y, const XTensor * x, 
+                      const XTensor * dedy, XTensor * dedx, 
+                      unsigned int seed, DTYPE prob = 0.5);
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __DROPOUT_H__
\ No newline at end of file
--- a/source/tensor/function/Loss.cpp
+++ b/source/tensor/function/Loss.cpp
@@ -51,8 +51,7 @@ DTYPE _LossCompute(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
        CheckNTErrors((XTensor::IsSameShaped(gold, output)), "The input tensors must be of the same size!");
        CheckNTErrors((gold->dimSizeRDI[0] == 1 && output->dimSizeRDI[0] == 1), "TODO!");
        CheckNTErrors((gold->order > leadDim && leadDim >= 0), "Illegal leading dimension!");
-        CheckNTErrors((gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE),
-                             "TODO!");
+        CheckNTErrors((gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE), "TODO!");

        int leadDimRDI = output->order - leadDim - 1;
        int dimensionSize = output->dimSizeRDI[leadDimRDI];

--- a/source/tensor/function/Loss.cu
+++ b/source/tensor/function/Loss.cu
@@ -58,8 +58,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
    CheckNTErrors((XTensor::IsSameShaped(gold, y)), "The input tensors must be of the same size!");
    CheckNTErrors((gold->dimSizeRDI[0] == 1 && y->dimSizeRDI[0] == 1), "TODO!");
    CheckNTErrors((gold->order > leadDim && leadDim >= 0), "Illegal leading dimension!");
-    CheckNTErrors((gold->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE),
-                         "TODO!");
+    CheckNTErrors((gold->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE), "TODO!");
    CheckNTErrors((gold->devID == y->devID), "Tensors must be on the same device!");
    CheckNTErrors((gold->devID >= 0), "Tensors must be on GPU device!");
    CheckNTErrors((gLen == gold->dimSize[leadDim] && gBeg == 0 && yBeg == 0), "TODO!");

--- a/source/tensor/function/Loss.h
+++ b/source/tensor/function/Loss.h
@@ -48,19 +48,19 @@ loss function to measure the "number" of errors

 /* compute the loss */
 DTYPE _LossCompute(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
-                  bool isLogOutput, int leadDim, int gBeg, int gLen, int oBeg);
+                   bool isLogOutput, int leadDim, int gBeg, int gLen, int oBeg);

 /* compute the loss (log version) */
 DTYPE _LossComputeForLogScale(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
-                             int leadDim, int gBeg, int gLen, int oBeg);
+                              int leadDim, int gBeg, int gLen, int oBeg);

 /* backward compuation for a single element */
 DTYPE _LossBackwardPoint(DTYPE t, DTYPE y, LOSS_FUNCTION_NAME LFName);

 /* backward compuation for (dense) vectors */
 void _LossBackward(XTensor * dEdY, XTensor * t, XTensor * y, 
-                  LOSS_FUNCTION_NAME LFName, 
-                  int leadDim = -1, int tBeg = 0, int tLen = -1, int yBeg = 0);
+                   LOSS_FUNCTION_NAME LFName, 
+                   int leadDim = -1, int tBeg = 0, int tLen = -1, int yBeg = 0);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/function/Sigmoid.cpp
+++ b/source/tensor/function/Sigmoid.cpp
@@ -16,8 +16,8 @@
 */

 /*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-25
-*/
+ * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-25
+ */

 #include "../XName.h"
 #include <math.h>

--- a/source/tensor/function/Sigmoid.cu
+++ b/source/tensor/function/Sigmoid.cu
@@ -16,8 +16,8 @@
 */

 /*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-25
-*/
+ * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-25
+ */

 #include "Sigmoid.h"
 #include "Sigmoid.cuh"

--- a/source/tensor/function/Sigmoid.cuh
+++ b/source/tensor/function/Sigmoid.cuh
@@ -29,7 +29,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)

 #ifdef USE_CUDA

-/* rectify function y = max(0, x) (Cuda version) */
+/* sigmoid function y = 1/(1+exp(-x)) (Cuda version) */
 void _CudaSigmoid(const XTensor * input, XTensor * output);

 /* de/dx (Cuda version) */

--- a/source/tensor/function/Softmax.cpp
+++ b/source/tensor/function/Softmax.cpp
@@ -45,20 +45,17 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
        int * dimSize = new int[x->order - 1];
        for(int i = 0; i < x->order; i++){
            if(i < leadDim)
-                dimSize[i] = -x->dimSize[i];
+                dimSize[i] = x->dimSize[i];
            else if(i > leadDim)
-                dimSize[i - 1] = -x->dimSize[i];
+                dimSize[i - 1] = x->dimSize[i];
        }

        XMem * mem = x->mem;
        XTensor * max = NULL;
        XTensor * sum = NULL;

-        max = NewTensor(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
-        sum = NewTensor(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
-
-        max->data = mem != NULL ? (char*)mem->AllocBuf(mem->devID, max->unitNum * max->unitSize) : XMemAlloc(max->devID, max->unitNum * max->unitSize);
-        sum->data = mem != NULL ? (char*)mem->AllocBuf(mem->devID, sum->unitNum * sum->unitSize) : XMemAlloc(sum->devID, sum->unitNum * sum->unitSize);
+        max = NewTensorBuf(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
+        sum = NewTensorBuf(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);

        _ReduceMax(x, max, leadDim);
        _ReduceSum(x, sum, leadDim, max, 1.0F, true);
@@ -114,18 +111,9 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
            }
        }

-        if(mem != NULL){
-            mem->ReleaseBuf(mem->devID, max->unitNum * max->unitSize);
-            mem->ReleaseBuf(mem->devID, sum->unitNum * sum->unitSize);
-        }
-        else{
-            XMemFree(max->devID, max->data);
-            XMemFree(sum->devID, sum->data);
-            max->data = NULL;
-            sum->data = NULL;
-        }
-        delete max;
-        delete sum;
+        DelTensorBuf(sum);
+        DelTensorBuf(max);
+
        delete[] dimSize;
    }
    else

--- a/source/tensor/test/TDropout.cpp
+++ b/source/tensor/test/TDropout.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-12
+ */
+
+#include "../XUtility.h"
+#include "TDropout.h"
+#include "../core/getandset/SetData.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* 
+case 1: test Dropout function.
+*/
+bool TestDropout1()
+{
+    /* a input tensor of size (4, 5) */
+    int order = 2;
+    int * dimSize = new int[order];
+    dimSize[0] = 40;
+    dimSize[1] = 50;
+
+    int unitNum = 1;
+    for (int i = 0; i < order; i++)
+        unitNum *= dimSize[i];
+
+    /* CPU test */
+    bool cpuTest = true;
+
+    /* create tensors */
+    XTensor * x = NewTensor(order, dimSize);
+    XTensor * y = NewTensor(order, dimSize);
+    XTensor yUser;
+
+    /* initialize variables */
+    x->SetDataRand(0, 1);
+    y->SetZeroAll();
+
+    /* call Dropout function */
+    float prob = 0.2F;
+    int seed = 20;
+    _Dropout(x, y, seed, prob);
+    yUser = Dropout(*x);
+
+    /* check result */
+    int zeroNum1 = 0;
+    int zeroNum2 = 0;
+    float * data1 = (float*)y->data;
+    float * data2 = (float*)yUser.data;
+    for (int i = 0; i < unitNum; i++){
+        DTYPE tmp1 = data1[i];
+        DTYPE tmp2 = data2[i];
+        if(tmp1 == 0.0F)
+            zeroNum1 += 1;
+        if(tmp2 == 0.0F)
+            zeroNum2 += 1;
+    }
+    printf("CPU Test:\n");
+    printf("In tensor y, there are %d units.\n", unitNum);
+    printf("There are %d zero units by Dropout layer with probability %.2f.\n", zeroNum1, prob);
+    printf("In tensor yUser, there are %d units.\n", unitNum);
+    printf("There are %d zero units by Dropout layer with default probability %.2f.\n", zeroNum2, 0.5F);
+
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensors */
+    XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor yUserGPU;
+
+    /* initialize variables */
+    xGPU->SetDataRand(0, 1);
+    yGPU->SetZeroAll();
+
+    /* call Dropout function */
+    _Dropout(xGPU, yGPU, seed, prob);
+    yUserGPU = Dropout(*xGPU);
+
+    /* check result */
+    zeroNum1 = 0;
+    zeroNum2 = 0;
+    data1 = (float*)y->data;
+    data2 = (float*)yUser.data;
+    for (int i = 0; i < unitNum; i++){
+        DTYPE tmp1 = data1[i];
+        DTYPE tmp2 = data2[i];
+        if(tmp1 == 0.0F)
+            zeroNum1 += 1;
+        if(tmp2 == 0.0F)
+            zeroNum2 += 1;
+    }
+    printf("CPU Test:\n");
+    printf("In tensor y, there are %d units.\n", unitNum);
+    printf("There are %d zero units by Dropout layer with probability %.2f.\n", zeroNum1, prob);
+    printf("In tensor yUser, there are %d units.\n", unitNum);
+    printf("There are %d zero units by Dropout layer with default probability %.2f.\n", zeroNum2, 0.5F);
+
+    /* destroy variables */
+    delete x;
+    delete y;
+    delete xGPU;
+    delete yGPU;
+    delete[] dimSize;
+
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete x;
+    delete y;
+    delete[] dimSize;
+
+    return cpuTest;
+#endif // USE_CUDA
+}
+
+/* 
+case 2: test Dropout function and backward computation.
+*/
+bool TestDropout2()
+{
+    /* a input tensor of size (4, 5) */
+    int order = 2;
+    int * dimSize = new int[order];
+    dimSize[0] = 4;
+    dimSize[1] = 5;
+
+    int unitNum = 1;
+    for (int i = 0; i < order; i++)
+        unitNum *= dimSize[i];
+
+    /* CPU test */
+    bool cpuTest = true;
+
+    /* create tensors */
+    XTensor * x = NewTensor(order, dimSize);
+    XTensor * y = NewTensor(order, dimSize);
+    XTensor * dedx = NewTensor(order, dimSize);
+    XTensor * dedy = NewTensor(order, dimSize);
+
+    /* initialize variables */
+    _SetDataFixedFloat(x, 1.0F);
+    y->SetZeroAll();
+    dedx->SetZeroAll();
+    _SetDataFixedFloat(dedy, 1.0F);
+
+    /* call Dropout function */
+    float prob = 0.5F;
+    int seed = 1;
+    _Dropout(x, y, seed, prob);
+    _DropoutBackward(y, x, dedy, dedx, 1, prob);
+
+    /* check result */
+    y->Dump(stderr, "y");
+    dedx->Dump(stderr, "dedy");
+
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+
+    /* create tensors */
+    XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * dedxGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * dedyGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+
+    /* initialize variables */
+    _SetDataFixedFloat(xGPU, 1.0F);
+    yGPU->SetZeroAll();
+    dedxGPU->SetZeroAll();
+    _SetDataFixedFloat(dedyGPU, 1.0F);
+
+    /* call Dropout function */
+    _Dropout(xGPU, yGPU, seed, prob);
+    _DropoutBackward(yGPU, xGPU, dedyGPU, dedxGPU, 1, prob);
+
+    /* check result */
+    yGPU->Dump(stderr, "yGPU");
+    dedxGPU->Dump(stderr, "dedyGPU");
+
+    /* destroy variables */
+    delete x;
+    delete y;
+    delete dedx;
+    delete dedy;
+    delete xGPU;
+    delete yGPU;
+    delete dedxGPU;
+    delete dedyGPU;
+    delete[] dimSize;
+
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete x;
+    delete y;
+    delete[] dimSize;
+
+    return cpuTest;
+#endif // USE_CUDA
+}
+
+/* other cases */
+/*
+    TODO!!
+*/
+
+/* test for Dropout Function */
+bool TestDropout()
+{
+    XPRINT(0, stdout, "[TEST DROPOUT] dropout function and its backward computation \n");
+    bool returnFlag = true, caseFlag = true;
+
+    /* case 1 test */
+    caseFlag = TestDropout1();
+
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 1 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 1 passed!\n");
+    
+    /* case 2 test */
+    caseFlag = TestDropout2();
+
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 2 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 2 passed!\n");
+
+    /* other cases test */
+    /*
+    TODO!!
+    */
+
+    if (returnFlag) {
+        XPRINT(0, stdout, ">> All Passed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> Failed!\n");
+
+    XPRINT(0, stdout, "\n");
+
+    return returnFlag;
+}
+
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/test/TDropout.h
+++ b/source/tensor/test/TDropout.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-09-12
+*/
+
+#ifndef __TEST_DROPOUT_H__
+#define __TEST_DROPOUT_H__
+
+#include "../function/Dropout.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* test for Dropout Function */
+bool TestDropout();
+
+} // namespace nts(NiuTrans.Tensor)
+#endif // __TEST_DROPOUT_H__
--- a/source/tensor/test/TLoss.cpp
+++ b/source/tensor/test/TLoss.cpp
@@ -19,6 +19,7 @@
 * $Created by: LI Yinqiao (email: li.yin.qiao.2012@hotmail.com) 2018-04-30
 */

+#include<math.h>
 #include "../core/math/ScaleAndShift.h"
 #include "TLoss.h"

@@ -62,7 +63,7 @@ bool TestLoss1()
    error = _LossCompute(gold, output, SQUAREDERROR, false, 0, 0, dimSize[0], 0);
    
    /* check results */
-    cpuTest = (error == answer);
+    cpuTest = (fabs(error - answer) < 1e-4);
    
 #ifdef USE_CUDA
    /* GPU test */
@@ -82,7 +83,7 @@ bool TestLoss1()
    error = _LossCompute(goldGPU, outputGPU, SQUAREDERROR, false, 0, 0, dimSize[0], 0);
    
    /* check results */
-    gpuTest = (error == answer);
+    gpuTest = (fabs(error - answer) < 1e-4);

    /* destroy variables */
    delete output;
@@ -140,7 +141,7 @@ bool TestLoss2()
    error = _LossCompute(gold, output, CROSSENTROPY, false, 0, 0, dimSize[0], 0);
    
    /* check results */
-    cpuTest = (error == answer);
+    cpuTest = (fabs(error - answer) < 1e-4);

 #ifdef USE_CUDA
    /* GPU test */
@@ -160,7 +161,7 @@ bool TestLoss2()
    error = _LossCompute(goldGPU, outputGPU, CROSSENTROPY, false, 0, 0, dimSize[0], 0);
    
    /* check results */
-    gpuTest = (error == answer);
+    gpuTest = (fabs(error - answer) < 1e-4);

    /* destroy variables */
    delete output;
@@ -226,7 +227,7 @@ bool TestLoss3()
    error = _LossCompute(gold, output, ONEHOTERROR, false, 0, 0, dimSize[0], 0);
    
    /* check results */
-    cpuTest = (error == answer);
+    cpuTest = (fabs(error - answer) < 1e-4);

 #ifdef USE_CUDA
    /* GPU test */
@@ -244,7 +245,7 @@ bool TestLoss3()
    error = _LossCompute(goldGPU, outputGPU, ONEHOTERROR, false, 0, 0, dimSize[0], 0);
    
    /* check results */
-    gpuTest = (error == answer);
+    gpuTest = (fabs(error - answer) < 1e-4);

    /* destroy variables */
    delete output;

--- a/source/tensor/test/TXMem.cpp
+++ b/source/tensor/test/TXMem.cpp
@@ -33,52 +33,130 @@ bool TestXMemCase1()
    int blcokSize = 16;
    int testNum = caseNum * 10;

-    for(int i = 0, scalar = 1; i < 3; i++){
-        XMem mem;
-        mem.Initialize(-1, FREE_ON_THE_FLY, blcokSize * sizeof(int) * scalar * scalar, 1000, 0);
-        mem.SetIndex(10000, blcokSize * sizeof(int) / 2);
-
-        srand(907);
-
-        int ** p = new int*[caseNum];
-        int * size = new int[caseNum];
-
-        for(int i = 0; i < caseNum; i++){
-            p[i] = NULL;
-            size[i] = rand() % (2*blcokSize);
-        }
+    int devIDs[2];
+    int devNum = 1;
+    devIDs[0] = -1;
+
+    /*if (GDevs.nGPU > 0) {
+        devIDs[1] = 0;
+        devNum = 2;
+        devIDs[0] = 0;
+        devNum = 1;
+    }*/
+
+    int * buf = new int[blcokSize * 10];
+
+    for (int id = 0; id < devNum; id++) {
+        int devID = devIDs[id];
+        for (int iter = 0, scalar = 1; iter < 3; iter++) {
+            XMem mem;
+            mem.Initialize(devID, FREE_ON_THE_FLY, blcokSize * sizeof(int) * scalar * scalar, 1000, 0);
+            mem.SetIndex(10000, blcokSize * sizeof(int) / 2);
+
+            srand(907);
+
+            int ** p = new int*[caseNum];
+            int * size = new int[caseNum];
+
+            for (int i = 0; i < caseNum; i++) {
+                p[i] = NULL;
+                size[i] = rand() % (2 * blcokSize);
+            }

-        for(int i = 0; i < testNum * scalar; i++){
-            int j = rand() % caseNum;
+            for (int i = 0; i < testNum * scalar; i++) {
+                testxmemid++;
+                //fprintf(stderr, "%d %d\n", testxmemid, ok);
+                int j = rand() % caseNum;

-            if(p[j] == NULL){
-                p[j] = (int*)mem.AllocStandard(mem.devID, size[j] * sizeof(int));
-                for(int k = 0; k < size[j]; k++)
-                    p[j][k] = j;
-            }
-            else{
-                mem.ReleaseStandard(mem.devID, p[j]);
-                for(int k = 0; k < size[j]; k++)
-                    p[j][k] = -1;
-                p[j] = NULL;
-            }
+                if (p[j] == NULL) {
+                    p[j] = (int*)mem.AllocStandard(mem.devID, size[j] * sizeof(int));
+                    for (int k = 0; k < size[j]; k++)
+                        buf[k] = j;
+                    XMemCopy(p[j], devID, buf, -1, sizeof(int) * size[j]);
+                }
+                else {
+                    mem.ReleaseStandard(mem.devID, p[j], size[j] * sizeof(int));
+                    for (int k = 0; k < size[j]; k++)
+                        buf[k] = -1;
+                    XMemCopy(p[j], devID, buf, -1, sizeof(int) * size[j]);
+                    p[j] = NULL;
+                }

-            for(int k = 0; k < caseNum; k++){
-                if(p[k] != NULL){
-                    for(int o = 0; o < size[k]; o++){
-                        if(p[k][o] != k){
-                            ok = false;
+                for (int k = 0; k < caseNum; k++) {
+                    if (p[k] != NULL) {
+                        XMemCopy(buf, -1, p[k], devID, sizeof(int) * size[k]);
+                        for (int o = 0; o < size[k]; o++) {
+                            if (buf[o] != k) {
+                                ok = false;
+                            }
+                        }
+                    }
+                }
+                
+                /*MPieceNode * entry = NULL;
+                MPieceNode * node = NULL;
+                
+                entry = mem.memIndex + mem.indexEntryNum + mem.FindIndexEntry(112);
+                
+                int cc = 0;
+                node = entry->next;
+                while(node != NULL){
+                    fprintf(stderr, "%d ", cc++);
+                    if(node->size == 0){
+                        MPieceNode * next = node->next;
+                        node = next;
+                    }
+                    else{
+                        CheckNTErrors(node->pReal != NULL, "Illegal pointer!");
+                        node = node->next;
+                    }
+                }
+                fprintf(stderr, "\n");*/
+                
+                /*int ccc = 0;
+                bool hhh = recordp != NULL ? false : true;
+                for(int i = 0; i < mem.indexEntryNum; i++){
+                    MPieceNode * entry = mem.memIndex + mem.indexEntryNum + i;
+                    
+                    MPieceNode * last = entry;
+                    MPieceNode * node = entry->next;
+                    
+                    ccc = 0;
+                    while(node != NULL){
+                        CheckNTErrors(node->pre == last, "XSomething is wrong!");
+                        CheckNTErrors(last->next == node, "XSomething is wrong!");
+                        
+                        last = node;
+                        
+                        ccc++;
+                        if(node->pReal == recordp){
+                            hhh = true;
+                        }
+                        
+                        if(node->size == 0){
+                            MPieceNode * next = node->next;
+                            node = next;
+                        }
+                        else{
+                            CheckNTErrors(node->pReal != NULL, "Illegal pointer!");
+                            node = node->next;
                        }
                    }
                }
+                
+                if(!hhh){
+                    int nnn = 0;
+                }*/
            }
-        }

-        delete[] p;
-        delete[] size;
-        scalar *= 2;
+            delete[] p;
+            delete[] size;
+            scalar *= 2;
+        }
    }

+    delete[] buf;
+
    return ok;
 }

@@ -113,4 +191,4 @@ bool TestXMem()
    return returnFlag;
 }

-} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/test/Test.cpp
+++ b/source/tensor/test/Test.cpp
@@ -76,6 +76,7 @@ bool Test()
    wrong = !TestUnsqueeze() || wrong;
    wrong = !TestXMem() || wrong;

+	wrong = !TestDropout() || wrong;
    wrong = !TestHardTanH() || wrong;
    wrong = !TestIdentity() || wrong;
    wrong = !TestLogSoftmax() || wrong;

--- a/source/tensor/test/Test.h
+++ b/source/tensor/test/Test.h
@@ -69,6 +69,7 @@
 #include "TUnsqueeze.h"
 #include "TXMem.h"

+#include "TDropout.h"
 #include "THardTanH.h"
 #include "TIdentity.h"
 #include "TLogSoftmax.h"