Merge with branch: xiaotong-working

3852f15a · huchi · 98a9130d · 3852f15a · 3852f15a · 3852f15a
Commit 3852f15a authored Mar 17, 2021 by huchi
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
--- a/README.md
+++ b/README.md
--- a/source/Main.cpp
+++ b/source/Main.cpp
@@ -27,6 +27,7 @@
 #include "./tensor/test/Test.h"
 #include "./sample/fnnlm/FNNLM.h"
 #include "./sample/transformer/NMT.h"
+#include "./train/TTrain.h"

 //#define CRTDBG_MAP_ALLOC
 //#include <stdlib.h>
@@ -38,8 +39,17 @@ using namespace nmt;

 int main( int argc, const char ** argv )
 {
-    if(argc > 1 && !strcmp(argv[1], "-test"))
+    XConfig config;
+
+    if(argc > 1){
+        config.Create(argc - 1, argv + 1);
+        verboseLevel = config.GetInt("verbose", 1);
+    }
+
+    if (argc > 1 && !strcmp(argv[1], "-test"))
        Test();
+    else if (argc > 1 && !strcmp(argv[1], "-testtrain"))
+        TestTrain();
    else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
        FNNLMMain(argc - 1, argv + 1);
    else if(argc > 1 && !strcmp(argv[1], "-t2t"))
@@ -47,7 +57,8 @@ int main( int argc, const char ** argv )
    else{
        fprintf(stderr, "Thanks for using NiuTensor! This is a library for building\n");
        fprintf(stderr, "neural networks in an easy way. \n\n");
-        fprintf(stderr, "Run this program with \"-test\" for unit test!\n");
+        fprintf(stderr, "   Run this program with \"-test\" for unit test!\n");
+        fprintf(stderr, "Or run this program with \"-testtrain\" for test of the trainer!\n");
        fprintf(stderr, "Or run this program with \"-fnnlm\" for sample FNNLM!\n");
        fprintf(stderr, "Or run this program with \"-t2t\" for sample Transformer!\n");
    }

--- a/source/network/XBackwardFunc.cpp
+++ b/source/network/XBackwardFunc.cpp
@@ -93,6 +93,7 @@ void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient)
    }

    node->visitMark = NODE_FINISHED;
+    node->isGradFinished = true;
 }

 /* indicates whether the node is for an activation function */

--- a/source/network/XBackwardLoss.cpp
+++ b/source/network/XBackwardLoss.cpp
@@ -89,6 +89,7 @@ void XLossGrad::MakeGrad(XTensor * node, bool isEfficient)
    }

    node->visitMark = NODE_FINISHED;
+    node->isGradFinished = true;
 }

 /* indicates whether the node is for a loss computation */

--- a/source/network/XBackwardMath.cpp
+++ b/source/network/XBackwardMath.cpp
--- a/source/network/XBackwardShape.cpp
+++ b/source/network/XBackwardShape.cpp
@@ -105,12 +105,19 @@ void XShapeGrad::GradConvertDataType(XTensor* node, bool isEfficient)
    if (!isEfficient || a->isGrad) {
        XNoder::MakeGrad(a);

+        if (a->mem != NULL)
+            a->mem->LockBuf();
        XTensor* tmp = NewTensorBufV2(a, a->devID, a->mem);
        _ConvertDataType(node->grad, tmp);
        _SumMe(a->grad, tmp);

        DelTensorBuf(tmp);
+        if (a->mem != NULL)
+            a->mem->UnlockBuf();
    }
+
+    node->visitMark = NODE_FINISHED;
+    node->isGradFinished = true;
 }

 /* 
@@ -138,12 +145,19 @@ void XShapeGrad::GradCopyIndexed(XTensor * node, bool isEfficient)
    if (!isEfficient || input->isGrad) {
        XNoder::MakeGrad(input);

+        if (input->mem != NULL)
+            input->mem->LockBuf();
        XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
        _SpreadForCopyIndexed(tmp, node->grad, dim, srcIndex, tgtIndex, copyNum);
        _SumMe(input->grad, tmp);

        DelTensorBuf(tmp);
+        if (input->mem != NULL)
+            input->mem->UnlockBuf();
    }
+
+    node->visitMark = NODE_FINISHED;
+    node->isGradFinished = true;
 }

 /* 
@@ -167,15 +181,20 @@ void XShapeGrad::GradGather(XTensor * node, bool isEfficient)
    if (!isEfficient || input->isGrad) {
        XNoder::MakeGrad(input);

+        if (input->mem != NULL)
+            input->mem->LockBuf();
        XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
        tmp->SetZeroAll();
        _SpreadForGather(tmp, node->grad, index);
        _SumMe(input->grad, tmp);

        DelTensorBuf(tmp);
+        if (input->mem != NULL)
+            input->mem->UnlockBuf();
    }

    node->visitMark = NODE_FINISHED;
+    node->isGradFinished = true;
 }

 /*
@@ -193,6 +212,8 @@ void XShapeGrad::GradDropoutWithIndex(XTensor * node, bool isEfficient)
    if (!isEfficient || input->isGrad) {
        XNoder::MakeGrad(input);

+        if (input->mem != NULL)
+            input->mem->LockBuf();
        XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
        _CopyValues(node->grad, tmp);

@@ -205,9 +226,12 @@ void XShapeGrad::GradDropoutWithIndex(XTensor * node, bool isEfficient)
        _SumMe(input->grad, tmp);

        DelTensorBuf(tmp);
+        if (input->mem != NULL)
+            input->mem->UnlockBuf();
    }

    node->visitMark = NODE_FINISHED;
+    node->isGradFinished = true;
 }

 /* 
@@ -246,13 +270,16 @@ void XShapeGrad::GradMerge(XTensor * node, bool isEfficient)
                dims[j++] = input->dimSize[i];
            }
        }
-        dims[0] = -dims[0];
+
+        dims[0] = -abs(dims[0]);
        XTensor gradInputSmall(input->order - leadDim, dims,
                               input->dataType, input->denseRatio,
                               input->devID, input->mem);

-        dims[whereToMerge - leadDim] *= dims[0];
-        XTensor gradNodeSmall(node->order - leadDim, dims + leadDim + 1,
+        dims[whereToMerge - leadDim] *= abs(dims[0]);
+        int * dimsNode = dims + 1;
+        dimsNode[0] = -abs(dimsNode[0]);
+        XTensor gradNodeSmall(node->order - leadDim, dimsNode,
                              node->dataType, node->denseRatio,
                              node->devID, node->mem);

@@ -296,6 +323,7 @@ void XShapeGrad::GradMerge(XTensor * node, bool isEfficient)
    }

    node->visitMark = NODE_FINISHED;
+    node->isGradFinished = true;
 }

 /* 
@@ -379,6 +407,7 @@ void XShapeGrad::GradMergeList(XTensor * node, bool isEfficient)
    }

    node->visitMark = NODE_FINISHED;
+    node->isGradFinished = true;
 }

 /* 
@@ -407,6 +436,7 @@ void XShapeGrad::GradReshape(XTensor * node, bool isEfficient)
    }

    node->visitMark = NODE_FINISHED;
+    node->isGradFinished = true;
 }

 /* 
@@ -442,16 +472,21 @@ void XShapeGrad::GradSplit(XTensor * node, bool isEfficient)
        /* if the tensor is used somewhere else, we need another SUM
           for gradient accumulation */
        else {
+            if (input->mem != NULL)
+                input->mem->LockBuf();
            XTensor * inputGradTMP = NewTensorBufV2(input, input->devID, input->mem);

            _Merge(node->grad, inputGradTMP, whereToSplit + 1, 0);
            _Sum(input->grad, inputGradTMP, input->grad);

            DelTensorBuf(inputGradTMP);
+            if (input->mem != NULL)
+                input->mem->UnlockBuf();
        }
    }

    node->visitMark = NODE_FINISHED;
+    node->isGradFinished = true;
 }

 /* 
@@ -528,14 +563,21 @@ void XShapeGrad::GradSplitListPost(XTensor * node, bool isEfficient)
           somewhere else, we need another SUM for gradient
           accumulation */
        else {
+            if (node->mem != NULL)
+                node->mem->LockBuf();
            XTensor * nodeGradTMP = NewTensorBufV2(node, node->devID, node->mem);

            _Merge(&splits, nodeGradTMP, whereToSplit + 1);
            _Sum(node->grad, nodeGradTMP, node->grad);

            DelTensorBuf(nodeGradTMP);
+            if (node->mem != NULL)
+                node->mem->UnlockBuf();
        }
    }
+
+    node->visitMark = NODE_DOING;
+    node->isGradFinished = true;
 }

 /*
@@ -566,14 +608,19 @@ void XShapeGrad::GradTranspose(XTensor * node, bool isEfficient)
        CheckNTErrors(input->order > i && i >= 0, "index of dimension is out of scope!");
        CheckNTErrors(input->order > j && j >= 0, "index of dimension is out of scope!");

+        if (input->mem != NULL)
+            input->mem->LockBuf();
        XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
        _Transpose(output->grad, tmp, i, j);
        _Sum(input->grad, tmp, input->grad);

        DelTensorBuf(tmp);
+        if (input->mem != NULL)
+            input->mem->UnlockBuf();
    }

    node->visitMark = NODE_FINISHED;
+    node->isGradFinished = true;
 }

 /* 
@@ -603,15 +650,20 @@ void XShapeGrad::GradUnsqueeze(XTensor * node, bool isEfficient)
    if (!isEfficient || input->isGrad) {
        XNoder::MakeGrad(input);

+        if (input->mem != NULL)
+            input->mem->LockBuf();
        XTensor * tmp = NewTensorBufV2(input->grad, input->devID, input->mem);

        _ReduceSum(output->grad, tmp, dim);
        _Sum(input->grad, tmp, input->grad);

        DelTensorBuf(tmp);
+        if (input->mem != NULL)
+            input->mem->UnlockBuf();
    }

    node->visitMark = NODE_FINISHED;
+    node->isGradFinished = true;
 }

 }
\ No newline at end of file
--- a/source/network/XNet.cpp
+++ b/source/network/XNet.cpp
@@ -101,6 +101,7 @@ void XNet::Backward(TensorList &roots)
    for(int i = 0; i < nodes.count; i++){
        XTensor * node = (XTensor*)nodes.Get(i);
        node->visitMark = NODE_UNFINISHED;
+        node->isGradFinished = false;
    }

    /* back-propagation from output to input */
@@ -108,7 +109,7 @@ void XNet::Backward(TensorList &roots)
        XTensor * node = (XTensor*)nodes.Get(i);

        if(node->mem != NULL){
-            CheckNTErrors(node->mem->bufUsed < BUF_PITCH, "Illegal access of buffer!");
+            //CheckNTErrors(node->mem->bufUsed < BUF_PITCH, "Illegal access of buffer!");
        }

        if(node->visitMark != NODE_FINISHED)
@@ -127,7 +128,20 @@ void XNet::Backward(TensorList &roots)
                    delete node;
                }
            }
-            
+        }
+    }
+
+    for (int i = 0; i < nodes.count; i++) {
+        XTensor* node = (XTensor*)nodes.Get(i);
+        if (node->income.tailNum >= 100 || node->outgo.tailNum >= 100) {
+            XPRINT(1, stderr, "Are you sure that the node should connect so many (100) nodes?\n");
+        }
+
+        if (node->grad != NULL) {
+            XTensor* grad = node->grad;
+            if (grad->income.tailNum >= 100 || grad->outgo.tailNum >= 100) {
+                XPRINT(1, stderr, "Are you sure that the grad node should connect so many (100) nodes?\n");
+            }
        }
    }
 }

--- a/source/sample/transformer/Decoder.cpp
+++ b/source/sample/transformer/Decoder.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/Decoder.h
+++ b/source/sample/transformer/Decoder.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/Encoder.cpp
+++ b/source/sample/transformer/Encoder.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/Encoder.h
+++ b/source/sample/transformer/Encoder.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/Model.cpp
+++ b/source/sample/transformer/Model.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -224,8 +224,6 @@ void Model::MakeMT(XTensor& inputEnc, XTensor& inputDec, XTensor& output,
    XTensor maskDec;
    XTensor maskEncDec;

-    bool debug(false);
-
    /* encoder mask */
    MakeMTMaskEnc(paddingEnc, maskEnc);

@@ -234,25 +232,9 @@ void Model::MakeMT(XTensor& inputEnc, XTensor& inputDec, XTensor& output,

    encoding = MakeEncoder(inputEnc, &maskEnc, isTraining);

-    if (debug) {
-        LOG("after encoding:");
-        encoding.mem->ShowMemUsage(stderr);
-    }
-    
    decoding = MakeDecoder(inputDec, encoding, &maskDec, maskEncDec, isTraining);

-    if (debug) {
-        LOG("after decoding:");
-        encoding.mem->ShowMemUsage(stderr);
-    }
-
    outputLayer->Make(decoding, output, true, true);
-
-    if (debug) {
-        LOG("after outputing:");
-        encoding.mem->ShowMemUsage(stderr);
-        exit(0);
-    }
 }

 /*
@@ -287,6 +269,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
    dims[inputDec.order + 1] = inputEnc.GetDim(inputEnc.order - 1);
    InitTensor(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, paddingEnc.devID);

+    GMems.GetMem(paddingEnc.devID)->LockBuf();
    XTensor* maskEncDecTMPEnc = NewTensorBufV2(paddingEnc.order + 1, dims + 1,
        paddingEnc.dataType, 1.0F, paddingEnc.devID, paddingEnc.mem);
    XTensor* maskEncDecTMPDec = NewTensorBufV2(maskEncDecTMPEnc, paddingEnc.devID, paddingEnc.mem);
@@ -297,6 +280,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,

    DelTensorBuf(maskEncDecTMPDec);
    DelTensorBuf(maskEncDecTMPEnc);
+    GMems.GetMem(paddingEnc.devID)->UnlockBuf();

    /* padding on the source side */
    int* dimsPadding = new int[paddingEnc.order + 2];
@@ -305,6 +289,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
    dimsPadding[paddingEnc.order - 1] = paddingEnc.GetDim(-1);
    dimsPadding[paddingEnc.order] = paddingEnc.GetDim(-1);

+    GMems.GetMem(paddingEnc.devID)->LockBuf();
    XTensor* padding2 = NewTensorBufV2(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType, 1.0F,
        paddingEnc.devID, paddingEnc.mem);

@@ -331,6 +316,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,

    DelTensorBuf(padding3);
    DelTensorBuf(padding2);
+    GMems.GetMem(paddingEnc.devID)->UnlockBuf();
 }

 /*
@@ -344,7 +330,6 @@ void Model::MakeMTMaskEnc(XTensor& paddingEnc, XTensor& maskEnc)

    /* mask of the padding */
    Unsqueeze(paddingEnc, padding2, paddingEnc.order - 1, paddingEnc.GetDim(-1));
-
    Unsqueeze(padding2, maskEnc, 0, nhead);
    ScaleAndShiftMe(maskEnc, 1e9F, -1e9F);
 }
@@ -378,7 +363,6 @@ void Model::MakeMTMaskDec(XTensor& paddingEnc, XTensor& paddingDec,

    Unsqueeze(paddingEnc, maskEncDecTMP, paddingEnc.order - 1, paddingDec.GetDim(-1));
    ScaleAndShiftMe(maskEncDecTMP, 1e9F, -1e9F);
-
    Unsqueeze(maskEncDecTMP, maskEncDec, 0, dims[0]);

    delete[] dims;
@@ -571,4 +555,14 @@ void Model::Read(FILE* file)
    LOG("model loaded (took %.1fs)", elapsed);
 }

+XModel* Model::Clone(int devID)
+{
+    return nullptr;
+}
+
+bool Model::RunSimple(XList* inputs, XList* outputs, XList* golds, XList* losses)
+{
+    return false;
+}
+
 }
\ No newline at end of file
--- a/source/sample/transformer/Model.h
+++ b/source/sample/transformer/Model.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -24,17 +24,18 @@

 #include "Encoder.h"
 #include "Decoder.h"
+#include "Utility.h"
 #include "submodel/FNN.h"
 #include "submodel/Output.h"
-#include "Utility.h"
 #include "submodel/Attention.h"
+#include "../../train/XModel.h"

 namespace nmt
 {

-/* a nmt model that keeps parameters of the encoder,
+/* an nmt model that keeps parameters of the encoder,
   the decoder and the output layer (softmax). */
-class Model
+class Model : public XModel
 {
 public:
    /* device id */
@@ -85,26 +86,26 @@ public:

    /* make the encoding network */
    XTensor MakeDecoder(XTensor& inputEnc, XTensor& inputDec, XTensor* mask,
-        XTensor& MaskEncDec, bool isTraining);
+                        XTensor& MaskEncDec, bool isTraining);

    /* make the network for language modeling (with the output softmax layer) */
    void MakeLM(XTensor& input, XTensor& output, XTensor& padding, bool isTraining);

    /* make the network for machine translation (with the output softmax layer) */
    void MakeMT(XTensor& inputEnc, XTensor& inputDec, XTensor& output,
-        XTensor& paddingEnc, XTensor& paddingDec, bool isTraining);
+                XTensor& paddingEnc, XTensor& paddingDec, bool isTraining);

    /* make the mask for training MT models */
    void MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
-        XTensor& paddingEnc, XTensor& paddingDec,
-        XTensor& maskEnc, XTensor& maskDec, XTensor& maskEncDec);
+                    XTensor& paddingEnc, XTensor& paddingDec,
+                    XTensor& maskEnc, XTensor& maskDec, XTensor& maskEncDec);

    /* make the mask of the encoder */
    void MakeMTMaskEnc(XTensor& paddingEnc, XTensor& maskEnc);

    /* make the mask of the decoder */
    void MakeMTMaskDec(XTensor& paddingEnc, XTensor& paddingDec,
-        XTensor& maskDec, XTensor& maskEncDec);
+                       XTensor& maskDec, XTensor& maskEncDec);

    /* get parameter matrices */
    void GetParams(TensorList& list);
@@ -114,6 +115,13 @@ public:

    /* read the parameters */
    void Read(FILE* file);
+
+public:
+    /* clone the model (overloaded method of XModel) */
+    XModel * Clone(int devID);
+
+    /* run the neural network (overloaded method of XModel) */
+    bool RunSimple(XList * inputs, XList * outputs, XList * golds, XList * losses);
 };

 }

--- a/source/sample/transformer/NMT.cpp
+++ b/source/sample/transformer/NMT.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/NMT.h
+++ b/source/sample/transformer/NMT.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/Utility.cpp
+++ b/source/sample/transformer/Utility.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -28,6 +28,7 @@

 #include "Utility.h"
 #include "../../tensor/XGlobal.h"
+#include "../../tensor/XConfig.h"

 using namespace nts;
 using namespace std;
@@ -165,89 +166,7 @@ int Config::LoadFromFile(const char* configFN, char** args) {
    return argsNum;
 }

-void LoadParamString(int argc, char** argv, const char* name, char* p, const char* defaultP)
-{
-    char vname[128];
-    vname[0] = '-';
-    strcpy(vname + 1, name);
-    bool hit = false;
-    for (int i = 0; i < argc; i++) {
-        if (!strcmp(argv[i], vname) && i + 1 < argc) {
-            strcpy(p, argv[i + 1]);
-            hit = true;
-            break;
-        }
-    }
-    if (!hit)
-        strcpy(p, defaultP);
-}

-void LoadParamInt(int argc, char** argv, const char* name, int* p, int defaultP)
-{
-    char vname[128];
-    vname[0] = '-';
-    strcpy(vname + 1, name);
-    bool hit = false;
-    for (int i = 0; i < argc; i++) {
-        if (!strcmp(argv[i], vname) && i + 1 < argc) {
-            *(int*)p = atoi(argv[i + 1]);
-            hit = true;
-            break;
-        }
-    }
-    if (!hit)
-        *p = defaultP;
-}
-
-void LoadParamBool(int argc, char** argv, const char* name, bool* p, bool defaultP)
-{
-    char vname[128];
-    vname[0] = '-';
-    strcpy(vname + 1, name);
-    bool hit = false;
-    for (int i = 0; i < argc; i++) {
-        if (!strcmp(argv[i], vname)) {
-            *(bool*)p = true;
-            hit = true;
-            break;
-        }
-    }
-    if (!hit)
-        *p = defaultP;
-}
-
-void LoadParamFloat(int argc, char** argv, const char* name, float* p, float defaultP)
-{
-    char vname[128];
-    vname[0] = '-';
-    strcpy(vname + 1, name);
-    bool hit = false;
-    for (int i = 0; i < argc; i++) {
-        if (!strcmp(argv[i], vname) && i + 1 < argc) {
-            *p = (float)atof(argv[i + 1]);
-            hit = true;
-            break;
-        }
-    }
-    if (!hit)
-        *p = defaultP;
-}
-
-void ShowParams(int argc, char** argv)
-{
-    fprintf(stderr, "args:\n");
-    for (int i = 0; i < argc; i++) {
-        if (argv[i][1] == 0)
-            continue;
-        if (argv[i][0] == '-' && (argv[i][1] < '1' || argv[i][1] > '9')) {
-            if (i + 1 < argc && argv[i + 1][0] != '-')
-                fprintf(stderr, " %s=%s\n", argv[i], argv[i + 1]);
-            else
-                fprintf(stderr, " %s=yes\n", argv[i]);
-        }
-    }
-    fprintf(stderr, "\n");
-}

 /*
 split string by delimiter, this will return indices of all sub-strings
@@ -281,7 +200,9 @@ IntList SplitInt(const string& s, const string& delimiter)
    IntList values;
    auto indices = SplitToPos(s, delimiter);
    for (int i = 0; i < indices.Size(); i++) {
-        values.Add(strtol(s.data() + indices[i], nullptr, 10));
+        
+        /* this line is with problem. Why do we need an IntList to keep an int64*/
+        values.Add((int)strtol(s.data() + indices[i], nullptr, 10));
    }
    return values;
 }
@@ -297,4 +218,4 @@ FloatList SplitFloat(const string& s, const string& delimiter)
    return values;
 }

-}
\ No newline at end of file
+}
--- a/source/sample/transformer/Utility.h
+++ b/source/sample/transformer/Utility.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -34,16 +34,6 @@ namespace nmt
 {

 #define MAX_PARAM_NUM 100
-
-/* load arguments */
-void LoadParamInt(int argc, char** argv, const char* name, int* p, int defaultP);
-void LoadParamBool(int argc, char** argv, const char* name, bool* p, bool defaultP);
-void LoadParamFloat(int argc, char** argv, const char* name, float* p, float defaultP);
-void LoadParamString(int argc, char** argv, const char* name, char* p, const char* defaultP);
-
-/* show arguments */
-void ShowParams(int argc, char** argv);
-
 /* split string */
 IntList SplitInt(const string& s, const string& delimiter);
 FloatList SplitFloat(const string& s, const string& delimiter);
@@ -115,10 +105,10 @@ public:
    /* the maximum length in positional embedding */
    int maxPosition;

-    /* the maximum length for the source sequence */
+    /* the maximum length of the source sequence */
    int maxSrcLen;

-    /* the maximum length for the target sequence */
+    /* the maximum length of the target sequence */
    int maxTgtLen;

    /* the dimension of fnn hidden layer */

--- a/source/sample/transformer/submodel/Attention.cpp
+++ b/source/sample/transformer/submodel/Attention.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -259,7 +259,7 @@ XTensor Attention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
        relativeKey = ConvertDataType(relativeKey, X_FLOAT);
    }

-    float scaling = float(sqrt(d / nhead));
+    float scaling = (float)sqrt(d / nhead);
    qheads = ScaleAndShift(qheads, 1.0F / scaling);

    dot = RPDotProduct(qheads, kheads, relativeKey, true);
@@ -373,7 +373,7 @@ XTensor Attention::RPDotProduct(XTensor& x, XTensor& y, XTensor& z, const bool i
    xTrans = Transpose(x, 0, 1);

    XTensor relative;
-    relative = BMMul(xTrans, X_NOTRANS, z, transposeFlag);
+    relative = MatrixMulBatched(xTrans, X_NOTRANS, z, transposeFlag);

    XTensor relativeTrans;
    relativeTrans = Transpose(relative, 0, 1);

--- a/source/sample/transformer/submodel/Attention.h
+++ b/source/sample/transformer/submodel/Attention.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/submodel/CommonModules.cpp
+++ b/source/sample/transformer/submodel/CommonModules.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/submodel/CommonModules.h
+++ b/source/sample/transformer/submodel/CommonModules.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/submodel/Embedding.cpp
+++ b/source/sample/transformer/submodel/Embedding.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/submodel/Embedding.h
+++ b/source/sample/transformer/submodel/Embedding.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/submodel/FNN.cpp
+++ b/source/sample/transformer/submodel/FNN.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -67,9 +67,7 @@ void FNN::InitModel(Config& config)
    float scale = 1.0F;
    _SetDataFanInOut(&w1, scale);
    _SetDataFanInOut(&w2, scale);
-
-    //w1.SetDataRand(-(DTYPE)sqrt(6.0F / inSize), (DTYPE)sqrt(6.0F / inSize));
-    //w2.SetDataRand(-(DTYPE)sqrt(6.0F / hSize), (DTYPE)sqrt(6.0F / hSize));
+	

    b1.SetZeroAll();
    b2.SetZeroAll();

--- a/source/sample/transformer/submodel/FNN.h
+++ b/source/sample/transformer/submodel/FNN.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/submodel/GLU.cpp
+++ b/source/sample/transformer/submodel/GLU.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/submodel/GLU.h
+++ b/source/sample/transformer/submodel/GLU.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/submodel/LayerHistory.cpp
+++ b/source/sample/transformer/submodel/LayerHistory.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/submodel/LayerHistory.h
+++ b/source/sample/transformer/submodel/LayerHistory.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/submodel/LayerNorm.cpp
+++ b/source/sample/transformer/submodel/LayerNorm.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/submodel/LayerNorm.h
+++ b/source/sample/transformer/submodel/LayerNorm.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/submodel/NNUtil.cpp
+++ b/source/sample/transformer/submodel/NNUtil.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/submodel/NNUtil.h
+++ b/source/sample/transformer/submodel/NNUtil.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/submodel/Output.cpp
+++ b/source/sample/transformer/submodel/Output.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/submodel/Output.h
+++ b/source/sample/transformer/submodel/Output.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/train/TrainDataSet.cpp
+++ b/source/sample/transformer/train/TrainDataSet.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/train/TrainDataSet.h
+++ b/source/sample/transformer/train/TrainDataSet.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -29,6 +29,7 @@
 #include "../../../tensor/XList.h"
 #include "../../../tensor/XTensor.h"
 #include "../../../tensor/XGlobal.h"
+#include "../../../train/XBaseTemplate.h"


 using namespace std;
@@ -74,8 +75,8 @@ struct ReservedIDs {
 };

 /* A `TrainDataSet` is associated with a file which contains training data. */
-struct TrainDataSet {
-
+struct TrainDataSet : public DataDistributeBase
+{
 public:

    /* the pointer to file stream */

--- a/source/sample/transformer/train/Trainer.cpp
+++ b/source/sample/transformer/train/Trainer.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -97,7 +97,6 @@ initialization
 void Trainer::Init(Config& config)
 {
    cfg = &config;
-    
    lrate = config.lrate;
    lrbias = config.lrbias;
    sBatchSize = config.sBatchSize;
@@ -242,17 +241,8 @@ void Trainer::Train(const char* fn, const char* validFN,
            DTYPE lossLocal = lossBatch / wc;
            bool doUpdate = (!IsNAN(lossLocal) && !IsINF(lossLocal) && lossLocal < 1e3F);

-            net.isGradEfficient = true;
-
-            bool debug(false);
-            if (debug) {
-                LOG("after forward:");
-                batchEnc.mem->ShowMemUsage(stderr);
-                exit(0);
-            }
-
            if (doUpdate) {
-
+                /* back-propagation */
                net.Backward(lossTensor);

                if (model->encoder->useHistory)
@@ -502,6 +492,7 @@ void Trainer::Update(Model* model, const float lr)
            _ScaleAndShiftMe(v, (1.0F - adamBeta2), 0);

            /* v2 = m / (sqrt(v) + delta) */
+            GMems.GetMem(v->devID)->LockBuf();
            XTensor* v2 = NewTensorBufV2(v, v->devID, v->mem);
            _Power(v, v2, 0.5F);
            _ScaleAndShiftMe(v2, 1.0F, d);
@@ -511,6 +502,7 @@ void Trainer::Update(Model* model, const float lr)
            _Sum(para, v2, para, -e);

            DelTensorBuf(v2);
+            GMems.GetMem(v->devID)->UnlockBuf();
        }
        else {
            /* the delta rule */

--- a/source/sample/transformer/train/Trainer.h
+++ b/source/sample/transformer/train/Trainer.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/translate/DataSet.cpp
+++ b/source/sample/transformer/translate/DataSet.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/translate/DataSet.h
+++ b/source/sample/transformer/translate/DataSet.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/translate/LengthPenalty.cpp
+++ b/source/sample/transformer/translate/LengthPenalty.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -42,7 +42,7 @@ float LengthPenalizer::GNMT(float length, float alpha)

    base = (length + 5.0F) / (1.0F + 5.0F);

-    lp = float(pow(base, alpha));
+    lp = (float)pow(base, alpha);

    return lp;
 }

--- a/source/sample/transformer/translate/LengthPenalty.h
+++ b/source/sample/transformer/translate/LengthPenalty.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/translate/Predictor.cpp
+++ b/source/sample/transformer/translate/Predictor.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/translate/Predictor.h
+++ b/source/sample/transformer/translate/Predictor.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/translate/Search.cpp
+++ b/source/sample/transformer/translate/Search.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -322,7 +322,7 @@ void BeamSearch::Generate(StateBundle* prev, StateBundle* beam)
    /* keep the most promising candidates in the beam */
    TopK(score, scoreTopK, index, -1, beamSize, true);

-    float lp = LengthPenalizer::GNMT(beam->nstep, alpha);
+    //float lp = LengthPenalizer::GNMT(beam->nstep, alpha);

    CopyValues(index, indexCPU);
    CopyValues(index, preID);
@@ -493,8 +493,8 @@ void BeamSearch::Collect(StateBundle* beam)

        /* check if this is the first end symbol. It is false
           if there have been end symbols in previously generated words. */
-        bool isCompleted = state.isCompleted && 
-             (state.last == NULL || !state.last->isCompleted);
+        //bool isCompleted = state.isCompleted &&
+        //     (state.last == NULL || !state.last->isCompleted);

        /* we push the hypothesis into the heap when it is completed */
        if ((state.isEnd || state.isCompleted)) {
@@ -557,7 +557,6 @@ void BeamSearch::Dump(IntList* output, XTensor* score)
            }
        }

-        int count = 0;
        bool isCompleted = true;

        /* we track the state from the end to the beginning */
@@ -874,4 +873,4 @@ void GreedySearch::Search(Model* model, XTensor& input,
    delete[] finishedFlags;
 }

-}
\ No newline at end of file
+}
--- a/source/sample/transformer/translate/Search.h
+++ b/source/sample/transformer/translate/Search.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/translate/Translator.cpp
+++ b/source/sample/transformer/translate/Translator.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -161,7 +161,7 @@ void Translator::Translate(const char* ifn, const char* sfn,
        batchLoader.outputBuffer.emplace_back(emptyRes);
    }

-    double startDump = GetClockSec();
+    //double startDump = GetClockSec();

    /* reorder the result */
    batchLoader.SortOutput();
@@ -169,10 +169,10 @@ void Translator::Translate(const char* ifn, const char* sfn,
    /* print the result to a file */
    batchLoader.DumpRes(ofn);

-    double elapsed = GetClockSec() - startDump;
+    //double elapsed = GetClockSec() - startDump;

    LOG("translation completed (word=%d, sent=%zu)", 
-        wordCountTotal, batchLoader.outputBuffer.size() + batchLoader.emptyLines.size());
+        wordCountTotal, batchLoader.inputBuffer.size() + batchLoader.emptyLines.size());
 }

 /*
@@ -202,4 +202,4 @@ void Translator::Dump(FILE* file, XTensor* output)
    }
 }

-}
\ No newline at end of file
+}
--- a/source/sample/transformer/translate/Translator.h
+++ b/source/sample/transformer/translate/Translator.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/sample/transformer/translate/Vocab.cpp
+++ b/source/sample/transformer/translate/Vocab.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -34,14 +34,14 @@ void Vocab::Load(const string& src)

    /* get the vocab size and the start id */
    f >> vsz >> sid;
-    startID = stol(sid);
-    vocabSize = stol(vsz);
+    startID = (int)stol(sid);
+    vocabSize = (int)stol(vsz);

    string word, id;
    for (int i = 0; i < vocabSize - startID; i++) {
        f >> word >> id;
-        word2id[word] = stol(id);
-        id2word[stol(id)] = word;
+        word2id[word] = (int)stol(id);
+        id2word[(int)stol(id)] = word;
    }

    f.close();
@@ -75,4 +75,4 @@ void Vocab::CopyFrom(const Vocab& v)
        id2word.insert(i2w);
 }

-}
\ No newline at end of file
+}
--- a/source/sample/transformer/translate/Vocab.h
+++ b/source/sample/transformer/translate/Vocab.h
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.NMT - an open-source neural machine translation system.
 * Copyright (C) 2020 NiuTrans Research. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

--- a/source/tensor/XConfig.cpp
+++ b/source/tensor/XConfig.cpp
+/*
+* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2021
+* Natural Language Processing Lab, Northeastern University
+* and
+* NiuTrans Research
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* this class keeps a batch of paramters.
+*
+* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-02-28
+*/
+
+#include "XConfig.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* constructor */
+XConfig::XConfig()
+{
+    n = 0;
+    args = NULL;
+    nReal = 0;
+}
+
+/* de-constructor */
+XConfig::~XConfig()
+{
+    for (int i = 0; i < n; i++) {
+        delete[] args[i];
+    }
+    delete[] args;
+}
+
+/* clear it */
+void XConfig::Clear()
+{
+    for (int i = 0; i < n; i++) {
+        delete[] args[i];
+    }
+    delete[] args;
+    n = 0;
+    args = NULL;
+    nReal = 0;
+}
+
+/* 
+create a config 
+>> myN - number of the input arguments
+>> myArgs - the input arguments
+*/
+void XConfig::Create(const int myN, const char ** myArgs)
+{
+    CheckNTErrors(myN > 0, "No input parameters to XConfig!");
+
+    for (int i = 0; i < n; i++) {
+        delete[] args[i];
+    }
+    delete[] args;
+    args = NULL;
+    n = myN;
+    nReal = n * 2;
+    
+    
+    args = new char*[nReal];
+
+    for (int i = 0; i < nReal; i++) {
+        args[i] = NULL;
+    }
+
+    for (int i = 0; i < n; i++) {
+        CheckNTErrors(myArgs[i] != NULL, "Illegal parameter input!");
+        args[i] = new char[strlen(myArgs[i]) + 1];
+        strcpy(args[i], myArgs[i]);
+    }
+}
+
+/* 
+add an argument 
+>> myArg - the argument
+>> myValue - the value of the argument
+*/
+void XConfig::Add(const char * myArg, const char * myValue)
+{
+    CheckNTErrors(myArg != NULL, "No argument!");
+
+    if (n + 2 > nReal) {
+        nReal = MAX(n * 2 + 1, 128);
+        char ** newArgs = new char*[nReal];
+        memset(newArgs, 0, sizeof(char*) * n);
+        memcpy(newArgs, args, sizeof(char*) * n);
+        delete[] args;
+        args = newArgs;
+    }
+
+    args[n] = new char[strlen(myArg) + 2];
+    args[n][0] = '-';
+    strcpy(args[n] + 1, myArg);
+    n++;
+
+    if (myValue != NULL) {
+        args[n] = new char[strlen(myValue) + 1];
+        strcpy(args[n], myValue);
+        n++;
+    }
+}
+
+/* 
+add an argument (in integer) 
+>> myArg - the argument
+>> myValue - the value of the argument
+*/
+void XConfig::Add(const char * myArg, int myValue)
+{
+    char value[MAX_WORD_LENGTH_IN_CONFIG];
+
+    sprintf(value, "%d", myValue);
+
+    Add(myArg, value);
+}
+
+/* 
+add an argument (in bool) 
+>> myArg - the argument
+>> myValue - the value of the argument
+*/
+void XConfig::Add(const char * myArg, bool myValue)
+{
+    char value[2];
+
+    if (myValue)
+        value[0] = '1';
+    else
+        value[0] = '0';
+    value[1] = 0;
+
+    Add(myArg, value);
+}
+
+/*
+add an argument (in float)
+>> myArg - the argument
+>> myValue - the value of the argument
+*/
+void XConfig::Add(const char * myArg, float myValue)
+{
+    char value[MAX_WORD_LENGTH_IN_CONFIG];
+
+    sprintf(value, "%f", myValue);
+
+    Add(myArg, value);
+}
+
+/* 
+load the value of an argument (in integer) 
+>> name - the name of the argument
+>> p - where we place the loaded value
+>> defaultP - the default value (used only if no argument is hit in the list)
+*/
+void XConfig::LoadInt(const char * name, int * p, int defaultP)
+{
+    LoadParamInt(n, args, name, p, defaultP);
+}
+
+/*
+load the value of an argument (in boolean)
+>> name - the name of the argument
+>> p - where we place the loaded value
+>> defaultP - the default value (used only if no argument is hit in the list)
+*/
+void XConfig::LoadBool(const char * name, bool * p, bool defaultP)
+{
+    LoadParamBool(n, args, name, p, defaultP);
+}
+
+/*
+load the value of an argument (in float)
+>> name - the name of the argument
+>> p - where we place the loaded value
+>> defaultP - the default value (used only if no argument is hit in the list)
+*/void XConfig::LoadFloat(const char * name, float * p, float defaultP)
+{
+    LoadParamFloat(n, args, name, p, defaultP);
+}
+
+/*
+load the value of an argument (in char string)
+>> name - the name of the argument
+>> p - where we place the loaded value
+>> defaultP - the default value (used only if no argument is hit in the list)
+*/
+void XConfig::LoadString(const char * name, char * p, const char* defaultP)
+{
+    LoadParamString(n, args, name, p, defaultP);
+}
+
+/* 
+get the value of an argument (in integer) 
+>> name - the name of the argument
+>> defaultP - the default value (used only if no argument is hit in the list)
+*/
+int XConfig::GetInt(const char * name, int defaultP)
+{
+    int r;
+
+    LoadInt(name, &r, defaultP);
+
+    return r;
+}
+
+/* 
+get the value of an argument (in bool)
+>> name - the name of the argument
+>> defaultP - the default value (used only if no argument is hit in the list)
+*/
+bool XConfig::GetBool(const char * name, bool defaultP)
+{
+    bool r;
+
+    LoadBool(name, &r, defaultP);
+
+    return r;
+}
+
+/* 
+get the value of an argument (in float) 
+>> name - the name of the argument
+>> defaultP - the default value (used only if no argument is hit in the list)
+*/
+float XConfig::GetFloat(const char * name, float defaultP)
+{
+    float r;
+
+    LoadFloat(name, &r, defaultP);
+
+    return r;
+}
+
+/* get item number */
+int XConfig::GetItemNum()
+{
+    return n;
+}
+
+/* 
+get the item with offset i 
+>> i - offset
+*/
+char * XConfig::GetItem(int i)
+{
+    if (i < n && i >= 0)
+        return args[i];
+    else
+        return NULL;
+}
+
+/* 
+initialize with another config model 
+>> myConfig - the configure model that we want to copy
+*/
+void XConfig::CreateFromMe(XConfig & myConfig)
+{
+    Clear();
+
+    for (int i = 0; i < myConfig.GetItemNum(); i++)
+        Add(myConfig.GetItem(i), i);
+}
+
+/*
+load the value of an argument (in integer)
+>> argc - number of arguments
+>> argv - arguments
+>> name - the argument we search for
+>> p - the pointer to the target variable where we want to place the value
+>> defaultP - the default value we use if no argument is found
+*/
+void LoadParamInt(int argc, char** argv, const char* name, int* p, int defaultP)
+{
+    char vname[128];
+    vname[0] = '-';
+    strcpy(vname + 1, name);
+    bool hit = false;
+    for (int i = 0; i < argc; i++) {
+        if (!strcmp(argv[i], vname) && i + 1 < argc) {
+            *(int*)p = atoi(argv[i + 1]);
+            hit = true;
+            break;
+        }
+    }
+    if (!hit)
+        *p = defaultP;
+}
+
+/*
+load the value of an argument (in boolean)
+>> argc - number of arguments
+>> argv - arguments
+>> name - the argument we search for
+>> p - the pointer to the target variable where we want to place the value
+>> defaultP - the default value we use if no argument is found
+*/
+void LoadParamBool(int argc, char** argv, const char* name, bool* p, bool defaultP)
+{
+    char vname[128];
+    vname[0] = '-';
+    strcpy(vname + 1, name);
+    bool hit = false;
+    for (int i = 0; i < argc; i++) {
+        if (!strcmp(argv[i], vname)) {
+            *(bool*)p = true;
+            hit = true;
+            break;
+        }
+    }
+    if (!hit)
+        *p = defaultP;
+}
+
+/*
+load the value of an argument (in float)
+>> argc - number of arguments
+>> argv - arguments
+>> name - the argument we search for
+>> p - the pointer to the target variable where we want to place the value
+>> defaultP - the default value we use if no argument is found
+*/
+void LoadParamFloat(int argc, char** argv, const char* name, float* p, float defaultP)
+{
+    char vname[128];
+    vname[0] = '-';
+    strcpy(vname + 1, name);
+    bool hit = false;
+    for (int i = 0; i < argc; i++) {
+        if (!strcmp(argv[i], vname) && i + 1 < argc) {
+            *p = (float)atof(argv[i + 1]);
+            hit = true;
+            break;
+        }
+    }
+    if (!hit)
+        *p = defaultP;
+}
+
+/*
+load the value of an argument (in char string)
+>> argc - number of arguments
+>> argv - arguments
+>> name - the argument we search for
+>> p - the pointer to the target variable where we want to place the value
+>> defaultP - the default value we use if no argument is found
+*/
+void LoadParamString(int argc, char** argv, const char* name, char* p, const char* defaultP)
+{
+    char vname[128];
+    vname[0] = '-';
+    strcpy(vname + 1, name);
+    bool hit = false;
+    for (int i = 0; i < argc; i++) {
+        if (!strcmp(argv[i], vname) && i + 1 < argc) {
+            strcpy(p, argv[i + 1]);
+            hit = true;
+            break;
+        }
+    }
+    if (!hit)
+        strcpy(p, defaultP);
+}
+
+/*
+show the argument list
+>> argc - number of arguments
+>> argv - arguments
+*/
+void ShowParams(int argc, char** argv)
+{
+    fprintf(stderr, "args:\n");
+    for (int i = 0; i < argc; i++) {
+        if (argv[i][1] == 0)
+            continue;
+        if (argv[i][0] == '-' && (argv[i][1] < '1' || argv[i][1] > '9')) {
+            if (i + 1 < argc && argv[i + 1][0] != '-')
+                fprintf(stderr, " %s=%s\n", argv[i], argv[i + 1]);
+            else
+                fprintf(stderr, " %s=yes\n", argv[i]);
+        }
+    }
+    fprintf(stderr, "\n");
+}
+
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/XConfig.h
+++ b/source/tensor/XConfig.h
+/*
+* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2021
+* Natural Language Processing Lab, Northeastern University
+* and
+* NiuTrans Research
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* this class defines a parameter keeper.
+*
+* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-02-28
+* A new semester begins today.
+*/
+
+#ifndef __XCONFIG_H__
+#define __XCONFIG_H__
+
+#include "XGlobal.h"
+#include "XUtility.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+#define MAX_WORD_LENGTH_IN_CONFIG 256
+
+/* the parameter keeper */
+class XConfig
+{
+private:
+    /* number of arguments */
+    int n;
+    
+    /* argument list (in char*) */
+    char ** args;
+
+    /* number of items we rellocate for these arguments */
+    int nReal;
+
+public:
+    /* constructor */
+    XConfig();
+
+    /* de-constructor */
+    ~XConfig();
+    
+    /* clear it */
+    void Clear();
+
+    /* create a config */
+    void Create(const int myN, const char ** myArgs);
+
+    /* add an argument */
+    void Add(const char * myArg, const char * myValue);
+
+    /* add an argument (in integer) */
+    void Add(const char * myArg, int myValue);
+
+    /* add an argument (in bool) */
+    void Add(const char * myArg, bool myValue);
+
+    /* add an argument (in float) */
+    void Add(const char * myArg, float myValue);
+
+    /* load the value of an argument to a variable (in integer) */
+    void LoadInt(const char * name, int * p, int defaultP);
+
+    /* load the value of an argument to a variable (in boolean) */
+    void LoadBool(const char * name, bool * p, bool defaultP);
+
+    /* load the value of an argument to a variable (in float) */
+    void LoadFloat(const char * name, float * p, float defaultP);
+
+    /* load the value of an argument to a variable (in char string) */
+    void LoadString(const char * name, char * p, const char* defaultP);
+
+    /* get the value of an argument (in integer) */
+    int GetInt(const char * name, int defaultP);
+
+    /* get the value of an argument (in boolean) */
+    bool GetBool(const char * name, bool defaultP);
+
+    /* get the value of an argument (in float) */
+    float GetFloat(const char * name, float defaultP);
+
+    /* get item number */
+    int GetItemNum();
+
+    /* get the item with offset i */
+    char * GetItem(int i);
+
+    /* initialize with another config model */
+    void CreateFromMe(XConfig &myConfig);
+
+};
+
+#define MAX_PARAM_NUM 100
+
+/* load arguments */
+void extern LoadParamInt(int argc, char** argv, const char* name, int* p, int defaultP);
+void extern LoadParamBool(int argc, char** argv, const char* name, bool* p, bool defaultP);
+void extern LoadParamFloat(int argc, char** argv, const char* name, float* p, float defaultP);
+void extern LoadParamString(int argc, char** argv, const char* name, char* p, const char* defaultP);
+
+/* show arguments */
+void extern ShowParams(int argc, char** argv);
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif
\ No newline at end of file
--- a/source/tensor/XDevice.cpp
+++ b/source/tensor/XDevice.cpp
@@ -182,10 +182,11 @@ void XDevice::Reset()
    XMem * mem = GMems.GetMem(devID);
    mem->Free();

+#ifdef USE_CUDA
    int devIDReset = devID;
+    
    Clear();
-
-#ifdef USE_CUDA
+    
    if (devIDReset >= 0) {
        int devIDBackup = -1;
        cudaGetDevice(&devIDBackup);
@@ -195,6 +196,8 @@ void XDevice::Reset()

        cudaSetDevice(devIDBackup);
    }
+#else
+    Clear();
 #endif
 }


--- a/source/tensor/XGlobal.h
+++ b/source/tensor/XGlobal.h
@@ -132,6 +132,36 @@ extern int TRAINING_SAMPLE_BUF_SIZE;
 extern int CONST_MINUSONE;
 extern bool CONST_TRUE;

+//////////////////////////////////////////////////
+// mutex
+#ifdef WIN32
+#define      THREAD_HANDLE            HANDLE
+#define      MUTEX_HANDLE             CRITICAL_SECTION
+#define      COND_HANDLE              HANDLE
+#define      MUTEX_INIT( x )          InitializeCriticalSection( &(x) )
+#define      MUTEX_DELE( x )          DeleteCriticalSection( &(x) )
+#define      MUTEX_LOCK( x )          EnterCriticalSection( &(x) )
+#define      MUTEX_UNLOCK( x )        LeaveCriticalSection( &(x) )
+#define      COND_INIT( x )           ( x = CreateEvent( NULL, false, false, NULL ) )
+#define      COND_DELE( x )           CloseHandle( (x) )
+#define      COND_WAIT( x, y )        WaitForSingleObject( (x), INFINITE )
+#define      COND_SIGNAL( x )         SetEvent( (x) )
+#define      COND_RESET( x)           ResetEvent( (x) )
+#else
+#define      THREAD_HANDLE            pthread_t
+#define      MUTEX_HANDLE             pthread_mutex_t
+#define      COND_HANDLE              pthread_cond_t
+#define      MUTEX_INIT( x )          pthread_mutex_init( &(x), NULL )
+#define      MUTEX_DELE( x )          pthread_mutex_destroy( &(x) )
+#define      MUTEX_LOCK( x )          pthread_mutex_lock( &(x) )
+#define      MUTEX_UNLOCK( x )        pthread_mutex_unlock( &(x) )
+#define      COND_INIT( x )           pthread_cond_init( &(x), NULL )
+#define      COND_DELE( x )           pthread_cond_destroy( &(x) )
+#define      COND_WAIT( x, y )        pthread_cond_wait( &(x), &(y) )
+#define      COND_SIGNAL( x )         pthread_cond_signal( &(x) )
+#define      COND_BROADCAST( x )      pthread_cond_broadcast( &(x) )
+#endif
+
 //#define USE_CUDA_RESURSION 1

 #define NIUTRANSNNDEBUG

--- a/source/tensor/XList.cpp
+++ b/source/tensor/XList.cpp
--- a/source/tensor/XList.h
+++ b/source/tensor/XList.h
@@ -75,6 +75,9 @@ public:
    /* de-constructor */
    ~TensorListBase();

+    /* reallocate */
+    void Reallocate(int itemNum);
+
    /* add an item into the list */
    void Add(T&& item);

@@ -84,6 +87,15 @@ public:
    /* add an item into the list */
    void Add(const T& item);

+    /* add an item (as an integer) into the list */
+    void AddInt(const int item);
+
+    /* add an item (as a float) into the list */
+    void AddFloat(const float item);
+
+    /* add an item (as a long long) into the list */
+    void AddLLong(const long long item);
+
    /* add a number of items into the list */
    void Add(const T* inputItems, int inputItemCount);

@@ -99,12 +111,30 @@ public:
    /* get the item at position i */
    T& GetItem(int i) const;

+    /* get the item at position i and force it to an integer */
+    int GetItemInt(int i) const;
+
+    /* get the item at position i and force it to a float number */
+    float GetItemFloat(int i) const;
+
+    /* get the item at position i and force it to an long long number */
+    long long GetItemLLong(int i) const;
+
    /* set the item at position i */
    void SetItem(int i, const T& item);

    /* set the item at position i */
    void SetItem(int i, T&& item);

+    /* set the item (as an integer) at position i */
+    void SetItemInt(int i, const int item);
+
+    /* set the item (as a float) at position i */
+    void SetItemFloat(int i, const float item);
+
+    /* set the item (as a long long) at position i */
+    void SetItemLLong(int i, const long long item);
+
    /* find the position of the first matched item  */
    int FindFirst(const T& item);

@@ -135,7 +165,13 @@ public:
    /* short */
    T& operator[] (int i) const { return GetItem(i); };
    T& Get(int i) const { return GetItem(i); };
+    int GetInt(int i) const { return GetItemInt(i); };
+    float GetFloat(int i) const { return GetItemFloat(i); };
+    long long GetLLong(int i) const { return GetItemLLong(i); };
    void Set(int i, T item) { SetItem(i, item); };
+    void SetInt(int i, int item) { SetItemInt(i, item); };
+    void SetFloat(int i, float item) { SetItemFloat(i, item); };
+    void SetLLong(int i, long long item) { SetItemLLong(i, item); };
 };

 struct XTensor;

--- a/source/tensor/XMem.cpp
+++ b/source/tensor/XMem.cpp
@@ -54,6 +54,8 @@ XMem::XMem()
    signature = 0;
    mergeFreeOTF = true;
    isInitialized = false;
+    MUTEX_INIT(allocMutex);
+    MUTEX_INIT(bufMutex);
 }

 /* 
@@ -77,6 +79,8 @@ XMem::XMem(int myDevID, MEMPOOL_MODE myMode, MTYPE myBlockSize, int myBlockNum, 
    strcpy(name, "xmem");
    signature = 0;
    mergeFreeOTF = true;
+    MUTEX_INIT(allocMutex);
+    MUTEX_INIT(bufMutex);
    Initialize(myDevID, myMode, myBlockSize, myBlockNum, myBufSize);
 }

@@ -99,6 +103,8 @@ XMem::~XMem()
    delete[] memIndex;
    delete[] memIndex2;
    delete[] minSizeIndex;
+    MUTEX_DELE(allocMutex);
+    MUTEX_DELE(bufMutex);
 }

 /* 
@@ -379,12 +385,18 @@ require a piece of memory
 */
 void * XMem::Alloc(int myDevID, MTYPE mySize)
 {
+    void * p = NULL;
+
+    MUTEX_LOCK(allocMutex);
    if(mode == FREE_ON_THE_FLY)
-        return AllocStandard(myDevID, mySize);
+        p = AllocStandard(myDevID, mySize);
    else if(isStatic)
-        return AllocStatic(myDevID, mySize);
+        p = AllocStatic(myDevID, mySize);
    else
-        return AllocDynamic(myDevID, mySize);
+        p = AllocDynamic(myDevID, mySize);
+    MUTEX_UNLOCK(allocMutex);
+
+    return p;
 }

 /* 
@@ -521,6 +533,11 @@ void * XMem::AllocBuf(int myDevID, MTYPE mySize, int pitch)
 {
    MTYPE backOffset = 0;

+    /* NOTE THAT this is tricky because we lock the buffer
+       but DO NOT unlock it in this function. The unlock would
+       happans when we call ReleaseBuf() */
+    //MUTEX_LOCK(bufMutex);
+
    if(pitch > 1){
        MTYPE address = (MTYPE)((char*)buf + bufUsed);
        int offset  = address % pitch;
@@ -560,8 +577,10 @@ release a piece of memory
 */
 void XMem::Release(int myDevID, void * p, MTYPE size)
 {
+    MUTEX_LOCK(allocMutex);
    if(mode == FREE_ON_THE_FLY)
        ReleaseStandard(myDevID, p, size);
+    MUTEX_UNLOCK(allocMutex);
 }

 /* 
@@ -583,6 +602,9 @@ void XMem::ReleaseBuf(int myDevID, MTYPE mySize, int pitch)
    }

    bufUsed -= (mySize + backOffset);
+
+    /* NOTE THAT this is a response to the lock in AllocBuf() */
+    //MUTEX_UNLOCK(bufMutex);
 }

 /* 
@@ -825,6 +847,18 @@ void * XMem::AllocStandard(int myDevID, MTYPE mySize, bool myIsRebuiltIndex)
    return result;
 }

+/* lock the buffer mutex */
+void XMem::LockBuf()
+{
+    MUTEX_LOCK(bufMutex);
+}
+
+/* unlock the buffer mutex */
+void XMem::UnlockBuf()
+{
+    MUTEX_UNLOCK(bufMutex);
+}
+
 /* 
 find the highest set bit (or most significant set bit) in an integer-64 
 >> mySize - required size
@@ -1604,6 +1638,9 @@ void XMemManager::GetBufferSize(MTYPE freeMem, MTYPE * myBufSize)
            }
        }
    }
+    else {
+        ShowNTErrors("No enough memory for buffer allocation!");
+    }
 } 

 /* initialize it and set the global memory information */

--- a/source/tensor/XMem.h
+++ b/source/tensor/XMem.h
@@ -24,6 +24,7 @@
 #ifndef __XMEM_H__
 #define __XMEM_H__

+#include "XGlobal.h"
 #include <stdio.h>
 #include <stdlib.h>

@@ -249,6 +250,13 @@ public:
    /* indicates whether we merge free memory pieces on the fly */
    bool mergeFreeOTF;

+private:
+    /* a mutex for memory allocation and release */
+    MUTEX_HANDLE allocMutex;
+
+    /* a mutex for buffer memory allocation and release */
+    MUTEX_HANDLE bufMutex;
+
 public:

    /* constructor */
@@ -337,6 +345,12 @@ public:
    /* allocate a piece of memory as "malloc" */
    void * AllocStandard(int myDevID, MTYPE mySize, bool myIsRebuiltIndex = false);

+    /* lock the buffer mutex */
+    void LockBuf();
+
+    /* unlock the buffer mutex */
+    void UnlockBuf();
+
    /* find the highest set bit (or most significant set bit) in an integer-64 */
    int GetMSB(MTYPE mySize);


--- a/source/tensor/XQueue.cpp
+++ b/source/tensor/XQueue.cpp
@@ -215,7 +215,8 @@ void XQueue::DequeueJobs(XList * args)
    int devID = *(int*)args->GetItem(1);

    int devIDBackup = -1;
-    XDevice::SetDevice(devID, devIDBackup);
+    if(devID >= 0)
+        XDevice::SetDevice(devID, devIDBackup);

    while(1){
        JobQueueNode * node = (JobQueueNode*)q->Dequeue();
@@ -236,7 +237,8 @@ void XQueue::DequeueJobs(XList * args)

    }

-    XDevice::SetDevice(devIDBackup);
+    if(devID >= 0)
+        XDevice::SetDevice(devIDBackup);
 }

 /* get the break flag */
@@ -248,7 +250,11 @@ bool XQueue::GetJobBreak()
 /* get the number of jobs */
 int XQueue::GetJobNum()
 {
-    return runningJobCount;
+    MUTEX_LOCK(jobQueueMutex);
+    int c = runningJobCount;
+    MUTEX_UNLOCK(jobQueueMutex);
+
+    return c;
 }

 } /* end of the nts (NiuTrans.Tensor) namespace */
--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -1985,6 +1985,19 @@ void XTensor::FlushToMem(XMem* targetMem)
    }
 }

+/* 
+flush the data to the target device (with id) 
+>> myDevID - id of the target device
+*/
+void XTensor::FlushToDevice(int myDevID)
+{
+    if (myDevID == devID)
+        return;
+
+    XMem * myMem = GMems.GetMem(myDevID);
+    FlushToMem(myMem);
+}
+
 /*
 allocate the memory space of the tensor (in the global memory) 
 >> tensor - the tensor we intend to process

--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
@@ -457,6 +457,9 @@ public:
    /* flush the data to the target device */
    void FlushToMem(XMem * targetMem);

+    /* flush the data to the target device (with id) */
+    void FlushToDevice(int myDevID);
+
    /* allocate the memory space of the tensor (in the global memory) */
    static
    void AllocateData(XTensor * tensor, XMem * myMem = NULL, bool useBuf = false);

--- a/source/tensor/XThread.h
+++ b/source/tensor/XThread.h
@@ -54,37 +54,6 @@ namespace nts{
                   (unsigned)(flag), (unsigned *)(id))
 #endif

-//////////////////////////////////////////////////
-// mutex
-#ifdef WIN32
-#define      THREAD_HANDLE            HANDLE
-#define      MUTEX_HANDLE             CRITICAL_SECTION
-#define      COND_HANDLE              HANDLE
-#define      MUTEX_INIT( x )          InitializeCriticalSection( &(x) )
-#define      MUTEX_DELE( x )          DeleteCriticalSection( &(x) )
-#define      MUTEX_LOCK( x )          EnterCriticalSection( &(x) )
-#define      MUTEX_UNLOCK( x )        LeaveCriticalSection( &(x) )
-#define      COND_INIT( x )           ( x = CreateEvent( NULL, false, false, NULL ) )
-#define      COND_DELE( x )           CloseHandle( (x) )
-#define      COND_WAIT( x, y )        WaitForSingleObject( (x), INFINITE )
-#define      COND_SIGNAL( x )         SetEvent( (x) )
-#define      COND_RESET( x)           ResetEvent( (x) )
-#else
-#define      THREAD_HANDLE            pthread_t
-#define      MUTEX_HANDLE             pthread_mutex_t
-#define      COND_HANDLE              pthread_cond_t
-#define      MUTEX_INIT( x )          pthread_mutex_init( &(x), NULL )
-#define      MUTEX_DELE( x )          pthread_mutex_destroy( &(x) )
-#define      MUTEX_LOCK( x )          pthread_mutex_lock( &(x) )
-#define      MUTEX_UNLOCK( x )        pthread_mutex_unlock( &(x) )
-#define      COND_INIT( x )           pthread_cond_init( &(x), NULL )
-#define      COND_DELE( x )           pthread_cond_destroy( &(x) )
-#define      COND_WAIT( x, y )        pthread_cond_wait( &(x), &(y) )
-#define      COND_SIGNAL( x )         pthread_cond_signal( &(x) )
-#define      COND_BROADCAST( x )      pthread_cond_broadcast( &(x) )
-
-#endif
-
 typedef void (*TFunction) (volatile XList*);

 /*

--- a/source/tensor/XUtility.cpp
+++ b/source/tensor/XUtility.cpp
@@ -155,13 +155,13 @@ void XMemSet(int devID, void * p, int value, size_t size)
 cudaMemcpyKind GetMemcpyKind(int devIDFrom, int devIDTo)
 {
    if(devIDFrom < 0 && devIDTo < 0)
-        return cudaMemcpyHostToHost;
+        return cudaMemcpyKind::cudaMemcpyHostToHost;
    else if(devIDFrom < 0 && devIDTo >= 0)
-        return cudaMemcpyHostToDevice;
+        return cudaMemcpyKind::cudaMemcpyHostToDevice;
    else if(devIDFrom >= 0 && devIDTo < 0)
-        return cudaMemcpyDeviceToHost;
+        return cudaMemcpyKind::cudaMemcpyDeviceToHost;
    else
-        return cudaMemcpyDeviceToDevice;
+        return cudaMemcpyKind::cudaMemcpyDeviceToDevice;
 }
 #endif

@@ -485,6 +485,9 @@ unsigned int GetNextPower2(unsigned int n)
 /* sleep for a while */
 void XSleep(int sleepTime)
 {
+    if (sleepTime <= 0)
+        return;
+
 #ifdef  _WIN32
    Sleep((DWORD)sleepTime);
 #else
@@ -553,9 +556,9 @@ void XQSort(void * data, void * index, int num, int width, int stride, int (*com
    stackptr = 0;

    lo = (char*)data;
-    hi = (char*)data + realStride * (num - 1);
+    hi = (char*)data + (long)realStride * (num - 1);
    indexlo = (int*)index;
-    indexhi = index != NULL ? (int*)index + stride * (num - 1) : NULL;
+    indexhi = index != NULL ? (int*)index + (long)stride * (num - 1) : NULL;

 recurse:

@@ -565,8 +568,8 @@ recurse:
    if(size <= MIN_QSORT_NUM)
        XShortSort(lo, hi, indexlo, indexhi, width, stride, comp);
    else {
-        mid = lo + (size/2) * realStride;
-        indexmid = indexlo + (size/2) * stride;
+        mid = lo + (long)(size/2) * realStride;
+        indexmid = indexlo + (long)(size/2) * stride;
        
        /* sort the first, last and middle elements into order */
        if(comp(lo, mid) > 0)
@@ -834,8 +837,7 @@ int SplitALine(char* inputString, const char* seperator, StrList* items)
        return 0;

    if (sepLen == 0) {
-
-        char* item = new char[inputLen + 1];
+        char* item = new char[(long)inputLen + 1];
        strcpy(item, inputString);
        items->Add(item);
    }

--- a/source/tensor/core/arithmetic/Div.cpp
+++ b/source/tensor/core/arithmetic/Div.cpp
@@ -253,15 +253,25 @@ void Div(const XTensor & a, const XTensor & b, XTensor & c, DTYPE alpha, int lea

    if (b.order == 0){
        DTYPE scale = 1.0F / b.Get0D();
+        if (a.mem != NULL)
+            a.mem->LockBuf();
        XTensor * tmp1 = NewTensorBufV2(&a, a.devID, a.mem);
+        if ((c.mem != NULL) && (c.mem != a.mem)) {
+            c.mem->LockBuf();
+        }
        XTensor * tmp2 = NewTensorBufV2(&c, c.devID, c.mem);

        ScaleAndShift(a, *tmp1, scale, 0.0F);
        ScaleAndShift(c, *tmp2, alpha, 0.0F);
        Sum(*tmp2, *tmp1, c);

-        DelTensorBuf(tmp1);
        DelTensorBuf(tmp2);
+        if ((c.mem != NULL) && (c.mem != a.mem)) {
+            c.mem->UnlockBuf();
+        }
+        DelTensorBuf(tmp1);
+        if (a.mem != NULL)
+            a.mem->UnlockBuf();
    }
    else {
        int n = GetBroadcastDimIndex(a, b);

--- a/source/tensor/core/arithmetic/MulAndShift.cpp
+++ b/source/tensor/core/arithmetic/MulAndShift.cpp
@@ -61,6 +61,8 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,

    float dr = (!x.isSparse || !w.isSparse) ? 1.0F : MAX(x.denseRatio, w.denseRatio);

+    if (x.mem != NULL)
+        x.mem->LockBuf();
    XTensor * tmp = NewTensorBufV2(order, dimSize, x.dataType, dr, x.devID, x.mem);

    /* call _MatrixMul function */
@@ -101,6 +103,8 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
    /* destroy variables */
    delete[] dimSize;
    DelTensorBuf(tmp);
+    if (x.mem != NULL)
+        x.mem->UnlockBuf();

    return c;
 }
@@ -121,8 +125,8 @@ XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedX,
    CheckNTErrors(x.order >= 2 && w.order >= 2, "Input tensors must have a order >= 2!");

    int xn = transposedX == X_TRANS ? x.dimSize[x.order - 1] : x.dimSize[x.order - 2];
-    int xm = transposedX == X_TRANS ? x.dimSize[x.order - 2] : x.dimSize[x.order - 1];
-    int wn = transposedW == X_TRANS ? w.dimSize[w.order - 1] : w.dimSize[w.order - 2];
+    //int xm = transposedX == X_TRANS ? x.dimSize[x.order - 2] : x.dimSize[x.order - 1];
+    //int wn = transposedW == X_TRANS ? w.dimSize[w.order - 1] : w.dimSize[w.order - 2];
    int wm = transposedW == X_TRANS ? w.dimSize[w.order - 2] : w.dimSize[w.order - 1];

    int order = x.order + w.order - 2;
@@ -137,6 +141,8 @@ XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedX,

    float dr = (!x.isSparse || !w.isSparse) ? 1.0F : MAX(x.denseRatio, w.denseRatio);

+    if (x.mem != NULL)
+        x.mem->LockBuf();
    XTensor * tmp = NewTensorBufV2(order, dimSize, x.dataType, dr, x.devID, x.mem);

    /* call _MatrixMul function */
@@ -175,8 +181,10 @@ XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedX,
    /* destroy variables */
    delete[] dimSize;
    DelTensorBuf(tmp);
+    if (x.mem != NULL)
+        x.mem->UnlockBuf();

    return c;
 }

-}
\ No newline at end of file
+}
--- a/source/tensor/core/arithmetic/Multiply.cpp
+++ b/source/tensor/core/arithmetic/Multiply.cpp
@@ -277,15 +277,25 @@ void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int l

    if (b.order == 0){
        DTYPE scale = b.Get0D();
+        if (a.mem != NULL)
+            a.mem->LockBuf();
        XTensor * tmp1 = NewTensorBufV2(&a, a.devID, a.mem);
+        if ((c.mem != NULL) && (c.mem != a.mem)) {
+            c.mem->LockBuf();
+        }
        XTensor * tmp2 = NewTensorBufV2(&c, c.devID, c.mem);

        ScaleAndShift(a, *tmp1, scale, 0.0F);
        ScaleAndShift(c, *tmp2, alpha, 0.0F);
        Sum(*tmp2, *tmp1, c);

-        DelTensorBuf(tmp1);
        DelTensorBuf(tmp2);
+        if ((c.mem != NULL) && (c.mem != a.mem)) {
+            c.mem->UnlockBuf();
+        }
+        DelTensorBuf(tmp1);
+        if (a.mem != NULL)
+            a.mem->UnlockBuf();
    }
    else {
        int n = GetBroadcastDimIndex(a, b);

--- a/source/tensor/core/arithmetic/MultiplyDim.cpp
+++ b/source/tensor/core/arithmetic/MultiplyDim.cpp
@@ -290,9 +290,16 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
                source = target;
            }
            
-            target = t->mem != NULL ?
+            /*target = t->mem != NULL ?
                     t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize):
-                     XMemAlloc(t->devID, t->unitNum * t->unitSize);
+                     XMemAlloc(t->devID, t->unitNum * t->unitSize);*/
+            if (t->mem != NULL) {
+                t->mem->LockBuf();
+                target = t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize);
+            }
+            else {
+                target = XMemAlloc(t->devID, t->unitNum * t->unitSize);
+            }
            
            s->data = source;
            t->data = target;
@@ -302,8 +309,9 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
            /* free the memory space of the one before the last allocation */
            if(count > 0){
                int size = s->unitNum * s->unitSize;
-                if(t->mem != NULL)
+                if(t->mem != NULL) {
                    t->mem->ReleaseBuf(t->devID, size);
+                }
                else
                    XMemFree(t->devID, source);
            }
@@ -312,8 +320,10 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
            if(isLast){
                CheckNTErrors(t->unitNum == c->unitNum, "Wrong tensor size!");
                _Multiply(a, t, c, beta);
-                if(t->mem != NULL)
+                if(t->mem != NULL) {
                    t->mem->ReleaseBuf(t->devID, t->unitNum * t->unitSize);
+                    t->mem->UnlockBuf();
+                }
                else
                    XMemFree(t->devID, target);
                target = NULL;

--- a/source/tensor/core/arithmetic/SumDim.cpp
+++ b/source/tensor/core/arithmetic/SumDim.cpp
@@ -293,10 +293,16 @@ void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta
                source = target;
            }
            
-            target = t->mem != NULL ?
+            /*target = t->mem != NULL ?
                     t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize):
-                     XMemAlloc(t->devID, t->unitNum * t->unitSize);
-            
+                     XMemAlloc(t->devID, t->unitNum * t->unitSize);*/
+            if (t->mem != NULL) {
+                t->mem->LockBuf();
+                target = t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize);
+            }
+            else {
+                target = XMemAlloc(t->devID, t->unitNum * t->unitSize);
+            }
            s->data = source;
            t->data = target;
            
@@ -315,8 +321,10 @@ void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta
            if(isLast){
                CheckNTErrors(t->unitNum == c->unitNum, "Wrong tensor size!");
                _Sum(a, t, c, beta);
-                if(t->mem != NULL)
+                if(t->mem != NULL) {
                    t->mem->ReleaseBuf(t->devID, t->unitNum * t->unitSize);
+                    t->mem->UnlockBuf();
+                }
                else
                    XMemFree(t->devID, target);
                target = NULL;

--- a/source/tensor/core/arithmetic/XTensorBLAS.cu
+++ b/source/tensor/core/arithmetic/XTensorBLAS.cu
@@ -330,6 +330,7 @@ void _CudaBLASMatrixMULList(cublasHandle_t * handle,
            DTYPE ** cpGPU = NULL;

            if (mem != NULL) {
+                mem->LockBuf();
                mem->SetPinBuf();
                apGPU = (DTYPE**)mem->AllocBuf(mem->devID, sizeof(DTYPE*) * a->count, 256);
                bpGPU = (DTYPE**)mem->AllocBuf(mem->devID, sizeof(DTYPE*) * a->count, 256);
@@ -356,8 +357,10 @@ void _CudaBLASMatrixMULList(cublasHandle_t * handle,
            delete[] bp;
            delete[] cp;

-            if(mem != NULL)
+            if (mem != NULL) {
                mem->BackToPinBuf();
+                mem->UnlockBuf();
+            }
            else {
                XMemFree(a0->devID, apGPU);
                XMemFree(a0->devID, bpGPU);

--- a/source/tensor/core/getandset/OnehotAndIndex.cpp
+++ b/source/tensor/core/getandset/OnehotAndIndex.cpp
@@ -96,9 +96,12 @@ XTensor OnehotToIndex(const XTensor & onehot, int size)
 /* 
 convert index tensor to onehot tensor 

->> index - index tensor, which value is an integer num
->> onehot - onehot tensor, which value is 0 or 1
->> size - the last dimension size of the onehot tensor
+>> index - index of the output dimension (over the vocabulary)
+>> onehot - one-hot representation of the index
+>> size - vocabuary size (last dimension size of onehot)
+>> labelSmoothingP - the parameter that controls how smooth the output is.
+                     E.g., p = 0 means no smoothing
+                           p = 1 means a uniform distribution (almost)
 */
 void _IndexToOnehot(const XTensor * index, XTensor * onehot, 
                    int size, float labelSmoothingP)

--- a/source/tensor/core/getandset/SetData.cpp
+++ b/source/tensor/core/getandset/SetData.cpp
@@ -696,13 +696,23 @@ void _SetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYPE nu
 #ifdef USE_CUDA
        XMem * mem = tensor->mem;
        MTYPE size = num * sizeof(MTYPE);
-        MTYPE * offsetsCuda = mem != NULL ? (MTYPE*)mem->AllocBuf(mem->devID, size) : (MTYPE*)XMemAlloc(tensor->devID, size);
+        //MTYPE * offsetsCuda = mem != NULL ? (MTYPE*)mem->AllocBuf(mem->devID, size) : (MTYPE*)XMemAlloc(tensor->devID, size);
+        MTYPE * offsetsCuda;
+        if (mem != NULL) {
+            mem->LockBuf();
+            offsetsCuda = (MTYPE*)mem->AllocBuf(mem->devID, size);
+        }
+        else {
+            offsetsCuda = (MTYPE*)XMemAlloc(tensor->devID, size);
+        }
        XMemCopy(offsetsCuda, tensor->devID, offsets, -1, num * sizeof(MTYPE));

        _CudaSetDataWithOffset(tensor, offsetsCuda, value, num);
        
-        if (mem != NULL)
+        if (mem != NULL) {
            mem->ReleaseBuf(mem->devID, size);
+            mem->UnlockBuf();
+        }
        else
            XMemFree(tensor->devID, offsetsCuda);
 #else

--- a/source/tensor/core/getandset/SetData.cu
+++ b/source/tensor/core/getandset/SetData.cu
@@ -636,12 +636,23 @@ void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * va
    int devIDBackup;
    ProtectCudaDev(tensor->devID, devIDBackup);

-    MTYPE * offsetsCuda = mem != NULL ? 
+    /*MTYPE * offsetsCuda = mem != NULL ? 
                            (MTYPE*)mem->AllocBuf(mem->devID, offsetSize) : 
                            (MTYPE*)XMemAlloc(tensor->devID, offsetSize);
-    void * valuesCuda  = mem != NULL ? 
-                            mem->AllocBuf(mem->devID, valueSize) : 
-                            XMemAlloc(tensor->devID, valueSize);
+    void * valuesCuda = mem != NULL ?
+                        mem->AllocBuf(mem->devID, valueSize) :
+                        XMemAlloc(tensor->devID, valueSize);*/
+    MTYPE * offsetsCuda;
+    void * valuesCuda; 
+    if (mem != NULL) {
+        mem->LockBuf();
+        offsetsCuda = (MTYPE*)mem->AllocBuf(mem->devID, offsetSize);
+        valuesCuda = mem->AllocBuf(mem->devID, valueSize);
+    }
+    else {
+        offsetsCuda = (MTYPE*)XMemAlloc(tensor->devID, offsetSize);
+        valuesCuda = XMemAlloc(tensor->devID, valueSize);
+    }

    if (mem != NULL) {
        XMemCopy(offsetsCuda, mem->devID, offsets, -1, offsetSize);
@@ -657,6 +668,7 @@ void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * va
    if (mem != NULL) {
        mem->ReleaseBuf(mem->devID, valueSize);
        mem->ReleaseBuf(mem->devID, offsetSize);
+        mem->UnlockBuf();
    }
    else {
        XMemFree(tensor->devID, valuesCuda);

--- a/source/tensor/core/movement/CopyBlocks.cpp
+++ b/source/tensor/core/movement/CopyBlocks.cpp
@@ -45,15 +45,25 @@ void _CopyBlocks(void * source, int unitSize, int blockSize, int blockNum, void 
    if (devID >= 0) {
 #ifdef USE_CUDA
        /* copy the index from host to device */
-        int * targetBlocksTMP = myMem != NULL ?
+        /*int * targetBlocksTMP = myMem != NULL ?
                               (int*)myMem->AllocBuf(devID, blockNum * sizeof(int)):
-                               (int*)XMemAlloc(devID, blockNum * sizeof(int));
+                               (int*)XMemAlloc(devID, blockNum * sizeof(int));*/
+        int * targetBlocksTMP;
+        if (myMem != NULL) {
+            myMem->LockBuf();
+            targetBlocksTMP = (int*)myMem->AllocBuf(devID, blockNum * sizeof(int));
+        }
+        else {
+            targetBlocksTMP = (int*)XMemAlloc(devID, blockNum * sizeof(int));
+        }
        XMemCopy(targetBlocksTMP, devID, targetBlocks, -1, blockNum * sizeof(int));

        _CopyBlocksOnSite(source, unitSize, blockSize, blockNum, target, targetBlocksTMP, devID);

-        if(myMem != NULL)
+        if (myMem != NULL) {
            myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
+            myMem->UnlockBuf();
+        }
        else
            XMemFree(devID, targetBlocksTMP);
 #else

--- a/source/tensor/core/movement/CopyBlocksInGrid.cpp
+++ b/source/tensor/core/movement/CopyBlocksInGrid.cpp
@@ -47,14 +47,17 @@ void _CopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, 
 #ifdef USE_CUDA
        int * indexGPU = index;
        if (!isIndexOnDev) {
+            myMem->LockBuf();
            indexGPU = (int*)myMem->AllocBuf(myMem->devID, blockNum * gridNum * sizeof(int));
            XMemCopy(indexGPU, myMem->devID, index, -1, blockNum * gridNum * sizeof(int));
        }

        _CudaCopyBlocksInGrid(source, blockSize, blockNum, gridNum, target, indexGPU, unitSize, myMem);

-        if (!isIndexOnDev)
+        if (!isIndexOnDev) {
            myMem->ReleaseBuf(myMem->devID, blockNum * gridNum * sizeof(int));
+            myMem->UnlockBuf();
+        }
 #else
        ShowNTErrors("Plesae specify USE_CUDA and recompile the code!");
 #endif

--- a/source/tensor/core/movement/CopyBlocksSelected.cu
+++ b/source/tensor/core/movement/CopyBlocksSelected.cu
@@ -80,12 +80,23 @@ void _CudaCopyBlocksSelected(void * source, int unitSize, int blockSize, int * s
    ProtectCudaDev(devID, devIDBackup);

    /* copy the index to the GPU memory */
-    int * sourceBlocksTMP = myMem != NULL ? 
+    /*int * sourceBlocksTMP = myMem != NULL ? 
                           (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) : 
                           (int *)XMemAlloc(devID, blockNum * sizeof(int));
    int * targetBlocksTMP = myMem != NULL ? 
                           (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) : 
-                           (int *)XMemAlloc(devID, blockNum * sizeof(int));
+                           (int *)XMemAlloc(devID, blockNum * sizeof(int));*/
+    int * sourceBlocksTMP;
+    int * targetBlocksTMP;
+    if (myMem != NULL) {
+        myMem->LockBuf();
+        sourceBlocksTMP = (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int));
+        targetBlocksTMP = (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int));
+    }
+    else {
+        sourceBlocksTMP = (int *)XMemAlloc(devID, blockNum * sizeof(int));
+        targetBlocksTMP = (int *)XMemAlloc(devID, blockNum * sizeof(int));
+    }
    
    XMemCopy(sourceBlocksTMP, devID, sourceBlocks, -1, blockNum * sizeof(int));
    XMemCopy(targetBlocksTMP, devID, targetBlocks, -1, blockNum * sizeof(int));
@@ -107,6 +118,7 @@ void _CudaCopyBlocksSelected(void * source, int unitSize, int blockSize, int * s
    if (myMem != NULL) {
        myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
        myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
+        myMem->UnlockBuf();
    }
    else {
        XMemFree(devID, sourceBlocksTMP);

--- a/source/tensor/core/movement/Gather.cpp
+++ b/source/tensor/core/movement/Gather.cpp
@@ -115,7 +115,7 @@ void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex)

        for (int i = 0; i < indexSize; i++) {
            int sIndex = sIndexData[i] * stride;
-            CheckNTErrors(sIndex < s->unitNum, "Wrong index!");
+            CheckNTErrors(sIndex < s->unitNum && sIndex >= 0, "Wrong index!");
            for (int j = 0; j < stride; j++)
                tData[i * stride + j] = sData[sIndex + j];
        }

--- a/source/tensor/core/movement/Gather.cu
+++ b/source/tensor/core/movement/Gather.cu
@@ -131,9 +131,16 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
            CheckNTErrors(srcIndexValue < s->unitNum, "Wrong index!");
        }

-        sIndex = mem != NULL ? 
+        /*sIndex = mem != NULL ? 
                  (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) : 
-                  (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
+                  (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);*/
+        if (mem != NULL) {
+            mem->LockBuf();
+            sIndex = (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize);
+        }
+        else {
+            sIndex = (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
+        }
        XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize);
    }
    else {
@@ -169,8 +176,10 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
    }

    if (srcIndex->devID < 0) {
-        if(mem != NULL)
+        if (mem != NULL) {
            mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
+            mem->UnlockBuf();
+        }
        else
            XMemFree(mem->devID, sIndex);
    }
@@ -209,9 +218,16 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
            CheckNTErrors(srcIndexValue < s->unitNum, "Wrong index!");
        }

-        sIndex = mem != NULL ?
-                  (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
-                  (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
+        /*sIndex = mem != NULL ?
+        (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
+        (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);*/
+        if (mem != NULL) {
+            mem->LockBuf();
+            sIndex = (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize);
+        }
+        else {
+            sIndex = (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
+        }
        XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize);
    }
    else {
@@ -238,6 +254,15 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
    else {
        ShowNTErrors("Unsupported dataType!");
    }
+
+    if (srcIndex->devID < 0) {
+        if (mem != NULL) {
+            mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
+            mem->UnlockBuf();
+        }
+        else
+            XMemFree(mem->devID, sIndex);
+    }
 }
 #endif // USE_CUDA


--- a/source/tensor/core/movement/Spread.cpp
+++ b/source/tensor/core/movement/Spread.cpp
@@ -231,8 +231,8 @@ And this is a special spread function for backward computation of gather functio
 */
 void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index)
 {
-    int dim = 0;
-    int order = source->order;
+    //int dim = 0;
+    //int order = source->order;

    CheckNTErrors(source->dataType == DEFAULT_DTYPE, "TODO!");
    CheckNTErrors(collection->GetDim(-1) == source->GetDim(-1), "Illegal dimension!");
@@ -272,4 +272,4 @@ void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index)
    }
 }

-} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/movement/Spread.cu
+++ b/source/tensor/core/movement/Spread.cu
@@ -177,9 +177,17 @@ void _CudaSpread(XTensor * source, XTensor * collection, int dim,
        DTYPE * c = (DTYPE*)collection->data;

        XMem * mem = source->mem;
-        int * si = mem != NULL ? 
+        /*int * si = mem != NULL ? 
                   (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize * 2) : 
-                   (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize * 2);
+                   (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize * 2);*/
+        int * si;
+        if (mem != NULL) {
+            mem->LockBuf();
+            si = (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize * 2);
+        }
+        else {
+            si = (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize * 2);
+        }
        int * ci = si + indexSize;

        XMemCopy(si, mem->devID, srcIndex, -1, sizeof(int) * indexSize);
@@ -188,8 +196,10 @@ void _CudaSpread(XTensor * source, XTensor * collection, int dim,
        KernelSpreadFuzed<<<blocks, threads >>>(s, c, blockNum, blockSizeSrc, blockSizeColl,
                                                stride, indexSize, si, ci);

-        if(mem != NULL)
+        if (mem != NULL) {
            mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize * 2);
+            mem->UnlockBuf();
+        }
        else
            XMemFree(mem->devID, si);
    }
@@ -393,9 +403,16 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcI
    dim3 threads(cudaBlocks[0], cudaBlocks[1]);

    if (srcIndex->devID < 0) {
-        sIndex = mem != NULL ? 
+        /*sIndex = mem != NULL ? 
                (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) : 
-                (int*)XMemAlloc(devID, sizeof(int) * indexSize);
+                (int*)XMemAlloc(devID, sizeof(int) * indexSize);*/
+        if (mem != NULL) {
+            mem->LockBuf();
+            sIndex = (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize);
+        }
+        else {
+            sIndex = (int*)XMemAlloc(devID, sizeof(int) * indexSize);
+        }
        XMemCopy(sIndex, devID, srcIndex->data, -1, sizeof(int) * indexSize);
    }
    else
@@ -422,8 +439,10 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcI
    }

    if (srcIndex->devID < 0) {
-        if(mem != NULL)
+        if (mem != NULL) {
            mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
+            mem->UnlockBuf();
+        }
        else
            XMemFree(devID, sIndex);
    }

--- a/source/tensor/core/reduce/ReduceMax.cu
+++ b/source/tensor/core/reduce/ReduceMax.cu
@@ -512,8 +512,8 @@ void funName(DTYPE * input, DTYPE * output,int stride, int strideNum,           
 KERNELREDUCEFUN1(KernelReduceMaxOp, MAX, shflDownReduceMax, FLOAT_MIN)
 KERNELREDUCEFUN1(KernelReduceMinOp, MIN, shflDownReduceMin, MAX_FLOAT)

-/* 
-get the max-valued items along a dimension of the tensor (cuda version). 
+/*
+get the max-valued items along a dimension of the tensor (cuda version).
 For a 1-dimensional data array a,
 sum_i = max_{0<=j<strideNum} input_{i,j}
 >> input - the input tensor
@@ -574,7 +574,14 @@ void _funcName(const XTensor * input, XTensor * output, int dim)                
        XMem * mem = input->mem;                                                                                                              \
        GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);                                     \
        int bufSize = input->unitSize * cudaGridSize[0] * stride * blockNum * 2;                                                              \
-        DTYPE * buf  = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(devID, bufSize);                          \
+        DTYPE * buf;                                                                                                                          \
+        if (mem != NULL) {                                                                                                                    \
+            mem->LockBuf();                                                                                                                   \
+            buf = (DTYPE*)mem->AllocBuf(mem->devID, bufSize);                                                                                 \
+        }                                                                                                                                     \
+        else {                                                                                                                                \
+            buf = (DTYPE*)XMemAlloc(devID, bufSize);                                                                                          \
+        }                                                                                                                                     \
        DTYPE * buf1 = buf;                                                                                                                   \
        DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum;                                                                             \
        do {                                                                                                                                  \
@@ -706,8 +713,10 @@ void _funcName(const XTensor * input, XTensor * output, int dim)                
                                                                                                                                              \
        } while (strideNum > 1);                                                                                                              \
                                                                                                                                              \
-        if (mem != NULL)                                                                                                                      \
+        if (mem != NULL) {                                                                                                                    \
            mem->ReleaseBuf(mem->devID, bufSize);                                                                                             \
+            mem->UnlockBuf();                                                                                                                 \
+        }                                                                                                                                     \
        else                                                                                                                                  \
            XMemFree(input->devID, buf);                                                                                                      \
    }                                                                                                                                         \

--- a/source/tensor/core/reduce/ReduceSum.cu
+++ b/source/tensor/core/reduce/ReduceSum.cu
@@ -757,7 +757,15 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
        GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);

        int bufSize = input->unitSize * cudaGridSize[0] * stride * blockNum * 2;
-        DTYPE * buf  = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(devID, bufSize);
+        //DTYPE * buf  = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(devID, bufSize);
+        DTYPE * buf;
+        if (mem != NULL) {
+            mem->LockBuf();
+            buf = (DTYPE*)mem->AllocBuf(mem->devID, bufSize);
+        }
+        else {
+            buf = (DTYPE*)XMemAlloc(devID, bufSize);
+        }
        DTYPE * buf1 = buf;
        DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum;
        do {
@@ -907,8 +915,10 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
        } while (strideNum > 1);
        

-        if (mem != NULL)
+        if (mem != NULL) {
            mem->ReleaseBuf(mem->devID, bufSize);
+            mem->UnlockBuf();
+        }
        else
            XMemFree(devID, buf);
    }

--- a/source/tensor/core/reduce/ReduceSumAll.cpp
+++ b/source/tensor/core/reduce/ReduceSumAll.cpp
@@ -56,12 +56,16 @@ void _ReduceSumAll(const XTensor * source, XTensor * target)

    int dims[1] = {source->unitNum};

+    if (source->mem != NULL)
+        source->mem->LockBuf();
    XTensor * all = NewTensorBufV2(1, dims, source->dataType, source->denseRatio, source->devID, source->mem);

    _CopyValues(source, all);
    _ReduceSum(all, target, 0);

    DelTensorBuf(all);
+    if (source->mem != NULL)
+        source->mem->UnlockBuf();
 }

 /*
@@ -72,7 +76,8 @@ sum all the items of the tensor (It should be optimized!)
 void _ReduceSumAll(const XTensor * source, DTYPE * value)
 {
    int * dimSize = new int[MAX_TENSOR_DIM_NUM];
-    float dr = (!source->isSparse) ? 1.0F : source->denseRatio;
+    if (source->mem != NULL)
+        source->mem->LockBuf();
    XTensor * target = NewTensorBufV2(0, dimSize, source->dataType, source->denseRatio, source->devID, source->mem);
    target->SetTMPFlag();

@@ -82,6 +87,8 @@ void _ReduceSumAll(const XTensor * source, DTYPE * value)

    delete[] dimSize;
    DelTensorBuf(target);
+    if (source->mem != NULL)
+        source->mem->UnlockBuf();
 }

 /*
@@ -122,4 +129,4 @@ DTYPE ReduceSumAllValue(const XTensor & source)
    return target.Get0D();
 }

-} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/shape/Merge.cpp
+++ b/source/tensor/core/shape/Merge.cpp
@@ -32,14 +32,14 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /*
 transform a tensor by merging it along with a dimension.

-e.g., (N/3, M, 3) -> (N, M)
+e.g., (3, M, N/3) -> (M, N)

 >> s - the source tensor
 >> t - the target tensor (for return)
 >> whereToMerge - the merging operation is along with which dimension
->> leadingDim - the leading dimension of merging, take (N/3, M, 3) -> (N, M) 
-   for example, whereToMerge = 0 (i.e., the dimension for "N/3")
-   leadingDim = 2 (i.e., the dimension for "3")
+>> leadingDim - the leading dimension of merging, take (3, M, N/3) -> (M, N)
+                for example, whereToMerge = 2 (i.e., the dimension for "N/3")
+                leadingDim = 0 (i.e., the dimension for "3")
 */
 void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
 {
@@ -118,30 +118,54 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)

        void * dataTMP = t->data;

-        if (!isOnSameDevice)
-            dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(mem->devID, size);
+        if (!isOnSameDevice) {
+            /*dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(mem->devID, size);*/
+            if (mem != NULL) {
+                mem->LockBuf();
+                dataTMP = mem->AllocBuf(mem->devID, size);
+            }
+            else {
+                dataTMP = XMemAlloc(mem->devID, size);
+            }
+        }

        int blockNumInMerge = s->dimSize[leadingDim];
        int splitSizeInGrid = gridSize / blockNumInMerge;
        int realBlockSize = blockSize * t->unitSize;

-        int * blockIndex = (int*)(mem != NULL ?
+        /*int * blockIndex = (int*)(mem != NULL ?
                                  mem->AllocBuf(mem->devID, blockNum * gridNum * sizeof(int)) :
-                                  XMemAlloc(s->devID, blockNum * gridNum * sizeof(int)));
+                                  XMemAlloc(s->devID, blockNum * gridNum * sizeof(int)));*/
+        int * blockIndex;
+        if (mem != NULL) {
+            if (isOnSameDevice) {
+                mem->LockBuf();
+            }
+            blockIndex = (int*)mem->AllocBuf(mem->devID, blockNum * gridNum * sizeof(int));
+        }
+        else {
+            blockIndex = (int*)XMemAlloc(s->devID, blockNum * gridNum * sizeof(int));
+        }

        _MakeMergeBlockIndex(blockIndex, blockNum, blockNumInMerge, splitSizeInGrid, gridSize, gridNum, s->devID);

        _CopyBlocksOnSite(s->data, s->unitSize, realBlockSize, blockNum * gridNum, dataTMP, blockIndex, s->devID);

-        if (mem != NULL)
+        if (mem != NULL) {
            mem->ReleaseBuf(mem->devID, blockNum * gridNum * sizeof(int));
+            if (isOnSameDevice) {
+                mem->UnlockBuf();
+            }
+        }
        else
            XMemFree(s->devID, blockIndex);

        if (!isOnSameDevice) {
            XMemCopy(t->data, t->devID, dataTMP, s->devID, size);
-            if (mem != NULL)
+            if (mem != NULL) {
                mem->ReleaseBuf(mem->devID, size);
+                mem->UnlockBuf();
+            }
            else
                XMemFree(s->devID, dataTMP);
        }
@@ -185,13 +209,13 @@ bool CheckMergeSize(const XTensor * s, const XTensor * t, int whereToMerge, int 
 transform a tensor by merging it along with a dimension (return an XTensor structure)
 make a new tensor to keep the result and  return it

-e.g., (N/3, M, 3) -> (N, M)
+e.g., (3, M, N/3) -> (M, N)

 >> s - the source tensor
 >> whereToMerge - the merging operation is along with which dimension
->> leadingDim - the leading dimension of merging, take (N/3, M, 3) -> (N, M) 
-   for example, whereToMerge = 0 (i.e., the dimension for "N/3")
-   leadingDim = 2 (i.e., the dimension for "3")
+>> leadingDim - the leading dimension of merging, take (3, M, N/3) -> (M, N) 
+   for example, whereToMerge = 2 (i.e., the dimension for "N/3")
+   leadingDim = 0 (i.e., the dimension for "3")
 << return - the transformed tensor by merging along with a dimension
 */
 XTensor Merge(const XTensor &s, int whereToMerge, int leadingDim)
@@ -358,8 +382,16 @@ void _Merge(const TensorList * smalls, XTensor * t, int whereToMerge)
        void * dataTMP = NULL;
        if (uniform)
            dataTMP = smallsItem0->data;
-        else
-            dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(t->devID, size);
+        else {
+            //dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(t->devID, size);
+            if (mem != NULL) {
+                mem->LockBuf();
+                dataTMP = mem->AllocBuf(mem->devID, size);
+            }
+            else {
+                dataTMP = XMemAlloc(t->devID, size);
+            }
+        }

        tensorTMP->data = dataTMP;

@@ -378,8 +410,10 @@ void _Merge(const TensorList * smalls, XTensor * t, int whereToMerge)
        tensorTMP->data = NULL;
        delete tensorTMP;

-        if ((!uniform) && (mem != NULL))
+        if ((!uniform) && (mem != NULL)) {
            mem->ReleaseBuf(mem->devID, size);
+            mem->UnlockBuf();
+        }
        else
            XMemFree(t->devID, dataTMP);
    }

--- a/source/tensor/core/shape/MergeBlockLists.cu
+++ b/source/tensor/core/shape/MergeBlockLists.cu
@@ -117,7 +117,7 @@ void _CudaMergeBlockLists(const StrList* sourceList, int * blockSizes, int block

    GDevs.GetCudaThread2D(myMem->devID, realMaxBlockSize, newBlockListSize, MAX_INT,
                          cudaGridSizes, cudaBlockSizes);
-
+    myMem->LockBuf();
    myMem->SetPinBuf();
    int * sizesGPU = (int*)myMem->AllocBuf(myMem->devID, sizeof(int) * newBlockListSize, 256);

@@ -133,6 +133,7 @@ void _CudaMergeBlockLists(const StrList* sourceList, int * blockSizes, int block
                            (sourceArraysGPU, sizesGPU, newBlockListSize, targetArraysGPU);

    myMem->BackToPinBuf();
+    myMem->UnlockBuf();

    delete[] sourceArrays;
    delete[] targetArrays;

--- a/source/tensor/core/shape/Split.cpp
+++ b/source/tensor/core/shape/Split.cpp
@@ -110,22 +110,44 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)

        void * dataTMP = t->data;

-        if (!isOnSameDevice)
-            dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(s->devID, size);
+        if (!isOnSameDevice) {
+            //dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(s->devID, size);
+            if (mem != NULL) {
+                mem->LockBuf();
+                dataTMP = mem->AllocBuf(mem->devID, size);
+            }
+            else {
+                dataTMP = XMemAlloc(s->devID, size);
+            }
+        }

        int realBlockSize = blockSize * t->unitSize;
        int blockSplitSize = blockNum / splitNum;

-        int * blockIndex = (int*)(mem != NULL ?
+        /*int * blockIndex = (int*)(mem != NULL ?
                                  mem->AllocBuf(mem->devID, blockNum * sizeof(int)) :
-                                  XMemAlloc(s->devID, blockNum * sizeof(int)));
+                                  XMemAlloc(s->devID, blockNum * sizeof(int)));*/
+        int * blockIndex;
+        if (mem != NULL) {
+            if (isOnSameDevice) {
+                mem->LockBuf();
+            }
+            blockIndex = (int*)mem->AllocBuf(mem->devID, blockNum * sizeof(int));
+        }
+        else {
+            blockIndex = (int*)XMemAlloc(s->devID, blockNum * sizeof(int));
+        }

        _MakeSplitBlockIndex(blockIndex, splitNum, blockSplitSize, blockNum, s->devID);

        _CopyBlocksOnSite(s->data, s->unitSize, realBlockSize, blockNum, dataTMP, blockIndex, s->devID);

-        if (mem != NULL)
+        if (mem != NULL) {
            mem->ReleaseBuf(mem->devID, blockNum * sizeof(int));
+            if (isOnSameDevice) {
+                mem->UnlockBuf();
+            }
+        }
        else
            XMemFree(s->devID, blockIndex);

@@ -133,8 +155,10 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
        if (!isOnSameDevice) {
            XMemCopy(t->data, t->devID, dataTMP, s->devID, size);

-            if (mem != NULL)
+            if (mem != NULL) {
                mem->ReleaseBuf(mem->devID, size);
+                mem->UnlockBuf();
+            }
            else
                XMemFree(s->devID, dataTMP);
        }
@@ -333,7 +357,14 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli
            dataTMP = first->data;
        }
        else {
-            dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(big->devID, size);
+            //dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(big->devID, size);
+            if (mem != NULL) {
+                mem->LockBuf();
+                dataTMP = mem->AllocBuf(mem->devID, size);
+            }
+            else {
+                dataTMP = XMemAlloc(big->devID, size);
+            }
        }

        tensorTMP->data = dataTMP;
@@ -354,8 +385,10 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli
        tensorTMP->data = NULL;
        delete tensorTMP;

-        if ((!uniform) && (mem != NULL))
+        if ((!uniform) && (mem != NULL)) {
            mem->ReleaseBuf(mem->devID, size);
+            mem->UnlockBuf();
+        }
        else
            XMemFree(big->devID, dataTMP);
    }

--- a/source/tensor/core/shape/Stack.cpp
+++ b/source/tensor/core/shape/Stack.cpp
@@ -43,13 +43,11 @@ void _Stack(const TensorList * smalls, XTensor * t, int dim)

    int blockSize = 1;
    int blockNum = 1;
-    int gridSize = 1;
    int gridNum = 1;

    XTensor * smallsItem0 = smalls->GetItem(0);
-    int unitNum = smallsItem0->unitNum;
+    //int unitNum = smallsItem0->unitNum;
    int unitSize = smallsItem0->unitSize;
-    int itemSize = unitNum * unitSize;

    for (int i = 0; i < smallsItem0->order; i++) {
        if (i >= dim)
@@ -129,7 +127,7 @@ bool CheckStackShape(const TensorList &smalls, XTensor &t, int dim)
    XTensor * tensor = (XTensor*)smalls.GetItem(0);
    int order = tensor->order;

-    for (int i = 0; i < tensor->order; i++) {
+    for (int i = 0; i < order; i++) {
        if (i < dim) {
            if (t.GetDim(i) != tensor->GetDim(i)) 
                return false;

--- a/source/tensor/core/sort/Sort.cu
+++ b/source/tensor/core/sort/Sort.cu
@@ -234,7 +234,15 @@ void _CudaSortBig(const XTensor * a, XTensor * b, XTensor * indexA, XTensor * in
    int m = GetNextPower2(strideNum);
    int n = stride * blockNum;

-    void * buf = mem != NULL ? mem->AllocBuf(a->devID, n * m * a->unitSize) : XMemAlloc(a->devID, n * m * a->unitSize);
+    //void * buf = mem != NULL ? mem->AllocBuf(a->devID, n * m * a->unitSize) : XMemAlloc(a->devID, n * m * a->unitSize);
+    void * buf;
+    if (mem != NULL) {
+        mem->LockBuf();
+        buf = mem->AllocBuf(a->devID, n * m * a->unitSize);
+    }
+    else {
+        buf = XMemAlloc(a->devID, n * m * a->unitSize);
+    }
    void * bufIndex = NULL;
    if (indexA != NULL && indexB != NULL) {
        bufIndex = mem != NULL ? mem->AllocBuf(a->devID, n * m * sizeof(int)) : XMemAlloc(a->devID, n * m * sizeof(int));
@@ -289,8 +297,10 @@ void _CudaSortBig(const XTensor * a, XTensor * b, XTensor * indexA, XTensor * in
        KernelReorganizeBack<int> << <dim3(cudaGrids[1], cudaGrids[0]), dim3(cudaBlocks[1], cudaBlocks[0]) >> >
                                      (bufIndex, indexB->data, m, n, stride, k, blockNum);

-    if (mem != NULL)
+    if (mem != NULL) {
        mem->ReleaseBuf(a->devID, n * m * a->unitSize);
+        mem->UnlockBuf();
+    }
    else
        XMemFree(a->devID, buf);
    if (indexA != NULL && indexB != NULL)

--- a/source/tensor/function/LogSoftmax.cpp
+++ b/source/tensor/function/LogSoftmax.cpp
@@ -79,6 +79,8 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
        blockSize = stride * dimensionSize;
        blockNum = y->unitNum / blockSize;

+        if (mem != NULL)
+            mem->LockBuf();
        max = NewTensorBufV2(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
        sum = NewTensorBufV2(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);

@@ -153,6 +155,8 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)

        DelTensorBuf(max);
        DelTensorBuf(sum);
+        if (mem != NULL)
+            mem->UnlockBuf();

        if (x->devID >= 0) {
            delete blockx;

--- a/source/tensor/function/Softmax.cpp
+++ b/source/tensor/function/Softmax.cpp
@@ -54,6 +54,8 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
        XTensor * max = NULL;
        XTensor * sum = NULL;

+        if (mem != NULL)
+            mem->LockBuf();
        max = NewTensorBufV2(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
        sum = NewTensorBufV2(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);

@@ -113,6 +115,8 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)

        DelTensorBuf(sum);
        DelTensorBuf(max);
+        if (mem != NULL)
+            mem->UnlockBuf();

        delete[] dimSize;
    }

--- a/source/tensor/loss/CrossEntropy.cpp
+++ b/source/tensor/loss/CrossEntropy.cpp
@@ -354,8 +354,10 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
            dimSize[i - 1] = output->dimSize[i];
    }

+    if (output->mem != NULL)
+        output->mem->LockBuf();
    XTensor * lossBuf = NewTensorBufV2(output->order - 1, dimSize, output->dataType, output->denseRatio, 
-                                     output->devID, output->mem);
+                                       output->devID, output->mem);

    _CrossEntropy(output, gold, lossBuf, weight, padding, leadingDim);

@@ -367,10 +369,16 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
            nonZeroNum = (DTYPE)lossBuf->unitNum;
        }
        else {
+            if ((padding->mem != NULL) && (padding->mem != output->mem)) {
+                padding->mem->LockBuf();
+            }
            XTensor * tmp = NewTensorBufV2(padding, padding->devID, padding->mem);
            _IsNonZero(padding, tmp);
            _ReduceSumAll(tmp, &nonZeroNum);
            DelTensorBuf(tmp);
+            if ((padding->mem != NULL) && (padding->mem != output->mem)) {
+                padding->mem->UnlockBuf();
+            }
        }

        loss = loss / nonZeroNum;
@@ -384,6 +392,8 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,

    delete[] dimSize;
    DelTensorBuf(lossBuf);
+    if (output->mem != NULL)
+        output->mem->UnlockBuf();

    return loss;
 }

--- a/source/tensor/loss/CrossEntropy.cu
+++ b/source/tensor/loss/CrossEntropy.cu
@@ -57,6 +57,9 @@ void _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
 {
    int n = leadingDim < 0 ? output->order - 1 : leadingDim;
    
+    if (output->mem != NULL) {
+        output->mem->LockBuf();
+    }
    XTensor * interBuf1 = NewTensorBufV2(output, output->devID, output->mem);
    XTensor * interBuf2 = NewTensorBufV2(output, output->devID, output->mem);
    
@@ -73,6 +76,9 @@ void _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,

    DelTensorBuf(interBuf2);
    DelTensorBuf(interBuf1);
+    if (output->mem != NULL) {
+        output->mem->UnlockBuf();
+    }
 }

 /*
@@ -118,6 +124,9 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
            dimSize[i - 1] = output->dimSize[i];
    }

+    if (output->mem != NULL) {
+        output->mem->LockBuf();
+    }
    XTensor * lossBuf = NewTensorBufV2(output->order - 1, dimSize, output->dataType, output->denseRatio, 
                                     output->devID, output->mem);

@@ -131,10 +140,16 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
            nonZeroNum = (DTYPE)lossBuf->unitNum;
        }
        else {
+            if ((padding->mem != NULL) && (padding->mem != output->mem)) {
+                padding->mem->LockBuf();
+            }
            XTensor * tmp = NewTensorBufV2(padding, padding->devID, padding->mem);
            _IsNonZero(padding, tmp);
            _ReduceSumAll(tmp, &nonZeroNum);
            DelTensorBuf(tmp);
+            if ((padding->mem != NULL) && (padding->mem != output->mem)) {
+                padding->mem->UnlockBuf();
+            }
        }

        loss = loss / nonZeroNum;
@@ -148,6 +163,9 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,

    delete[] dimSize;
    DelTensorBuf(lossBuf);
+    if (output->mem != NULL) {
+        output->mem->UnlockBuf();
+    }

    return loss;
 }

--- a/source/tensor/test/TConvertDataType.cpp
+++ b/source/tensor/test/TConvertDataType.cpp
@@ -215,12 +215,7 @@ bool TestConvertDataType3()
                          {0.5F, -4.0F},
                          {0.0F, 6.0F} };
    
-    DTYPE data2[2][3] = { {1.0F, 2.0F, 3.0F},
-                          {0.0F, 4.0F, 5.0F} };
    
-    DTYPE answer[3][3] = { {1.0F, -6.0F, -7.0F},
-                           {0.5F, -15.0F, -18.5F}, 
-                           {0.0F, 24.0F, 30.0F} };

    /* CPU test */
    bool cpuTest = true;
@@ -241,6 +236,14 @@ bool TestConvertDataType3()
    cpuTest = _CheckData(a, data1, unitNum1, 1e-4F);

 #ifdef USE_CUDA
+
+    DTYPE data2[2][3] = { { 1.0F, 2.0F, 3.0F },
+                          { 0.0F, 4.0F, 5.0F } };
+
+    DTYPE answer[3][3] = { { 1.0F, -6.0F, -7.0F },
+                           { 0.5F, -15.0F, -18.5F },
+                           { 0.0F, 24.0F, 30.0F } };
+
    /* GPU test */
    bool gpuTest = true;


--- a/source/tensor/test/TGather.cpp
+++ b/source/tensor/test/TGather.cpp
@@ -67,7 +67,6 @@ bool TestGather1()
    DTYPE answer[2][3] = { {0.0F, -1.0F, 2.0F},
                           {1.0F, 2.0F, 4.0F} };

-    int dim = 0;
    int indexSize = 2;
    int srcIndex[2] = {0, 2};


--- a/source/tensor/test/TSetData.cpp
+++ b/source/tensor/test/TSetData.cpp
@@ -422,7 +422,7 @@ bool TestSetData6()
    for (int i = 0; i < order; i++)
        unitNum *= dimSize[i];

-    DTYPE answer[5] = {5.2F, 3.2F, 1.2F, -0.8F, -2.8F};
+    //DTYPE answer[5] = {5.2F, 3.2F, 1.2F, -0.8F, -2.8F};

    /* CPU test */
    bool cpuTest = true;

--- a/source/train/TTrain.cpp
+++ b/source/train/TTrain.cpp
+/*
+* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2016-2021
+* Natural Language Processing Lab, Northeastern University
+* and
+* NiuTrans Research
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* We test XTrain here. It is simple, we design a simple task in that we
+* make the model to predict an integer D (0-100) from four input integers
+* A, B, C and D (0-100). We generate a number of samples with different values
+* of A, B, C and D. The gold standard is
+*
+*          D = (int)(sqrt(A * B) + abs(C - D))/2
+*
+* Our model is a two-layer feed-forward neural network. It can be treated
+* as a classifier rather than a regression model.
+*
+* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-03-03
+*/
+
+#include "TTrain.h"
+#include "../tensor/core/CHeader.h"
+#include "../tensor/function/FHeader.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+XTensor * tmpTT = NULL;
+
+/* genreate the training data file */
+void GeneateTTrainData(const char * fileName)
+{
+    FILE * file = fopen(fileName, "wb");
+    CheckNTErrors(file, "Cannot open the file");
+
+    XPRINT(1, stderr, "[INFO] Generating data ... ");
+
+    int sampleNum = MAX_SAMPLE_NUM_IN_TTRAIN;
+    int range = MAX_INT_IN_TTRAIN;
+
+    fprintf(file, "%d\n", sampleNum);
+
+    srand(1);
+
+    for (int i = 0; i < sampleNum; i++) {
+        int A = (int)(((float)rand() / RAND_MAX) * range);
+        int B = (int)(((float)rand() / RAND_MAX) * range);
+        int C = (int)(((float)rand() / RAND_MAX) * range);
+        int D = (int)(((float)rand() / RAND_MAX) * range);
+        int E = (int)((sqrt(A * B) + abs(C - D)) / 2);
+        fprintf(file, "%d %d %d %d %d\n", A, B, C, D, E);
+    }
+
+    XPRINT2(1, stderr, "%d samples in \"%s\" [DONE]\n", sampleNum, fileName);
+    
+    fclose(file);
+}
+
+/* run the test */
+void TestTrain()
+{
+    GeneateTTrainData("ttrain.txt");
+
+    XConfig config;
+    //config.Add("dev", -1);
+    config.Add("lrate", 0.1F);
+    config.Add("nstep", 100000);
+    config.Add("nepoch", 5);
+    config.Add("jobdev0", 0);
+    //config.Add("jobdev4", -1);
+
+    int serverDevID = config.GetInt("jobdev0", -1);
+
+    TTDataLoader loader;
+    loader.SetFileName("ttrain.txt");
+    loader.SetBatchSize(config.GetInt("batchsize", TT_BATCH_SIZE));
+
+    TTModel model;
+    model.Init(config, serverDevID);
+
+    tmpTT = model.params[0].param;
+
+    XOptimizer optimizer;
+    optimizer.Init(config);
+
+    XTrainer trainer;
+    trainer.Run(&config, &loader, &model, &optimizer);
+}
+
+/*****************************
+* data loader
+******************************/
+
+/* constructor */
+TTDataLoader::TTDataLoader()
+{
+    fileName = new char[MAX_FILE_NAME_LENGTH];
+    file = NULL;
+    batchSize = TT_BATCH_SIZE;
+}
+
+/* de-constructor */
+TTDataLoader::~TTDataLoader()
+{
+    delete[] fileName;
+}
+
+/* set file name */
+void TTDataLoader::SetFileName(const char * myFileName)
+{
+    strcpy(fileName, myFileName);
+}
+
+/* set batch size */
+void TTDataLoader::SetBatchSize(int myBatchSize)
+{
+    batchSize = myBatchSize;
+}
+
+/* start the process */
+bool TTDataLoader::Start()
+{
+    file = fopen(fileName, "rb");
+    CheckNTErrors(file != NULL, "Cannot open the file");
+
+    /* skip the first line */
+    char * line = new char[MAX_SAMPLE_LINE_LENGTH];
+    fgets(line, MAX_SAMPLE_LINE_LENGTH, file);
+    delete[] line;
+
+    return true;
+}
+
+/* end the process */
+bool TTDataLoader::End()
+{
+    fclose(file);
+
+    return true;
+}
+
+/* 
+get a batch of samples 
+>> inputs - inputs of the model
+>> golds - gold standards
+*/
+bool TTDataLoader::GetBatchSimple(XList * inputs, XList * golds)
+{
+    CheckNTErrors(file != NULL, "No input file specificed!");
+    CheckNTErrors(inputs != NULL && inputs->count >= 1, "Wrong argument!");
+    CheckNTErrors(golds != NULL && golds->count >= 1, "Wrong argument!");
+
+    XTensor * input = (XTensor*)inputs->GetItem(0);
+    XTensor * gold = (XTensor*)golds->GetItem(0);
+
+    int count = 0;
+    int sampleSize = MAX_SAMPLE_SIZE;
+    char * line = new char[MAX_SAMPLE_LINE_LENGTH];
+    int * inputBatch = new int[batchSize * sampleSize];
+    int * goldBatch = new int[batchSize];
+    int A, B, C, D, E;
+    
+    while (fgets(line, MAX_SAMPLE_LINE_LENGTH, file)) {
+
+        if (count == batchSize)
+            break;
+
+        if (sscanf(line, "%d %d %d %d %d", &A, &B, &C, &D, &E) < sampleSize + 1) {
+            ShowNTErrors("Wrong format in the training file!");
+        }
+
+        inputBatch[count * sampleSize] = A;
+        inputBatch[count * sampleSize + 1] = B;
+        inputBatch[count * sampleSize + 2] = C;
+        inputBatch[count * sampleSize + 3] = D;
+        goldBatch[count] = E;
+
+        count++;
+    }
+
+    if (count > 0) {
+        InitTensor2D(input, count, 4, X_INT);
+        InitTensor2D(gold, count, 1, X_INT);
+
+        input->SetData(inputBatch, count * 4);
+        gold->SetData(goldBatch, count);
+    }
+
+    delete[] line;
+    delete[] inputBatch;
+    delete[] goldBatch;
+
+    if (count > 0)
+        return true;
+    else
+        return false;
+}
+
+/*****************************
+* the neural model
+******************************/
+
+/* constructor */
+TTModel::TTModel()
+{
+    devID = -1;
+    vSize = 0;
+    eSize = 0;
+    hSize = 0;
+}
+
+/* de-constructor */
+TTModel::~TTModel()
+{
+}
+
+/* config it */
+void TTModel::SetConfig(XConfig &myConfig)
+{
+    config.CreateFromMe(myConfig);
+}
+
+/* 
+initialize the model 
+>> myConfig - configuration
+>> devID - device id
+*/
+void TTModel::Init(XConfig &myConfig, int myDevID)
+{
+    Clear();
+    SetConfig(myConfig);
+
+    devID = myDevID;
+
+    vSize = MAX_INT_IN_TTRAIN + 1;
+    eSize = config.GetInt("esize", TT_EMBEDDING_SIZE);
+    hSize = config.GetInt("hsize", TT_HIDDEN_SIZE);
+
+    InitTensor2D(&embeddingW, vSize, eSize, X_FLOAT, devID);
+    InitTensor2D(&hiddenW, MAX_SAMPLE_SIZE * eSize, hSize, X_FLOAT, devID);
+    InitTensor2D(&outputW, hSize, vSize, X_FLOAT, devID);
+
+    embeddingW.SetName("embeddingw");
+    hiddenW.SetName("hiddenw");
+    outputW.SetName("outputw");
+
+    embeddingW.SetDataRand(-0.1F, 0.1F);
+    hiddenW.SetDataRand(-0.1F, 0.1F);
+    outputW.SetDataRand(-0.1F, 0.1F);
+    
+    AddParam(&embeddingW);
+    AddParam(&hiddenW);
+    AddParam(&outputW);
+}
+
+/* 
+create the model 
+>> devID - device id
+>> input - as it is
+>> output - as it is
+*/
+void TTModel::Forward(int devID, XTensor * input, XTensor * output)
+{
+    XTensor embedding;
+    XTensor embeddingCat;
+    XTensor hidden;
+
+    /* [e_0, e_1, e_2] = w_e * input(one-hot) */
+    embedding = Gather(embeddingW, *input);
+
+    /* e = merge(e_0, e_1, e_2) */
+    embeddingCat = Merge(embedding, embedding.order - 1, embedding.order - 2);
+
+    /* h = hardtanh(e * w_h) */
+    hidden = HardTanH(MMul(embeddingCat, hiddenW));
+
+    /* output = Softmax(h * w_o) */
+    *output = Softmax(MMul(hidden, outputW), -1);
+}
+
+/* clear the model */
+void TTModel::Clear()
+{
+    config.Clear();
+}
+
+/* 
+clone the model 
+>> devID - device id
+*/
+XModel * TTModel::Clone(int devID)
+{
+    TTModel * model = new TTModel();
+    model->SetConfig(config);
+    model->Init(config, devID);
+
+    CopyValues(embeddingW, model->embeddingW);
+    CopyValues(hiddenW, model->hiddenW);
+    CopyValues(outputW, model->outputW);
+
+    return model;
+}
+
+/* 
+run the neural network
+>> inputs - inputs of the model
+>> outputs - outputs of the model
+>> golds - gold standards
+>> losses - losses of the output respect to the gold standards
+*/
+bool TTModel::RunSimple(XList * inputs, XList * outputs, XList * golds, XList* losses)
+{
+    //fprintf(stderr, "run simple 0\n");
+    CheckNTErrors(inputs != NULL && inputs->count >= 1, "Wrong arguments!");
+    CheckNTErrors(outputs != NULL && outputs->count >= 1, "Wrong arguments!");
+    CheckNTErrors(golds != NULL && golds->count >= 1, "Wrong arguments!");
+    CheckNTErrors(losses != NULL && losses->count >= 1, "Wrong arguments!");
+
+    XTensor * input = (XTensor*)inputs->GetItem(0);
+    XTensor * output = (XTensor*)outputs->GetItem(0);
+    XTensor * gold = (XTensor*)golds->GetItem(0);
+    XTensor * loss = (XTensor*)losses->GetItem(0);
+    XTensor goldOneHot;
+
+    /* place all input data on the correct device */
+    input->FlushToDevice(devID);
+    output->FlushToDevice(devID);
+    gold->FlushToDevice(devID);
+
+    XNet net;
+
+    /* create the neural network and run it */
+    Forward(devID, input, output);
+
+    /* gold standard in ong-hot representaiton */
+    goldOneHot = IndexToOnehot(*gold, vSize, 0.0F);
+
+    int * dims = new int[goldOneHot.order];
+    for (int i = 0; i < goldOneHot.order - 2; i++)
+        dims[i] = goldOneHot.GetDim(i);
+    dims[goldOneHot.order - 2] = goldOneHot.GetDim(goldOneHot.order - 1);
+    goldOneHot.Reshape(goldOneHot.order - 1, dims);
+
+    /* loss */
+    *loss = CrossEntropy(*output, goldOneHot);
+
+    /* back-propagation */
+    net.Backward(*loss);
+
+    delete[] dims;
+    
+    //fprintf(stderr, "run simple 1\n");
+
+    return true;
+}
+
+}
--- a/source/train/TTrain.h
+++ b/source/train/TTrain.h
+/*
+* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2016-2021
+* Natural Language Processing Lab, Northeastern University
+* and
+* NiuTrans Research
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* We test XTrain here. It is simple, we design a simple task in that we
+* make the model to predict an integer D (0-100) from three input integers 
+* A, B and C (0-100). We generate a number of samples with different values
+* of A, B and C. The gold standard is 
+*     
+*          D = (int)(sqrt(A * B) + C)/2
+* 
+* Our model is a two-layer feed-forward neural network. It can be treated
+* as a classifier rather than a regression model.
+*
+* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-03-03
+* The express train was updated this year. It just takes me two hours and
+* a half from Shenyang to Beijing.
+*/
+
+#ifndef __TTRAIN_H__
+#define __TTRAIN_H__
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "XTrainer.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+#define MAX_SAMPLE_NUM_IN_TTRAIN 200000
+#define MAX_INT_IN_TTRAIN 100
+#define MAX_SAMPLE_LINE_LENGTH 128
+#define MAX_SAMPLE_SIZE 4
+#define TT_BATCH_SIZE 256
+#define TT_EMBEDDING_SIZE 128
+#define TT_HIDDEN_SIZE 512
+
+extern XTensor * tmpTT;
+
+/* genreate the training data file */
+void GeneateTTrainData(const char * fileName);
+
+/* run the test */
+extern
+void TestTrain();
+
+/* data loader */
+class TTDataLoader : public DataDistributeBase
+{
+protected:
+    /* file name */
+    char * fileName;
+
+    /* file handle */
+    FILE * file;
+
+    /* batch size */
+    int batchSize;
+
+public:
+    /* constructor */
+    TTDataLoader();
+
+    /* de-constructor */
+    ~TTDataLoader();
+
+    /* set file name */
+    void SetFileName(const char * myFileName);
+
+    /* set batch size */
+    void SetBatchSize(int myBatchSize);
+
+    /* start the process */
+    bool Start();
+
+    /* end the process */
+    bool End();
+
+    /* get a batch of samples */
+    bool GetBatchSimple(XList * inputs, XList * golds);
+};
+
+/* the model */
+class TTModel : public XModel
+{
+protected:
+    /* device id */
+    int devID;
+
+    /* configuration */
+    XConfig config;
+
+    /* embedding matrix of the input */
+    XTensor embeddingW;
+
+    /* parameter matrix of the hidden layer */
+    XTensor hiddenW;
+
+    /* parameter matrix of the output layer */
+    XTensor outputW;
+
+    /* vocabulary size */
+    int vSize;
+
+    /* embedding size */
+    int eSize;
+
+    /* hidden layer size */
+    int hSize;
+
+public:
+    /* constructor */
+    TTModel();
+
+    /* de-constructor */
+    ~TTModel();
+
+    /* config it */
+    void SetConfig(XConfig &myConfig);
+
+    /* initialize the parameters */
+    void Init(XConfig &myConfig, int myDevID);
+
+    /* create the model */
+    void Forward(int devID, XTensor * input, XTensor * output);
+
+    /* clear the model */
+    void Clear();
+
+    /* clone the model */
+    XModel * Clone(int devID);
+
+    /* run the neural network */
+    bool RunSimple(XList * inputs, XList * outputs, XList * golds, XList * losses);
+};
+
+/*  */
+
+}
+
+#endif
\ No newline at end of file
--- a/source/train/XBaseTemplate.cpp
+++ b/source/train/XBaseTemplate.cpp
+/*
+* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2016-2021
+* Natural Language Processing Lab, Northeastern University
+* and
+* NiuTrans Research
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* We define various template classes here. They will be overloaded and used
+* in applications.
+*
+* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-02-25
+*/
+
+#include "XBaseTemplate.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/******************************* 
+ * data loader template 
+ *******************************/
+
+/* constructor */
+DataDistributeBase::DataDistributeBase()
+{
+    MUTEX_INIT(loadMutex);
+}
+
+/* de-constructor */
+DataDistributeBase::~DataDistributeBase()
+{
+    MUTEX_DELE(loadMutex);
+}
+
+/* * start the job (e.g., open the file) */
+bool DataDistributeBase::Start()
+{
+    ShowNTErrors("DataDistributeBase::Start must be overloaded!");
+    return true;
+}
+
+/* end the job (e.g., close the file) */
+bool DataDistributeBase::End()
+{
+    ShowNTErrors("DataDistributeBase::End must be overloaded!");
+    return true;
+}
+
+/* 
+get a batch of samples 
+>> inputs - inputs of the model
+>> golds - gold standards
+*/
+bool DataDistributeBase::GetBatchSimple(XList * inputs, XList * golds)
+{
+    return false;
+}
+
+/* get a batch of samples */
+bool DataDistributeBase::GetBatch(XList * args)
+{
+    CheckNTErrors(args->count >= 2, "More input arguments are required!");
+
+    XList * input = (XList*)args->GetItem(0);
+    XList * gold = (XList*)args->GetItem(1);
+
+    if (GetBatchSimple(input, gold))
+        return true;
+
+    ShowNTErrors("You must be overload one of these: DataDistributeBase::GetBatchSimple ... !");
+    return false;
+}
+
+/* get a batch of samples (for multi-threading) */
+bool DataDistributeBase::GetBatchSafe(XList * args)
+{
+    bool r;
+
+    MUTEX_LOCK(loadMutex);
+    r = GetBatch(args);
+    MUTEX_UNLOCK(loadMutex);
+
+    return r;
+}
+
+}
--- a/source/train/XBaseTemplate.h
+++ b/source/train/XBaseTemplate.h
+/*
+* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2016-2021
+* Natural Language Processing Lab, Northeastern University
+* and
+* NiuTrans Research
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* We define various template classes here. They will be overloaded and used 
+* in applications.
+*
+* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-02-25
+* The meeting at 3:00pm today was canceled. More time for coding.
+*/
+
+#ifndef __XNETTEMPLATE_H__
+#define __XNETTEMPLATE_H__
+
+#include "../tensor/XTensor.h"
+#include "../tensor/XThread.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* 
+data distributor template. It distributes batches of data to workers.
+
+The use of data distributor follows:
+Start() -> GetBatch() -> ... -> GetBatch() -> End()
+
+In addition, GetBatch() should be thread-safe, and thus could be 
+called by different threads simultaneously.
+*/
+class DataDistributeBase
+{
+protected:
+    /* mutex of batch loading */
+    MUTEX_HANDLE loadMutex;
+
+public:
+    /* constructor */
+    DataDistributeBase();
+
+    /* de-constructor */
+    ~DataDistributeBase();
+
+    /* start the job (e.g., open the file).
+       NOTE THAT before calling Start() one should initialize
+       the distributor if neccessary */
+    virtual
+    bool Start();
+
+    /* end the job (e.g., close the file) */
+    virtual
+    bool End();
+
+    /* get a batch of samples */
+    virtual
+    bool GetBatchSimple(XList * inputs, XList * golds);
+    
+
+public:
+    /* get a batch of samples */
+    bool GetBatch(XList * args);
+
+    /* get a batch of samples (for multi-threading) */
+    bool GetBatchSafe(XList * args);
+};
+
+}
+
+#endif // __XNETTEMPLATE_H__
+
--- a/source/train/XLeader.cpp
+++ b/source/train/XLeader.cpp
--- a/source/train/XLeader.h
+++ b/source/train/XLeader.h
--- a/source/train/XLearningRate.cpp
+++ b/source/train/XLearningRate.cpp
--- a/source/train/XLearningRate.h
+++ b/source/train/XLearningRate.h
--- a/source/train/XModel.cpp
+++ b/source/train/XModel.cpp
--- a/source/train/XModel.h
+++ b/source/train/XModel.h
--- a/source/train/XNNRecord.cpp
+++ b/source/train/XNNRecord.cpp
--- a/source/train/XNNRecord.h
+++ b/source/train/XNNRecord.h
--- a/source/train/XOptimizer.cpp
+++ b/source/train/XOptimizer.cpp
--- a/source/train/XOptimizer.h
+++ b/source/train/XOptimizer.h
--- a/source/train/XTrainer.cpp
+++ b/source/train/XTrainer.cpp
--- a/source/trainer/XTrainer.h
+++ b/source/trainer/XTrainer.h
--- a/source/train/XWorker.cpp
+++ b/source/train/XWorker.cpp
--- a/source/train/XWorker.h
+++ b/source/train/XWorker.h
--- a/source/train/XWorkerBroadcast.cpp
+++ b/source/train/XWorkerBroadcast.cpp
--- a/source/train/XWorkerBroadcast.h
+++ b/source/train/XWorkerBroadcast.h
--- a/source/train/XWorkerCollect.cpp
+++ b/source/train/XWorkerCollect.cpp
--- a/source/train/XWorkerCollect.h
+++ b/source/train/XWorkerCollect.h
--- a/source/train/XWorkerJob.cpp
+++ b/source/train/XWorkerJob.cpp
--- a/source/train/XWorkerJob.h
+++ b/source/train/XWorkerJob.h
--- a/source/train/XWorkerUpdate.cpp
+++ b/source/train/XWorkerUpdate.cpp
--- a/source/train/XWorkerUpdate.h
+++ b/source/train/XWorkerUpdate.h
--- a/source/train/optimizer/Adam.cpp
+++ b/source/train/optimizer/Adam.cpp
--- a/source/train/optimizer/Adam.h
+++ b/source/train/optimizer/Adam.h
--- a/source/trainer/XTrainer.cpp
+++ b/source/trainer/XTrainer.cpp