Merge with xiaotong branch and add mutex when operating the memory pool.

a79523f9 · liyinqiao · 7d4bc44a · a79523f9 · a79523f9 · a79523f9
Commit a79523f9 authored Mar 13, 2021 by liyinqiao
--- a/source/Main.cpp
+++ b/source/Main.cpp
@@ -27,6 +27,7 @@
 #include "./tensor/test/Test.h"
 #include "./sample/fnnlm/FNNLM.h"
 #include "./sample/transformer/NMT.h"
+#include "./train/TTrain.h"

 //#define CRTDBG_MAP_ALLOC
 //#include <stdlib.h>
@@ -38,8 +39,14 @@ using namespace nmt;

 int main( int argc, const char ** argv )
 {
-    if(argc > 1 && !strcmp(argv[1], "-test"))
+    XConfig config;
+    config.Create(argc - 1, argv + 1);
+    verboseLevel = config.GetInt("verbose", 1);
+
+    if (argc > 1 && !strcmp(argv[1], "-test"))
        Test();
+    else if (argc > 1 && !strcmp(argv[1], "-testtrain"))
+        TestTrain();
    else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
        FNNLMMain(argc - 1, argv + 1);
    else if(argc > 1 && !strcmp(argv[1], "-t2t"))
@@ -47,7 +54,8 @@ int main( int argc, const char ** argv )
    else{
        fprintf(stderr, "Thanks for using NiuTensor! This is a library for building\n");
        fprintf(stderr, "neural networks in an easy way. \n\n");
-        fprintf(stderr, "Run this program with \"-test\" for unit test!\n");
+        fprintf(stderr, "   Run this program with \"-test\" for unit test!\n");
+        fprintf(stderr, "Or run this program with \"-testtrain\" for test of the trainer!\n");
        fprintf(stderr, "Or run this program with \"-fnnlm\" for sample FNNLM!\n");
        fprintf(stderr, "Or run this program with \"-t2t\" for sample Transformer!\n");
    }

--- a/source/network/XBackwardFunc.cpp
+++ b/source/network/XBackwardFunc.cpp
@@ -93,6 +93,7 @@ void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient)
    }

    node->visitMark = NODE_FINISHED;
+    node->isGradFinished = true;
 }

 /* indicates whether the node is for an activation function */

--- a/source/network/XBackwardLoss.cpp
+++ b/source/network/XBackwardLoss.cpp
@@ -89,6 +89,7 @@ void XLossGrad::MakeGrad(XTensor * node, bool isEfficient)
    }

    node->visitMark = NODE_FINISHED;
+    node->isGradFinished = true;
 }

 /* indicates whether the node is for a loss computation */

--- a/source/network/XBackwardMath.cpp
+++ b/source/network/XBackwardMath.cpp
--- a/source/network/XBackwardShape.cpp
+++ b/source/network/XBackwardShape.cpp
@@ -105,12 +105,19 @@ void XShapeGrad::GradConvertDataType(XTensor* node, bool isEfficient)
    if (!isEfficient || a->isGrad) {
        XNoder::MakeGrad(a);

+        if (a->mem != NULL)
+            a->mem->LockBuf();
        XTensor* tmp = NewTensorBufV2(a, a->devID, a->mem);
        _ConvertDataType(node->grad, tmp);
        _SumMe(a->grad, tmp);

        DelTensorBuf(tmp);
+        if (a->mem != NULL)
+            a->mem->UnlockBuf();
    }
+
+    node->visitMark = NODE_FINISHED;
+    node->isGradFinished = true;
 }

 /* 
@@ -138,12 +145,19 @@ void XShapeGrad::GradCopyIndexed(XTensor * node, bool isEfficient)
    if (!isEfficient || input->isGrad) {
        XNoder::MakeGrad(input);

+        if (input->mem != NULL)
+            input->mem->LockBuf();
        XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
        _SpreadForCopyIndexed(tmp, node->grad, dim, srcIndex, tgtIndex, copyNum);
        _SumMe(input->grad, tmp);

        DelTensorBuf(tmp);
+        if (input->mem != NULL)
+            input->mem->UnlockBuf();
    }
+
+    node->visitMark = NODE_FINISHED;
+    node->isGradFinished = true;
 }

 /* 
@@ -167,15 +181,20 @@ void XShapeGrad::GradGather(XTensor * node, bool isEfficient)
    if (!isEfficient || input->isGrad) {
        XNoder::MakeGrad(input);

+        if (input->mem != NULL)
+            input->mem->LockBuf();
        XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
        tmp->SetZeroAll();
        _SpreadForGather(tmp, node->grad, index);
        _SumMe(input->grad, tmp);

        DelTensorBuf(tmp);
+        if (input->mem != NULL)
+            input->mem->UnlockBuf();
    }

    node->visitMark = NODE_FINISHED;
+    node->isGradFinished = true;
 }

 /*
@@ -193,6 +212,8 @@ void XShapeGrad::GradDropoutWithIndex(XTensor * node, bool isEfficient)
    if (!isEfficient || input->isGrad) {
        XNoder::MakeGrad(input);

+        if (input->mem != NULL)
+            input->mem->LockBuf();
        XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
        _CopyValues(node->grad, tmp);

@@ -205,9 +226,12 @@ void XShapeGrad::GradDropoutWithIndex(XTensor * node, bool isEfficient)
        _SumMe(input->grad, tmp);

        DelTensorBuf(tmp);
+        if (input->mem != NULL)
+            input->mem->UnlockBuf();
    }

    node->visitMark = NODE_FINISHED;
+    node->isGradFinished = true;
 }

 /* 
@@ -246,13 +270,16 @@ void XShapeGrad::GradMerge(XTensor * node, bool isEfficient)
                dims[j++] = input->dimSize[i];
            }
        }
-        dims[0] = -dims[0];
+
+        dims[0] = -abs(dims[0]);
        XTensor gradInputSmall(input->order - leadDim, dims,
                               input->dataType, input->denseRatio,
                               input->devID, input->mem);

-        dims[whereToMerge - leadDim] *= dims[0];
-        XTensor gradNodeSmall(node->order - leadDim, dims + leadDim + 1,
+        dims[whereToMerge - leadDim] *= abs(dims[0]);
+        int * dimsNode = dims + 1;
+        dimsNode[0] = -abs(dimsNode[0]);
+        XTensor gradNodeSmall(node->order - leadDim, dimsNode,
                              node->dataType, node->denseRatio,
                              node->devID, node->mem);

@@ -296,6 +323,7 @@ void XShapeGrad::GradMerge(XTensor * node, bool isEfficient)
    }

    node->visitMark = NODE_FINISHED;
+    node->isGradFinished = true;
 }

 /* 
@@ -379,6 +407,7 @@ void XShapeGrad::GradMergeList(XTensor * node, bool isEfficient)
    }

    node->visitMark = NODE_FINISHED;
+    node->isGradFinished = true;
 }

 /* 
@@ -407,6 +436,7 @@ void XShapeGrad::GradReshape(XTensor * node, bool isEfficient)
    }

    node->visitMark = NODE_FINISHED;
+    node->isGradFinished = true;
 }

 /* 
@@ -442,16 +472,21 @@ void XShapeGrad::GradSplit(XTensor * node, bool isEfficient)
        /* if the tensor is used somewhere else, we need another SUM
           for gradient accumulation */
        else {
+            if (input->mem != NULL)
+                input->mem->LockBuf();
            XTensor * inputGradTMP = NewTensorBufV2(input, input->devID, input->mem);

            _Merge(node->grad, inputGradTMP, whereToSplit + 1, 0);
            _Sum(input->grad, inputGradTMP, input->grad);

            DelTensorBuf(inputGradTMP);
+            if (input->mem != NULL)
+                input->mem->UnlockBuf();
        }
    }

    node->visitMark = NODE_FINISHED;
+    node->isGradFinished = true;
 }

 /* 
@@ -528,14 +563,21 @@ void XShapeGrad::GradSplitListPost(XTensor * node, bool isEfficient)
           somewhere else, we need another SUM for gradient
           accumulation */
        else {
+            if (node->mem != NULL)
+                node->mem->LockBuf();
            XTensor * nodeGradTMP = NewTensorBufV2(node, node->devID, node->mem);

            _Merge(&splits, nodeGradTMP, whereToSplit + 1);
            _Sum(node->grad, nodeGradTMP, node->grad);

            DelTensorBuf(nodeGradTMP);
+            if (node->mem != NULL)
+                node->mem->UnlockBuf();
        }
    }
+
+    node->visitMark = NODE_DOING;
+    node->isGradFinished = true;
 }

 /*
@@ -566,14 +608,19 @@ void XShapeGrad::GradTranspose(XTensor * node, bool isEfficient)
        CheckNTErrors(input->order > i && i >= 0, "index of dimension is out of scope!");
        CheckNTErrors(input->order > j && j >= 0, "index of dimension is out of scope!");

+        if (input->mem != NULL)
+            input->mem->LockBuf();
        XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
        _Transpose(output->grad, tmp, i, j);
        _Sum(input->grad, tmp, input->grad);

        DelTensorBuf(tmp);
+        if (input->mem != NULL)
+            input->mem->UnlockBuf();
    }

    node->visitMark = NODE_FINISHED;
+    node->isGradFinished = true;
 }

 /* 
@@ -603,15 +650,20 @@ void XShapeGrad::GradUnsqueeze(XTensor * node, bool isEfficient)
    if (!isEfficient || input->isGrad) {
        XNoder::MakeGrad(input);

+        if (input->mem != NULL)
+            input->mem->LockBuf();
        XTensor * tmp = NewTensorBufV2(input->grad, input->devID, input->mem);

        _ReduceSum(output->grad, tmp, dim);
        _Sum(input->grad, tmp, input->grad);

        DelTensorBuf(tmp);
+        if (input->mem != NULL)
+            input->mem->UnlockBuf();
    }

    node->visitMark = NODE_FINISHED;
+    node->isGradFinished = true;
 }

 }
\ No newline at end of file
--- a/source/network/XNet.cpp
+++ b/source/network/XNet.cpp
@@ -101,6 +101,7 @@ void XNet::Backward(TensorList &roots)
    for(int i = 0; i < nodes.count; i++){
        XTensor * node = (XTensor*)nodes.Get(i);
        node->visitMark = NODE_UNFINISHED;
+        node->isGradFinished = false;
    }

    /* back-propagation from output to input */
@@ -162,6 +163,7 @@ void XNet::BackwardNode(XTensor * node, bool isEfficent)
    }
    else{
        node->visitMark = NODE_FINISHED;
+        node->isGradFinished = true;
    }
 }


--- a/source/sample/transformer/Decoder.cpp
+++ b/source/sample/transformer/Decoder.cpp
@@ -21,8 +21,8 @@

 #include "Decoder.h"
 #include "Utility.h"
-#include "module/LayerNorm.h"
-#include "module/CommonModules.h"
+#include "submodel/LayerNorm.h"
+#include "submodel/CommonModules.h"
 #include "../../tensor/core/CHeader.h"

 namespace nmt

--- a/source/sample/transformer/Encoder.cpp
+++ b/source/sample/transformer/Encoder.cpp
@@ -21,8 +21,8 @@

 #include "Encoder.h"
 #include "Utility.h"
-#include "module/LayerNorm.h"
-#include "module/CommonModules.h"
+#include "submodel/LayerNorm.h"
+#include "submodel/CommonModules.h"
 #include "../../tensor/core/CHeader.h"

 namespace nmt

--- a/source/sample/transformer/Encoder.h
+++ b/source/sample/transformer/Encoder.h
@@ -23,10 +23,10 @@
 #define __ENCODER_H__

 #include "Utility.h"
-#include "module/FNN.h"
-#include "module/Attention.h"
-#include "module/Embedding.h"
-#include "module/LayerNorm.h"
+#include "submodel/FNN.h"
+#include "submodel/Attention.h"
+#include "submodel/Embedding.h"
+#include "submodel/LayerNorm.h"
 #include "../../network/XNet.h"

 using namespace nts;

--- a/source/sample/transformer/Model.cpp
+++ b/source/sample/transformer/Model.cpp
@@ -265,6 +265,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
    dims[inputDec.order + 1] = inputEnc.GetDim(inputEnc.order - 1);
    InitTensor(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, paddingEnc.devID);

+    GMems.GetMem(paddingEnc.devID)->LockBuf();
    XTensor* maskEncDecTMPEnc = NewTensorBuf(paddingEnc.order + 1, dims + 1,
        paddingEnc.dataType, paddingEnc.devID);
    XTensor* maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID);
@@ -275,6 +276,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,

    DelTensorBuf(maskEncDecTMPDec);
    DelTensorBuf(maskEncDecTMPEnc);
+    GMems.GetMem(paddingEnc.devID)->UnlockBuf();

    /* padding on the source side */
    int* dimsPadding = new int[paddingEnc.order + 2];
@@ -283,6 +285,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
    dimsPadding[paddingEnc.order - 1] = paddingEnc.GetDim(-1);
    dimsPadding[paddingEnc.order] = paddingEnc.GetDim(-1);

+    GMems.GetMem(paddingEnc.devID)->LockBuf();
    XTensor* padding2 = NewTensorBuf(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType,
        paddingEnc.devID);

@@ -309,6 +312,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,

    DelTensorBuf(padding3);
    DelTensorBuf(padding2);
+    GMems.GetMem(paddingEnc.devID)->UnlockBuf();
 }

 /*
@@ -490,7 +494,7 @@ void Model::Read(FILE* file)

    TensorList params;
    GetParams(params);
-    LOG("params count: %lu", params.Size());
+    LOG("params count: %lu", (unsigned long)params.Size());
    int size = 0;
    for (int i = 0; i < params.Size(); i++) {
        size += params[i]->unitNum;

--- a/source/sample/transformer/Model.h
+++ b/source/sample/transformer/Model.h
@@ -24,10 +24,10 @@

 #include "Encoder.h"
 #include "Decoder.h"
-#include "module/FNN.h"
-#include "module/Output.h"
+#include "submodel/FNN.h"
+#include "submodel/Output.h"
 #include "Utility.h"
-#include "module/Attention.h"
+#include "submodel/Attention.h"

 namespace nmt
 {

--- a/source/sample/transformer/Utility.cpp
+++ b/source/sample/transformer/Utility.cpp
@@ -28,6 +28,7 @@

 #include "Utility.h"
 #include "../../tensor/XGlobal.h"
+#include "../../tensor/XConfig.h"

 using namespace nts;
 using namespace std;
@@ -91,9 +92,9 @@ Config::Config(int argc, const char** argv)
    LoadParamInt(argsNum, args, "sbatch", &sBatchSize, 8);
    isTraining = (strcmp(trainFN, "") == 0) ? false : true;
    LoadParamBool(argsNum, args, "mt", &isMT, true);
-    LoadParamFloat(argsNum, args, "dropout", &dropout, 0.3);
-    LoadParamFloat(argsNum, args, "fnndrop", &fnnDropout, 0.1);
-    LoadParamFloat(argsNum, args, "attdrop", &attDropout, 0.1);
+    LoadParamFloat(argsNum, args, "dropout", &dropout, 0.3F);
+    LoadParamFloat(argsNum, args, "fnndrop", &fnnDropout, 0.1F);
+    LoadParamFloat(argsNum, args, "attdrop", &attDropout, 0.1F);

    LoadParamFloat(argc, args, "lrate", &lrate, 0.0015F);
    LoadParamFloat(argc, args, "lrbias", &lrbias, 0);
@@ -106,7 +107,7 @@ Config::Config(int argc, const char** argv)
    LoadParamFloat(argc, args, "adambeta2", &adamBeta2, 0.98F);
    LoadParamFloat(argc, args, "adamdelta", &adamDelta, 1e-9F);
    LoadParamBool(argc, args, "shuffled", &isShuffled, true);
-    LoadParamFloat(argc, args, "labelsmoothing", &labelSmoothingP, 0.1);
+    LoadParamFloat(argc, args, "labelsmoothing", &labelSmoothingP, 0.1F);
    LoadParamInt(argc, args, "nstepcheckpoint", &nStepCheckpoint, -1);
    LoadParamBool(argc, args, "epochcheckpoint", &useEpochCheckpoint, true);
    LoadParamInt(argc, args, "updatestep", &updateStep, 1);
@@ -124,8 +125,8 @@ Config::Config(int argc, const char** argv)
    LoadParamString(argsNum, args, "output", outputFN, "");
    LoadParamInt(argsNum, args, "beamsize", &beamSize, 1);
    LoadParamBool(argsNum, args, "fp16", &useFP16, false);
-    LoadParamFloat(argsNum, args, "lenalpha", &lenAlpha, 0.6);
-    LoadParamFloat(argsNum, args, "maxlenalpha", &maxLenAlpha, 1.2);
+    LoadParamFloat(argsNum, args, "lenalpha", &lenAlpha, 0.6F);
+    LoadParamFloat(argsNum, args, "maxlenalpha", &maxLenAlpha, 1.2F);

    for (int i = 0; i < argc; i++)
        delete[] args[i];
@@ -157,90 +158,6 @@ int Config::LoadFromFile(const char* configFN, char** args) {
    return argsNum;
 }

-void LoadParamString(int argc, char** argv, const char* name, char* p, const char* defaultP)
-{
-    char vname[128];
-    vname[0] = '-';
-    strcpy(vname + 1, name);
-    bool hit = false;
-    for (int i = 0; i < argc; i++) {
-        if (!strcmp(argv[i], vname) && i + 1 < argc) {
-            strcpy(p, argv[i + 1]);
-            hit = true;
-            break;
-        }
-    }
-    if (!hit)
-        strcpy(p, defaultP);
-}
-
-void LoadParamInt(int argc, char** argv, const char* name, int* p, int defaultP)
-{
-    char vname[128];
-    vname[0] = '-';
-    strcpy(vname + 1, name);
-    bool hit = false;
-    for (int i = 0; i < argc; i++) {
-        if (!strcmp(argv[i], vname) && i + 1 < argc) {
-            *(int*)p = atoi(argv[i + 1]);
-            hit = true;
-            break;
-        }
-    }
-    if (!hit)
-        *p = defaultP;
-}
-
-void LoadParamBool(int argc, char** argv, const char* name, bool* p, bool defaultP)
-{
-    char vname[128];
-    vname[0] = '-';
-    strcpy(vname + 1, name);
-    bool hit = false;
-    for (int i = 0; i < argc; i++) {
-        if (!strcmp(argv[i], vname)) {
-            *(bool*)p = true;
-            hit = true;
-            break;
-        }
-    }
-    if (!hit)
-        *p = defaultP;
-}
-
-void LoadParamFloat(int argc, char** argv, const char* name, float* p, float defaultP)
-{
-    char vname[128];
-    vname[0] = '-';
-    strcpy(vname + 1, name);
-    bool hit = false;
-    for (int i = 0; i < argc; i++) {
-        if (!strcmp(argv[i], vname) && i + 1 < argc) {
-            *p = (float)atof(argv[i + 1]);
-            hit = true;
-            break;
-        }
-    }
-    if (!hit)
-        *p = defaultP;
-}
-
-void ShowParams(int argc, char** argv)
-{
-    fprintf(stderr, "args:\n");
-    for (int i = 0; i < argc; i++) {
-        if (argv[i][1] == 0)
-            continue;
-        if (argv[i][0] == '-' && (argv[i][1] < '1' || argv[i][1] > '9')) {
-            if (i + 1 < argc && argv[i + 1][0] != '-')
-                fprintf(stderr, " %s=%s\n", argv[i], argv[i + 1]);
-            else
-                fprintf(stderr, " %s=yes\n", argv[i]);
-        }
-    }
-    fprintf(stderr, "\n");
-}
-
 #define MAX_WORD_NUM 120

 /*
@@ -275,7 +192,9 @@ IntList SplitInt(const string& s, const string& delimiter)
    IntList values;
    auto indices = SplitToPos(s, delimiter);
    for (int i = 0; i < indices.Size(); i++) {
-        values.Add(strtol(s.data() + indices[i], nullptr, 10));
+        
+        /* this line is with problem. Why do we need an IntList to keep an int64*/
+        values.Add((int)strtol(s.data() + indices[i], nullptr, 10));
    }
    return values;
 }
@@ -291,4 +210,4 @@ FloatList SplitFloat(const string& s, const string& delimiter)
    return values;
 }

-}
\ No newline at end of file
+}
--- a/source/sample/transformer/Utility.h
+++ b/source/sample/transformer/Utility.h
@@ -33,17 +33,6 @@ using namespace nts;
 namespace nmt
 {

-#define MAX_PARAM_NUM 100
-
-/* load arguments */
-void LoadParamInt(int argc, char** argv, const char* name, int* p, int defaultP);
-void LoadParamBool(int argc, char** argv, const char* name, bool* p, bool defaultP);
-void LoadParamFloat(int argc, char** argv, const char* name, float* p, float defaultP);
-void LoadParamString(int argc, char** argv, const char* name, char* p, const char* defaultP);
-
-/* show arguments */
-void ShowParams(int argc, char** argv);
-
 /* split string */
 IntList SplitInt(const string& s, const string& delimiter);
 FloatList SplitFloat(const string& s, const string& delimiter);

--- a/source/sample/transformer/module/Attention.cpp
+++ b/source/sample/transformer/module/Attention.cpp
@@ -226,7 +226,6 @@ XTensor Attention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
    XTensor qheads;
    XTensor vheads;

-    const int batchSize = q.GetDim(0);
    const int lenQ = q.GetDim(1);
    const int lenKV = k.GetDim(1);

@@ -255,7 +254,7 @@ XTensor Attention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v,
        relativeKey = ConvertDataType(relativeKey, X_FLOAT);
    }

-    float scaling = sqrt(d / nhead);
+    float scaling = (float)sqrt(d / nhead);
    qheads = ScaleAndShift(qheads, 1.0F / scaling);

    dot = RPDotProduct(qheads, kheads, relativeKey, true);
@@ -402,4 +401,4 @@ void Cache::Reorder(XTensor& reorder)
        value = AutoGather(value, reorder);
    }
 }
-}
\ No newline at end of file
+}
--- a/source/sample/transformer/module/Attention.h
+++ b/source/sample/transformer/module/Attention.h
--- a/source/sample/transformer/module/CommonModules.cpp
+++ b/source/sample/transformer/module/CommonModules.cpp
--- a/source/sample/transformer/module/CommonModules.h
+++ b/source/sample/transformer/module/CommonModules.h
--- a/source/sample/transformer/module/Embedding.cpp
+++ b/source/sample/transformer/module/Embedding.cpp
--- a/source/sample/transformer/module/Embedding.h
+++ b/source/sample/transformer/module/Embedding.h
--- a/source/sample/transformer/module/FNN.cpp
+++ b/source/sample/transformer/module/FNN.cpp
--- a/source/sample/transformer/module/FNN.h
+++ b/source/sample/transformer/module/FNN.h
--- a/source/sample/transformer/module/GLU.cpp
+++ b/source/sample/transformer/module/GLU.cpp
@@ -48,8 +48,6 @@ void GLU::InitModel(Config& config)
 {
    devID = config.devID;

-    float minmax = 0;
-
    inSize = config.modelSize;
    outSize = config.modelSize;

@@ -84,4 +82,4 @@ XTensor GLU::Make(XTensor& input)
    return t1 * Sigmoid(t2);
 }

-}
\ No newline at end of file
+}
--- a/source/sample/transformer/module/GLU.h
+++ b/source/sample/transformer/module/GLU.h
--- a/source/sample/transformer/module/LayerHistory.cpp
+++ b/source/sample/transformer/module/LayerHistory.cpp
@@ -92,10 +92,10 @@ generate the weight sum vector of all previous layer output in the history as th
 XTensor LayerHistory::Pop()
 {
    /* the number of layer output in the history */
-    size_t size = history.Size();
+    int size = (int)history.Size();

    TensorList historyList;
-    for (size_t i = 0; i < size; i++)
+    for (int i = 0; i < size; i++)
        historyList.Add(history[i]);

    /* we need stack the tensor along the first dim*/

--- a/source/sample/transformer/module/LayerHistory.h
+++ b/source/sample/transformer/module/LayerHistory.h
--- a/source/sample/transformer/module/LayerNorm.cpp
+++ b/source/sample/transformer/module/LayerNorm.cpp
--- a/source/sample/transformer/module/LayerNorm.h
+++ b/source/sample/transformer/module/LayerNorm.h
--- a/source/sample/transformer/module/NNUtil.cpp
+++ b/source/sample/transformer/module/NNUtil.cpp
--- a/source/sample/transformer/module/NNUtil.h
+++ b/source/sample/transformer/module/NNUtil.h
--- a/source/sample/transformer/module/Output.cpp
+++ b/source/sample/transformer/module/Output.cpp
--- a/source/sample/transformer/module/Output.h
+++ b/source/sample/transformer/module/Output.h
--- a/source/sample/transformer/train/TrainDataSet.cpp
+++ b/source/sample/transformer/train/TrainDataSet.cpp
@@ -134,13 +134,13 @@ UInt64List TrainDataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
    UInt64List info;
    size_t srcTokenNum = 0;
    size_t tgtTokenNum = 0;
-    int realBatchSize = 1;
+    size_t realBatchSize = 1;

    if (!isTraining)
        realBatchSize = minSentBatch;

    /* get the maximum source sentence length in a mini-batch */
-    size_t maxSrcLen = buffer[curIdx]->srcSent.Size();
+    size_t maxSrcLen = buffer[(int)curIdx]->srcSent.Size();

    /* max batch size */
    const int MAX_BATCH_SIZE = 512;
@@ -150,9 +150,9 @@ UInt64List TrainDataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
        while ((realBatchSize < (buffer.Size() - curIdx))
            && (realBatchSize * maxSrcLen < batchSize)
            && (realBatchSize < MAX_BATCH_SIZE)
-            && (realBatchSize * buffer[curIdx + realBatchSize]->srcSent.Size() < batchSize)) {
-            if (maxSrcLen < buffer[curIdx + realBatchSize]->srcSent.Size())
-                maxSrcLen = buffer[curIdx + realBatchSize]->srcSent.Size();
+            && (realBatchSize * buffer[(int)(curIdx + realBatchSize)]->srcSent.Size() < batchSize)) {
+            if (maxSrcLen < buffer[(int)(curIdx + realBatchSize)]->srcSent.Size())
+                maxSrcLen = buffer[(int)(curIdx + realBatchSize)]->srcSent.Size();
            realBatchSize++;
        }
    }
@@ -165,14 +165,14 @@ UInt64List TrainDataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
    CheckNTErrors(realBatchSize > 0, "Invalid batch size");

    /* get the maximum target sentence length in a mini-batch */
-    size_t maxTgtLen = buffer[curIdx]->tgtSent.Size();
+    size_t maxTgtLen = buffer[(int)curIdx]->tgtSent.Size();
    for (size_t i = 0; i < realBatchSize; i++) {
-        if (maxTgtLen < buffer[curIdx + i]->tgtSent.Size())
-            maxTgtLen = buffer[curIdx + i]->tgtSent.Size();
+        if (maxTgtLen < buffer[(int)(curIdx + i)]->tgtSent.Size())
+            maxTgtLen = buffer[(int)(curIdx + i)]->tgtSent.Size();
    }
    for (size_t i = 0; i < realBatchSize; i++) {
-        if (maxSrcLen < buffer[curIdx + i]->srcSent.Size())
-            maxSrcLen = buffer[curIdx + i]->srcSent.Size();
+        if (maxSrcLen < buffer[(int)(curIdx + i)]->srcSent.Size())
+            maxSrcLen = buffer[(int)(curIdx + i)]->srcSent.Size();
    }

    CheckNTErrors(maxSrcLen != 0, "Invalid source length for batching");
@@ -204,19 +204,19 @@ UInt64List TrainDataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
    */
    for (int i = 0; i < realBatchSize; ++i) {

-        srcTokenNum += buffer[curIdx + i]->srcSent.Size();
-        tgtTokenNum += buffer[curIdx + i]->tgtSent.Size();
+        srcTokenNum += buffer[(int)(curIdx + i)]->srcSent.Size();
+        tgtTokenNum += buffer[(int)(curIdx + i)]->tgtSent.Size();

        curSrc = maxSrcLen * i;
-        for (int j = 0; j < buffer[curIdx + i]->srcSent.Size(); j++) {
-            batchEncValues[curSrc++] = buffer[curIdx + i]->srcSent[j];
+        for (int j = 0; j < buffer[(int)(curIdx + i)]->srcSent.Size(); j++) {
+            batchEncValues[curSrc++] = buffer[(int)(curIdx + i)]->srcSent[j];
        }

        curTgt = maxTgtLen * i;
-        for (int j = 0; j < buffer[curIdx + i]->tgtSent.Size(); j++) {
+        for (int j = 0; j < buffer[(int)(curIdx + i)]->tgtSent.Size(); j++) {
            if (j > 0)
-                labelVaues[curTgt - 1] = buffer[curIdx + i]->tgtSent[j];
-            batchDecValues[curTgt++] = buffer[curIdx + i]->tgtSent[j];
+                labelVaues[curTgt - 1] = buffer[(int)(curIdx + i)]->tgtSent[j];
+            batchDecValues[curTgt++] = buffer[(int)(curIdx + i)]->tgtSent[j];
        }
        labelVaues[curTgt - 1] = EOS;
        while (curSrc < maxSrcLen * (i + 1))
@@ -226,11 +226,13 @@ UInt64List TrainDataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,

    }

-    InitTensor2D(batchEnc, realBatchSize, maxSrcLen, X_INT, devID);
-    InitTensor2D(paddingEnc, realBatchSize, maxSrcLen, X_FLOAT, devID);
-    InitTensor2D(batchDec, realBatchSize, maxTgtLen, X_INT, devID);
-    InitTensor2D(paddingDec, realBatchSize, maxTgtLen, X_FLOAT, devID);
-    InitTensor2D(label, realBatchSize, maxTgtLen, X_INT, devID);
+    int rbs = (int)realBatchSize;
+    int msl = (int)maxSrcLen;
+    InitTensor2D(batchEnc, rbs, msl, X_INT, devID);
+    InitTensor2D(paddingEnc, rbs, msl, X_FLOAT, devID);
+    InitTensor2D(batchDec, rbs, msl, X_INT, devID);
+    InitTensor2D(paddingDec, rbs, msl, X_FLOAT, devID);
+    InitTensor2D(label, rbs, msl, X_INT, devID);

    curIdx += realBatchSize;

@@ -304,14 +306,14 @@ void TrainDataSet::BuildBucket()
        size_t sentNum = 1;

        /* get the maximum source sentence length in a bucket */
-        size_t maxSrcLen = buffer[idx]->srcSent.Size();
+        size_t maxSrcLen = buffer[(int)idx]->srcSent.Size();

        /* bucketing for sentences */
        while ((sentNum < (buffer.Size() - idx))
            && (sentNum * maxSrcLen < bucketSize)
-            && (sentNum * buffer[curIdx + sentNum]->srcSent.Size() < bucketSize)) {
-            if (maxSrcLen < buffer[idx + sentNum]->srcSent.Size())
-                maxSrcLen = buffer[idx + sentNum]->srcSent.Size();
+            && (sentNum * buffer[(int)(curIdx + sentNum)]->srcSent.Size() < bucketSize)) {
+            if (maxSrcLen < buffer[(int)(idx + sentNum)]->srcSent.Size())
+                maxSrcLen = buffer[(int)(idx + sentNum)]->srcSent.Size();
            sentNum++;
        }

@@ -324,7 +326,7 @@ void TrainDataSet::BuildBucket()

        /* shuffle items in a bucket */
        for (size_t i = 0; i < sentNum; i++) {
-            buffer[idx + i]->bucketKey = randomKey;
+            buffer[(int)(idx + i)]->bucketKey = randomKey;
        }

        idx += sentNum;
@@ -335,13 +337,13 @@ void TrainDataSet::BuildBucket()
    idx = 0;
    while (idx < buffer.Size()) {
        size_t sentNum = 0;
-        int bucketKey = buffer[idx + sentNum]->bucketKey;
+        int bucketKey = buffer[(int)(idx + sentNum)]->bucketKey;
        while (sentNum < (buffer.Size() - idx)
-            && buffer[idx + sentNum]->bucketKey == bucketKey) {
-            buffer[idx + sentNum]->key = buffer[idx + sentNum]->srcSent.Size();
+            && buffer[(int)(idx + sentNum)]->bucketKey == bucketKey) {
+            buffer[(int)(idx + sentNum)]->key = (int)buffer[(int)(idx + sentNum)]->srcSent.Size();
            sentNum++;
        }
-        SortInBucket(idx, idx + sentNum);
+        SortInBucket((int)idx, (int)(idx + sentNum));
        idx += sentNum;
    }
 }

--- a/source/sample/transformer/train/TrainDataSet.h
+++ b/source/sample/transformer/train/TrainDataSet.h
@@ -98,6 +98,21 @@ public:
                         XTensor* batchDec, XTensor* paddingDec, XTensor* label,
                         size_t minSentBatch, size_t batchSize, int devID);

+    /* load the samples into the buffer (a list) */
+    bool LoadBatchToBuf(XList * buf);
+
+    /* load the samples into tensors from the buffer */
+    static
+    bool LoadBatch(XList * buf,
+                   XTensor* batchEnc, XTensor* paddingEnc,
+                   XTensor* batchDec, XTensor* paddingDec, XTensor* label,
+                   size_t minSentBatch, size_t batchSize, int devID,
+                   int &wc, int &sc);
+
+    /* release the samples in a buffer */
+    static
+    void ClearSamples(XList * buf);
+
    /* initialization function */
    void Init(const char* dataFile, int bucketSize, bool training);


--- a/source/sample/transformer/train/Trainer.cpp
+++ b/source/sample/transformer/train/Trainer.cpp
@@ -163,8 +163,8 @@ void Trainer::Train(const char* fn, const char* validFN,
            UInt64List info = batchLoader.LoadBatch(&batchEnc, &paddingEnc, &batchDec, &paddingDec, &label, 
                                                    sBatchSize, wBatchSize, devID);

-            wc = info[0];
-            ws = info[1];
+            wc = (int)info[0];
+            ws = (int)info[1];
            CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch");

            /* output probabilities */
@@ -206,7 +206,7 @@ void Trainer::Train(const char* fn, const char* validFN,
                if (gradStep == updateStep) {

                    float warmupEndLR = lrate;
-                    float warmupInitLR = 1e-7;
+                    float warmupInitLR = 1e-7F;
                    float lrStep = (warmupEndLR - warmupInitLR) / nwarmup;
                    float decayFactor = warmupEndLR * pow(float(nwarmup), 0.5F);

@@ -320,8 +320,8 @@ void Trainer::Validate(const char* fn, const char* ofn, Model* model)

        UInt64List info = batchLoader.LoadBatch(&batchEnc, &paddingEnc, &batchDec, &paddingDec, &label, 
                                                sBatchSize, 0, model->devID);
-        wc = info[0];
-        ws = info[1];
+        wc = (int)info[0];
+        ws = (int)info[1];
        CheckNTErrors(batchEnc.order == 2, "Wrong tensor order of the sequence batch");

        /* make the network */
@@ -334,7 +334,7 @@ void Trainer::Validate(const char* fn, const char* ofn, Model* model)
        }

        int bSize = output.GetDim(0);
-        int length = output.GetDim(1);
+        //int length = output.GetDim(1);

        labelOnehot = IndexToOnehot(label, vSizeTgt, 0);
        lossTensor = CrossEntropy(output, labelOnehot, paddingDec);
@@ -428,6 +428,7 @@ void Trainer::Update(Model* model, const float lr)
            _ScaleAndShiftMe(v, (1.0F - adamBeta2), 0);

            /* v2 = m / (sqrt(v) + delta) */
+            GMems.GetMem(v->devID)->LockBuf();
            XTensor* v2 = NewTensorBuf(v, v->devID);
            _Power(v, v2, 0.5F);
            _ScaleAndShiftMe(v2, 1.0F, d);
@@ -437,6 +438,7 @@ void Trainer::Update(Model* model, const float lr)
            _Sum(para, v2, para, -e);

            DelTensorBuf(v2);
+            GMems.GetMem(v->devID)->UnlockBuf();
        }
        else {
            /* the delta rule */
@@ -479,4 +481,4 @@ void Trainer::PrepareModel(Model* model)
    adamBeta2T = 1.0F;
 }

-}
\ No newline at end of file
+}
--- a/source/sample/transformer/translate/DataSet.cpp
+++ b/source/sample/transformer/translate/DataSet.cpp
@@ -70,10 +70,10 @@ void DataSet::LoadDataToBuffer()
        size_t maxLen = indices.Size() > MAX_WORD_NUM ? MAX_WORD_NUM : indices.Size();

        for (size_t i = 0; i < maxLen; i++) {
-            auto offset = (i != (indices.Size() - 1)) ?
-                indices[i + 1] - indices[i] - tokenDelimiter.size()
-                : line.size() - indices[i];
-            string word = line.substr(indices[i], offset);
+            size_t offset = (i != (indices.Size() - 1)) ?
+                              (size_t)indices[(int)i + 1] - (size_t)indices[(int)i] - tokenDelimiter.size()
+                            : line.size() - (size_t)indices[(int)i];
+            string word = line.substr((size_t)indices[(int)i], offset);
            if (srcVocab.word2id.find(word) == srcVocab.word2id.end())
                values.Add(UNK);
            else
@@ -110,12 +110,12 @@ load a mini-batch to the device (for translating)
 << indices of the sentences
 */
 UInt64List DataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
-                              size_t minSentBatch, size_t batchSize, int devID)
+                              int minSentBatch, int batchSize, int devID)
 {
-    size_t realBatchSize = minSentBatch;
+    int realBatchSize = minSentBatch;

    /* get the maximum sentence length in a mini-batch */
-    size_t maxLen = inputBuffer[bufferUsed]->values.Size();
+    int maxLen = (int)inputBuffer[(int)bufferUsed]->values.Size();

    /* dynamic batching for sentences */
    //while ((realBatchSize < (inputBuffer.Size() - bufferUsed))
@@ -125,7 +125,7 @@ UInt64List DataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,

    /* real batch size */
    if ((inputBuffer.Size() - bufferUsed) < realBatchSize) {
-        realBatchSize = inputBuffer.Size() - bufferUsed;
+        realBatchSize = (int)(inputBuffer.Size() - bufferUsed);
    }

    CheckNTErrors(maxLen != 0, "invalid length");
@@ -144,15 +144,15 @@ UInt64List DataSet::LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
    UInt64List infos;
    size_t totalLength = 0;

-    for (int i = 0; i < realBatchSize; ++i) {
-        infos.Add(inputBuffer[bufferUsed + i]->id);
-        totalLength += inputBuffer[bufferUsed + i]->values.Size();
+    for (size_t i = 0; i < (size_t)realBatchSize; ++i) {
+        infos.Add(inputBuffer[(int)(bufferUsed + i)]->id);
+        totalLength += inputBuffer[(int)(bufferUsed + i)]->values.Size();

        curSrc = maxLen * i;
-        for (int j = 0; j < inputBuffer[bufferUsed + i]->values.Size(); j++)
-            batchValues[curSrc++] = inputBuffer[bufferUsed + i]->values[j];
+        for (size_t j = 0; j < inputBuffer[(int)(bufferUsed + i)]->values.Size(); j++)
+            batchValues[(int)(curSrc++)] = (int)inputBuffer[(int)(bufferUsed + i)]->values[(int)j];
        while (curSrc < maxLen * (i + 1))
-            paddingValues[curSrc++] = 0;
+            paddingValues[(int)(curSrc++)] = 0;
    }
    infos.Add(totalLength);


--- a/source/sample/transformer/translate/DataSet.h
+++ b/source/sample/transformer/translate/DataSet.h
@@ -85,7 +85,7 @@ public:

    /* generate a mini-batch */
    UInt64List LoadBatch(XTensor* batchEnc, XTensor* paddingEnc,
-        size_t sBatch, size_t wBatch, int devID);
+                         int sBatch, int wBatch, int devID);

    /* initialization function */
    void Init(const char* dataFile, const char* srcVocabFN, const char* tgtVocabFN);

--- a/source/sample/transformer/translate/LengthPenalty.cpp
+++ b/source/sample/transformer/translate/LengthPenalty.cpp
@@ -42,7 +42,7 @@ float LengthPenalizer::GNMT(float length, float alpha)

    base = (length + 5.0F) / (1.0F + 5.0F);

-    lp = pow(base, alpha);
+    lp = (float)pow(base, alpha);

    return lp;
 }

--- a/source/sample/transformer/translate/Predictor.cpp
+++ b/source/sample/transformer/translate/Predictor.cpp
@@ -22,7 +22,7 @@
 #include <iostream>

 #include "Predictor.h"
-#include "../module/NNUtil.h"
+#include "../submodel/NNUtil.h"

 using namespace nts;


--- a/source/sample/transformer/translate/Search.cpp
+++ b/source/sample/transformer/translate/Search.cpp
@@ -322,7 +322,7 @@ void BeamSearch::Generate(StateBundle* prev, StateBundle* beam)
    /* keep the most promising candidates in the beam */
    TopK(score, scoreTopK, index, -1, beamSize, true);

-    float lp = LengthPenalizer::GNMT(beam->nstep, alpha);
+    //float lp = LengthPenalizer::GNMT(beam->nstep, alpha);

    CopyValues(index, indexCPU);
    CopyValues(index, preID);
@@ -493,8 +493,8 @@ void BeamSearch::Collect(StateBundle* beam)

        /* check if this is the first end symbol. It is false
           if there have been end symbols in previously generated words. */
-        bool isCompleted = state.isCompleted && 
-             (state.last == NULL || !state.last->isCompleted);
+        //bool isCompleted = state.isCompleted &&
+        //     (state.last == NULL || !state.last->isCompleted);

        /* we push the hypothesis into the heap when it is completed */
        if ((state.isEnd || state.isCompleted)) {
@@ -557,7 +557,6 @@ void BeamSearch::Dump(IntList* output, XTensor* score)
            }
        }

-        int count = 0;
        bool isCompleted = true;

        /* we track the state from the end to the beginning */
@@ -873,4 +872,4 @@ void GreedySearch::Search(Model* model, XTensor& input,
    delete[] finishedFlags;
 }

-}
\ No newline at end of file
+}
--- a/source/sample/transformer/translate/Translator.cpp
+++ b/source/sample/transformer/translate/Translator.cpp
@@ -155,7 +155,7 @@ void Translator::Translate(const char* ifn, const char* sfn,
        batchLoader.outputBuffer.Add(emptyRes);
    }

-    double startDump = GetClockSec();
+    //double startDump = GetClockSec();

    /* reorder the result */
    batchLoader.SortOutput();
@@ -163,7 +163,7 @@ void Translator::Translate(const char* ifn, const char* sfn,
    /* print the result to a file */
    batchLoader.DumpRes(ofn);

-    double elapsed = GetClockSec() - startDump;
+    //double elapsed = GetClockSec() - startDump;

    LOG("translation completed (word=%d, sent=%zu)", 
        wordCountTotal, batchLoader.inputBuffer.Size() + batchLoader.emptyLines.Size());
@@ -196,4 +196,4 @@ void Translator::Dump(FILE* file, XTensor* output)
    }
 }

-}
\ No newline at end of file
+}
--- a/source/sample/transformer/translate/Vocab.cpp
+++ b/source/sample/transformer/translate/Vocab.cpp
@@ -34,14 +34,14 @@ void Vocab::Load(const string& src)

    /* get the vocab size and the start id */
    f >> vsz >> sid;
-    startID = stol(sid);
-    vocabSize = stol(vsz);
+    startID = (int)stol(sid);
+    vocabSize = (int)stol(vsz);

    string word, id;
    for (int i = 0; i < vocabSize - startID; i++) {
        f >> word >> id;
-        word2id[word] = stol(id);
-        id2word[stol(id)] = word;
+        word2id[word] = (int)stol(id);
+        id2word[(int)stol(id)] = word;
    }

    f.close();
@@ -75,4 +75,4 @@ void Vocab::CopyFrom(const Vocab& v)
        id2word.insert(i2w);
 }

-}
\ No newline at end of file
+}
--- a/source/tensor/XCall.cpp
+++ b/source/tensor/XCall.cpp
@@ -847,6 +847,7 @@ XTensor * NewTensorRange(int lower, int upper, int step, const TENSOR_DATA_TYPE 

    XTensor * tensor = NewTensor1D(unitNum, myDataType, myDevID, isEnableGrad);
    tensor->Range(lower, upper, step);
+
    return tensor;
 }


--- a/source/tensor/XConfig.cpp
+++ b/source/tensor/XConfig.cpp
+/*
+* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2021
+* Natural Language Processing Lab, Northeastern University
+* and
+* NiuTrans Research
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* this class keeps a batch of paramters.
+*
+* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-02-28
+*/
+
+#include "XConfig.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* constructor */
+XConfig::XConfig()
+{
+    n = 0;
+    args = NULL;
+    nReal = 0;
+}
+
+/* de-constructor */
+XConfig::~XConfig()
+{
+    for (int i = 0; i < n; i++) {
+        delete[] args[i];
+    }
+    delete[] args;
+}
+
+/* clear it */
+void XConfig::Clear()
+{
+    for (int i = 0; i < n; i++) {
+        delete[] args[i];
+    }
+    delete[] args;
+    n = 0;
+    args = NULL;
+    nReal = 0;
+}
+
+/* 
+create a config 
+>> myN - number of the input arguments
+>> myArgs - the input arguments
+*/
+void XConfig::Create(const int myN, const char ** myArgs)
+{
+    CheckNTErrors(myN > 0, "No input parameters to XConfig!");
+
+    for (int i = 0; i < n; i++) {
+        delete[] args[i];
+    }
+    delete[] args;
+    args = NULL;
+    n = myN;
+    nReal = n * 2;
+    
+    
+    args = new char*[nReal];
+
+    for (int i = 0; i < nReal; i++) {
+        args[i] = NULL;
+    }
+
+    for (int i = 0; i < n; i++) {
+        CheckNTErrors(myArgs[i] != NULL, "Illegal parameter input!");
+        args[i] = new char[strlen(myArgs[i]) + 1];
+        strcpy(args[i], myArgs[i]);
+    }
+}
+
+/* 
+add an argument 
+>> myArg - the argument
+>> myValue - the value of the argument
+*/
+void XConfig::Add(const char * myArg, const char * myValue)
+{
+    CheckNTErrors(myArg != NULL, "No argument!");
+
+    if (n + 2 > nReal) {
+        nReal = MAX(n * 2 + 1, 128);
+        char ** newArgs = new char*[nReal];
+        memset(newArgs, 0, sizeof(char*) * n);
+        memcpy(newArgs, args, sizeof(char*) * n);
+        delete[] args;
+        args = newArgs;
+    }
+
+    args[n] = new char[strlen(myArg) + 2];
+    args[n][0] = '-';
+    strcpy(args[n] + 1, myArg);
+    n++;
+
+    if (myValue != NULL) {
+        args[n] = new char[strlen(myValue) + 1];
+        strcpy(args[n], myValue);
+        n++;
+    }
+}
+
+/* 
+add an argument (in integer) 
+>> myArg - the argument
+>> myValue - the value of the argument
+*/
+void XConfig::Add(const char * myArg, int myValue)
+{
+    char value[MAX_WORD_LENGTH_IN_CONFIG];
+
+    sprintf(value, "%d", myValue);
+
+    Add(myArg, value);
+}
+
+/* 
+add an argument (in bool) 
+>> myArg - the argument
+>> myValue - the value of the argument
+*/
+void XConfig::Add(const char * myArg, bool myValue)
+{
+    char value[2];
+
+    if (myValue)
+        value[0] = '1';
+    else
+        value[0] = '0';
+    value[1] = 0;
+
+    Add(myArg, value);
+}
+
+/*
+add an argument (in float)
+>> myArg - the argument
+>> myValue - the value of the argument
+*/
+void XConfig::Add(const char * myArg, float myValue)
+{
+    char value[MAX_WORD_LENGTH_IN_CONFIG];
+
+    sprintf(value, "%f", myValue);
+
+    Add(myArg, value);
+}
+
+/* 
+load the value of an argument (in integer) 
+>> name - the name of the argument
+>> p - where we place the loaded value
+>> defaultP - the default value (used only if no argument is hit in the list)
+*/
+void XConfig::LoadInt(const char * name, int * p, int defaultP)
+{
+    LoadParamInt(n, args, name, p, defaultP);
+}
+
+/*
+load the value of an argument (in boolean)
+>> name - the name of the argument
+>> p - where we place the loaded value
+>> defaultP - the default value (used only if no argument is hit in the list)
+*/
+void XConfig::LoadBool(const char * name, bool * p, bool defaultP)
+{
+    LoadParamBool(n, args, name, p, defaultP);
+}
+
+/*
+load the value of an argument (in float)
+>> name - the name of the argument
+>> p - where we place the loaded value
+>> defaultP - the default value (used only if no argument is hit in the list)
+*/void XConfig::LoadFloat(const char * name, float * p, float defaultP)
+{
+    LoadParamFloat(n, args, name, p, defaultP);
+}
+
+/*
+load the value of an argument (in char string)
+>> name - the name of the argument
+>> p - where we place the loaded value
+>> defaultP - the default value (used only if no argument is hit in the list)
+*/
+void XConfig::LoadString(const char * name, char * p, const char* defaultP)
+{
+    LoadParamString(n, args, name, p, defaultP);
+}
+
+/* 
+get the value of an argument (in integer) 
+>> name - the name of the argument
+>> defaultP - the default value (used only if no argument is hit in the list)
+*/
+int XConfig::GetInt(const char * name, int defaultP)
+{
+    int r;
+
+    LoadInt(name, &r, defaultP);
+
+    return r;
+}
+
+/* 
+get the value of an argument (in bool)
+>> name - the name of the argument
+>> defaultP - the default value (used only if no argument is hit in the list)
+*/
+bool XConfig::GetBool(const char * name, bool defaultP)
+{
+    bool r;
+
+    LoadBool(name, &r, defaultP);
+
+    return r;
+}
+
+/* 
+get the value of an argument (in float) 
+>> name - the name of the argument
+>> defaultP - the default value (used only if no argument is hit in the list)
+*/
+float XConfig::GetFloat(const char * name, float defaultP)
+{
+    float r;
+
+    LoadFloat(name, &r, defaultP);
+
+    return r;
+}
+
+/* get item number */
+int XConfig::GetItemNum()
+{
+    return n;
+}
+
+/* 
+get the item with offset i 
+>> i - offset
+*/
+char * XConfig::GetItem(int i)
+{
+    if (i < n && i >= 0)
+        return args[i];
+    else
+        return NULL;
+}
+
+/* 
+initialize with another config model 
+>> myConfig - the configure model that we want to copy
+*/
+void XConfig::CreateFromMe(XConfig & myConfig)
+{
+    Clear();
+
+    for (int i = 0; i < myConfig.GetItemNum(); i++)
+        Add(myConfig.GetItem(i), i);
+}
+
+/*
+load the value of an argument (in integer)
+>> argc - number of arguments
+>> argv - arguments
+>> name - the argument we search for
+>> p - the pointer to the target variable where we want to place the value
+>> defaultP - the default value we use if no argument is found
+*/
+void LoadParamInt(int argc, char** argv, const char* name, int* p, int defaultP)
+{
+    char vname[128];
+    vname[0] = '-';
+    strcpy(vname + 1, name);
+    bool hit = false;
+    for (int i = 0; i < argc; i++) {
+        if (!strcmp(argv[i], vname) && i + 1 < argc) {
+            *(int*)p = atoi(argv[i + 1]);
+            hit = true;
+            break;
+        }
+    }
+    if (!hit)
+        *p = defaultP;
+}
+
+/*
+load the value of an argument (in boolean)
+>> argc - number of arguments
+>> argv - arguments
+>> name - the argument we search for
+>> p - the pointer to the target variable where we want to place the value
+>> defaultP - the default value we use if no argument is found
+*/
+void LoadParamBool(int argc, char** argv, const char* name, bool* p, bool defaultP)
+{
+    char vname[128];
+    vname[0] = '-';
+    strcpy(vname + 1, name);
+    bool hit = false;
+    for (int i = 0; i < argc; i++) {
+        if (!strcmp(argv[i], vname)) {
+            *(bool*)p = true;
+            hit = true;
+            break;
+        }
+    }
+    if (!hit)
+        *p = defaultP;
+}
+
+/*
+load the value of an argument (in float)
+>> argc - number of arguments
+>> argv - arguments
+>> name - the argument we search for
+>> p - the pointer to the target variable where we want to place the value
+>> defaultP - the default value we use if no argument is found
+*/
+void LoadParamFloat(int argc, char** argv, const char* name, float* p, float defaultP)
+{
+    char vname[128];
+    vname[0] = '-';
+    strcpy(vname + 1, name);
+    bool hit = false;
+    for (int i = 0; i < argc; i++) {
+        if (!strcmp(argv[i], vname) && i + 1 < argc) {
+            *p = (float)atof(argv[i + 1]);
+            hit = true;
+            break;
+        }
+    }
+    if (!hit)
+        *p = defaultP;
+}
+
+/*
+load the value of an argument (in char string)
+>> argc - number of arguments
+>> argv - arguments
+>> name - the argument we search for
+>> p - the pointer to the target variable where we want to place the value
+>> defaultP - the default value we use if no argument is found
+*/
+void LoadParamString(int argc, char** argv, const char* name, char* p, const char* defaultP)
+{
+    char vname[128];
+    vname[0] = '-';
+    strcpy(vname + 1, name);
+    bool hit = false;
+    for (int i = 0; i < argc; i++) {
+        if (!strcmp(argv[i], vname) && i + 1 < argc) {
+            strcpy(p, argv[i + 1]);
+            hit = true;
+            break;
+        }
+    }
+    if (!hit)
+        strcpy(p, defaultP);
+}
+
+/*
+show the argument list
+>> argc - number of arguments
+>> argv - arguments
+*/
+void ShowParams(int argc, char** argv)
+{
+    fprintf(stderr, "args:\n");
+    for (int i = 0; i < argc; i++) {
+        if (argv[i][1] == 0)
+            continue;
+        if (argv[i][0] == '-' && (argv[i][1] < '1' || argv[i][1] > '9')) {
+            if (i + 1 < argc && argv[i + 1][0] != '-')
+                fprintf(stderr, " %s=%s\n", argv[i], argv[i + 1]);
+            else
+                fprintf(stderr, " %s=yes\n", argv[i]);
+        }
+    }
+    fprintf(stderr, "\n");
+}
+
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/XConfig.h
+++ b/source/tensor/XConfig.h
+/*
+* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2021
+* Natural Language Processing Lab, Northeastern University
+* and
+* NiuTrans Research
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* this class defines a parameter keeper.
+*
+* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2021-02-28
+* A new semester begins today.
+*/
+
+#ifndef __XCONFIG_H__
+#define __XCONFIG_H__
+
+#include "XGlobal.h"
+#include "XUtility.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+#define MAX_WORD_LENGTH_IN_CONFIG 256
+
+/* the parameter keeper */
+class XConfig
+{
+private:
+    /* number of arguments */
+    int n;
+    
+    /* argument list (in char*) */
+    char ** args;
+
+    /* number of items we rellocate for these arguments */
+    int nReal;
+
+public:
+    /* constructor */
+    XConfig();
+
+    /* de-constructor */
+    ~XConfig();
+    
+    /* clear it */
+    void Clear();
+
+    /* create a config */
+    void Create(const int myN, const char ** myArgs);
+
+    /* add an argument */
+    void Add(const char * myArg, const char * myValue);
+
+    /* add an argument (in integer) */
+    void Add(const char * myArg, int myValue);
+
+    /* add an argument (in bool) */
+    void Add(const char * myArg, bool myValue);
+
+    /* add an argument (in float) */
+    void Add(const char * myArg, float myValue);
+
+    /* load the value of an argument to a variable (in integer) */
+    void LoadInt(const char * name, int * p, int defaultP);
+
+    /* load the value of an argument to a variable (in boolean) */
+    void LoadBool(const char * name, bool * p, bool defaultP);
+
+    /* load the value of an argument to a variable (in float) */
+    void LoadFloat(const char * name, float * p, float defaultP);
+
+    /* load the value of an argument to a variable (in char string) */
+    void LoadString(const char * name, char * p, const char* defaultP);
+
+    /* get the value of an argument (in integer) */
+    int GetInt(const char * name, int defaultP);
+
+    /* get the value of an argument (in boolean) */
+    bool GetBool(const char * name, bool defaultP);
+
+    /* get the value of an argument (in float) */
+    float GetFloat(const char * name, float defaultP);
+
+    /* get item number */
+    int GetItemNum();
+
+    /* get the item with offset i */
+    char * GetItem(int i);
+
+    /* initialize with another config model */
+    void CreateFromMe(XConfig &myConfig);
+
+};
+
+#define MAX_PARAM_NUM 100
+
+/* load arguments */
+void extern LoadParamInt(int argc, char** argv, const char* name, int* p, int defaultP);
+void extern LoadParamBool(int argc, char** argv, const char* name, bool* p, bool defaultP);
+void extern LoadParamFloat(int argc, char** argv, const char* name, float* p, float defaultP);
+void extern LoadParamString(int argc, char** argv, const char* name, char* p, const char* defaultP);
+
+/* show arguments */
+void extern ShowParams(int argc, char** argv);
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif
\ No newline at end of file
--- a/source/tensor/XDevice.cpp
+++ b/source/tensor/XDevice.cpp
@@ -42,7 +42,6 @@ XDevManager GDevs;
 /* constructor */
 XDevice::XDevice()
 {
-    stream = NULL;
    isInitialized = false;
    Clear();

@@ -141,8 +140,6 @@ void XDevice::Init(int myDevID)
        }
        else
            sprintf(name2, "GPU-%d %s", devID, name);
-
-        stream = new XStream(0, devID);
 #endif
    }

@@ -176,10 +173,6 @@ void XDevice::Clear()
        curandDestroyGenerator(gen);
        isGenReady = false;
    }
-    if (stream != NULL) {
-        delete stream;
-        stream = NULL;
-    }
 #endif
    isInitialized = false;
 }
@@ -189,10 +182,11 @@ void XDevice::Reset()
    XMem * mem = GMems.GetMem(devID);
    mem->Free();

+#ifdef USE_CUDA
    int devIDReset = devID;
+    
    Clear();
-
-#ifdef USE_CUDA
+    
    if (devIDReset >= 0) {
        int devIDBackup = -1;
        cudaGetDevice(&devIDBackup);
@@ -202,6 +196,8 @@ void XDevice::Reset()

        cudaSetDevice(devIDBackup);
    }
+#else
+    Clear();
 #endif
 }

@@ -227,17 +223,6 @@ cublasHandle_t * XDevice::GetCublasHandle()
    return &cublasHandle;
 }

-/* get the stream of cuda */
-cudaStream_t * XDevice::GetCudaStream()
-{
-    if (!isInitialized)
-        Init(devID);
-
-    CheckNTErrors(stream != NULL, "the stream is not initialized!");
-
-    return &stream->stream;
-}
-
 #endif // USE_CUDA

 /* switch to a device */
@@ -286,6 +271,28 @@ int XDevice::GetGPUDevice()
 #endif
 }

+/* 
+swith to a device (CPU or GPU) 
+>> devID - device id
+*/
+void XDevice::SetDevice(int devID)
+{
+    if(devID >= 0)
+        SetGPUDevice(devID);
+}
+
+/* 
+swith to a device (CPU or GPU) with a backup of the device id 
+>> devID - device id
+>> backupDevID - backup of the device id
+*/
+void XDevice::SetDevice(int devID, int &backupDevID)
+{
+    backupDevID = GetGPUDevice();
+    if (devID >= 0)
+        SetGPUDevice(devID);
+}
+
 /* reset cuda flag for more efficient cuda execution. It should be called after "SetGPUDevice" when
   no GPU context has been established. */
 void XDevice::SetFastFlags()
@@ -312,13 +319,6 @@ void XDevice::SetFastFlagsAllDevices()
 #endif
 }

-/* delete the default stream for the device */
-void XDevice::DelDeviceStream()
-{
-    if(stream != NULL)
-        delete stream;
-}
-
 /* constructor */
 XDevManager::XDevManager()
 {
@@ -391,14 +391,6 @@ cublasHandle_t * XDevManager::GetCudaHandle(const int devID)
    return GPUs[devID].GetCublasHandle();
 }

-/* get the stream of a given GPU */
-cudaStream_t * XDevManager::GetCudaStream(const int devID)
-{
-    CheckNTErrors(devID < nGPU, "index of GPU is out of range.");
-
-    return GPUs[devID].GetCudaStream();
-}
-
 #endif

 /* 
@@ -620,16 +612,5 @@ char * XDevManager::GetDevString(int devID)
    }
 }

-/* delete the streams for all devices */
-void XDevManager::DelDeviceStream()
-{
-    for(int i = 0; i < GDevs.nCPU; i++) {
-        GDevs.CPUs[i].DelDeviceStream();
-    }
-    for(int i = 0; i < GDevs.nGPU; i++) {
-        GDevs.GPUs[i].DelDeviceStream();
-    }
-}
-
 } /* end of the nts (NiuTrans.Tensor) namespace */

--- a/source/tensor/XDevice.h
+++ b/source/tensor/XDevice.h
@@ -25,7 +25,6 @@
 #define __XDEVICE_H__

 #include "XThread.h"
-#include "XStream.h"

 #ifdef USE_CUDA

@@ -97,9 +96,6 @@ public:
    /* specify whether Unified Virtual Address Space (UVA) is supported */
    bool isUVASupported;

-    /* default stream for the device */
-    XStream * stream;
-
    /* seed for random number generation */
    int seed;
    
@@ -140,12 +136,9 @@ public:
 #ifdef USE_CUDA
    /* get cublas handle */
    cublasHandle_t * GetCublasHandle();
-
-    /* get the stream of cuda */
-    cudaStream_t * GetCudaStream();
 #endif

-    /* switch to a device */
+    /* switch to a GPU device */
    static
    void SetGPUDevice(int devID);

@@ -153,10 +146,18 @@ public:
    static
    void SetGPUDeviceFast(int devID);

-    /* switch to a get current dev */
+    /* get current dev */
    static
    int GetGPUDevice();

+    /* swith to a device (CPU or GPU) */
+    static
+    void SetDevice(int devID);
+
+    /* swith to a device (CPU or GPU) with a backup of the device id */
+    static
+    void SetDevice(int devID, int &backupDevID);
+
    /* reset cuda flag for more efficient cuda execution */
    static
    void SetFastFlags();
@@ -164,9 +165,6 @@ public:
    /* reset cuda flag for more efficient cuda execution (all devices) */
    static
    void SetFastFlagsAllDevices();
-
-    /* delete the default stream for the device (call it before deleting the XDevice) */
-    void DelDeviceStream();
 };

 /*
@@ -206,9 +204,6 @@ public:
 #ifdef USE_CUDA
    /* get the handle of GPU */
    cublasHandle_t * GetCudaHandle(const int devID);
-
-    /* get the stream of cuda */
-    cudaStream_t * GetCudaStream(const int devID);
 #endif

    /* get grid and block sizes that max potential */
@@ -228,10 +223,6 @@ public:

    /* get the device information in string */
    char * GetDevString(int devID);
-
-    /* delete the streams for all devices */
-    static
-    void DelDeviceStream();
 };

 /* managing the devices */

--- a/source/tensor/XGlobal.h
+++ b/source/tensor/XGlobal.h
@@ -132,6 +132,36 @@ extern int TRAINING_SAMPLE_BUF_SIZE;
 extern int CONST_MINUSONE;
 extern bool CONST_TRUE;

+//////////////////////////////////////////////////
+// mutex
+#ifdef WIN32
+#define      THREAD_HANDLE            HANDLE
+#define      MUTEX_HANDLE             CRITICAL_SECTION
+#define      COND_HANDLE              HANDLE
+#define      MUTEX_INIT( x )          InitializeCriticalSection( &(x) )
+#define      MUTEX_DELE( x )          DeleteCriticalSection( &(x) )
+#define      MUTEX_LOCK( x )          EnterCriticalSection( &(x) )
+#define      MUTEX_UNLOCK( x )        LeaveCriticalSection( &(x) )
+#define      COND_INIT( x )           ( x = CreateEvent( NULL, false, false, NULL ) )
+#define      COND_DELE( x )           CloseHandle( (x) )
+#define      COND_WAIT( x, y )        WaitForSingleObject( (x), INFINITE )
+#define      COND_SIGNAL( x )         SetEvent( (x) )
+#define      COND_RESET( x)           ResetEvent( (x) )
+#else
+#define      THREAD_HANDLE            pthread_t
+#define      MUTEX_HANDLE             pthread_mutex_t
+#define      COND_HANDLE              pthread_cond_t
+#define      MUTEX_INIT( x )          pthread_mutex_init( &(x), NULL )
+#define      MUTEX_DELE( x )          pthread_mutex_destroy( &(x) )
+#define      MUTEX_LOCK( x )          pthread_mutex_lock( &(x) )
+#define      MUTEX_UNLOCK( x )        pthread_mutex_unlock( &(x) )
+#define      COND_INIT( x )           pthread_cond_init( &(x), NULL )
+#define      COND_DELE( x )           pthread_cond_destroy( &(x) )
+#define      COND_WAIT( x, y )        pthread_cond_wait( &(x), &(y) )
+#define      COND_SIGNAL( x )         pthread_cond_signal( &(x) )
+#define      COND_BROADCAST( x )      pthread_cond_broadcast( &(x) )
+#endif
+
 //#define USE_CUDA_RESURSION 1

 #define NIUTRANSNNDEBUG

--- a/source/tensor/XLink.h
+++ b/source/tensor/XLink.h
@@ -26,8 +26,6 @@
 #ifndef __XLINK_H__
 #define __XLINK_H__

-#include "XGlobal.h"
-
 namespace nts{ // namespace nts(NiuTrans.Tensor)

 /* cross reference */

--- a/source/tensor/XList.cpp
+++ b/source/tensor/XList.cpp
--- a/source/tensor/XList.h
+++ b/source/tensor/XList.h
@@ -75,6 +75,9 @@ public:
    /* de-constructor */
    ~TensorListBase();

+    /* reallocate */
+    void Reallocate(int itemNum);
+
    /* add an item into the list */
    void Add(T&& item);

@@ -84,6 +87,15 @@ public:
    /* add an item into the list */
    void Add(const T& item);

+    /* add an item (as an integer) into the list */
+    void AddInt(const int item);
+
+    /* add an item (as a float) into the list */
+    void AddFloat(const float item);
+
+    /* add an item (as a long long) into the list */
+    void AddLLong(const long long item);
+
    /* add a number of items into the list */
    void Add(const T* inputItems, int inputItemCount);

@@ -99,12 +111,30 @@ public:
    /* get the item at position i */
    T& GetItem(int i) const;

+    /* get the item at position i and force it to an integer */
+    int GetItemInt(int i) const;
+
+    /* get the item at position i and force it to a float number */
+    float GetItemFloat(int i) const;
+
+    /* get the item at position i and force it to an long long number */
+    long long GetItemLLong(int i) const;
+
    /* set the item at position i */
    void SetItem(int i, const T& item);

    /* set the item at position i */
    void SetItem(int i, T&& item);

+    /* set the item (as an integer) at position i */
+    void SetItemInt(int i, const int item);
+
+    /* set the item (as a float) at position i */
+    void SetItemFloat(int i, const float item);
+
+    /* set the item (as a long long) at position i */
+    void SetItemLLong(int i, const long long item);
+
    /* find the position of the first matched item  */
    int FindFirst(const T& item);

@@ -135,7 +165,13 @@ public:
    /* short */
    T& operator[] (int i) const { return GetItem(i); };
    T& Get(int i) const { return GetItem(i); };
+    int GetInt(int i) const { return GetItemInt(i); };
+    float GetFloat(int i) const { return GetItemFloat(i); };
+    long long GetLLong(int i) const { return GetItemLLong(i); };
    void Set(int i, T item) { SetItem(i, item); };
+    void SetInt(int i, int item) { SetItemInt(i, item); };
+    void SetFloat(int i, float item) { SetItemFloat(i, item); };
+    void SetLLong(int i, long long item) { SetItemLLong(i, item); };
 };

 struct XTensor;

--- a/source/tensor/XMem.cpp
+++ b/source/tensor/XMem.cpp
@@ -54,6 +54,8 @@ XMem::XMem()
    signature = 0;
    mergeFreeOTF = true;
    isInitialized = false;
+    MUTEX_INIT(allocMutex);
+    MUTEX_INIT(bufMutex);
 }

 /* 
@@ -77,6 +79,8 @@ XMem::XMem(int myDevID, MEMPOOL_MODE myMode, MTYPE myBlockSize, int myBlockNum, 
    strcpy(name, "xmem");
    signature = 0;
    mergeFreeOTF = true;
+    MUTEX_INIT(allocMutex);
+    MUTEX_INIT(bufMutex);
    Initialize(myDevID, myMode, myBlockSize, myBlockNum, myBufSize);
 }

@@ -99,6 +103,8 @@ XMem::~XMem()
    delete[] memIndex;
    delete[] memIndex2;
    delete[] minSizeIndex;
+    MUTEX_DELE(allocMutex);
+    MUTEX_DELE(bufMutex);
 }

 /* 
@@ -379,12 +385,18 @@ require a piece of memory
 */
 void * XMem::Alloc(int myDevID, MTYPE mySize)
 {
+    void * p = NULL;
+
+    MUTEX_LOCK(allocMutex);
    if(mode == FREE_ON_THE_FLY)
-        return AllocStandard(myDevID, mySize);
+        p = AllocStandard(myDevID, mySize);
    else if(isStatic)
-        return AllocStatic(myDevID, mySize);
+        p = AllocStatic(myDevID, mySize);
    else
-        return AllocDynamic(myDevID, mySize);
+        p = AllocDynamic(myDevID, mySize);
+    MUTEX_UNLOCK(allocMutex);
+
+    return p;
 }

 /* 
@@ -521,6 +533,11 @@ void * XMem::AllocBuf(int myDevID, MTYPE mySize, int pitch)
 {
    MTYPE backOffset = 0;

+    /* NOTE THAT this is tricky because we lock the buffer
+       but DO NOT unlock it in this function. The unlock would
+       happans when we call ReleaseBuf() */
+    //MUTEX_LOCK(bufMutex);
+
    if(pitch > 1){
        MTYPE address = (MTYPE)((char*)buf + bufUsed);
        int offset  = address % pitch;
@@ -560,8 +577,10 @@ release a piece of memory
 */
 void XMem::Release(int myDevID, void * p, MTYPE size)
 {
+    MUTEX_LOCK(allocMutex);
    if(mode == FREE_ON_THE_FLY)
        ReleaseStandard(myDevID, p, size);
+    MUTEX_UNLOCK(allocMutex);
 }

 /* 
@@ -583,6 +602,9 @@ void XMem::ReleaseBuf(int myDevID, MTYPE mySize, int pitch)
    }

    bufUsed -= (mySize + backOffset);
+
+    /* NOTE THAT this is a response to the lock in AllocBuf() */
+    //MUTEX_UNLOCK(bufMutex);
 }

 /* 
@@ -825,6 +847,18 @@ void * XMem::AllocStandard(int myDevID, MTYPE mySize, bool myIsRebuiltIndex)
    return result;
 }

+/* lock the buffer mutex */
+void XMem::LockBuf()
+{
+    MUTEX_LOCK(bufMutex);
+}
+
+/* unlock the buffer mutex */
+void XMem::UnlockBuf()
+{
+    MUTEX_UNLOCK(bufMutex);
+}
+
 /* 
 find the highest set bit (or most significant set bit) in an integer-64 
 >> mySize - required size
@@ -1511,12 +1545,12 @@ void XMem::ShowMemUsage(FILE * file)
    }

    MTYPE bufTotal = bufSize;
-    MTYPE bufUsed = bufUsed;
+    MTYPE bufUsedTotal = bufUsed;

    fprintf(file, "block mem:%.1fMB used:%.1fMB usage:%.3f\n",
           (DTYPE)blockTotal/MILLION, (DTYPE)blockUsed/MILLION, (DTYPE)blockUsed/blockTotal);
    fprintf(file, "buffer mem:%.1fMB used:%.1fMB usage:%.3f\n",
-            (DTYPE)bufTotal / 1024 / 1024, (DTYPE)bufUsed / 1024 / 1024, (DTYPE)bufUsed / bufTotal);
+            (DTYPE)bufTotal / 1024 / 1024, (DTYPE)bufUsedTotal / 1024 / 1024, (DTYPE)bufUsed / bufTotal);

 }

@@ -1560,7 +1594,7 @@ MTYPE XMemManager::GetAvailableMemory()
    MEMORYSTATUSEX memoryStatus;
    memoryStatus.dwLength = sizeof(memoryStatus);
    if (GlobalMemoryStatusEx(&memoryStatus)){
-        freeMem = memoryStatus.ullAvailPhys;
+        freeMem = (unsigned long)memoryStatus.ullAvailPhys;
    }
 #else
    long pages = sysconf(_SC_AVPHYS_PAGES);
@@ -1604,6 +1638,9 @@ void XMemManager::GetBufferSize(MTYPE freeMem, MTYPE * myBufSize)
            }
        }
    }
+    else {
+        ShowNTErrors("No enough memory for buffer allocation!");
+    }
 } 

 /* initialize it and set the global memory information */

--- a/source/tensor/XMem.h
+++ b/source/tensor/XMem.h
@@ -24,6 +24,7 @@
 #ifndef __XMEM_H__
 #define __XMEM_H__

+#include "XGlobal.h"
 #include <stdio.h>
 #include <stdlib.h>

@@ -249,6 +250,13 @@ public:
    /* indicates whether we merge free memory pieces on the fly */
    bool mergeFreeOTF;

+private:
+    /* a mutex for memory allocation and release */
+    MUTEX_HANDLE allocMutex;
+
+    /* a mutex for buffer memory allocation and release */
+    MUTEX_HANDLE bufMutex;
+
 public:

    /* constructor */
@@ -337,6 +345,12 @@ public:
    /* allocate a piece of memory as "malloc" */
    void * AllocStandard(int myDevID, MTYPE mySize, bool myIsRebuiltIndex = false);

+    /* lock the buffer mutex */
+    void LockBuf();
+
+    /* unlock the buffer mutex */
+    void UnlockBuf();
+
    /* find the highest set bit (or most significant set bit) in an integer-64 */
    int GetMSB(MTYPE mySize);


--- a/source/tensor/XPRunner.cpp
+++ b/source/tensor/XPRunner.cpp
@@ -146,7 +146,7 @@ run a set of jobs in parallel
 >> jobArgs - the list of arguments for each job
 >> sleepTime - time to sleep (in ms) for each round
 */
-void XPRunner::Run(TensorList * jobFunctions, TensorList * jobArgs, float sleepTime)
+void XPRunner::Run(XList * jobFunctions, XList * jobArgs, float sleepTime)
 {
    if(threadNum <= 0){
        XPRINT(1, stderr, "Error! No threads were created!\n");
@@ -195,13 +195,12 @@ void XPRunner::Run(TensorList * jobFunctions, TensorList * jobArgs, float sleepT
            TFunction function = (TFunction)jobFunctions->GetItem(jobArgs->count - c);

            /* the arguments that are passed to the function */
-            volatile TensorList * args = (TensorList*)jobArgs->GetItem(jobArgs->count - c);
+            XList * args = (XList*)jobArgs->GetItem(jobArgs->count - c);

            /* thread */
            XThread * thread  = threads + availableThreads[i];

-            thread->argv = args;
-            thread->function = function;
+            thread->SetFunc(function, args);

            MUTEX_LOCK(thread->workingMutex);
            thread->working = 1;

--- a/source/tensor/XPRunner.h
+++ b/source/tensor/XPRunner.h
@@ -106,7 +106,7 @@ public:
    void KillThreads();

    /* run a set of jobs in parallel */
-    void Run(TensorList * jobFunctions, TensorList * jobArgs, float sleepTime = 0);
+    void Run(XList * jobFunctions, XList * jobArgs, float sleepTime = 0);

    /* get the number of parallel jobs to run */
    int GetJobNum(int size);

--- a/source/tensor/XQueue.cpp
+++ b/source/tensor/XQueue.cpp
@@ -42,7 +42,7 @@ job item used in queues
 JobQueueNode::JobQueueNode()
 {
    job  = NULL;
-    args = new TensorList(1);
+    args = new XList(1);
 }

 /* de-constructor */
@@ -67,12 +67,9 @@ XQueue::XQueue(int mySize)
    head = 0;
    tail = 0;
    isJobQueue = false;
-    jobDequeuerArgs = new TensorList(1);
+    jobDequeuerArgs = new XList(1);
    jobDequeuerBreak = false;
    runningJobCount = 0;
-    jobStream = NULL;
-    jobStream1 = NULL;
-    jobStream2 = NULL;
    
    MUTEX_INIT(enqueueMutex);
    MUTEX_INIT(dequeueMutex);
@@ -85,9 +82,6 @@ XQueue::~XQueue()
 {
    delete[] queue;
    delete jobDequeuerArgs;
-    delete jobStream;
-    delete jobStream1;
-    delete jobStream2;

    //if(isJobQueue)
    //    StopJobConsumer();
@@ -160,19 +154,6 @@ void XQueue::WaitForEmptyJobQueue()
    while(runningJobCount > 0){
        XSleep(10);
    }
-
-    if(jobStream != NULL){
-        CheckNTErrors((jobStream->IsFinished()), "None fineished jobs remain");
-        jobStream->Clear();
-    }
-    if(jobStream1 != NULL){
-        CheckNTErrors((jobStream1->IsFinished()), "None fineished jobs remain");
-        jobStream1->Clear();
-    }
-    if(jobStream2 != NULL){
-        CheckNTErrors((jobStream2->IsFinished()), "None fineished jobs remain");
-        jobStream2->Clear();
-    }
 }

 int devids[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
@@ -189,12 +170,11 @@ void XQueue::RunJobConsumer(int jobDevID)
    isJobQueue = true;
    jobDequeuerArgs->Clear();

-    // warning: this may cause unknown error
-    jobDequeuerArgs->Add((XTensor*)this);
-    jobDequeuerArgs->Add(jobDevID >= 0 ? (XTensor*)(devids + jobDevID) : (XTensor*)&cpuid);
+    /* warning: this may cause unknown errors */
+    jobDequeuerArgs->Add(this);
+    jobDequeuerArgs->Add(jobDevID >= 0 ? (devids + jobDevID) : &cpuid);

-    jobDequeuer.function = (TFunction)DequeueJobs;
-    jobDequeuer.argv = jobDequeuerArgs;
+    jobDequeuer.SetFunc((TFunction)DequeueJobs, jobDequeuerArgs);

    jobDequeuer.Start();
    jobDequeuer.LetItGo();
@@ -213,7 +193,7 @@ void XQueue::StopJobConsumer()
 }

 /* add a job item to process */
-void XQueue::EnqueueJob(void * job, TensorList * jobArgs)
+void XQueue::EnqueueJob(void * job, XList * jobArgs)
 {
    MUTEX_LOCK(jobQueueMutex);
    runningJobCount++;
@@ -227,17 +207,16 @@ void XQueue::EnqueueJob(void * job, TensorList * jobArgs)
 }

 /* job item consumer */
-void XQueue::DequeueJobs(TensorList * args)
+void XQueue::DequeueJobs(XList * args)
 {
    CheckNTErrors((args->count == 2), "Illegal arguments!");

    XQueue * q = (XQueue*)args->GetItem(0);
    int devID = *(int*)args->GetItem(1);

-    int devIDBackup = XDevice::GetGPUDevice();
-
+    int devIDBackup = -1;
    if(devID >= 0)
-        XDevice::SetGPUDevice(devID);
+        XDevice::SetDevice(devID, devIDBackup);

    while(1){
        JobQueueNode * node = (JobQueueNode*)q->Dequeue();
@@ -259,7 +238,7 @@ void XQueue::DequeueJobs(TensorList * args)
    }

    if(devID >= 0)
-        XDevice::SetGPUDevice(devIDBackup);
+        XDevice::SetDevice(devIDBackup);
 }

 /* get the break flag */
@@ -268,31 +247,14 @@ bool XQueue::GetJobBreak()
    return jobDequeuerBreak;
 }

-/* get job stream */
-XStream * XQueue::GetJobStream(int n)
+/* get the number of jobs */
+int XQueue::GetJobNum()
 {
-    if(n == 0)
-        return jobStream;
-    else if(n == 1)
-        return jobStream1;
-    else if(n == 2)
-        return jobStream2;
-    else{
-        ShowNTErrors("invalid stream id!");
-    }
-
-    return NULL;
-}
+    MUTEX_LOCK(jobQueueMutex);
+    int c = runningJobCount;
+    MUTEX_UNLOCK(jobQueueMutex);

-/* make job streams */
-void XQueue::MakeJobStreams(int devID, int devID1, int devID2)
-{
-    if(devID != INVALID_DEVICE_ID)
-        jobStream = new XStream(0, devID);
-    if(devID1 != INVALID_DEVICE_ID)
-        jobStream1 = new XStream(0, devID1);
-    if(devID2 != INVALID_DEVICE_ID)
-        jobStream2 = new XStream(0, devID2);
+    return c;
 }

 } /* end of the nts (NiuTrans.Tensor) namespace */
--- a/source/tensor/XQueue.h
+++ b/source/tensor/XQueue.h
@@ -33,7 +33,6 @@

 #include "XGlobal.h"
 #include "XThread.h"
-#include "XStream.h"
 #include "XDevice.h"
 #include "XList.h"

@@ -52,7 +51,7 @@ public:
    void * job;

    /* arguments of the job */
-    TensorList * args;
+    XList * args;

 public:
    /* constructor */
@@ -102,7 +101,7 @@ private:
    XThread jobDequeuer;

    /* argument list of jobDequeuer */
-    TensorList * jobDequeuerArgs;
+    XList * jobDequeuerArgs;

    /* indicates whether jobDequeuer stops */
    bool jobDequeuerBreak;
@@ -110,11 +109,6 @@ private:
    /* running job count */
    int runningJobCount;

-    /* job streams (we think that three streams is enough :)) */
-    XStream * jobStream;
-    XStream * jobStream1;
-    XStream * jobStream2;
-
 public:
    /* constuctor */
    XQueue(int mySize = MAX_QUEUE_SIZE);
@@ -135,26 +129,23 @@ public:
    void WaitForEmptyJobQueue();

    /* run the job consumer */
-    void RunJobConsumer(int jobDevID = 0);
+    void RunJobConsumer(int jobDevID = -1);

    /* stop the job consumer */
    void StopJobConsumer();

    /* add a job item to process */
-    void EnqueueJob(void * job, TensorList * jobArgs);
+    void EnqueueJob(void * job, XList * jobArgs);

    /* job item consumer */
    static
-    void DequeueJobs(TensorList * args);
+    void DequeueJobs(XList * args);

    /* get the break flag */
    bool GetJobBreak();

-    /* get job stream */
-    XStream * GetJobStream(int n = 0);
-
-    /* make job streams */
-    void MakeJobStreams(int devID = INVALID_DEVICE_ID, int devID1 = INVALID_DEVICE_ID, int devID2 = INVALID_DEVICE_ID);
+    /* get the number of jobs */
+    int GetJobNum();
 };

 } /* end of the nts (NiuTrans.Tensor) namespace */

--- a/source/tensor/XStream.cpp
+++ b/source/tensor/XStream.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2017, Natural Language Processing Lab, Northeastern University. 
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * 
- * This is for streaming (on GPU), i.e., run jobs in different stream for 
- * GPU Async capabilities.
- *
- *
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2016-03-09
- *
- */
-
-#include "stdio.h"
-#include "stdlib.h"
-#include "XGlobal.h"
-#include "XStream.h"
-#include "XDevice.h"
-
-/* the nts (NiuTrans.Tensor) namespace */
-namespace nts{
-
-/*
-This class defines the stream used in pipelining jobs. E.g., one can put
-a sequence of jobs in a stream and asynchronously do something else. Basically
-we can use multiply streams to hide the data transfer cost on GPUs by using
-job overlaps.
-*/
-
-/* constructor */
-XStream::XStream(int priority, int myDevID, int myMaxEventNum)
-{
-    devID = myDevID;
-#ifdef USE_CUDA
-    if(myDevID >= 0){
-        int backupDevID = XDevice::GetGPUDevice();
-        XDevice::SetGPUDevice(myDevID);
-        events = new cudaEvent_t[myMaxEventNum];
-        XDevice::SetGPUDevice(backupDevID);
-
-        maxEventNum = myMaxEventNum;
-        usedEventNum = 0;
-    }
-    else{
-        maxEventNum = 0;
-        usedEventNum = 0;
-    }
-#endif
-
-    Create(priority, devID);
-}
-
-/* deconstructor */
-XStream::~XStream()
-{
-    Destroy();
-#ifdef USE_CUDA
-    delete[] events;
-#endif
-}
-
-/* create the stream */
-void XStream::Create(int priority, int myDevID)
-{
-    if(myDevID < 0)
-        return;
-
-#ifdef USE_CUDA
-    int backupDevID = XDevice::GetGPUDevice();
-    XDevice::SetGPUDevice(myDevID);
-    //cudaStreamCreateWithPriority(&stream, cudaStreamDefault, priority);
-    CheckNTErrors((cudaStreamCreate(&stream) == cudaSuccess), 
-                  "cannot create the cuda stream!");
-    XDevice::SetGPUDevice(backupDevID);
-#endif
-    devID = myDevID;
-}
-
-/* destroy the stream */
-void XStream::Destroy()
-{
-    if(devID < 0)
-        return;
-
-#ifdef USE_CUDA
-    int backupDevID = XDevice::GetGPUDevice();
-    XDevice::SetGPUDevice(devID);
-    cudaStreamDestroy(stream);
-    XDevice::SetGPUDevice(backupDevID);
-    Clear();
-#endif
-}
-
-/* clear it */
-void XStream::Clear()
-{
-#ifdef USE_CUDA
-    int backupDevID = XDevice::GetGPUDevice();
-    XDevice::SetGPUDevice(devID);
-    for(int i = 0; i < usedEventNum; i++){
-        cudaEventDestroy(events[i]);
-    }
-    usedEventNum = 0;
-    XDevice::SetGPUDevice(backupDevID);
-#endif
-}
-
-/* judge if all the jobs in the stream have been finished */
-bool XStream::IsFinished()
-{
-#ifdef USE_CUDA
-    if(cudaStreamQuery(stream) == cudaSuccess)
-        return true;
-    else
-        return false;
-#else
-    return true;
-#endif
-}
-
-void XStream::StreamSynchronize()
-{
-#ifdef USE_CUDA
-    int devIDBackup = XDevice::GetGPUDevice();
-    if(devID != devIDBackup)
-        XDevice::SetGPUDevice(devID);
-    cudaStreamSynchronize(stream);
-    if(devID != devIDBackup)
-        XDevice::SetGPUDevice(devIDBackup);
-#endif
-}
-
-void XStream::ThreadSynchronize()
-{
-#ifdef USE_CUDA
-#if CUDART_VERSION < 10000
-    cudaThreadSynchronize();
-#else
-    ShowNTErrors("TODO!");
-#endif
-#endif
-}
-
-void XStream::DeviceSynchronize(int devID)
-{
-#ifdef USE_CUDA
-    int devIDBackup = XDevice::GetGPUDevice();
-    cudaGetDevice(&devIDBackup);
-    if(devID != devIDBackup)
-        XDevice::SetGPUDevice(devID);
-    cudaDeviceSynchronize();
-    if(devID != devIDBackup)
-        XDevice::SetGPUDevice(devIDBackup);
-#endif
-}
-
-/* make a dependency of two streams. i.e., current stream must wait for the last job finished in another stream */
-void XStream::MakeDependency(XStream * precedingStream)
-{
-#ifdef USE_CUDA
-    cudaEvent_t * e = precedingStream->MakeEvent();
-    cudaEventRecord(*e, precedingStream->stream);
-    cudaStreamWaitEvent(stream, *e, 0);
-#endif
-}
-
-
-/* get the stream */
-#ifdef USE_CUDA
-inline cudaStream_t * XStream::Get()
-{
-    return &stream;
-}
-
-/* make a event */
-inline cudaEvent_t * XStream::MakeEvent()
-{
-    int backupDevID = XDevice::GetGPUDevice();
-    XDevice::SetGPUDevice(devID);
-    CheckNTErrors((usedEventNum < maxEventNum), "Too many events are required!");
-    cudaEvent_t * e = events + usedEventNum++;
-    cudaEventCreate(e);
-    XDevice::SetGPUDevice(backupDevID);
-    return e;
-}
-#endif
-
-} /* end of the nts (NiuTrans.Tensor) namespace */
-
--- a/source/tensor/XStream.h
+++ b/source/tensor/XStream.h
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2017, Natural Language Processing Lab, Northeastern University. 
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * 
- * This is for streaming (on GPU), i.e., run jobs in different stream for 
- * GPU Async capabilities.
- *
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2016-03-09
- *
- */
-
-#ifndef __XSTREAM_H__
-#define __XSTREAM_H__
-
-/* the CUDA stuff */
-#ifdef USE_CUDA
-#include <cuda_runtime.h>
-#include <cublas_v2.h>
-#include <cuda_fp16.h>
-#endif
-
-/* the nts (NiuTrans.Tensor) namespace */
-namespace nts{
-
-#define MAX_CUDA_EVENT_NUM_IN_A_STREAM 128
-
-/*
-This class defines the stream used in pipelining jobs. E.g., one can put
-a sequence of jobs in a stream and asychronously do something else. Basically
-we can use multiply streams to hide the data transfer cost on GPUs by using
-job overlaps.
-*/
-class XStream
-{
-public:
-#ifdef USE_CUDA
-    /* the cuda stream */
-    cudaStream_t stream;
-
-    /* list of cuda events for synchronize different streams */
-    cudaEvent_t * events;
-
-    /* max number of the events */
-    int maxEventNum;
-
-    /* number of used events */
-    int usedEventNum;
-#else
-    /* virtual pointer */
-    void * stream;
-#endif
-
-
-    /* device that holds the stream */
-    int devID;
-
-public:
-    /* constructor */
-    XStream(int priority = 0, int devID = 0, int maxEventNum = MAX_CUDA_EVENT_NUM_IN_A_STREAM);
-
-    /* deconstructor */
-    ~XStream();
-
-    /* create the stream */
-    void Create(int priority = 0, int devID = 0);
-
-    /* destroy the stream */
-    void Destroy();
-
-    /* clear it */
-    void Clear();
-
-    /* judge if all the jobs in the stream have been finished */
-    bool IsFinished();
-
-    /* stream synchronize */
-    void StreamSynchronize();
-
-    /* thread synchronize */
-    static 
-    void ThreadSynchronize();
-
-    /* device synchronize */
-    static 
-    void DeviceSynchronize(int devID);
-
-    /* make a dependency of two streams. i.e., current stream must wait for the last job finished in another stream */
-    void MakeDependency(XStream * precedingStream);
-
-#ifdef USE_CUDA
-    /* get the stream */
-    cudaStream_t * Get();
-
-    /* make a event */
-    cudaEvent_t * MakeEvent();
-#endif  
-};
-
-} /* end of the nts (NiuTrans.Tensor) namespace */
-
-#endif
--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -89,10 +89,6 @@ XTensor::XTensor()
    Init();

    id = MakeTensorID();
-    isDefaultDType = true;
-    isInGlobalMem = false;
-    isInit = false;
-    isTmp = false;
    reserved = 0;
 }

@@ -277,6 +273,7 @@ void XTensor::Init()
    isTmp = false;
    isGrad = false;
    isVar = false;
+    isGradFinished = false;
    enableGrad = X_ENABLE_GRAD;
    visitMark = 0;
    grad = NULL;
@@ -772,10 +769,9 @@ MTYPE XTensor::GetOffset3D(int d0, int d1, int d2) const
 }

 /* 
-a vector with all entries of 0 
->> stream - stream for the job pipeline
+a tensor with all entries of 0 
 */
-void XTensor::SetZeroAll(XStream* stream)
+void XTensor::SetZeroAll()
 {
    if(data == NULL)
        return;
@@ -788,12 +784,7 @@ void XTensor::SetZeroAll(XStream* stream)
            int devIDBackup = 0;
            cudaGetDevice(&devIDBackup);
            cudaSetDevice(devID);
-
-            if(stream == NULL)
-                cudaMemset(data, 0, size);
-            else
-                cudaMemsetAsync(data, 0, size, stream->stream);
-            
+            cudaMemset(data, 0, size);
            cudaSetDevice(devIDBackup);
 #endif
        }
@@ -807,13 +798,8 @@ void XTensor::SetZeroAll(XStream* stream)
 #ifdef USE_CUDA
            int devIDBackup = 0;
            cudaGetDevice(&devIDBackup);
-            cudaSetDevice(devID);
-            
-            if(stream == NULL)
-                cudaMemset(data, 0, unitNum * unitSize);
-            else
-                cudaMemsetAsync(data, 0, unitNum * unitSize, stream->stream);
-            
+            cudaSetDevice(devID);     
+            cudaMemset(data, 0, unitNum * unitSize);
            cudaSetDevice(devIDBackup);
 #endif
        }
@@ -845,11 +831,11 @@ void XTensor::Rand(int rNum, int cNum)
 }

 /* generate data items with a range by start, end and the step
->> start - the begin of the array
->> end - the end of the array (not included self)
->> step - the step of two items
+>> start - the beginning of the array
+>> end - the end of the array (it does not includes itself)
+>> step - the step we take along the array
 */
-void XTensor::Range(DTYPE lower, DTYPE upper, DTYPE step)
+void XTensor::Range(int lower, int upper, int step)
 {
    _SetDataRange(this, lower, upper, step);
 }

--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
@@ -31,7 +31,6 @@
 #include <math.h>
 #include "XGlobal.h"
 #include "XPRunner.h"
-#include "XStream.h"
 #include "XHeap.h"
 #include "XList.h"
 #include "XDataType.h"
@@ -157,6 +156,11 @@ public:
    /* mark for traversing the gragh */
    unsigned int visitMark;

+    /* indicates whether the gradient of the tensor has been computed (in the backward process) 
+       Note that the indicator could be modified by XNet (in back propagation) and be accessed
+       in XTrainer (and related classes). */
+    bool isGradFinished;
+
    /* gradient (for back-propagation) */
    XTensor * grad;
    
@@ -303,7 +307,7 @@ public:
    MTYPE GetOffset3D(int d0, int d1, int d2) const;

    /* a tensor with all entries of 0 */
-    void SetZeroAll(XStream * stream = NULL);
+    void SetZeroAll();

    /* set the tensor with an data array */
    void SetData(const void * d, int num, int beg = 0);
@@ -311,8 +315,8 @@ public:
    /* generate data items with a uniform distribution in [0, 1] */
    void Rand(int rNum, int cNum);

-    /* generate data items with a range by start, end and the step */
-    void Range(DTYPE lower, DTYPE upper, DTYPE step);
+    /* generate data items with a range by start, end and step */
+    void Range(int lower, int upper, int step);

    /* generate data items with a fixed value */
    template<class T>

--- a/source/tensor/XThread.cpp
+++ b/source/tensor/XThread.cpp
@@ -38,7 +38,7 @@ XThread::XThread()
 #endif
    MUTEX_INIT(gMutex);
    function = NULL;
-    argv = NULL;
+    argv.Clear();
    toBreak = false;
    jobCount = 0;
    working = 0;
@@ -69,6 +69,18 @@ void * XThread::Wrapper(void * ptr)
    return 0;
 }

+/* 
+initialize the thread with the function and its parameters 
+>> myFunc - the function to run
+>> myArgv - arguments of the function
+*/
+void XThread::SetFunc(TFunction myFunc, XList * myArgv)
+{
+    function = myFunc;
+    argv.Clear();
+    argv.AddList(myArgv);
+}
+

 /* 
 Tunning for this thread. It is very very native implementation.
@@ -77,6 +89,10 @@ After that, we wait again if there is no new job.
 */
 void XThread::Run()
 {
+    if (function == NULL) {
+        ShowNTErrors("You are running a thread with no function specified!");
+    }
+
 #ifdef _WIN32
    //COND_RESET(gCond);
 #endif    
@@ -104,7 +120,7 @@ void XThread::Run()
        }

        /* do what you want to do*/
-        function(argv);
+        function(&argv);

 #ifdef USE_PTHREAD
        jobCount--;

--- a/source/tensor/XThread.h
+++ b/source/tensor/XThread.h
@@ -54,38 +54,7 @@ namespace nts{
                   (unsigned)(flag), (unsigned *)(id))
 #endif

-//////////////////////////////////////////////////
-// mutex
-#ifdef WIN32
-#define      THREAD_HANDLE            HANDLE
-#define      MUTEX_HANDLE             CRITICAL_SECTION
-#define      COND_HANDLE              HANDLE
-#define      MUTEX_INIT( x )          InitializeCriticalSection( &(x) )
-#define      MUTEX_DELE( x )          DeleteCriticalSection( &(x) )
-#define      MUTEX_LOCK( x )          EnterCriticalSection( &(x) )
-#define      MUTEX_UNLOCK( x )        LeaveCriticalSection( &(x) )
-#define      COND_INIT( x )           ( x = CreateEvent( NULL, false, false, NULL ) )
-#define      COND_DELE( x )           CloseHandle( (x) )
-#define      COND_WAIT( x, y )        WaitForSingleObject( (x), INFINITE )
-#define      COND_SIGNAL( x )         SetEvent( (x) )
-#define      COND_RESET( x)           ResetEvent( (x) )
-#else
-#define      THREAD_HANDLE            pthread_t
-#define      MUTEX_HANDLE             pthread_mutex_t
-#define      COND_HANDLE              pthread_cond_t
-#define      MUTEX_INIT( x )          pthread_mutex_init( &(x), NULL )
-#define      MUTEX_DELE( x )          pthread_mutex_destroy( &(x) )
-#define      MUTEX_LOCK( x )          pthread_mutex_lock( &(x) )
-#define      MUTEX_UNLOCK( x )        pthread_mutex_unlock( &(x) )
-#define      COND_INIT( x )           pthread_cond_init( &(x), NULL )
-#define      COND_DELE( x )           pthread_cond_destroy( &(x) )
-#define      COND_WAIT( x, y )        pthread_cond_wait( &(x), &(y) )
-#define      COND_SIGNAL( x )         pthread_cond_signal( &(x) )
-#define      COND_BROADCAST( x )      pthread_cond_broadcast( &(x) )
-
-#endif
-
-typedef void (*TFunction) (volatile TensorList*);
+typedef void (*TFunction) (volatile XList*);

 /*
 This is a class that wraps the standard implementation of threading
@@ -128,12 +97,10 @@ public:

 public:
    /* function to run */
-    volatile
    TFunction function;

    /* arguments (for the function to run) */
-    volatile
-    TensorList * argv;
+    XList argv;

    /* a flag to break */
    volatile
@@ -154,6 +121,9 @@ public:
    /* a wrapper for the start-routine parameter in pthread_create */
    static void * Wrapper(void * ptr);

+    /* initialize the thread with the function and its parameters */
+    void SetFunc(TFunction myFunc, XList * myArgv);
+
    /* 
    Core of the thread. It is very very native impelementation.
    We loop and wait for a singnal to activate the job processing.

--- a/source/tensor/XUtility.cpp
+++ b/source/tensor/XUtility.cpp
@@ -155,13 +155,13 @@ void XMemSet(int devID, void * p, int value, size_t size)
 cudaMemcpyKind GetMemcpyKind(int devIDFrom, int devIDTo)
 {
    if(devIDFrom < 0 && devIDTo < 0)
-        return cudaMemcpyHostToHost;
+        return cudaMemcpyKind::cudaMemcpyHostToHost;
    else if(devIDFrom < 0 && devIDTo >= 0)
-        return cudaMemcpyHostToDevice;
+        return cudaMemcpyKind::cudaMemcpyHostToDevice;
    else if(devIDFrom >= 0 && devIDTo < 0)
-        return cudaMemcpyDeviceToHost;
+        return cudaMemcpyKind::cudaMemcpyDeviceToHost;
    else
-        return cudaMemcpyDeviceToDevice;
+        return cudaMemcpyKind::cudaMemcpyDeviceToDevice;
 }
 #endif

@@ -311,44 +311,6 @@ void XMemCopy2D(void * t, size_t tPitch, int devIDT, const void * s, size_t sPit
 #endif
 }

-void XMemCopy2DAsync(void * t, size_t tPitch, int devIDT, const void * s, size_t sPitch, int devIDS, size_t mSize, int n, XStream * stream)
-{
-    if (t == s)
-        return;
-
-    if (devIDT < 0 && devIDS < 0) {
-        for(int i = 0; i < n; i++)
-            memcpy((char*)t + tPitch * i, (char*)s + sPitch * i, mSize);
-        return;
-    }
-#ifdef USE_CUDA
-    else{
-        CheckNTErrors(stream != NULL, "No stream found!");
-        cudaStream_t &cstream = stream->stream;
-        if (devIDT >= 0 && devIDS < 0) {
-            cudaError_t error = cudaMemcpy2DAsync(t, tPitch, s, sPitch, mSize, n, cudaMemcpyHostToDevice, cstream);
-            if(error != cudaSuccess){
-                ShowNTErrors("cudaMemcpy2D error (cudaMemcpyHostToDevice)");
-            }
-        }
-        else if (devIDT < 0 && devIDS >= 0) {
-            cudaError_t error = cudaMemcpy2DAsync(t, tPitch, s, sPitch, mSize, n, cudaMemcpyDeviceToHost, cstream);
-            if(error != cudaSuccess){
-                ShowNTErrors("cudaMemcpy error (cudaMemcpyDeviceToHost)");
-            }
-        }
-        else {
-            cudaError_t error = cudaMemcpy2DAsync(t, tPitch, s, sPitch, mSize, n, cudaMemcpyDeviceToDevice, cstream);
-            if (error != cudaSuccess) {
-                ShowNTErrors("cudaMemcpy error (cudaMemcpyDeviceToDevice)");
-            }
-        }
-    }
-#else
-    ShowNTErrors("Please specify USE_CUDA and recompile the code!");
-#endif
-}
-
 void * XMemAlloc(int devID, size_t size)
 {
    void * p = NULL;
@@ -523,6 +485,9 @@ unsigned int GetNextPower2(unsigned int n)
 /* sleep for a while */
 void XSleep(int sleepTime)
 {
+    if (sleepTime <= 0)
+        return;
+
 #ifdef  _WIN32
    Sleep((DWORD)sleepTime);
 #else
@@ -591,9 +556,9 @@ void XQSort(void * data, void * index, int num, int width, int stride, int (*com
    stackptr = 0;

    lo = (char*)data;
-    hi = (char*)data + realStride * (num - 1);
+    hi = (char*)data + (long)realStride * (num - 1);
    indexlo = (int*)index;
-    indexhi = index != NULL ? (int*)index + stride * (num - 1) : NULL;
+    indexhi = index != NULL ? (int*)index + (long)stride * (num - 1) : NULL;

 recurse:

@@ -603,8 +568,8 @@ recurse:
    if(size <= MIN_QSORT_NUM)
        XShortSort(lo, hi, indexlo, indexhi, width, stride, comp);
    else {
-        mid = lo + (size/2) * realStride;
-        indexmid = indexlo + (size/2) * stride;
+        mid = lo + (long)(size/2) * realStride;
+        indexmid = indexlo + (long)(size/2) * stride;
        
        /* sort the first, last and middle elements into order */
        if(comp(lo, mid) > 0)
@@ -872,8 +837,7 @@ int SplitALine(char* inputString, const char* seperator, StrList* items)
        return 0;

    if (sepLen == 0) {
-
-        char* item = new char[inputLen + 1];
+        char* item = new char[(long)inputLen + 1];
        strcpy(item, inputString);
        items->Add(item);
    }

--- a/source/tensor/XUtility.h
+++ b/source/tensor/XUtility.h
@@ -42,7 +42,6 @@ extern void XMemSet(void * p, int value, size_t size);
 extern void XMemSet(int devID, void * p, int value, size_t size);
 extern void XMemCopy(void * t, int devIDT, const void * s, int devIDS, size_t size);
 extern void XMemCopy2D(void * t, size_t tPitch, int devIDT, const void * s, size_t sPitch, int devIDS, size_t mSize, int n);
-extern void XMemCopy2DAsync(void * t, size_t tPitch, int devIDT, const void * s, size_t sPitch, int devIDS, size_t mSize, int n, XStream * stream);
 extern void * XMemAlloc(int devID, size_t size);
 extern void * XMemAllocOnDev(int devID, size_t size);
 extern void XMemFree(int devID, void * p);

--- a/source/tensor/core/arithmetic/Div.cpp
+++ b/source/tensor/core/arithmetic/Div.cpp
@@ -253,15 +253,25 @@ void Div(const XTensor & a, const XTensor & b, XTensor & c, DTYPE alpha, int lea

    if (b.order == 0){
        DTYPE scale = 1.0F / b.Get0D();
+        if (a.mem != NULL)
+            a.mem->LockBuf();
        XTensor * tmp1 = NewTensorBufV2(&a, a.devID, a.mem);
+        if ((c.mem != NULL) && (c.mem != a.mem)) {
+            c.mem->LockBuf();
+        }
        XTensor * tmp2 = NewTensorBufV2(&c, c.devID, c.mem);

        ScaleAndShift(a, *tmp1, scale, 0.0F);
        ScaleAndShift(c, *tmp2, alpha, 0.0F);
        Sum(*tmp2, *tmp1, c);

-        DelTensorBuf(tmp1);
        DelTensorBuf(tmp2);
+        if ((c.mem != NULL) && (c.mem != a.mem)) {
+            c.mem->UnlockBuf();
+        }
+        DelTensorBuf(tmp1);
+        if (a.mem != NULL)
+            a.mem->UnlockBuf();
    }
    else {
        int n = GetBroadcastDimIndex(a, b);

--- a/source/tensor/core/arithmetic/MatrixMul2D.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul2D.cpp
@@ -42,12 +42,11 @@ where trans() return the transposed matrix if the flag is fired
 >> alpha - a coefficient
 >> beta - another coefficient
 >> parallelRunner - parallel processing module
->> stream - the string for creating the job pipeline
 */
 void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
                  const XTensor * b, MATRIX_TRANS_TYPE transposedB,
                  XTensor * c, DTYPE alpha, DTYPE beta,
-                  XPRunner * parallelRunner, XStream * stream)
+                  XPRunner * parallelRunner)
 {
    CheckNTErrors((a && b && c), "Empty input tensors!");
    CheckNTErrors((a->dataType == b->dataType), "Input tensors should have the same data type!");
@@ -69,7 +68,7 @@ void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,

 #ifdef USE_CUDA
    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
-        _CudaMatrixMul2D(a, transposedA, b, transposedB, c, alpha, beta, stream);
+        _CudaMatrixMul2D(a, transposedA, b, transposedB, c, alpha, beta);
        return;
    }
 #endif

--- a/source/tensor/core/arithmetic/MatrixMul2D.cu
+++ b/source/tensor/core/arithmetic/MatrixMul2D.cu
@@ -119,11 +119,10 @@ where trans() return the transposed matrix if the flag is fired
 >> c - where we put a*b
 >> alpha - a coefficient
 >> beta - another coefficient
->> stream - the string for creating the job pipeline
 */
 void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
                      const XTensor * b, MATRIX_TRANS_TYPE transposedB,
-                      XTensor * c, DTYPE alpha, DTYPE beta, XStream * stream)
+                      XTensor * c, DTYPE alpha, DTYPE beta)
 {
    int an = transposedA == X_TRANS ? a->dimSize[1] : a->dimSize[0];
    int am = transposedA == X_TRANS ? a->dimSize[0] : a->dimSize[1];
@@ -152,10 +151,6 @@ void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,

        cublasHandle_t * handle = a->mem == NULL ? GDevs.GetCudaHandle(a->devID) : a->mem->GetCublasHandle();

-        /* !!!! might have problems */
-        if (stream != NULL)
-            cublasSetStream(*handle, stream->stream);
-
        if (beta == 0)
            c->SetZeroAll();


--- a/source/tensor/core/arithmetic/MatrixMul2D.cuh
+++ b/source/tensor/core/arithmetic/MatrixMul2D.cuh
@@ -43,7 +43,7 @@ c = trans(a) * trans(b) * alpha + c * beta
 where trans() return the transposed matrix if the flag is fired
 */
 void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
-                      DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XStream * stream = NULL);
+                      DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);

 #endif // USE_CUDA


--- a/source/tensor/core/arithmetic/MatrixMul2D.h
+++ b/source/tensor/core/arithmetic/MatrixMul2D.h
@@ -32,7 +32,7 @@ c = trans(a) * trans(b) * alpha + c * beta
 where trans() return the transposed matrix if the flag is fired
 */
 void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
-                  DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL, XStream * stream = NULL);
+                  DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/arithmetic/MulAndShift.cpp
+++ b/source/tensor/core/arithmetic/MulAndShift.cpp
@@ -61,6 +61,8 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,

    float dr = (!x.isSparse || !w.isSparse) ? 1.0F : MAX(x.denseRatio, w.denseRatio);

+    if (x.mem != NULL)
+        x.mem->LockBuf();
    XTensor * tmp = NewTensorBufV2(order, dimSize, x.dataType, dr, x.devID, x.mem);

    /* call _MatrixMul function */
@@ -101,6 +103,8 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
    /* destroy variables */
    delete[] dimSize;
    DelTensorBuf(tmp);
+    if (x.mem != NULL)
+        x.mem->UnlockBuf();

    return c;
 }
@@ -121,8 +125,8 @@ XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedX,
    CheckNTErrors(x.order >= 2 && w.order >= 2, "Input tensors must have a order >= 2!");

    int xn = transposedX == X_TRANS ? x.dimSize[x.order - 1] : x.dimSize[x.order - 2];
-    int xm = transposedX == X_TRANS ? x.dimSize[x.order - 2] : x.dimSize[x.order - 1];
-    int wn = transposedW == X_TRANS ? w.dimSize[w.order - 1] : w.dimSize[w.order - 2];
+    //int xm = transposedX == X_TRANS ? x.dimSize[x.order - 2] : x.dimSize[x.order - 1];
+    //int wn = transposedW == X_TRANS ? w.dimSize[w.order - 1] : w.dimSize[w.order - 2];
    int wm = transposedW == X_TRANS ? w.dimSize[w.order - 2] : w.dimSize[w.order - 1];

    int order = x.order + w.order - 2;
@@ -137,6 +141,8 @@ XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedX,

    float dr = (!x.isSparse || !w.isSparse) ? 1.0F : MAX(x.denseRatio, w.denseRatio);

+    if (x.mem != NULL)
+        x.mem->LockBuf();
    XTensor * tmp = NewTensorBufV2(order, dimSize, x.dataType, dr, x.devID, x.mem);

    /* call _MatrixMul function */
@@ -175,8 +181,10 @@ XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedX,
    /* destroy variables */
    delete[] dimSize;
    DelTensorBuf(tmp);
+    if (x.mem != NULL)
+        x.mem->UnlockBuf();

    return c;
 }

-}
\ No newline at end of file
+}
--- a/source/tensor/core/arithmetic/Multiply.cpp
+++ b/source/tensor/core/arithmetic/Multiply.cpp
@@ -277,15 +277,25 @@ void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int l

    if (b.order == 0){
        DTYPE scale = b.Get0D();
+        if (a.mem != NULL)
+            a.mem->LockBuf();
        XTensor * tmp1 = NewTensorBufV2(&a, a.devID, a.mem);
+        if ((c.mem != NULL) && (c.mem != a.mem)) {
+            c.mem->LockBuf();
+        }
        XTensor * tmp2 = NewTensorBufV2(&c, c.devID, c.mem);

        ScaleAndShift(a, *tmp1, scale, 0.0F);
        ScaleAndShift(c, *tmp2, alpha, 0.0F);
        Sum(*tmp2, *tmp1, c);

-        DelTensorBuf(tmp1);
        DelTensorBuf(tmp2);
+        if ((c.mem != NULL) && (c.mem != a.mem)) {
+            c.mem->UnlockBuf();
+        }
+        DelTensorBuf(tmp1);
+        if (a.mem != NULL)
+            a.mem->UnlockBuf();
    }
    else {
        int n = GetBroadcastDimIndex(a, b);

--- a/source/tensor/core/arithmetic/MultiplyDim.cpp
+++ b/source/tensor/core/arithmetic/MultiplyDim.cpp
@@ -290,9 +290,16 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
                source = target;
            }
            
-            target = t->mem != NULL ?
+            /*target = t->mem != NULL ?
                     t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize):
-                     XMemAlloc(t->devID, t->unitNum * t->unitSize);
+                     XMemAlloc(t->devID, t->unitNum * t->unitSize);*/
+            if (t->mem != NULL) {
+                t->mem->LockBuf();
+                target = t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize);
+            }
+            else {
+                target = XMemAlloc(t->devID, t->unitNum * t->unitSize);
+            }
            
            s->data = source;
            t->data = target;
@@ -302,8 +309,9 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
            /* free the memory space of the one before the last allocation */
            if(count > 0){
                int size = s->unitNum * s->unitSize;
-                if(t->mem != NULL)
+                if(t->mem != NULL) {
                    t->mem->ReleaseBuf(t->devID, size);
+                }
                else
                    XMemFree(t->devID, source);
            }
@@ -312,8 +320,10 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
            if(isLast){
                CheckNTErrors(t->unitNum == c->unitNum, "Wrong tensor size!");
                _Multiply(a, t, c, beta);
-                if(t->mem != NULL)
+                if(t->mem != NULL) {
                    t->mem->ReleaseBuf(t->devID, t->unitNum * t->unitSize);
+                    t->mem->UnlockBuf();
+                }
                else
                    XMemFree(t->devID, target);
                target = NULL;

--- a/source/tensor/core/arithmetic/Sum.cpp
+++ b/source/tensor/core/arithmetic/Sum.cpp
@@ -147,25 +147,27 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
                int * bp = (int*)b->data;
                int * cp = (int*)c->data;

+                /* TODO: new code for beta = 1. the follow code might be slow because it introduces 
+                         additional floating-point computation. */
                /* unrolling */
                int num = a->unitNum;
                if (num % 4 == 0) {
                    for (int i = 0; i < num; i += 4) {
-                        cp[i] = ap[i] + bp[i] * beta;
-                        cp[i + 1] = ap[i + 1] + bp[i + 1] * beta;
-                        cp[i + 2] = ap[i + 2] + bp[i + 2] * beta;
-                        cp[i + 3] = ap[i + 3] + bp[i + 3] * beta;
+                        cp[i] = ap[i] + (int)(bp[i] * beta);
+                        cp[i + 1] = ap[i + 1] + (int)(bp[i + 1] * beta);
+                        cp[i + 2] = ap[i + 2] + (int)(bp[i + 2] * beta);
+                        cp[i + 3] = ap[i + 3] + (int)(bp[i + 3] * beta);
                    }
                }
                else if (num % 2 == 0) {
                    for (int i = 0; i < num; i += 2) {
-                        cp[i] = ap[i] + bp[i] * beta;
-                        cp[i + 1] = ap[i + 1] + bp[i + 1] * beta;
+                        cp[i] = ap[i] + (int)(bp[i] * beta);
+                        cp[i + 1] = ap[i + 1] + (int)(bp[i + 1] * beta);
                    }
                }
                else {
                    for (int i = 0; i < num; i++) {
-                        cp[i] = ap[i] + bp[i] * beta;
+                        cp[i] = ap[i] + (int)(bp[i] * beta);
                    }
                }
            }

--- a/source/tensor/core/arithmetic/SumDim.cpp
+++ b/source/tensor/core/arithmetic/SumDim.cpp
@@ -293,10 +293,16 @@ void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta
                source = target;
            }
            
-            target = t->mem != NULL ?
+            /*target = t->mem != NULL ?
                     t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize):
-                     XMemAlloc(t->devID, t->unitNum * t->unitSize);
-            
+                     XMemAlloc(t->devID, t->unitNum * t->unitSize);*/
+            if (t->mem != NULL) {
+                t->mem->LockBuf();
+                target = t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize);
+            }
+            else {
+                target = XMemAlloc(t->devID, t->unitNum * t->unitSize);
+            }
            s->data = source;
            t->data = target;
            
@@ -315,8 +321,10 @@ void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta
            if(isLast){
                CheckNTErrors(t->unitNum == c->unitNum, "Wrong tensor size!");
                _Sum(a, t, c, beta);
-                if(t->mem != NULL)
+                if(t->mem != NULL) {
                    t->mem->ReleaseBuf(t->devID, t->unitNum * t->unitSize);
+                    t->mem->UnlockBuf();
+                }
                else
                    XMemFree(t->devID, target);
                target = NULL;

--- a/source/tensor/core/arithmetic/XTensorBLAS.cu
+++ b/source/tensor/core/arithmetic/XTensorBLAS.cu
@@ -113,6 +113,9 @@ void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
                               int count, int na, int ma, int nb, int mb, int nc, int mc,
                               DTYPE alpha, DTYPE beta)
 {
+    int version = 0;
+    cudaRuntimeGetVersion(&version);
+
    /*
    matrxi-matrix multiplication
    For row-major matrices (as in c/c++), the trick used here is (AB)^T = B^T * A^T
@@ -327,6 +330,7 @@ void _CudaBLASMatrixMULList(cublasHandle_t * handle,
            DTYPE ** cpGPU = NULL;

            if (mem != NULL) {
+                mem->LockBuf();
                mem->SetPinBuf();
                apGPU = (DTYPE**)mem->AllocBuf(mem->devID, sizeof(DTYPE*) * a->count, 256);
                bpGPU = (DTYPE**)mem->AllocBuf(mem->devID, sizeof(DTYPE*) * a->count, 256);
@@ -353,8 +357,10 @@ void _CudaBLASMatrixMULList(cublasHandle_t * handle,
            delete[] bp;
            delete[] cp;

-            if(mem != NULL)
+            if (mem != NULL) {
                mem->BackToPinBuf();
+                mem->UnlockBuf();
+            }
            else {
                XMemFree(a0->devID, apGPU);
                XMemFree(a0->devID, bpGPU);

--- a/source/tensor/core/getandset/OnehotAndIndex.cpp
+++ b/source/tensor/core/getandset/OnehotAndIndex.cpp
@@ -96,9 +96,12 @@ XTensor OnehotToIndex(const XTensor & onehot, int size)
 /* 
 convert index tensor to onehot tensor 

->> index - index tensor, which value is an integer num
->> onehot - onehot tensor, which value is 0 or 1
->> size - the last dimension size of the onehot tensor
+>> index - index of the output dimension (over the vocabulary)
+>> onehot - one-hot representation of the index
+>> size - vocabuary size (last dimension size of onehot)
+>> labelSmoothingP - the parameter that controls how smooth the output is.
+                     E.g., p = 0 means no smoothing
+                           p = 1 means a uniform distribution (almost)
 */
 void _IndexToOnehot(const XTensor * index, XTensor * onehot, 
                    int size, float labelSmoothingP)

--- a/source/tensor/core/getandset/SetData.cpp
+++ b/source/tensor/core/getandset/SetData.cpp
@@ -483,7 +483,7 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
        else if (tensor->dataType == X_FLOAT16) {
            unsigned short* d = (unsigned short*)tensor->data;
            for (int i = 0; i < tensor->unitNum; i++) {
-                d[i] = variance * ((unsigned short)rand() / RAND_MAX) + lower;
+                d[i] = (unsigned short)(variance * ((unsigned short)rand() / RAND_MAX) + lower);
            }
        }
        else if(tensor->dataType == X_DOUBLE){
@@ -538,17 +538,17 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
 /* generate data items with a range by start, end and the step

 >> tensor - the tensor whose data array would be initialized
->> start - the begin of the array
->> end - the end of the array (not included self)
->> step - the step of two items
+>> beg - the beginning of the array
+>> end - the end of the array (it does not include itself)
+>> step - the step we take along the array
 */
-void _SetDataRange(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE step)
+void _SetDataRange(XTensor * tensor, int beg, int end, int step)
 {
    CheckNTErrors((tensor->order == 1), "Tensor must be 1 dimension!");

    /* compute the true length according to the (start, end, step) */
-    DTYPE size = (DTYPE)fabs(upper - lower);
-    int num = ceil(size / fabs(step));
+    DTYPE size = (DTYPE)fabs(end - beg);
+    int num = (int)ceil(size / fabs(step));
    CheckNTErrors((tensor->unitNum == num), "Unit number of the tensor is not matched.");

    /* init a integer array to store the sequence */
@@ -556,12 +556,13 @@ void _SetDataRange(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE step)
    if (tensor->dataType == X_INT) {
        data = new int[num];
        for (int i = 0; i < num; i++)
-            *((int*)data + i) = lower + i * step;
+            *((int*)data + i) = beg + i * step;
    }
    else if (tensor->dataType == X_FLOAT) {
-        data = new float[num];
-        for (int i = 0; i < num; i++)
-            *((float*)data + i) = lower + i * step;
+        ShowNTErrors("TODO! Unsupported datatype!")
+        //data = new float[num];
+        //for (int i = 0; i < num; i++)
+        //    *((float*)data + i) = beg + i * step;
    }
    else {
        ShowNTErrors("TODO! Unsupported datatype!")
@@ -695,13 +696,23 @@ void _SetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYPE nu
 #ifdef USE_CUDA
        XMem * mem = tensor->mem;
        MTYPE size = num * sizeof(MTYPE);
-        MTYPE * offsetsCuda = mem != NULL ? (MTYPE*)mem->AllocBuf(mem->devID, size) : (MTYPE*)XMemAlloc(tensor->devID, size);
+        //MTYPE * offsetsCuda = mem != NULL ? (MTYPE*)mem->AllocBuf(mem->devID, size) : (MTYPE*)XMemAlloc(tensor->devID, size);
+        MTYPE * offsetsCuda;
+        if (mem != NULL) {
+            mem->LockBuf();
+            offsetsCuda = (MTYPE*)mem->AllocBuf(mem->devID, size);
+        }
+        else {
+            offsetsCuda = (MTYPE*)XMemAlloc(tensor->devID, size);
+        }
        XMemCopy(offsetsCuda, tensor->devID, offsets, -1, num * sizeof(MTYPE));

        _CudaSetDataWithOffset(tensor, offsetsCuda, value, num);
        
-        if (mem != NULL)
+        if (mem != NULL) {
            mem->ReleaseBuf(mem->devID, size);
+            mem->UnlockBuf();
+        }
        else
            XMemFree(tensor->devID, offsetsCuda);
 #else

--- a/source/tensor/core/getandset/SetData.cu
+++ b/source/tensor/core/getandset/SetData.cu
@@ -636,12 +636,23 @@ void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * va
    int devIDBackup;
    ProtectCudaDev(tensor->devID, devIDBackup);

-    MTYPE * offsetsCuda = mem != NULL ? 
+    /*MTYPE * offsetsCuda = mem != NULL ? 
                            (MTYPE*)mem->AllocBuf(mem->devID, offsetSize) : 
                            (MTYPE*)XMemAlloc(tensor->devID, offsetSize);
-    void * valuesCuda  = mem != NULL ? 
-                            mem->AllocBuf(mem->devID, valueSize) : 
-                            XMemAlloc(tensor->devID, valueSize);
+    void * valuesCuda = mem != NULL ?
+                        mem->AllocBuf(mem->devID, valueSize) :
+                        XMemAlloc(tensor->devID, valueSize);*/
+    MTYPE * offsetsCuda;
+    void * valuesCuda; 
+    if (mem != NULL) {
+        mem->LockBuf();
+        offsetsCuda = (MTYPE*)mem->AllocBuf(mem->devID, offsetSize);
+        valuesCuda = mem->AllocBuf(mem->devID, valueSize);
+    }
+    else {
+        offsetsCuda = (MTYPE*)XMemAlloc(tensor->devID, offsetSize);
+        valuesCuda = XMemAlloc(tensor->devID, valueSize);
+    }

    if (mem != NULL) {
        XMemCopy(offsetsCuda, mem->devID, offsets, -1, offsetSize);
@@ -657,6 +668,7 @@ void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * va
    if (mem != NULL) {
        mem->ReleaseBuf(mem->devID, valueSize);
        mem->ReleaseBuf(mem->devID, offsetSize);
+        mem->UnlockBuf();
    }
    else {
        XMemFree(tensor->devID, valuesCuda);

--- a/source/tensor/core/getandset/SetData.h
+++ b/source/tensor/core/getandset/SetData.h
@@ -57,8 +57,8 @@ void _SetDataRand(XTensor * tensor, int rNum, int cNum);
 /* generate data items with a uniform distribution in [lower, upper] */
 void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper);

-/* generate data items with a range by start, end and the step */
-void _SetDataRange(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE step);
+/* generate data items with a range [begin, end] and the step */
+void _SetDataRange(XTensor * tensor, int beg, int end, int step);

 /* generate data items with a uniform distribution in [lower, upper] and set 
   the item to a pre-defined value if the item >= p, set the item to 0 otherwise */

--- a/source/tensor/core/math/Clip.cpp
+++ b/source/tensor/core/math/Clip.cpp
@@ -63,9 +63,9 @@ void _Clip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper)
        int* db = (int*)b->data;
        for (int i = 0; i < a->unitNum; i++) {
            if (d[i] > upper)
-                db[i] = upper;
+                db[i] = (int)upper;
            else if (d[i] < lower)
-                db[i] = lower;
+                db[i] = (int)lower;
            else
                db[i] = d[i];
        }

--- a/source/tensor/core/math/ScaleAndShift.cpp
+++ b/source/tensor/core/math/ScaleAndShift.cpp
@@ -86,7 +86,7 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift)
            for(int i = 0; i < num; i++){
                int * v = (int*)f;
                int * vb = (int*)fb;
-                *vb = *v * scale + shift;
+                *vb = (int)(*v * scale + shift);
                f += sizeof(int) + sizeof(int);
                fb += sizeof(int) + sizeof(int);
            }
@@ -96,7 +96,7 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift)
            int * va = (int*)a->data;
            int * vb = (int*)b->data;
            for(int i = 0; i < b->unitNum; i++){
-                *vb = *va * scale + shift;
+                *vb = (int)(*va * scale + shift);
                va++;
                vb++;
            }

--- a/source/tensor/core/movement/CopyBlocks.cpp
+++ b/source/tensor/core/movement/CopyBlocks.cpp
@@ -45,15 +45,25 @@ void _CopyBlocks(void * source, int unitSize, int blockSize, int blockNum, void 
    if (devID >= 0) {
 #ifdef USE_CUDA
        /* copy the index from host to device */
-        int * targetBlocksTMP = myMem != NULL ?
+        /*int * targetBlocksTMP = myMem != NULL ?
                               (int*)myMem->AllocBuf(devID, blockNum * sizeof(int)):
-                               (int*)XMemAlloc(devID, blockNum * sizeof(int));
+                               (int*)XMemAlloc(devID, blockNum * sizeof(int));*/
+        int * targetBlocksTMP;
+        if (myMem != NULL) {
+            myMem->LockBuf();
+            targetBlocksTMP = (int*)myMem->AllocBuf(devID, blockNum * sizeof(int));
+        }
+        else {
+            targetBlocksTMP = (int*)XMemAlloc(devID, blockNum * sizeof(int));
+        }
        XMemCopy(targetBlocksTMP, devID, targetBlocks, -1, blockNum * sizeof(int));

        _CopyBlocksOnSite(source, unitSize, blockSize, blockNum, target, targetBlocksTMP, devID);

-        if(myMem != NULL)
+        if (myMem != NULL) {
            myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
+            myMem->UnlockBuf();
+        }
        else
            XMemFree(devID, targetBlocksTMP);
 #else

--- a/source/tensor/core/movement/CopyBlocksInGrid.cpp
+++ b/source/tensor/core/movement/CopyBlocksInGrid.cpp
@@ -47,14 +47,17 @@ void _CopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, 
 #ifdef USE_CUDA
        int * indexGPU = index;
        if (!isIndexOnDev) {
+            myMem->LockBuf();
            indexGPU = (int*)myMem->AllocBuf(myMem->devID, blockNum * gridNum * sizeof(int));
            XMemCopy(indexGPU, myMem->devID, index, -1, blockNum * gridNum * sizeof(int));
        }

        _CudaCopyBlocksInGrid(source, blockSize, blockNum, gridNum, target, indexGPU, unitSize, myMem);

-        if (!isIndexOnDev)
+        if (!isIndexOnDev) {
            myMem->ReleaseBuf(myMem->devID, blockNum * gridNum * sizeof(int));
+            myMem->UnlockBuf();
+        }
 #else
        ShowNTErrors("Plesae specify USE_CUDA and recompile the code!");
 #endif

--- a/source/tensor/core/movement/CopyBlocksSelected.cu
+++ b/source/tensor/core/movement/CopyBlocksSelected.cu
@@ -80,12 +80,23 @@ void _CudaCopyBlocksSelected(void * source, int unitSize, int blockSize, int * s
    ProtectCudaDev(devID, devIDBackup);

    /* copy the index to the GPU memory */
-    int * sourceBlocksTMP = myMem != NULL ? 
+    /*int * sourceBlocksTMP = myMem != NULL ? 
                           (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) : 
                           (int *)XMemAlloc(devID, blockNum * sizeof(int));
    int * targetBlocksTMP = myMem != NULL ? 
                           (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) : 
-                           (int *)XMemAlloc(devID, blockNum * sizeof(int));
+                           (int *)XMemAlloc(devID, blockNum * sizeof(int));*/
+    int * sourceBlocksTMP;
+    int * targetBlocksTMP;
+    if (myMem != NULL) {
+        myMem->LockBuf();
+        sourceBlocksTMP = (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int));
+        targetBlocksTMP = (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int));
+    }
+    else {
+        sourceBlocksTMP = (int *)XMemAlloc(devID, blockNum * sizeof(int));
+        targetBlocksTMP = (int *)XMemAlloc(devID, blockNum * sizeof(int));
+    }
    
    XMemCopy(sourceBlocksTMP, devID, sourceBlocks, -1, blockNum * sizeof(int));
    XMemCopy(targetBlocksTMP, devID, targetBlocks, -1, blockNum * sizeof(int));
@@ -107,6 +118,7 @@ void _CudaCopyBlocksSelected(void * source, int unitSize, int blockSize, int * s
    if (myMem != NULL) {
        myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
        myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
+        myMem->UnlockBuf();
    }
    else {
        XMemFree(devID, sourceBlocksTMP);

--- a/source/tensor/core/movement/CopyValues.cpp
+++ b/source/tensor/core/movement/CopyValues.cpp
@@ -32,9 +32,8 @@ copy s to t

 >> s - source
 >> t - target
->> stream - the stream for creating the job pipeline
 */
-void _CopyValues(const XTensor * s, XTensor * t, XStream * stream)
+void _CopyValues(const XTensor * s, XTensor * t)
 {
    if(s->data == NULL && t->data == NULL)
        return;
@@ -55,7 +54,7 @@ void _CopyValues(const XTensor * s, XTensor * t, XStream * stream)

 #ifdef USE_CUDA
    if (s->devID >= 0 || t->devID >= 0) {
-        _CudaCopyValues(s, t, stream);
+        _CudaCopyValues(s, t);
        return;
    }
 #endif
@@ -82,9 +81,8 @@ copy s to t
 >> sLen - length of the segment
 >> t - target
 >> tBeg - beginning of the segment on the target side
->> stream - the stream for creating the job pipeline
 */
-void _CopyValues(const XTensor * s, const int sBeg, const int sLen, XTensor * t, const int tBeg, XStream * stream)
+void _CopyValues(const XTensor * s, const int sBeg, const int sLen, XTensor * t, const int tBeg)
 {
    if(s->data == NULL && t->data == NULL)
        return;
@@ -108,13 +106,12 @@ void _CopyValues(const XTensor * s, const int sBeg, const int sLen, XTensor * t,
    
 /*
 copy s to t (rename _CopyValues)
- >> s - source
- >> t - target
- >> stream - the stream for creating the job pipeline
+>> s - source
+>> t - target
 */
-void CopyValues(const XTensor &s, XTensor &t, XStream * stream)
+void CopyValues(const XTensor &s, XTensor &t)
 {
-    _CopyValues(&s, &t, stream);
+    _CopyValues(&s, &t);
 }

 /*
@@ -122,16 +119,15 @@ copy s to t (return an XTensor structure)
 make a new tensor to keep the result and return it

 >> s - source
->> stream - the stream for creating the job pipeline
 << return - the copyed tensor t
 */
-XTensor CopyValues(const XTensor &s, XStream * stream)
+XTensor CopyValues(const XTensor &s)
 {
    XTensor t(&s);
    t.SetTMPFlag();

    /* call _CopyValues function */
-    _CopyValues(&s, &t, stream);
+    _CopyValues(&s, &t);
        
    /* tensor connection */
    if (s.enableGrad) {

--- a/source/tensor/core/movement/CopyValues.cu
+++ b/source/tensor/core/movement/CopyValues.cu
@@ -32,10 +32,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 copy a range of elements from a source vector to a target vector
 >> s - source matrix
 >> t - target matrix
->> stream - the stream for creating the job pipeline
 << return - succeed or not
 */
-void _CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream)
+void _CudaCopyValues(const XTensor * s, XTensor * t)
 {
    CheckNTErrors(s != NULL && t != NULL, "The input tensor and output tensor must be nonempty!");
    CheckNTErrors(s->dataType == t->dataType, "Unmatched data type!");
@@ -45,10 +44,7 @@ void _CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream)

    /* dense -> dense */
    if (!s->isSparse && !t->isSparse) {
-        if (stream == NULL)
-            XMemCopy(t->data, t->devID, s->data, s->devID, s->unitSize * s->unitNum);
-        else
-            XMemCopyAsync(t->data, t->devID, s->data, s->devID, s->unitSize * s->unitNum, stream->stream, stream->devID);
+        XMemCopy(t->data, t->devID, s->data, s->devID, s->unitSize * s->unitNum);
    }
    /* dense -> sparse */
    else if (!s->isSparse && t->isSparse &&
@@ -72,11 +68,8 @@ void _CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream)
        int num = s->unitNumNonZero;
        int size = sizeof(int) + num * (s->unitSize + sizeof(int));

-        if (stream == NULL)
-            XMemCopy(t->data, t->devID, s->data, s->devID, size);
-        else
-            XMemCopyAsync(t->data, t->devID, s->data, s->devID, size, stream->stream, stream->devID);
-
+        XMemCopy(t->data, t->devID, s->data, s->devID, size);
+        
        t->unitNumNonZero = num;
    }
    else {

--- a/source/tensor/core/movement/CopyValues.cuh
+++ b/source/tensor/core/movement/CopyValues.cuh
@@ -29,7 +29,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA

 /* copy all elements from a source matrix to a target matrix */
-void _CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream = NULL);
+void _CudaCopyValues(const XTensor * s, XTensor * t);

 #endif // USE_CUDA


--- a/source/tensor/core/movement/CopyValues.h
+++ b/source/tensor/core/movement/CopyValues.h
@@ -27,19 +27,19 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)

 /* copy s to t */
-void _CopyValues(const XTensor * s, XTensor * t, XStream * stream = NULL);
+void _CopyValues(const XTensor * s, XTensor * t);

 /* copy a segment of s to t  */
-void _CopyValues(const XTensor * s, const int sBeg, const int sLen, XTensor * t, const int tBeg, XStream * stream = NULL);
+void _CopyValues(const XTensor * s, const int sBeg, const int sLen, XTensor * t, const int tBeg);

 /* copy s to t (rename _CopyValues) */
-void CopyValues(const XTensor &s, XTensor &t, XStream * stream = NULL);
+void CopyValues(const XTensor &s, XTensor &t);

 /* 
 copy s to t (return an XTensor structure)
 make a new tensor to keep the result and return it
 */
-XTensor CopyValues(const XTensor &s, XStream * stream = NULL);
+XTensor CopyValues(const XTensor &s);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/movement/Gather.cpp
+++ b/source/tensor/core/movement/Gather.cpp
@@ -115,7 +115,7 @@ void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex)

        for (int i = 0; i < indexSize; i++) {
            int sIndex = sIndexData[i] * stride;
-            CheckNTErrors(sIndex < s->unitNum, "Wrong index!");
+            CheckNTErrors(sIndex < s->unitNum && sIndex >= 0, "Wrong index!");
            for (int j = 0; j < stride; j++)
                tData[i * stride + j] = sData[sIndex + j];
        }

--- a/source/tensor/core/movement/Gather.cu
+++ b/source/tensor/core/movement/Gather.cu
@@ -131,9 +131,16 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
            CheckNTErrors(srcIndexValue < s->unitNum, "Wrong index!");
        }

-        sIndex = mem != NULL ? 
+        /*sIndex = mem != NULL ? 
                  (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) : 
-                  (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
+                  (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);*/
+        if (mem != NULL) {
+            mem->LockBuf();
+            sIndex = (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize);
+        }
+        else {
+            sIndex = (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
+        }
        XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize);
    }
    else {
@@ -169,8 +176,10 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
    }

    if (srcIndex->devID < 0) {
-        if(mem != NULL)
+        if (mem != NULL) {
            mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
+            mem->UnlockBuf();
+        }
        else
            XMemFree(mem->devID, sIndex);
    }
@@ -209,9 +218,16 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
            CheckNTErrors(srcIndexValue < s->unitNum, "Wrong index!");
        }

-        sIndex = mem != NULL ?
-                  (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
-                  (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
+        /*sIndex = mem != NULL ?
+        (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
+        (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);*/
+        if (mem != NULL) {
+            mem->LockBuf();
+            sIndex = (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize);
+        }
+        else {
+            sIndex = (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
+        }
        XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize);
    }
    else {
@@ -238,6 +254,15 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
    else {
        ShowNTErrors("Unsupported dataType!");
    }
+
+    if (srcIndex->devID < 0) {
+        if (mem != NULL) {
+            mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
+            mem->UnlockBuf();
+        }
+        else
+            XMemFree(mem->devID, sIndex);
+    }
 }
 #endif // USE_CUDA


--- a/source/tensor/core/movement/Spread.cpp
+++ b/source/tensor/core/movement/Spread.cpp
@@ -231,8 +231,8 @@ And this is a special spread function for backward computation of gather functio
 */
 void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index)
 {
-    int dim = 0;
-    int order = source->order;
+    //int dim = 0;
+    //int order = source->order;

    CheckNTErrors(source->dataType == DEFAULT_DTYPE, "TODO!");
    CheckNTErrors(collection->GetDim(-1) == source->GetDim(-1), "Illegal dimension!");
@@ -272,4 +272,4 @@ void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index)
    }
 }

-} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/movement/Spread.cu
+++ b/source/tensor/core/movement/Spread.cu
@@ -177,9 +177,17 @@ void _CudaSpread(XTensor * source, XTensor * collection, int dim,
        DTYPE * c = (DTYPE*)collection->data;

        XMem * mem = source->mem;
-        int * si = mem != NULL ? 
+        /*int * si = mem != NULL ? 
                   (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize * 2) : 
-                   (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize * 2);
+                   (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize * 2);*/
+        int * si;
+        if (mem != NULL) {
+            mem->LockBuf();
+            si = (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize * 2);
+        }
+        else {
+            si = (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize * 2);
+        }
        int * ci = si + indexSize;

        XMemCopy(si, mem->devID, srcIndex, -1, sizeof(int) * indexSize);
@@ -188,8 +196,10 @@ void _CudaSpread(XTensor * source, XTensor * collection, int dim,
        KernelSpreadFuzed<<<blocks, threads >>>(s, c, blockNum, blockSizeSrc, blockSizeColl,
                                                stride, indexSize, si, ci);

-        if(mem != NULL)
+        if (mem != NULL) {
            mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize * 2);
+            mem->UnlockBuf();
+        }
        else
            XMemFree(mem->devID, si);
    }
@@ -393,9 +403,16 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcI
    dim3 threads(cudaBlocks[0], cudaBlocks[1]);

    if (srcIndex->devID < 0) {
-        sIndex = mem != NULL ? 
+        /*sIndex = mem != NULL ? 
                (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) : 
-                (int*)XMemAlloc(devID, sizeof(int) * indexSize);
+                (int*)XMemAlloc(devID, sizeof(int) * indexSize);*/
+        if (mem != NULL) {
+            mem->LockBuf();
+            sIndex = (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize);
+        }
+        else {
+            sIndex = (int*)XMemAlloc(devID, sizeof(int) * indexSize);
+        }
        XMemCopy(sIndex, devID, srcIndex->data, -1, sizeof(int) * indexSize);
    }
    else
@@ -422,8 +439,10 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcI
    }

    if (srcIndex->devID < 0) {
-        if(mem != NULL)
+        if (mem != NULL) {
            mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
+            mem->UnlockBuf();
+        }
        else
            XMemFree(devID, sIndex);
    }

--- a/source/tensor/core/reduce/ReduceMax.cu
+++ b/source/tensor/core/reduce/ReduceMax.cu
@@ -512,8 +512,8 @@ void funName(DTYPE * input, DTYPE * output,int stride, int strideNum,           
 KERNELREDUCEFUN1(KernelReduceMaxOp, MAX, shflDownReduceMax, FLOAT_MIN)
 KERNELREDUCEFUN1(KernelReduceMinOp, MIN, shflDownReduceMin, MAX_FLOAT)

-/* 
-get the max-valued items along a dimension of the tensor (cuda version). 
+/*
+get the max-valued items along a dimension of the tensor (cuda version).
 For a 1-dimensional data array a,
 sum_i = max_{0<=j<strideNum} input_{i,j}
 >> input - the input tensor
@@ -574,7 +574,14 @@ void _funcName(const XTensor * input, XTensor * output, int dim)                
        XMem * mem = input->mem;                                                                                                              \
        GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);                                     \
        int bufSize = input->unitSize * cudaGridSize[0] * stride * blockNum * 2;                                                              \
-        DTYPE * buf  = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(devID, bufSize);                          \
+        DTYPE * buf;                                                                                                                          \
+        if (mem != NULL) {                                                                                                                    \
+            mem->LockBuf();                                                                                                                   \
+            buf = (DTYPE*)mem->AllocBuf(mem->devID, bufSize);                                                                                 \
+        }                                                                                                                                     \
+        else {                                                                                                                                \
+            buf = (DTYPE*)XMemAlloc(devID, bufSize);                                                                                          \
+        }                                                                                                                                     \
        DTYPE * buf1 = buf;                                                                                                                   \
        DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum;                                                                             \
        do {                                                                                                                                  \
@@ -706,8 +713,10 @@ void _funcName(const XTensor * input, XTensor * output, int dim)                
                                                                                                                                              \
        } while (strideNum > 1);                                                                                                              \
                                                                                                                                              \
-        if (mem != NULL)                                                                                                                      \
+        if (mem != NULL) {                                                                                                                    \
            mem->ReleaseBuf(mem->devID, bufSize);                                                                                             \
+            mem->UnlockBuf();                                                                                                                 \
+        }                                                                                                                                     \
        else                                                                                                                                  \
            XMemFree(input->devID, buf);                                                                                                      \
    }                                                                                                                                         \

--- a/source/tensor/core/reduce/ReduceSum.cu
+++ b/source/tensor/core/reduce/ReduceSum.cu
@@ -757,7 +757,15 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
        GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);

        int bufSize = input->unitSize * cudaGridSize[0] * stride * blockNum * 2;
-        DTYPE * buf  = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(devID, bufSize);
+        //DTYPE * buf  = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(devID, bufSize);
+        DTYPE * buf;
+        if (mem != NULL) {
+            mem->LockBuf();
+            buf = (DTYPE*)mem->AllocBuf(mem->devID, bufSize);
+        }
+        else {
+            buf = (DTYPE*)XMemAlloc(devID, bufSize);
+        }
        DTYPE * buf1 = buf;
        DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum;
        do {
@@ -907,8 +915,10 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
        } while (strideNum > 1);
        

-        if (mem != NULL)
+        if (mem != NULL) {
            mem->ReleaseBuf(mem->devID, bufSize);
+            mem->UnlockBuf();
+        }
        else
            XMemFree(devID, buf);
    }

--- a/source/tensor/core/reduce/ReduceSumAll.cpp
+++ b/source/tensor/core/reduce/ReduceSumAll.cpp
@@ -56,12 +56,16 @@ void _ReduceSumAll(const XTensor * source, XTensor * target)

    int dims[1] = {source->unitNum};

+    if (source->mem != NULL)
+        source->mem->LockBuf();
    XTensor * all = NewTensorBufV2(1, dims, source->dataType, source->denseRatio, source->devID, source->mem);

    _CopyValues(source, all);
    _ReduceSum(all, target, 0);

    DelTensorBuf(all);
+    if (source->mem != NULL)
+        source->mem->UnlockBuf();
 }

 /*
@@ -72,7 +76,8 @@ sum all the items of the tensor (It should be optimized!)
 void _ReduceSumAll(const XTensor * source, DTYPE * value)
 {
    int * dimSize = new int[MAX_TENSOR_DIM_NUM];
-    float dr = (!source->isSparse) ? 1.0F : source->denseRatio;
+    if (source->mem != NULL)
+        source->mem->LockBuf();
    XTensor * target = NewTensorBufV2(0, dimSize, source->dataType, source->denseRatio, source->devID, source->mem);
    target->SetTMPFlag();

@@ -82,6 +87,8 @@ void _ReduceSumAll(const XTensor * source, DTYPE * value)

    delete[] dimSize;
    DelTensorBuf(target);
+    if (source->mem != NULL)
+        source->mem->UnlockBuf();
 }

 /*
@@ -122,4 +129,4 @@ DTYPE ReduceSumAllValue(const XTensor & source)
    return target.Get0D();
 }

-} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/shape/Merge.cpp
+++ b/source/tensor/core/shape/Merge.cpp
@@ -32,14 +32,14 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /*
 transform a tensor by merging it along with a dimension.

-e.g., (N/3, M, 3) -> (N, M)
+e.g., (3, M, N/3) -> (M, N)

 >> s - the source tensor
 >> t - the target tensor (for return)
 >> whereToMerge - the merging operation is along with which dimension
->> leadingDim - the leading dimension of merging, take (N/3, M, 3) -> (N, M) 
-   for example, whereToMerge = 0 (i.e., the dimension for "N/3")
-   leadingDim = 2 (i.e., the dimension for "3")
+>> leadingDim - the leading dimension of merging, take (3, M, N/3) -> (M, N)
+                for example, whereToMerge = 2 (i.e., the dimension for "N/3")
+                leadingDim = 0 (i.e., the dimension for "3")
 */
 void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
 {
@@ -118,30 +118,54 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)

        void * dataTMP = t->data;

-        if (!isOnSameDevice)
-            dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(mem->devID, size);
+        if (!isOnSameDevice) {
+            /*dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(mem->devID, size);*/
+            if (mem != NULL) {
+                mem->LockBuf();
+                dataTMP = mem->AllocBuf(mem->devID, size);
+            }
+            else {
+                dataTMP = XMemAlloc(mem->devID, size);
+            }
+        }

        int blockNumInMerge = s->dimSize[leadingDim];
        int splitSizeInGrid = gridSize / blockNumInMerge;
        int realBlockSize = blockSize * t->unitSize;

-        int * blockIndex = (int*)(mem != NULL ?
+        /*int * blockIndex = (int*)(mem != NULL ?
                                  mem->AllocBuf(mem->devID, blockNum * gridNum * sizeof(int)) :
-                                  XMemAlloc(s->devID, blockNum * gridNum * sizeof(int)));
+                                  XMemAlloc(s->devID, blockNum * gridNum * sizeof(int)));*/
+        int * blockIndex;
+        if (mem != NULL) {
+            if (isOnSameDevice) {
+                mem->LockBuf();
+            }
+            blockIndex = (int*)mem->AllocBuf(mem->devID, blockNum * gridNum * sizeof(int));
+        }
+        else {
+            blockIndex = (int*)XMemAlloc(s->devID, blockNum * gridNum * sizeof(int));
+        }

        _MakeMergeBlockIndex(blockIndex, blockNum, blockNumInMerge, splitSizeInGrid, gridSize, gridNum, s->devID);

        _CopyBlocksOnSite(s->data, s->unitSize, realBlockSize, blockNum * gridNum, dataTMP, blockIndex, s->devID);

-        if (mem != NULL)
+        if (mem != NULL) {
            mem->ReleaseBuf(mem->devID, blockNum * gridNum * sizeof(int));
+            if (isOnSameDevice) {
+                mem->UnlockBuf();
+            }
+        }
        else
            XMemFree(s->devID, blockIndex);

        if (!isOnSameDevice) {
            XMemCopy(t->data, t->devID, dataTMP, s->devID, size);
-            if (mem != NULL)
+            if (mem != NULL) {
                mem->ReleaseBuf(mem->devID, size);
+                mem->UnlockBuf();
+            }
            else
                XMemFree(s->devID, dataTMP);
        }
@@ -185,13 +209,13 @@ bool CheckMergeSize(const XTensor * s, const XTensor * t, int whereToMerge, int 
 transform a tensor by merging it along with a dimension (return an XTensor structure)
 make a new tensor to keep the result and  return it

-e.g., (N/3, M, 3) -> (N, M)
+e.g., (3, M, N/3) -> (M, N)

 >> s - the source tensor
 >> whereToMerge - the merging operation is along with which dimension
->> leadingDim - the leading dimension of merging, take (N/3, M, 3) -> (N, M) 
-   for example, whereToMerge = 0 (i.e., the dimension for "N/3")
-   leadingDim = 2 (i.e., the dimension for "3")
+>> leadingDim - the leading dimension of merging, take (3, M, N/3) -> (M, N) 
+   for example, whereToMerge = 2 (i.e., the dimension for "N/3")
+   leadingDim = 0 (i.e., the dimension for "3")
 << return - the transformed tensor by merging along with a dimension
 */
 XTensor Merge(const XTensor &s, int whereToMerge, int leadingDim)
@@ -358,8 +382,16 @@ void _Merge(const TensorList * smalls, XTensor * t, int whereToMerge)
        void * dataTMP = NULL;
        if (uniform)
            dataTMP = smallsItem0->data;
-        else
-            dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(t->devID, size);
+        else {
+            //dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(t->devID, size);
+            if (mem != NULL) {
+                mem->LockBuf();
+                dataTMP = mem->AllocBuf(mem->devID, size);
+            }
+            else {
+                dataTMP = XMemAlloc(t->devID, size);
+            }
+        }

        tensorTMP->data = dataTMP;

@@ -378,8 +410,10 @@ void _Merge(const TensorList * smalls, XTensor * t, int whereToMerge)
        tensorTMP->data = NULL;
        delete tensorTMP;

-        if ((!uniform) && (mem != NULL))
+        if ((!uniform) && (mem != NULL)) {
            mem->ReleaseBuf(mem->devID, size);
+            mem->UnlockBuf();
+        }
        else
            XMemFree(t->devID, dataTMP);
    }

--- a/source/tensor/core/shape/MergeBlockLists.cu
+++ b/source/tensor/core/shape/MergeBlockLists.cu
@@ -117,7 +117,7 @@ void _CudaMergeBlockLists(const StrList* sourceList, int * blockSizes, int block

    GDevs.GetCudaThread2D(myMem->devID, realMaxBlockSize, newBlockListSize, MAX_INT,
                          cudaGridSizes, cudaBlockSizes);
-
+    myMem->LockBuf();
    myMem->SetPinBuf();
    int * sizesGPU = (int*)myMem->AllocBuf(myMem->devID, sizeof(int) * newBlockListSize, 256);

@@ -133,6 +133,7 @@ void _CudaMergeBlockLists(const StrList* sourceList, int * blockSizes, int block
                            (sourceArraysGPU, sizesGPU, newBlockListSize, targetArraysGPU);

    myMem->BackToPinBuf();
+    myMem->UnlockBuf();

    delete[] sourceArrays;
    delete[] targetArrays;

--- a/source/tensor/core/shape/Split.cpp
+++ b/source/tensor/core/shape/Split.cpp
@@ -96,25 +96,11 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
            }
        }
        else{
-#ifdef USE_CUDA
-#ifdef STREAMED_MEMCPOPY
-            XStream * stream = GDevs.GPUs[t->devID].stream;
-            for (int k = 0; k < splitNum; k++) {
-                XMemCopy2DAsync((char*)t->data + k * tStep, tPitch, t->devID,
-                                (char*)s->data + k * sStep, sPitch, s->devID,
-                                 mSize, n, stream);
-            }
-            stream->StreamSynchronize();
-#else
            for (int k = 0; k < splitNum; k++) {
                XMemCopy2D((char*)t->data + k * tStep, tPitch, t->devID,
                           (char*)s->data + k * sStep, sPitch, s->devID,
                            mSize, n);
            }
-#endif
-#else
-            ShowNTErrors("Please specify USE_CUDA and recompile the code!");
-#endif
        }
    }
    else {
@@ -124,22 +110,44 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)

        void * dataTMP = t->data;

-        if (!isOnSameDevice)
-            dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(s->devID, size);
+        if (!isOnSameDevice) {
+            //dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(s->devID, size);
+            if (mem != NULL) {
+                mem->LockBuf();
+                dataTMP = mem->AllocBuf(mem->devID, size);
+            }
+            else {
+                dataTMP = XMemAlloc(s->devID, size);
+            }
+        }

        int realBlockSize = blockSize * t->unitSize;
        int blockSplitSize = blockNum / splitNum;

-        int * blockIndex = (int*)(mem != NULL ?
+        /*int * blockIndex = (int*)(mem != NULL ?
                                  mem->AllocBuf(mem->devID, blockNum * sizeof(int)) :
-                                  XMemAlloc(s->devID, blockNum * sizeof(int)));
+                                  XMemAlloc(s->devID, blockNum * sizeof(int)));*/
+        int * blockIndex;
+        if (mem != NULL) {
+            if (isOnSameDevice) {
+                mem->LockBuf();
+            }
+            blockIndex = (int*)mem->AllocBuf(mem->devID, blockNum * sizeof(int));
+        }
+        else {
+            blockIndex = (int*)XMemAlloc(s->devID, blockNum * sizeof(int));
+        }

        _MakeSplitBlockIndex(blockIndex, splitNum, blockSplitSize, blockNum, s->devID);

        _CopyBlocksOnSite(s->data, s->unitSize, realBlockSize, blockNum, dataTMP, blockIndex, s->devID);

-        if (mem != NULL)
+        if (mem != NULL) {
            mem->ReleaseBuf(mem->devID, blockNum * sizeof(int));
+            if (isOnSameDevice) {
+                mem->UnlockBuf();
+            }
+        }
        else
            XMemFree(s->devID, blockIndex);

@@ -147,8 +155,10 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
        if (!isOnSameDevice) {
            XMemCopy(t->data, t->devID, dataTMP, s->devID, size);

-            if (mem != NULL)
+            if (mem != NULL) {
                mem->ReleaseBuf(mem->devID, size);
+                mem->UnlockBuf();
+            }
            else
                XMemFree(s->devID, dataTMP);
        }
@@ -321,27 +331,12 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli
            }
        }
        else{
-#ifdef USE_CUDA
-#ifdef STREAMED_MEMCPOPY
-            XStream * stream = GDevs.GPUs[big->devID].stream;
-            for (int k = 0; k < splitNum; k++) {
-                XTensor * t = (XTensor*)smalls->GetItem(k);
-                XMemCopy2DAsync((char*)t->data + k * tStep, tPitch, t->devID,
-                                (char*)big->data + k * sStep, sPitch, big->devID,
-                                 mSize, n, stream);
-            }
-            stream->StreamSynchronize();
-#else
            for (int k = 0; k < splitNum; k++) {
                XTensor * t = (XTensor*)smalls->GetItem(k);
                XMemCopy2D((char*)t->data + k * tStep, tPitch, t->devID,
                           (char*)big->data + k * sStep, sPitch, big->devID,
                            mSize, n);
            }
-#endif
-#else
-            ShowNTErrors("Please specify USE_CUDA and recompile the code!");
-#endif
        }
    }
    /* splitting with fewer kernel/api calls??? (i'm not sure about it!! may remove this later) */
@@ -362,7 +357,14 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli
            dataTMP = first->data;
        }
        else {
-            dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(big->devID, size);
+            //dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(big->devID, size);
+            if (mem != NULL) {
+                mem->LockBuf();
+                dataTMP = mem->AllocBuf(mem->devID, size);
+            }
+            else {
+                dataTMP = XMemAlloc(big->devID, size);
+            }
        }

        tensorTMP->data = dataTMP;
@@ -383,8 +385,10 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli
        tensorTMP->data = NULL;
        delete tensorTMP;

-        if ((!uniform) && (mem != NULL))
+        if ((!uniform) && (mem != NULL)) {
            mem->ReleaseBuf(mem->devID, size);
+            mem->UnlockBuf();
+        }
        else
            XMemFree(big->devID, dataTMP);
    }

--- a/source/tensor/core/shape/Stack.cpp
+++ b/source/tensor/core/shape/Stack.cpp
@@ -43,13 +43,11 @@ void _Stack(const TensorList * smalls, XTensor * t, int dim)

    int blockSize = 1;
    int blockNum = 1;
-    int gridSize = 1;
    int gridNum = 1;

    XTensor * smallsItem0 = smalls->GetItem(0);
-    int unitNum = smallsItem0->unitNum;
+    //int unitNum = smallsItem0->unitNum;
    int unitSize = smallsItem0->unitSize;
-    int itemSize = unitNum * unitSize;

    for (int i = 0; i < smallsItem0->order; i++) {
        if (i >= dim)
@@ -129,7 +127,7 @@ bool CheckStackShape(const TensorList &smalls, XTensor &t, int dim)
    XTensor * tensor = (XTensor*)smalls.GetItem(0);
    int order = tensor->order;

-    for (int i = 0; i < tensor->order; i++) {
+    for (int i = 0; i < order; i++) {
        if (i < dim) {
            if (t.GetDim(i) != tensor->GetDim(i)) 
                return false;

--- a/source/tensor/core/sort/Sort.cu
+++ b/source/tensor/core/sort/Sort.cu
@@ -234,7 +234,15 @@ void _CudaSortBig(const XTensor * a, XTensor * b, XTensor * indexA, XTensor * in
    int m = GetNextPower2(strideNum);
    int n = stride * blockNum;

-    void * buf = mem != NULL ? mem->AllocBuf(a->devID, n * m * a->unitSize) : XMemAlloc(a->devID, n * m * a->unitSize);
+    //void * buf = mem != NULL ? mem->AllocBuf(a->devID, n * m * a->unitSize) : XMemAlloc(a->devID, n * m * a->unitSize);
+    void * buf;
+    if (mem != NULL) {
+        mem->LockBuf();
+        buf = mem->AllocBuf(a->devID, n * m * a->unitSize);
+    }
+    else {
+        buf = XMemAlloc(a->devID, n * m * a->unitSize);
+    }
    void * bufIndex = NULL;
    if (indexA != NULL && indexB != NULL) {
        bufIndex = mem != NULL ? mem->AllocBuf(a->devID, n * m * sizeof(int)) : XMemAlloc(a->devID, n * m * sizeof(int));
@@ -289,8 +297,10 @@ void _CudaSortBig(const XTensor * a, XTensor * b, XTensor * indexA, XTensor * in
        KernelReorganizeBack<int> << <dim3(cudaGrids[1], cudaGrids[0]), dim3(cudaBlocks[1], cudaBlocks[0]) >> >
                                      (bufIndex, indexB->data, m, n, stride, k, blockNum);

-    if (mem != NULL)
+    if (mem != NULL) {
        mem->ReleaseBuf(a->devID, n * m * a->unitSize);
+        mem->UnlockBuf();
+    }
    else
        XMemFree(a->devID, buf);
    if (indexA != NULL && indexB != NULL)

--- a/source/tensor/core/utilities/XMatrixSegment.cpp
+++ b/source/tensor/core/utilities/XMatrixSegment.cpp
--- a/source/tensor/function/LogSoftmax.cpp
+++ b/source/tensor/function/LogSoftmax.cpp
--- a/source/tensor/function/Softmax.cpp
+++ b/source/tensor/function/Softmax.cpp
--- a/source/tensor/loss/CrossEntropy.cpp
+++ b/source/tensor/loss/CrossEntropy.cpp
--- a/source/tensor/loss/CrossEntropy.cu
+++ b/source/tensor/loss/CrossEntropy.cu
--- a/source/tensor/test/TConvertDataType.cpp
+++ b/source/tensor/test/TConvertDataType.cpp
--- a/source/tensor/test/TGather.cpp
+++ b/source/tensor/test/TGather.cpp
--- a/source/tensor/test/TSetData.cpp
+++ b/source/tensor/test/TSetData.cpp
--- a/source/train/TTrain.cpp
+++ b/source/train/TTrain.cpp
--- a/source/train/TTrain.h
+++ b/source/train/TTrain.h
--- a/source/train/XBaseTemplate.cpp
+++ b/source/train/XBaseTemplate.cpp
--- a/source/train/XBaseTemplate.h
+++ b/source/train/XBaseTemplate.h
--- a/source/train/XLeader.cpp
+++ b/source/train/XLeader.cpp
--- a/source/train/XLeader.h
+++ b/source/train/XLeader.h
--- a/source/train/XModel.cpp
+++ b/source/train/XModel.cpp
--- a/source/train/XModel.h
+++ b/source/train/XModel.h
--- a/source/train/XNNRecord.cpp
+++ b/source/train/XNNRecord.cpp
--- a/source/train/XNNRecord.h
+++ b/source/train/XNNRecord.h
--- a/source/train/XOptimizer.cpp
+++ b/source/train/XOptimizer.cpp
--- a/source/train/XOptimizer.h
+++ b/source/train/XOptimizer.h
--- a/source/train/XTrainer.cpp
+++ b/source/train/XTrainer.cpp
--- a/source/train/XTrainer.h
+++ b/source/train/XTrainer.h
--- a/source/train/XWorker.cpp
+++ b/source/train/XWorker.cpp
--- a/source/train/XWorker.h
+++ b/source/train/XWorker.h
--- a/source/train/XWorkerBroadcast.cpp
+++ b/source/train/XWorkerBroadcast.cpp
--- a/source/train/XWorkerBroadcast.h
+++ b/source/train/XWorkerBroadcast.h
--- a/source/train/XWorkerCollect.cpp
+++ b/source/train/XWorkerCollect.cpp
--- a/source/train/XWorkerCollect.h
+++ b/source/train/XWorkerCollect.h
--- a/source/train/XWorkerJob.cpp
+++ b/source/train/XWorkerJob.cpp
--- a/source/train/XWorkerJob.h
+++ b/source/train/XWorkerJob.h
--- a/source/train/XWorkerUpdate.cpp
+++ b/source/train/XWorkerUpdate.cpp
--- a/source/train/XWorkerUpdate.h
+++ b/source/train/XWorkerUpdate.h