Merge with xiaotong branch.

1. Add mutex when operating the memory pool. 2. Support CMake 3.19 and fix some CMake bugs. Note that CUDA_ROOT variable in CMake is modified as CUDA_TOOLKIT_ROOT. You can find this update in the README. 3. Fix some bugs on Ubuntu.

Merge with xiaotong branch.
1. Add mutex when operating the memory pool. 2. Support CMake 3.19 and fix some CMake bugs. Note that CUDA_ROOT variable in CMake is modified as CUDA_TOOLKIT_ROOT. You can find this update in the README. 3. Fix some bugs on Ubuntu.
ae6e43fd · liyinqiao · 7e102f8a · ae6e43fd · ae6e43fd · ae6e43fd
Commit ae6e43fd authored Mar 13, 2021 by liyinqiao
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,19 +25,19 @@ option(USE_MKL "Use MKL" OFF)
 option(USE_OPENBLAS "Use OpenBLAS" OFF)
 option(GEN_DLL "Generate Dynamic Link Library" OFF)
-# If set USE_CUDA ON, please modify CUDA_ROOT below.
+# If set USE_CUDA ON, please modify CUDA_TOOLKIT_ROOT below.
 # If set USE_MKL ON, please modify the INTEL_ROOT below.
 # If set USE_OPENBLAS ON, please modify the OPENBLAS_ROOT below.
 if (USE_CUDA)
-    if(NOT EXISTS ${CUDA_ROOT})
+    if(NOT EXISTS ${CUDA_TOOLKIT_ROOT})
        if(WIN32)
-            set(CUDA_ROOT "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.2")
+            set(CUDA_TOOLKIT_ROOT "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.2")
        else()
-            set(CUDA_ROOT "/usr/local/cuda-9.2")
+            set(CUDA_TOOLKIT_ROOT "/usr/local/cuda-9.2")
        endif()
    endif()
-    set(CUDA_TOOLKIT_ROOT_DIR ${CUDA_ROOT})
+    set(CUDA_TOOLKIT_ROOT_DIR ${CUDA_TOOLKIT_ROOT})
-    message(STATUS "CUDA_ROOT: ${CUDA_ROOT}")
+    message(STATUS "CUDA_TOOLKIT_ROOT: ${CUDA_TOOLKIT_ROOT}")
 endif()
 if(USE_MKL)
    if(NOT DEFINED INTEL_ROOT)
@@ -128,12 +128,13 @@ if(USE_CUDA)
    if(WIN32)
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4819")
-        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-maxrregcount=0 -m64 -Wno-deprecated-gpu-targets -use_fast_math")
+        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-maxrregcount=0 -Wno-deprecated-gpu-targets -use_fast_math")
+        string(REPLACE -m32 -m64 CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${ARCH_FLAGS})
        set(CMAKE_POLICY_DEFAULT_CMP0028 NEW)
-        link_directories("${CUDA_ROOT}/lib/x64")
+        link_directories("${CUDA_TOOLKIT_ROOT}/lib/x64")
-        include_directories("${CUDA_ROOT}/include")
+        include_directories("${CUDA_TOOLKIT_ROOT}/include")
-        set(CUDA_LIB_DIR "${CUDA_ROOT}/lib/x64/")
+        set(CUDA_LIB_DIR "${CUDA_TOOLKIT_ROOT}/lib/x64/")
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}cublas.lib")
        if(CUDA_VERSION_MAJOR EQUAL 11)
            set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}cublasLt.lib")
@@ -146,9 +147,9 @@ if(USE_CUDA)
        set(CMAKE_CXX_FLAGS "-fPIC -msse4.2 -w -march=native -Wno-enum-compare -Wno-sign-compare -Wno-format -Wno-dev -O3 -DNDEBUG -rdynamic")
        set(CUDA_NVCC_FLAGS "-Xcompiler -fPIC -maxrregcount=0 --disable-warnings -use_fast_math -DUSE_CUDA -Wno-deprecated-gpu-targets -std=c++11")
        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${ARCH_FLAGS})
-        link_directories("${CUDA_ROOT}/lib64")
+        link_directories("${CUDA_TOOLKIT_ROOT}/lib64")
-        include_directories("${CUDA_ROOT}/include")
+        include_directories("${CUDA_TOOLKIT_ROOT}/include")
-        set(CUDA_LIB_DIR "${CUDA_ROOT}/lib64/")
+        set(CUDA_LIB_DIR "${CUDA_TOOLKIT_ROOT}/lib64/")
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcublas_static.a")
        if(CUDA_VERSION_MAJOR EQUAL 11)
            set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcublasLt_static.a")
@@ -158,7 +159,13 @@ if(USE_CUDA)
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libnppc_static.a")
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcudadevrt.a")
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcurand_static.a")
+        if(EXISTS "/usr/lib64/libdl.so.2")
            set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "/usr/lib64/libdl.so.2")
+        elseif(EXISTS "/lib/x86_64-linux-gnu/libdl.so.2")
+            set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "/lib/x86_64-linux-gnu/libdl.so.2")
+        elseif(EXISTS "/lib64/libdl.so.2")
+            set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "/lib64/libdl.so.2")
+        endif()
    endif()
 endif()

--- a/README.md
+++ b/README.md
--- a/source/network/XBackwardMath.cpp
+++ b/source/network/XBackwardMath.cpp
--- a/source/network/XBackwardShape.cpp
+++ b/source/network/XBackwardShape.cpp
@@ -105,11 +105,15 @@ void XShapeGrad::GradConvertDataType(XTensor* node, bool isEfficient)
    if (!isEfficient || a->isGrad) {
        XNoder::MakeGrad(a);
+        if (a->mem != NULL)
+            a->mem->LockBuf();
        XTensor* tmp = NewTensorBufV2(a, a->devID, a->mem);
        _ConvertDataType(node->grad, tmp);
        _SumMe(a->grad, tmp);
        DelTensorBuf(tmp);
+        if (a->mem != NULL)
+            a->mem->UnlockBuf();
    }
    node->visitMark = NODE_FINISHED;
@@ -141,11 +145,15 @@ void XShapeGrad::GradCopyIndexed(XTensor * node, bool isEfficient)
    if (!isEfficient || input->isGrad) {
        XNoder::MakeGrad(input);
+        if (input->mem != NULL)
+            input->mem->LockBuf();
        XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
        _SpreadForCopyIndexed(tmp, node->grad, dim, srcIndex, tgtIndex, copyNum);
        _SumMe(input->grad, tmp);
        DelTensorBuf(tmp);
+        if (input->mem != NULL)
+            input->mem->UnlockBuf();
    }
    node->visitMark = NODE_FINISHED;
@@ -173,12 +181,16 @@ void XShapeGrad::GradGather(XTensor * node, bool isEfficient)
    if (!isEfficient || input->isGrad) {
        XNoder::MakeGrad(input);
+        if (input->mem != NULL)
+            input->mem->LockBuf();
        XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
        tmp->SetZeroAll();
        _SpreadForGather(tmp, node->grad, index);
        _SumMe(input->grad, tmp);
        DelTensorBuf(tmp);
+        if (input->mem != NULL)
+            input->mem->UnlockBuf();
    }
    node->visitMark = NODE_FINISHED;
@@ -200,6 +212,8 @@ void XShapeGrad::GradDropoutWithIndex(XTensor * node, bool isEfficient)
    if (!isEfficient || input->isGrad) {
        XNoder::MakeGrad(input);
+        if (input->mem != NULL)
+            input->mem->LockBuf();
        XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
        _CopyValues(node->grad, tmp);
@@ -212,6 +226,8 @@ void XShapeGrad::GradDropoutWithIndex(XTensor * node, bool isEfficient)
        _SumMe(input->grad, tmp);
        DelTensorBuf(tmp);
+        if (input->mem != NULL)
+            input->mem->UnlockBuf();
    }
    node->visitMark = NODE_FINISHED;
@@ -456,12 +472,16 @@ void XShapeGrad::GradSplit(XTensor * node, bool isEfficient)
        /* if the tensor is used somewhere else, we need another SUM
           for gradient accumulation */
        else {
+            if (input->mem != NULL)
+                input->mem->LockBuf();
            XTensor * inputGradTMP = NewTensorBufV2(input, input->devID, input->mem);
            _Merge(node->grad, inputGradTMP, whereToSplit + 1, 0);
            _Sum(input->grad, inputGradTMP, input->grad);
            DelTensorBuf(inputGradTMP);
+            if (input->mem != NULL)
+                input->mem->UnlockBuf();
        }
    }
@@ -543,12 +563,16 @@ void XShapeGrad::GradSplitListPost(XTensor * node, bool isEfficient)
           somewhere else, we need another SUM for gradient
           accumulation */
        else {
+            if (node->mem != NULL)
+                node->mem->LockBuf();
            XTensor * nodeGradTMP = NewTensorBufV2(node, node->devID, node->mem);
            _Merge(&splits, nodeGradTMP, whereToSplit + 1);
            _Sum(node->grad, nodeGradTMP, node->grad);
            DelTensorBuf(nodeGradTMP);
+            if (node->mem != NULL)
+                node->mem->UnlockBuf();
        }
    }
@@ -584,11 +608,15 @@ void XShapeGrad::GradTranspose(XTensor * node, bool isEfficient)
        CheckNTErrors(input->order > i && i >= 0, "index of dimension is out of scope!");
        CheckNTErrors(input->order > j && j >= 0, "index of dimension is out of scope!");
+        if (input->mem != NULL)
+            input->mem->LockBuf();
        XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
        _Transpose(output->grad, tmp, i, j);
        _Sum(input->grad, tmp, input->grad);
        DelTensorBuf(tmp);
+        if (input->mem != NULL)
+            input->mem->UnlockBuf();
    }
    node->visitMark = NODE_FINISHED;
@@ -622,12 +650,16 @@ void XShapeGrad::GradUnsqueeze(XTensor * node, bool isEfficient)
    if (!isEfficient || input->isGrad) {
        XNoder::MakeGrad(input);
+        if (input->mem != NULL)
+            input->mem->LockBuf();
        XTensor * tmp = NewTensorBufV2(input->grad, input->devID, input->mem);
        _ReduceSum(output->grad, tmp, dim);
        _Sum(input->grad, tmp, input->grad);
        DelTensorBuf(tmp);
+        if (input->mem != NULL)
+            input->mem->UnlockBuf();
    }
    node->visitMark = NODE_FINISHED;

--- a/source/sample/transformer/Model.cpp
+++ b/source/sample/transformer/Model.cpp
@@ -265,6 +265,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
    dims[inputDec.order + 1] = inputEnc.GetDim(inputEnc.order - 1);
    InitTensor(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, paddingEnc.devID);
+    GMems.GetMem(paddingEnc.devID)->LockBuf();
    XTensor* maskEncDecTMPEnc = NewTensorBuf(paddingEnc.order + 1, dims + 1,
        paddingEnc.dataType, paddingEnc.devID);
    XTensor* maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID);
@@ -275,6 +276,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
    DelTensorBuf(maskEncDecTMPDec);
    DelTensorBuf(maskEncDecTMPEnc);
+    GMems.GetMem(paddingEnc.devID)->UnlockBuf();
    /* padding on the source side */
    int* dimsPadding = new int[paddingEnc.order + 2];
@@ -283,6 +285,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
    dimsPadding[paddingEnc.order - 1] = paddingEnc.GetDim(-1);
    dimsPadding[paddingEnc.order] = paddingEnc.GetDim(-1);
+    GMems.GetMem(paddingEnc.devID)->LockBuf();
    XTensor* padding2 = NewTensorBuf(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType,
        paddingEnc.devID);
@@ -309,6 +312,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
    DelTensorBuf(padding3);
    DelTensorBuf(padding2);
+    GMems.GetMem(paddingEnc.devID)->UnlockBuf();
 }
 /*

--- a/source/sample/transformer/train/Trainer.cpp
+++ b/source/sample/transformer/train/Trainer.cpp
@@ -428,6 +428,7 @@ void Trainer::Update(Model* model, const float lr)
            _ScaleAndShiftMe(v, (1.0F - adamBeta2), 0);
            /* v2 = m / (sqrt(v) + delta) */
+            GMems.GetMem(v->devID)->LockBuf();
            XTensor* v2 = NewTensorBuf(v, v->devID);
            _Power(v, v2, 0.5F);
            _ScaleAndShiftMe(v2, 1.0F, d);
@@ -437,6 +438,7 @@ void Trainer::Update(Model* model, const float lr)
            _Sum(para, v2, para, -e);
            DelTensorBuf(v2);
+            GMems.GetMem(v->devID)->UnlockBuf();
        }
        else {
            /* the delta rule */

--- a/source/tensor/core/arithmetic/Div.cpp
+++ b/source/tensor/core/arithmetic/Div.cpp
@@ -253,15 +253,25 @@ void Div(const XTensor & a, const XTensor & b, XTensor & c, DTYPE alpha, int lea
    if (b.order == 0){
        DTYPE scale = 1.0F / b.Get0D();
+        if (a.mem != NULL)
+            a.mem->LockBuf();
        XTensor * tmp1 = NewTensorBufV2(&a, a.devID, a.mem);
+        if ((c.mem != NULL) && (c.mem != a.mem)) {
+            c.mem->LockBuf();
+        }
        XTensor * tmp2 = NewTensorBufV2(&c, c.devID, c.mem);
        ScaleAndShift(a, *tmp1, scale, 0.0F);
        ScaleAndShift(c, *tmp2, alpha, 0.0F);
        Sum(*tmp2, *tmp1, c);
-        DelTensorBuf(tmp1);
        DelTensorBuf(tmp2);
+        if ((c.mem != NULL) && (c.mem != a.mem)) {
+            c.mem->UnlockBuf();
+        }
+        DelTensorBuf(tmp1);
+        if (a.mem != NULL)
+            a.mem->UnlockBuf();
    }
    else {
        int n = GetBroadcastDimIndex(a, b);

--- a/source/tensor/core/arithmetic/MulAndShift.cpp
+++ b/source/tensor/core/arithmetic/MulAndShift.cpp
@@ -61,6 +61,8 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
    float dr = (!x.isSparse || !w.isSparse) ? 1.0F : MAX(x.denseRatio, w.denseRatio);
+    if (x.mem != NULL)
+        x.mem->LockBuf();
    XTensor * tmp = NewTensorBufV2(order, dimSize, x.dataType, dr, x.devID, x.mem);
    /* call _MatrixMul function */
@@ -101,6 +103,8 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
    /* destroy variables */
    delete[] dimSize;
    DelTensorBuf(tmp);
+    if (x.mem != NULL)
+        x.mem->UnlockBuf();
    return c;
 }
@@ -137,6 +141,8 @@ XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedX,
    float dr = (!x.isSparse || !w.isSparse) ? 1.0F : MAX(x.denseRatio, w.denseRatio);
+    if (x.mem != NULL)
+        x.mem->LockBuf();
    XTensor * tmp = NewTensorBufV2(order, dimSize, x.dataType, dr, x.devID, x.mem);
    /* call _MatrixMul function */
@@ -175,6 +181,8 @@ XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedX,
    /* destroy variables */
    delete[] dimSize;
    DelTensorBuf(tmp);
+    if (x.mem != NULL)
+        x.mem->UnlockBuf();
    return c;
 }

--- a/source/tensor/core/arithmetic/Multiply.cpp
+++ b/source/tensor/core/arithmetic/Multiply.cpp
@@ -277,15 +277,25 @@ void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int l
    if (b.order == 0){
        DTYPE scale = b.Get0D();
+        if (a.mem != NULL)
+            a.mem->LockBuf();
        XTensor * tmp1 = NewTensorBufV2(&a, a.devID, a.mem);
+        if ((c.mem != NULL) && (c.mem != a.mem)) {
+            c.mem->LockBuf();
+        }
        XTensor * tmp2 = NewTensorBufV2(&c, c.devID, c.mem);
        ScaleAndShift(a, *tmp1, scale, 0.0F);
        ScaleAndShift(c, *tmp2, alpha, 0.0F);
        Sum(*tmp2, *tmp1, c);
-        DelTensorBuf(tmp1);
        DelTensorBuf(tmp2);
+        if ((c.mem != NULL) && (c.mem != a.mem)) {
+            c.mem->UnlockBuf();
+        }
+        DelTensorBuf(tmp1);
+        if (a.mem != NULL)
+            a.mem->UnlockBuf();
    }
    else {
        int n = GetBroadcastDimIndex(a, b);

--- a/source/tensor/core/arithmetic/MultiplyDim.cpp
+++ b/source/tensor/core/arithmetic/MultiplyDim.cpp
@@ -290,9 +290,16 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
                source = target;
            }
-            target = t->mem != NULL ?
+            /*target = t->mem != NULL ?
                     t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize):
-                     XMemAlloc(t->devID, t->unitNum * t->unitSize);
+                     XMemAlloc(t->devID, t->unitNum * t->unitSize);*/
+            if (t->mem != NULL) {
+                t->mem->LockBuf();
+                target = t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize);
+            }
+            else {
+                target = XMemAlloc(t->devID, t->unitNum * t->unitSize);
+            }
            s->data = source;
            t->data = target;
@@ -302,8 +309,9 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
            /* free the memory space of the one before the last allocation */
            if(count > 0){
                int size = s->unitNum * s->unitSize;
-                if(t->mem != NULL)
+                if(t->mem != NULL) {
                    t->mem->ReleaseBuf(t->devID, size);
+                }
                else
                    XMemFree(t->devID, source);
            }
@@ -312,8 +320,10 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
            if(isLast){
                CheckNTErrors(t->unitNum == c->unitNum, "Wrong tensor size!");
                _Multiply(a, t, c, beta);
-                if(t->mem != NULL)
+                if(t->mem != NULL) {
                    t->mem->ReleaseBuf(t->devID, t->unitNum * t->unitSize);
+                    t->mem->UnlockBuf();
+                }
                else
                    XMemFree(t->devID, target);
                target = NULL;

--- a/source/tensor/core/arithmetic/SumDim.cpp
+++ b/source/tensor/core/arithmetic/SumDim.cpp
@@ -293,10 +293,16 @@ void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta
                source = target;
            }
-            target = t->mem != NULL ?
+            /*target = t->mem != NULL ?
                     t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize):
-                     XMemAlloc(t->devID, t->unitNum * t->unitSize);
+                     XMemAlloc(t->devID, t->unitNum * t->unitSize);*/
+            if (t->mem != NULL) {
+                t->mem->LockBuf();
+                target = t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize);
+            }
+            else {
+                target = XMemAlloc(t->devID, t->unitNum * t->unitSize);
+            }
            s->data = source;
            t->data = target;
@@ -315,8 +321,10 @@ void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta
            if(isLast){
                CheckNTErrors(t->unitNum == c->unitNum, "Wrong tensor size!");
                _Sum(a, t, c, beta);
-                if(t->mem != NULL)
+                if(t->mem != NULL) {
                    t->mem->ReleaseBuf(t->devID, t->unitNum * t->unitSize);
+                    t->mem->UnlockBuf();
+                }
                else
                    XMemFree(t->devID, target);
                target = NULL;

--- a/source/tensor/core/arithmetic/XTensorBLAS.cu
+++ b/source/tensor/core/arithmetic/XTensorBLAS.cu
@@ -330,6 +330,7 @@ void _CudaBLASMatrixMULList(cublasHandle_t * handle,
            DTYPE ** cpGPU = NULL;
            if (mem != NULL) {
+                mem->LockBuf();
                mem->SetPinBuf();
                apGPU = (DTYPE**)mem->AllocBuf(mem->devID, sizeof(DTYPE*) * a->count, 256);
                bpGPU = (DTYPE**)mem->AllocBuf(mem->devID, sizeof(DTYPE*) * a->count, 256);
@@ -356,8 +357,10 @@ void _CudaBLASMatrixMULList(cublasHandle_t * handle,
            delete[] bp;
            delete[] cp;
-            if(mem != NULL)
+            if (mem != NULL) {
                mem->BackToPinBuf();
+                mem->UnlockBuf();
+            }
            else {
                XMemFree(a0->devID, apGPU);
                XMemFree(a0->devID, bpGPU);

--- a/source/tensor/core/getandset/SetData.cpp
+++ b/source/tensor/core/getandset/SetData.cpp
@@ -696,13 +696,23 @@ void _SetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYPE nu
 #ifdef USE_CUDA
        XMem * mem = tensor->mem;
        MTYPE size = num * sizeof(MTYPE);
-        MTYPE * offsetsCuda = mem != NULL ? (MTYPE*)mem->AllocBuf(mem->devID, size) : (MTYPE*)XMemAlloc(tensor->devID, size);
+        //MTYPE * offsetsCuda = mem != NULL ? (MTYPE*)mem->AllocBuf(mem->devID, size) : (MTYPE*)XMemAlloc(tensor->devID, size);
+        MTYPE * offsetsCuda;
+        if (mem != NULL) {
+            mem->LockBuf();
+            offsetsCuda = (MTYPE*)mem->AllocBuf(mem->devID, size);
+        }
+        else {
+            offsetsCuda = (MTYPE*)XMemAlloc(tensor->devID, size);
+        }
        XMemCopy(offsetsCuda, tensor->devID, offsets, -1, num * sizeof(MTYPE));
        _CudaSetDataWithOffset(tensor, offsetsCuda, value, num);
-        if (mem != NULL)
+        if (mem != NULL) {
            mem->ReleaseBuf(mem->devID, size);
+            mem->UnlockBuf();
+        }
        else
            XMemFree(tensor->devID, offsetsCuda);
 #else

--- a/source/tensor/core/getandset/SetData.cu
+++ b/source/tensor/core/getandset/SetData.cu
@@ -636,12 +636,23 @@ void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * va
    int devIDBackup;
    ProtectCudaDev(tensor->devID, devIDBackup);
-    MTYPE * offsetsCuda = mem != NULL ? 
+    /*MTYPE * offsetsCuda = mem != NULL ? 
                            (MTYPE*)mem->AllocBuf(mem->devID, offsetSize) : 
                            (MTYPE*)XMemAlloc(tensor->devID, offsetSize);
    void * valuesCuda = mem != NULL ?
                        mem->AllocBuf(mem->devID, valueSize) :
-                            XMemAlloc(tensor->devID, valueSize);
+                        XMemAlloc(tensor->devID, valueSize);*/
+    MTYPE * offsetsCuda;
+    void * valuesCuda; 
+    if (mem != NULL) {
+        mem->LockBuf();
+        offsetsCuda = (MTYPE*)mem->AllocBuf(mem->devID, offsetSize);
+        valuesCuda = mem->AllocBuf(mem->devID, valueSize);
+    }
+    else {
+        offsetsCuda = (MTYPE*)XMemAlloc(tensor->devID, offsetSize);
+        valuesCuda = XMemAlloc(tensor->devID, valueSize);
+    }
    if (mem != NULL) {
        XMemCopy(offsetsCuda, mem->devID, offsets, -1, offsetSize);
@@ -657,6 +668,7 @@ void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * va
    if (mem != NULL) {
        mem->ReleaseBuf(mem->devID, valueSize);
        mem->ReleaseBuf(mem->devID, offsetSize);
+        mem->UnlockBuf();
    }
    else {
        XMemFree(tensor->devID, valuesCuda);

--- a/source/tensor/core/movement/CopyBlocks.cpp
+++ b/source/tensor/core/movement/CopyBlocks.cpp
@@ -45,15 +45,25 @@ void _CopyBlocks(void * source, int unitSize, int blockSize, int blockNum, void 
    if (devID >= 0) {
 #ifdef USE_CUDA
        /* copy the index from host to device */
-        int * targetBlocksTMP = myMem != NULL ?
+        /*int * targetBlocksTMP = myMem != NULL ?
                               (int*)myMem->AllocBuf(devID, blockNum * sizeof(int)):
-                               (int*)XMemAlloc(devID, blockNum * sizeof(int));
+                               (int*)XMemAlloc(devID, blockNum * sizeof(int));*/
+        int * targetBlocksTMP;
+        if (myMem != NULL) {
+            myMem->LockBuf();
+            targetBlocksTMP = (int*)myMem->AllocBuf(devID, blockNum * sizeof(int));
+        }
+        else {
+            targetBlocksTMP = (int*)XMemAlloc(devID, blockNum * sizeof(int));
+        }
        XMemCopy(targetBlocksTMP, devID, targetBlocks, -1, blockNum * sizeof(int));
        _CopyBlocksOnSite(source, unitSize, blockSize, blockNum, target, targetBlocksTMP, devID);
-        if(myMem != NULL)
+        if (myMem != NULL) {
            myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
+            myMem->UnlockBuf();
+        }
        else
            XMemFree(devID, targetBlocksTMP);
 #else

--- a/source/tensor/core/movement/CopyBlocksInGrid.cpp
+++ b/source/tensor/core/movement/CopyBlocksInGrid.cpp
@@ -47,14 +47,17 @@ void _CopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, 
 #ifdef USE_CUDA
        int * indexGPU = index;
        if (!isIndexOnDev) {
+            myMem->LockBuf();
            indexGPU = (int*)myMem->AllocBuf(myMem->devID, blockNum * gridNum * sizeof(int));
            XMemCopy(indexGPU, myMem->devID, index, -1, blockNum * gridNum * sizeof(int));
        }
        _CudaCopyBlocksInGrid(source, blockSize, blockNum, gridNum, target, indexGPU, unitSize, myMem);
-        if (!isIndexOnDev)
+        if (!isIndexOnDev) {
            myMem->ReleaseBuf(myMem->devID, blockNum * gridNum * sizeof(int));
+            myMem->UnlockBuf();
+        }
 #else
        ShowNTErrors("Plesae specify USE_CUDA and recompile the code!");
 #endif

--- a/source/tensor/core/movement/CopyBlocksSelected.cu
+++ b/source/tensor/core/movement/CopyBlocksSelected.cu
@@ -80,12 +80,23 @@ void _CudaCopyBlocksSelected(void * source, int unitSize, int blockSize, int * s
    ProtectCudaDev(devID, devIDBackup);
    /* copy the index to the GPU memory */
-    int * sourceBlocksTMP = myMem != NULL ? 
+    /*int * sourceBlocksTMP = myMem != NULL ? 
                           (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) : 
                           (int *)XMemAlloc(devID, blockNum * sizeof(int));
    int * targetBlocksTMP = myMem != NULL ? 
                           (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) : 
-                           (int *)XMemAlloc(devID, blockNum * sizeof(int));
+                           (int *)XMemAlloc(devID, blockNum * sizeof(int));*/
+    int * sourceBlocksTMP;
+    int * targetBlocksTMP;
+    if (myMem != NULL) {
+        myMem->LockBuf();
+        sourceBlocksTMP = (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int));
+        targetBlocksTMP = (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int));
+    }
+    else {
+        sourceBlocksTMP = (int *)XMemAlloc(devID, blockNum * sizeof(int));
+        targetBlocksTMP = (int *)XMemAlloc(devID, blockNum * sizeof(int));
+    }
    XMemCopy(sourceBlocksTMP, devID, sourceBlocks, -1, blockNum * sizeof(int));
    XMemCopy(targetBlocksTMP, devID, targetBlocks, -1, blockNum * sizeof(int));
@@ -107,6 +118,7 @@ void _CudaCopyBlocksSelected(void * source, int unitSize, int blockSize, int * s
    if (myMem != NULL) {
        myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
        myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
+        myMem->UnlockBuf();
    }
    else {
        XMemFree(devID, sourceBlocksTMP);

--- a/source/tensor/core/movement/Gather.cu
+++ b/source/tensor/core/movement/Gather.cu
@@ -131,9 +131,16 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
            CheckNTErrors(srcIndexValue < s->unitNum, "Wrong index!");
        }
-        sIndex = mem != NULL ? 
+        /*sIndex = mem != NULL ? 
                  (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) : 
-                  (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
+                  (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);*/
+        if (mem != NULL) {
+            mem->LockBuf();
+            sIndex = (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize);
+        }
+        else {
+            sIndex = (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
+        }
        XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize);
    }
    else {
@@ -169,8 +176,10 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
    }
    if (srcIndex->devID < 0) {
-        if(mem != NULL)
+        if (mem != NULL) {
            mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
+            mem->UnlockBuf();
+        }
        else
            XMemFree(mem->devID, sIndex);
    }
@@ -209,9 +218,16 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
            CheckNTErrors(srcIndexValue < s->unitNum, "Wrong index!");
        }
-        sIndex = mem != NULL ?
+        /*sIndex = mem != NULL ?
        (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
-                  (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
+        (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);*/
+        if (mem != NULL) {
+            mem->LockBuf();
+            sIndex = (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize);
+        }
+        else {
+            sIndex = (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
+        }
        XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize);
    }
    else {
@@ -238,6 +254,15 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
    else {
        ShowNTErrors("Unsupported dataType!");
    }
+    if (srcIndex->devID < 0) {
+        if (mem != NULL) {
+            mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
+            mem->UnlockBuf();
+        }
+        else
+            XMemFree(mem->devID, sIndex);
+    }
 }
 #endif // USE_CUDA

--- a/source/tensor/core/movement/Spread.cu
+++ b/source/tensor/core/movement/Spread.cu
@@ -177,9 +177,17 @@ void _CudaSpread(XTensor * source, XTensor * collection, int dim,
        DTYPE * c = (DTYPE*)collection->data;
        XMem * mem = source->mem;
-        int * si = mem != NULL ? 
+        /*int * si = mem != NULL ? 
                   (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize * 2) : 
-                   (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize * 2);
+                   (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize * 2);*/
+        int * si;
+        if (mem != NULL) {
+            mem->LockBuf();
+            si = (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize * 2);
+        }
+        else {
+            si = (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize * 2);
+        }
        int * ci = si + indexSize;
        XMemCopy(si, mem->devID, srcIndex, -1, sizeof(int) * indexSize);
@@ -188,8 +196,10 @@ void _CudaSpread(XTensor * source, XTensor * collection, int dim,
        KernelSpreadFuzed<<<blocks, threads >>>(s, c, blockNum, blockSizeSrc, blockSizeColl,
                                                stride, indexSize, si, ci);
-        if(mem != NULL)
+        if (mem != NULL) {
            mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize * 2);
+            mem->UnlockBuf();
+        }
        else
            XMemFree(mem->devID, si);
    }
@@ -393,9 +403,16 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcI
    dim3 threads(cudaBlocks[0], cudaBlocks[1]);
    if (srcIndex->devID < 0) {
-        sIndex = mem != NULL ? 
+        /*sIndex = mem != NULL ? 
                (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) : 
-                (int*)XMemAlloc(devID, sizeof(int) * indexSize);
+                (int*)XMemAlloc(devID, sizeof(int) * indexSize);*/
+        if (mem != NULL) {
+            mem->LockBuf();
+            sIndex = (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize);
+        }
+        else {
+            sIndex = (int*)XMemAlloc(devID, sizeof(int) * indexSize);
+        }
        XMemCopy(sIndex, devID, srcIndex->data, -1, sizeof(int) * indexSize);
    }
    else
@@ -422,8 +439,10 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcI
    }
    if (srcIndex->devID < 0) {
-        if(mem != NULL)
+        if (mem != NULL) {
            mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
+            mem->UnlockBuf();
+        }
        else
            XMemFree(devID, sIndex);
    }

--- a/source/tensor/core/reduce/ReduceMax.cu
+++ b/source/tensor/core/reduce/ReduceMax.cu
@@ -574,7 +574,14 @@ void _funcName(const XTensor * input, XTensor * output, int dim)                
        XMem * mem = input->mem;                                                                                                              \
        GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);                                     \
        int bufSize = input->unitSize * cudaGridSize[0] * stride * blockNum * 2;                                                              \
-        DTYPE * buf  = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(devID, bufSize);                          \
+        DTYPE * buf;                                                                                                                          \
+        if (mem != NULL) {                                                                                                                    \
+            mem->LockBuf();                                                                                                                   \
+            buf = (DTYPE*)mem->AllocBuf(mem->devID, bufSize);                                                                                 \
+        }                                                                                                                                     \
+        else {                                                                                                                                \
+            buf = (DTYPE*)XMemAlloc(devID, bufSize);                                                                                          \
+        }                                                                                                                                     \
        DTYPE * buf1 = buf;                                                                                                                   \
        DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum;                                                                             \
        do {                                                                                                                                  \
@@ -706,8 +713,10 @@ void _funcName(const XTensor * input, XTensor * output, int dim)                
                                                                                                                                              \
        } while (strideNum > 1);                                                                                                              \
                                                                                                                                              \
-        if (mem != NULL)                                                                                                                      \
+        if (mem != NULL) {                                                                                                                    \
            mem->ReleaseBuf(mem->devID, bufSize);                                                                                             \
+            mem->UnlockBuf();                                                                                                                 \
+        }                                                                                                                                     \
        else                                                                                                                                  \
            XMemFree(input->devID, buf);                                                                                                      \
    }                                                                                                                                         \

--- a/source/tensor/core/reduce/ReduceSum.cu
+++ b/source/tensor/core/reduce/ReduceSum.cu
@@ -757,7 +757,15 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
        GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
        int bufSize = input->unitSize * cudaGridSize[0] * stride * blockNum * 2;
-        DTYPE * buf  = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(devID, bufSize);
+        //DTYPE * buf  = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(devID, bufSize);
+        DTYPE * buf;
+        if (mem != NULL) {
+            mem->LockBuf();
+            buf = (DTYPE*)mem->AllocBuf(mem->devID, bufSize);
+        }
+        else {
+            buf = (DTYPE*)XMemAlloc(devID, bufSize);
+        }
        DTYPE * buf1 = buf;
        DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum;
        do {
@@ -907,8 +915,10 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
        } while (strideNum > 1);
-        if (mem != NULL)
+        if (mem != NULL) {
            mem->ReleaseBuf(mem->devID, bufSize);
+            mem->UnlockBuf();
+        }
        else
            XMemFree(devID, buf);
    }

--- a/source/tensor/core/reduce/ReduceSumAll.cpp
+++ b/source/tensor/core/reduce/ReduceSumAll.cpp
@@ -56,12 +56,16 @@ void _ReduceSumAll(const XTensor * source, XTensor * target)
    int dims[1] = {source->unitNum};
+    if (source->mem != NULL)
+        source->mem->LockBuf();
    XTensor * all = NewTensorBufV2(1, dims, source->dataType, source->denseRatio, source->devID, source->mem);
    _CopyValues(source, all);
    _ReduceSum(all, target, 0);
    DelTensorBuf(all);
+    if (source->mem != NULL)
+        source->mem->UnlockBuf();
 }
 /*
@@ -72,6 +76,8 @@ sum all the items of the tensor (It should be optimized!)
 void _ReduceSumAll(const XTensor * source, DTYPE * value)
 {
    int * dimSize = new int[MAX_TENSOR_DIM_NUM];
+    if (source->mem != NULL)
+        source->mem->LockBuf();
    XTensor * target = NewTensorBufV2(0, dimSize, source->dataType, source->denseRatio, source->devID, source->mem);
    target->SetTMPFlag();
@@ -81,6 +87,8 @@ void _ReduceSumAll(const XTensor * source, DTYPE * value)
    delete[] dimSize;
    DelTensorBuf(target);
+    if (source->mem != NULL)
+        source->mem->UnlockBuf();
 }
 /*

--- a/source/tensor/core/shape/Merge.cpp
+++ b/source/tensor/core/shape/Merge.cpp
@@ -118,30 +118,54 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
        void * dataTMP = t->data;
-        if (!isOnSameDevice)
+        if (!isOnSameDevice) {
-            dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(mem->devID, size);
+            /*dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(mem->devID, size);*/
+            if (mem != NULL) {
+                mem->LockBuf();
+                dataTMP = mem->AllocBuf(mem->devID, size);
+            }
+            else {
+                dataTMP = XMemAlloc(mem->devID, size);
+            }
+        }
        int blockNumInMerge = s->dimSize[leadingDim];
        int splitSizeInGrid = gridSize / blockNumInMerge;
        int realBlockSize = blockSize * t->unitSize;
-        int * blockIndex = (int*)(mem != NULL ?
+        /*int * blockIndex = (int*)(mem != NULL ?
                                  mem->AllocBuf(mem->devID, blockNum * gridNum * sizeof(int)) :
-                                  XMemAlloc(s->devID, blockNum * gridNum * sizeof(int)));
+                                  XMemAlloc(s->devID, blockNum * gridNum * sizeof(int)));*/
+        int * blockIndex;
+        if (mem != NULL) {
+            if (isOnSameDevice) {
+                mem->LockBuf();
+            }
+            blockIndex = (int*)mem->AllocBuf(mem->devID, blockNum * gridNum * sizeof(int));
+        }
+        else {
+            blockIndex = (int*)XMemAlloc(s->devID, blockNum * gridNum * sizeof(int));
+        }
        _MakeMergeBlockIndex(blockIndex, blockNum, blockNumInMerge, splitSizeInGrid, gridSize, gridNum, s->devID);
        _CopyBlocksOnSite(s->data, s->unitSize, realBlockSize, blockNum * gridNum, dataTMP, blockIndex, s->devID);
-        if (mem != NULL)
+        if (mem != NULL) {
            mem->ReleaseBuf(mem->devID, blockNum * gridNum * sizeof(int));
+            if (isOnSameDevice) {
+                mem->UnlockBuf();
+            }
+        }
        else
            XMemFree(s->devID, blockIndex);
        if (!isOnSameDevice) {
            XMemCopy(t->data, t->devID, dataTMP, s->devID, size);
-            if (mem != NULL)
+            if (mem != NULL) {
                mem->ReleaseBuf(mem->devID, size);
+                mem->UnlockBuf();
+            }
            else
                XMemFree(s->devID, dataTMP);
        }
@@ -358,8 +382,16 @@ void _Merge(const TensorList * smalls, XTensor * t, int whereToMerge)
        void * dataTMP = NULL;
        if (uniform)
            dataTMP = smallsItem0->data;
-        else
+        else {
-            dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(t->devID, size);
+            //dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(t->devID, size);
+            if (mem != NULL) {
+                mem->LockBuf();
+                dataTMP = mem->AllocBuf(mem->devID, size);
+            }
+            else {
+                dataTMP = XMemAlloc(t->devID, size);
+            }
+        }
        tensorTMP->data = dataTMP;
@@ -378,8 +410,10 @@ void _Merge(const TensorList * smalls, XTensor * t, int whereToMerge)
        tensorTMP->data = NULL;
        delete tensorTMP;
-        if ((!uniform) && (mem != NULL))
+        if ((!uniform) && (mem != NULL)) {
            mem->ReleaseBuf(mem->devID, size);
+            mem->UnlockBuf();
+        }
        else
            XMemFree(t->devID, dataTMP);
    }

--- a/source/tensor/core/shape/MergeBlockLists.cu
+++ b/source/tensor/core/shape/MergeBlockLists.cu
@@ -117,7 +117,7 @@ void _CudaMergeBlockLists(const StrList* sourceList, int * blockSizes, int block
    GDevs.GetCudaThread2D(myMem->devID, realMaxBlockSize, newBlockListSize, MAX_INT,
                          cudaGridSizes, cudaBlockSizes);
+    myMem->LockBuf();
    myMem->SetPinBuf();
    int * sizesGPU = (int*)myMem->AllocBuf(myMem->devID, sizeof(int) * newBlockListSize, 256);
@@ -133,6 +133,7 @@ void _CudaMergeBlockLists(const StrList* sourceList, int * blockSizes, int block
                            (sourceArraysGPU, sizesGPU, newBlockListSize, targetArraysGPU);
    myMem->BackToPinBuf();
+    myMem->UnlockBuf();
    delete[] sourceArrays;
    delete[] targetArrays;

--- a/source/tensor/core/shape/Split.cpp
+++ b/source/tensor/core/shape/Split.cpp
@@ -110,22 +110,44 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
        void * dataTMP = t->data;
-        if (!isOnSameDevice)
+        if (!isOnSameDevice) {
-            dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(s->devID, size);
+            //dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(s->devID, size);
+            if (mem != NULL) {
+                mem->LockBuf();
+                dataTMP = mem->AllocBuf(mem->devID, size);
+            }
+            else {
+                dataTMP = XMemAlloc(s->devID, size);
+            }
+        }
        int realBlockSize = blockSize * t->unitSize;
        int blockSplitSize = blockNum / splitNum;
-        int * blockIndex = (int*)(mem != NULL ?
+        /*int * blockIndex = (int*)(mem != NULL ?
                                  mem->AllocBuf(mem->devID, blockNum * sizeof(int)) :
-                                  XMemAlloc(s->devID, blockNum * sizeof(int)));
+                                  XMemAlloc(s->devID, blockNum * sizeof(int)));*/
+        int * blockIndex;
+        if (mem != NULL) {
+            if (isOnSameDevice) {
+                mem->LockBuf();
+            }
+            blockIndex = (int*)mem->AllocBuf(mem->devID, blockNum * sizeof(int));
+        }
+        else {
+            blockIndex = (int*)XMemAlloc(s->devID, blockNum * sizeof(int));
+        }
        _MakeSplitBlockIndex(blockIndex, splitNum, blockSplitSize, blockNum, s->devID);
        _CopyBlocksOnSite(s->data, s->unitSize, realBlockSize, blockNum, dataTMP, blockIndex, s->devID);
-        if (mem != NULL)
+        if (mem != NULL) {
            mem->ReleaseBuf(mem->devID, blockNum * sizeof(int));
+            if (isOnSameDevice) {
+                mem->UnlockBuf();
+            }
+        }
        else
            XMemFree(s->devID, blockIndex);
@@ -133,8 +155,10 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
        if (!isOnSameDevice) {
            XMemCopy(t->data, t->devID, dataTMP, s->devID, size);
-            if (mem != NULL)
+            if (mem != NULL) {
                mem->ReleaseBuf(mem->devID, size);
+                mem->UnlockBuf();
+            }
            else
                XMemFree(s->devID, dataTMP);
        }
@@ -333,7 +357,14 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli
            dataTMP = first->data;
        }
        else {
-            dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(big->devID, size);
+            //dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(big->devID, size);
+            if (mem != NULL) {
+                mem->LockBuf();
+                dataTMP = mem->AllocBuf(mem->devID, size);
+            }
+            else {
+                dataTMP = XMemAlloc(big->devID, size);
+            }
        }
        tensorTMP->data = dataTMP;
@@ -354,8 +385,10 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli
        tensorTMP->data = NULL;
        delete tensorTMP;
-        if ((!uniform) && (mem != NULL))
+        if ((!uniform) && (mem != NULL)) {
            mem->ReleaseBuf(mem->devID, size);
+            mem->UnlockBuf();
+        }
        else
            XMemFree(big->devID, dataTMP);
    }

--- a/source/tensor/core/sort/Sort.cu
+++ b/source/tensor/core/sort/Sort.cu
@@ -234,7 +234,15 @@ void _CudaSortBig(const XTensor * a, XTensor * b, XTensor * indexA, XTensor * in
    int m = GetNextPower2(strideNum);
    int n = stride * blockNum;
-    void * buf = mem != NULL ? mem->AllocBuf(a->devID, n * m * a->unitSize) : XMemAlloc(a->devID, n * m * a->unitSize);
+    //void * buf = mem != NULL ? mem->AllocBuf(a->devID, n * m * a->unitSize) : XMemAlloc(a->devID, n * m * a->unitSize);
+    void * buf;
+    if (mem != NULL) {
+        mem->LockBuf();
+        buf = mem->AllocBuf(a->devID, n * m * a->unitSize);
+    }
+    else {
+        buf = XMemAlloc(a->devID, n * m * a->unitSize);
+    }
    void * bufIndex = NULL;
    if (indexA != NULL && indexB != NULL) {
        bufIndex = mem != NULL ? mem->AllocBuf(a->devID, n * m * sizeof(int)) : XMemAlloc(a->devID, n * m * sizeof(int));
@@ -289,8 +297,10 @@ void _CudaSortBig(const XTensor * a, XTensor * b, XTensor * indexA, XTensor * in
        KernelReorganizeBack<int> << <dim3(cudaGrids[1], cudaGrids[0]), dim3(cudaBlocks[1], cudaBlocks[0]) >> >
                                      (bufIndex, indexB->data, m, n, stride, k, blockNum);
-    if (mem != NULL)
+    if (mem != NULL) {
        mem->ReleaseBuf(a->devID, n * m * a->unitSize);
+        mem->UnlockBuf();
+    }
    else
        XMemFree(a->devID, buf);
    if (indexA != NULL && indexB != NULL)

--- a/source/tensor/function/LogSoftmax.cpp
+++ b/source/tensor/function/LogSoftmax.cpp
@@ -79,6 +79,8 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
        blockSize = stride * dimensionSize;
        blockNum = y->unitNum / blockSize;
+        if (mem != NULL)
+            mem->LockBuf();
        max = NewTensorBufV2(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
        sum = NewTensorBufV2(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
@@ -153,6 +155,8 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
        DelTensorBuf(max);
        DelTensorBuf(sum);
+        if (mem != NULL)
+            mem->UnlockBuf();
        if (x->devID >= 0) {
            delete blockx;

--- a/source/tensor/function/Softmax.cpp
+++ b/source/tensor/function/Softmax.cpp
@@ -54,6 +54,8 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
        XTensor * max = NULL;
        XTensor * sum = NULL;
+        if (mem != NULL)
+            mem->LockBuf();
        max = NewTensorBufV2(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
        sum = NewTensorBufV2(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
@@ -113,6 +115,8 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
        DelTensorBuf(sum);
        DelTensorBuf(max);
+        if (mem != NULL)
+            mem->UnlockBuf();
        delete[] dimSize;
    }

--- a/source/tensor/loss/CrossEntropy.cpp
+++ b/source/tensor/loss/CrossEntropy.cpp
@@ -354,6 +354,8 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
            dimSize[i - 1] = output->dimSize[i];
    }
+    if (output->mem != NULL)
+        output->mem->LockBuf();
    XTensor * lossBuf = NewTensorBufV2(output->order - 1, dimSize, output->dataType, output->denseRatio, 
                                       output->devID, output->mem);
@@ -367,10 +369,16 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
            nonZeroNum = (DTYPE)lossBuf->unitNum;
        }
        else {
+            if ((padding->mem != NULL) && (padding->mem != output->mem)) {
+                padding->mem->LockBuf();
+            }
            XTensor * tmp = NewTensorBufV2(padding, padding->devID, padding->mem);
            _IsNonZero(padding, tmp);
            _ReduceSumAll(tmp, &nonZeroNum);
            DelTensorBuf(tmp);
+            if ((padding->mem != NULL) && (padding->mem != output->mem)) {
+                padding->mem->UnlockBuf();
+            }
        }
        loss = loss / nonZeroNum;
@@ -384,6 +392,8 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
    delete[] dimSize;
    DelTensorBuf(lossBuf);
+    if (output->mem != NULL)
+        output->mem->UnlockBuf();
    return loss;
 }

--- a/source/tensor/loss/CrossEntropy.cu
+++ b/source/tensor/loss/CrossEntropy.cu
@@ -57,6 +57,9 @@ void _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
 {
    int n = leadingDim < 0 ? output->order - 1 : leadingDim;
+    if (output->mem != NULL) {
+        output->mem->LockBuf();
+    }
    XTensor * interBuf1 = NewTensorBufV2(output, output->devID, output->mem);
    XTensor * interBuf2 = NewTensorBufV2(output, output->devID, output->mem);
@@ -73,6 +76,9 @@ void _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
    DelTensorBuf(interBuf2);
    DelTensorBuf(interBuf1);
+    if (output->mem != NULL) {
+        output->mem->UnlockBuf();
+    }
 }
 /*
@@ -118,6 +124,9 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
            dimSize[i - 1] = output->dimSize[i];
    }
+    if (output->mem != NULL) {
+        output->mem->LockBuf();
+    }
    XTensor * lossBuf = NewTensorBufV2(output->order - 1, dimSize, output->dataType, output->denseRatio, 
                                     output->devID, output->mem);
@@ -131,10 +140,16 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
            nonZeroNum = (DTYPE)lossBuf->unitNum;
        }
        else {
+            if ((padding->mem != NULL) && (padding->mem != output->mem)) {
+                padding->mem->LockBuf();
+            }
            XTensor * tmp = NewTensorBufV2(padding, padding->devID, padding->mem);
            _IsNonZero(padding, tmp);
            _ReduceSumAll(tmp, &nonZeroNum);
            DelTensorBuf(tmp);
+            if ((padding->mem != NULL) && (padding->mem != output->mem)) {
+                padding->mem->UnlockBuf();
+            }
        }
        loss = loss / nonZeroNum;
@@ -148,6 +163,9 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
    delete[] dimSize;
    DelTensorBuf(lossBuf);
+    if (output->mem != NULL) {
+        output->mem->UnlockBuf();
+    }
    return loss;
 }

--- a/source/train/XWorkerBroadcast.h
+++ b/source/train/XWorkerBroadcast.h