Merge with liyinqiao branch.

1. Add mutex when operating the memory pool. 2. Support CMake 3.19 and fix some CMake bugs. Note that CUDA_ROOT variable in CMake is modified as CUDA_TOOLKIT_ROOT. You can find this update in the README. 3. Fix some bugs on Ubuntu.

Merge with liyinqiao branch.
1. Add mutex when operating the memory pool. 2. Support CMake 3.19 and fix some CMake bugs. Note that CUDA_ROOT variable in CMake is modified as CUDA_TOOLKIT_ROOT. You can find this update in the README. 3. Fix some bugs on Ubuntu.
8f368e73 · liyinqiao · 7e102f8a · 8f368e73 · 8f368e73 · 8f368e73
Commit 8f368e73 authored Mar 13, 2021 by liyinqiao
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,19 +25,19 @@ option(USE_MKL "Use MKL" OFF)
 option(USE_OPENBLAS "Use OpenBLAS" OFF)
 option(GEN_DLL "Generate Dynamic Link Library" OFF)

-# If set USE_CUDA ON, please modify CUDA_ROOT below.
+# If set USE_CUDA ON, please modify CUDA_TOOLKIT_ROOT below.
 # If set USE_MKL ON, please modify the INTEL_ROOT below.
 # If set USE_OPENBLAS ON, please modify the OPENBLAS_ROOT below.
 if (USE_CUDA)
-    if(NOT EXISTS ${CUDA_ROOT})
+    if(NOT EXISTS ${CUDA_TOOLKIT_ROOT})
        if(WIN32)
-            set(CUDA_ROOT "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.2")
+            set(CUDA_TOOLKIT_ROOT "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.2")
        else()
-            set(CUDA_ROOT "/usr/local/cuda-9.2")
+            set(CUDA_TOOLKIT_ROOT "/usr/local/cuda-9.2")
        endif()
    endif()
-    set(CUDA_TOOLKIT_ROOT_DIR ${CUDA_ROOT})
-    message(STATUS "CUDA_ROOT: ${CUDA_ROOT}")
+    set(CUDA_TOOLKIT_ROOT_DIR ${CUDA_TOOLKIT_ROOT})
+    message(STATUS "CUDA_TOOLKIT_ROOT: ${CUDA_TOOLKIT_ROOT}")
 endif()
 if(USE_MKL)
    if(NOT DEFINED INTEL_ROOT)
@@ -128,12 +128,13 @@ if(USE_CUDA)
    
    if(WIN32)
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4819")
-        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-maxrregcount=0 -m64 -Wno-deprecated-gpu-targets -use_fast_math")
+        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-maxrregcount=0 -Wno-deprecated-gpu-targets -use_fast_math")
+        string(REPLACE -m32 -m64 CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${ARCH_FLAGS})
        set(CMAKE_POLICY_DEFAULT_CMP0028 NEW)
-        link_directories("${CUDA_ROOT}/lib/x64")
-        include_directories("${CUDA_ROOT}/include")
-        set(CUDA_LIB_DIR "${CUDA_ROOT}/lib/x64/")
+        link_directories("${CUDA_TOOLKIT_ROOT}/lib/x64")
+        include_directories("${CUDA_TOOLKIT_ROOT}/include")
+        set(CUDA_LIB_DIR "${CUDA_TOOLKIT_ROOT}/lib/x64/")
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}cublas.lib")
        if(CUDA_VERSION_MAJOR EQUAL 11)
            set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}cublasLt.lib")
@@ -146,9 +147,9 @@ if(USE_CUDA)
        set(CMAKE_CXX_FLAGS "-fPIC -msse4.2 -w -march=native -Wno-enum-compare -Wno-sign-compare -Wno-format -Wno-dev -O3 -DNDEBUG -rdynamic")
        set(CUDA_NVCC_FLAGS "-Xcompiler -fPIC -maxrregcount=0 --disable-warnings -use_fast_math -DUSE_CUDA -Wno-deprecated-gpu-targets -std=c++11")
        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${ARCH_FLAGS})
-        link_directories("${CUDA_ROOT}/lib64")
-        include_directories("${CUDA_ROOT}/include")
-        set(CUDA_LIB_DIR "${CUDA_ROOT}/lib64/")
+        link_directories("${CUDA_TOOLKIT_ROOT}/lib64")
+        include_directories("${CUDA_TOOLKIT_ROOT}/include")
+        set(CUDA_LIB_DIR "${CUDA_TOOLKIT_ROOT}/lib64/")
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcublas_static.a")
        if(CUDA_VERSION_MAJOR EQUAL 11)
            set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcublasLt_static.a")
@@ -158,7 +159,13 @@ if(USE_CUDA)
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libnppc_static.a")
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcudadevrt.a")
        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcurand_static.a")
-        set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "/usr/lib64/libdl.so.2")
+        if(EXISTS "/usr/lib64/libdl.so.2")
+            set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "/usr/lib64/libdl.so.2")
+        elseif(EXISTS "/lib/x86_64-linux-gnu/libdl.so.2")
+            set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "/lib/x86_64-linux-gnu/libdl.so.2")
+        elseif(EXISTS "/lib64/libdl.so.2")
+            set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "/lib64/libdl.so.2")
+        endif()
    endif()
 endif()

@@ -293,4 +300,4 @@ else()
        target_link_libraries(${NIUTENSOR_EXE} ${ALL_LIB} ${FLAG})
    endif()
    message(STATUS "${MESS}")
-endif()
\ No newline at end of file
+endif()
--- a/README.md
+++ b/README.md
@@ -39,14 +39,14 @@ NiuTensor蟾･蜈ｷ蛹庄莉･蝨ｨWindows縲´inux莉･蜿確acOS邇ｯ蠅ｸ玖ｿ幄｡悟ｮ芽｣ｼ梧髪

 ##### CMake方式（Visual Studio）

-对于WIndows平台的NiuTensor工具包安装，这里可以使用CMake工具自动生成Visual Studio项目（需要用户提前安装CMake工具以及Visual Studio集成开发环境），操作步骤如下：
+对于Windows平台的NiuTensor工具包安装，这里可以使用CMake工具自动生成Visual Studio项目（需要用户提前安装CMake工具以及Visual Studio集成开发环境），操作步骤如下：

 - 在工具包根目录新建目录以保存生成的Visual Studio项目文件（如建立build目录）。
 - 在项目根目录打开Windows平台的命令行工具（如PowerShell），执行`cd build`命令进入新建的build目录。
 - 执行CMake命令对Visual Studio项目进行生成（如果 visual studio 版本低于 2019，则在使用下列命令的时候需额外加上`-A x64`的CMake参数），如计划生成动态链接库，则仅需在命令中额外加上`-DGEN_DLL=ON`的CMake参数即可，否则默认生成可执行程序。
  - 如项目计划启用MKL数学运算库（需用户自行安装），则仅需在CMake命令中使用`-DUSE_MKL=ON`参数，并通过`-DINTEL_ROOT='/intel/root/path'`指定MKL库（Intel工具包）的安装路径。如`cmake -DUSE_MKL=ON -DINTEL_ROOT='C:/Program Files (x86)/IntelSWTools/compilers_and_libraries_2020.2.254/windows' ..`。
  - 如项目计划启用OpenBLAS数学运算库（需用户自行安装），则仅需在CMake命令中使用`-DUSE_OPENBLAS=ON`参数，并通过`-DOPENBLAS_ROOT='/openblas/root/path'`指定OpenBLAS库的安装路径。如`cmake -DUSE_OPENBLAS=ON -DOPENBLAS_ROOT='C:/Program Files/OpenBLAS' ..`。
-  - 如项目计划启用CUDA数学运算库（需用户自行安装），则仅需在CMake命令中使用`-DUSE_CUDA=ON`参数，并通过`-DCUDA_ROOT='/cuda/root/path'`指定CUDA库的安装路径，通过-DGPU_ARCH=ARCH指定所在GPU设备的架构（K：Kepler架构；M：Maxwell架构；P：Pascal架构；V：Volta架构；T：Turing架构；A：Ampere架构）。如`cmake -DUSE_CUDA=ON -DCUDA_ROOT='C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.2' -DGPU_ARCH=P ..`。如需在GPU设备上使用半精度浮点数进行运算，需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数（需要注意的是半精度但需要注意的是，半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持，该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询）。
+  - 如项目计划启用CUDA数学运算库（需用户自行安装），则仅需在CMake命令中使用`-DUSE_CUDA=ON`参数，并通过`-DCUDA_TOOLKIT_ROOT='/cuda/root/path'`指定CUDA库的安装路径，通过-DGPU_ARCH=ARCH指定所在GPU设备的架构（K：Kepler架构；M：Maxwell架构；P：Pascal架构；V：Volta架构；T：Turing架构；A：Ampere架构）。如`cmake -DUSE_CUDA=ON -DCUDA_TOOLKIT_ROOT='C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.2' -DGPU_ARCH=P ..`。如需在GPU设备上使用半精度浮点数进行运算，需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数（需要注意的是半精度但需要注意的是，半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持，该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询）。
 - 执行成功将显示`Build files have been written to:...`。
 - 打开build目录中的NiuTensor.sln文件即可通过Visual Studio打开NiuTensor项目。
 - 打开后在解决方案管理器中选中NiuTensor，右键将其设为启动项目即可开始使用。
@@ -67,7 +67,7 @@ NiuTensor蟾･蜈ｷ蛹庄莉･蝨ｨWindows縲´inux莉･蜿確acOS邇ｯ蠅ｸ玖ｿ幄｡悟ｮ芽｣ｼ梧髪
 - 打开CLion首选项，点击“构建，执行，部署”选项卡中的CMake，在“CMake选项”中进行设置，设置完成后CLion将自动使用CMake对项目进行构建，如计划生成动态链接库，则仅需在在“CMake选项”中额外加上`-DGEN_DLL=ON`的CMake参数即可，否则默认生成可执行程序。
  - 如项目计划启用MKL数学运算库（需用户自行安装），则仅需在“CMake选项”中填入`-DUSE_MKL=ON`，并通过`-DINTEL_ROOT='/intel/root/path'`指定MKL库（Intel工具包）的安装路径。如`-DUSE_MKL=ON -DINTEL_ROOT='/opt/intel/compilers_and_libraries_2020.2.254/linux'`。
  - 如项目计划启用OpenBLAS数学运算库（需用户自行安装），则仅需在“CMake选项”中填入`-DUSE_OPENBLAS=ON`，并通过`-DOPENBLAS_ROOT='/openblas/root/path'`指定OpenBLAS库的安装路径。如`-DUSE_OPENBLAS=ON -DOPENBLAS_ROOT='/opt/OpenBLAS'`。
-  - 如项目计划启用CUDA数学运算库（需用户自行安装），则仅需在“CMake选项”中填入`-DUSE_CUDA=ON`，并通过`-DCUDA_ROOT='/cuda/root/path'`指定CUDA库的安装路径，通过-DGPU_ARCH=ARCH指定所在GPU设备的架构（K：Kepler架构；M：Maxwell架构；P：Pascal架构；V：Volta架构；T：Turing架构；A：Ampere架构）。如`-DUSE_CUDA=ON -DCUDA_ROOT='/usr/local/cuda-9.2' -DGPU_ARCH=P `。如需在GPU设备上使用半精度浮点数进行运算，需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数（需要注意的是半精度但需要注意的是，半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持，该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询）。
+  - 如项目计划启用CUDA数学运算库（需用户自行安装），则仅需在“CMake选项”中填入`-DUSE_CUDA=ON`，并通过`-DCUDA_TOOLKIT_ROOT='/cuda/root/path'`指定CUDA库的安装路径，通过-DGPU_ARCH=ARCH指定所在GPU设备的架构（K：Kepler架构；M：Maxwell架构；P：Pascal架构；V：Volta架构；T：Turing架构；A：Ampere架构）。如`-DUSE_CUDA=ON -DCUDA_TOOLKIT_ROOT='/usr/local/cuda-9.2' -DGPU_ARCH=P `。如需在GPU设备上使用半精度浮点数进行运算，需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数（需要注意的是半精度但需要注意的是，半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持，该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询）。

 ##### CMake方式（命令行）

@@ -78,7 +78,7 @@ NiuTensor蟾･蜈ｷ蛹庄莉･蝨ｨWindows縲´inux莉･蜿確acOS邇ｯ蠅ｸ玖ｿ幄｡悟ｮ芽｣ｼ梧髪
 - 执行CMake命令对项目进行生成，如计划生成动态链接库，则仅需在命令中额外加上`-DGEN_DLL=ON`的CMake参数即可，否则默认生成可执行程序。
  - 如项目计划启用MKL数学运算库（需用户自行安装），则仅需在CMake命令中使用`-DUSE_MKL=ON`参数，并通过`-DINTEL_ROOT='/intel/root/path'`指定MKL库（Intel工具包）的安装路径。如`cmake -DUSE_MKL=ON -DINTEL_ROOT='/opt/intel/compilers_and_libraries_2020.2.254/linux' ..`。
  - 如项目计划启用OpenBLAS数学运算库（需用户自行安装），则仅需在CMake命令中使用`-DUSE_OPENBLAS=ON`参数，并通过`-DOPENBLAS_ROOT='/openblas/root/path'`指定OpenBLAS库的安装路径。如`cmake -DUSE_OPENBLAS=ON -DOPENBLAS_ROOT='/opt/OpenBLAS' ..`。
-  - 如项目计划启用CUDA数学运算库（需用户自行安装），则仅需在CMake命令中使用`-DUSE_CUDA=ON`参数，并通过`-DCUDA_ROOT='/cuda/root/path'`指定CUDA库的安装路径，通过-DGPU_ARCH=ARCH指定所在GPU设备的架构（K：Kepler架构；M：Maxwell架构；P：Pascal架构；V：Volta架构；T：Turing架构；A：Ampere架构）。如`cmake -DUSE_CUDA=ON -DCUDA_ROOT='/usr/local/cuda-9.2' -DGPU_ARCH=P ..`。如需在GPU设备上使用半精度浮点数进行运算，需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数（需要注意的是半精度但需要注意的是，半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持，该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询）。
+  - 如项目计划启用CUDA数学运算库（需用户自行安装），则仅需在CMake命令中使用`-DUSE_CUDA=ON`参数，并通过`-DCUDA_TOOLKIT_ROOT='/cuda/root/path'`指定CUDA库的安装路径，通过-DGPU_ARCH=ARCH指定所在GPU设备的架构（K：Kepler架构；M：Maxwell架构；P：Pascal架构；V：Volta架构；T：Turing架构；A：Ampere架构）。如`cmake -DUSE_CUDA=ON -DCUDA_TOOLKIT_ROOT='/usr/local/cuda-9.2' -DGPU_ARCH=P ..`。如需在GPU设备上使用半精度浮点数进行运算，需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数（需要注意的是半精度但需要注意的是，半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持，该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询）。
 - 执行成功将显示`Build files have been written to:...`并在该目录下生成Makefile文件。
 - 执行`make -j`命令对NiuTensor项目进行编译，执行成功将显示`Built target NiuTensor`，安装完毕。

@@ -137,4 +137,4 @@ NiuTensor蠑驥剰ｮ｡邂怜ｺ鍋罰荳懷圏螟ｧ蟄ｦ閾ｪ辟ｶ隸ｭ險螟炊螳樣ｪ悟ｮ､蟆冗央蠑貅仙

 ## 更新版本

-NiuTensor version 0.3.5 - 2021年2月6日
+NiuTensor version 0.4.0 - 2021年3月13日
--- a/source/network/XBackwardMath.cpp
+++ b/source/network/XBackwardMath.cpp
@@ -159,11 +159,15 @@ void XMathGrad::GradAbsolute(XTensor * node, bool isEfficient)
    if (!isEfficient || a->isGrad) {
        XNoder::MakeGrad(a);

+        if (a->mem != NULL)
+            a->mem->LockBuf();
        XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
        _Sign(a, tmp);
        _Multiply(node->grad, tmp, a->grad, 1.0F);

        DelTensorBuf(tmp);
+        if (a->mem != NULL)
+            a->mem->UnlockBuf();
    }
 }

@@ -188,12 +192,16 @@ void XMathGrad::GradCos(XTensor * node, bool isEfficient)
    if (!isEfficient || a->isGrad) {
        XNoder::MakeGrad(a);

+        if (a->mem != NULL)
+            a->mem->LockBuf();
        XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
        _Sin(a, tmp);
        _NegateMe(tmp);
        _Multiply(node->grad, tmp, a->grad, 1.0F);

        DelTensorBuf(tmp);
+        if (a->mem != NULL)
+            a->mem->UnlockBuf();
    }
 }

@@ -218,11 +226,15 @@ void XMathGrad::GradExp(XTensor * node, bool isEfficient)
    if (!isEfficient || a->isGrad) {
        XNoder::MakeGrad(a);

+        if (a->mem != NULL)
+            a->mem->LockBuf();
        XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
        _Exp(a, tmp);
        _Multiply(node->grad, tmp, a->grad, 1.0F);

        DelTensorBuf(tmp);
+        if (a->mem != NULL)
+            a->mem->UnlockBuf();
    }
 }

@@ -317,11 +329,15 @@ void XMathGrad::GradSin(XTensor * node, bool isEfficient)
    if (!isEfficient || a->isGrad) {
        XNoder::MakeGrad(a);

+        if (a->mem != NULL)
+            a->mem->LockBuf();
        XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
        _Cos(a, tmp);
        _Multiply(node->grad, tmp, a->grad, 1.0F);

        DelTensorBuf(tmp);
+        if (a->mem != NULL)
+            a->mem->UnlockBuf();
    }
 }

@@ -341,17 +357,22 @@ void XMathGrad::GradTan(XTensor * node, bool isEfficient)
    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for TAN!");

    XTensor * a = income.tails[0];
-    XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);

    /* dE/da = dE/dc * 1/(cos(a))^2
             = dE/dc * (cos(a))^-2 */
    if (!isEfficient || a->isGrad) {
        XNoder::MakeGrad(a);
+
+        if (a->mem != NULL)
+            a->mem->LockBuf();
+        XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
        _Cos(a, tmp);
        _PowerMe(tmp, -2.0F);
        _Multiply(node->grad, tmp, a->grad, 1.0F);

        DelTensorBuf(tmp);
+        if (a->mem != NULL)
+            a->mem->UnlockBuf();
    }
 }

@@ -379,11 +400,15 @@ void XMathGrad::GradClip(XTensor * node, bool isEfficient)
    if (!isEfficient || a->isGrad) {
        XNoder::MakeGrad(a);

+        if (a->mem != NULL)
+            a->mem->LockBuf();
        XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
        _ClipBackward(node, a, node->grad, tmp, lower, upper);
        _SumMe(a->grad, tmp);

        DelTensorBuf(tmp);
+        if (a->mem != NULL)
+            a->mem->UnlockBuf();
    }
 }

@@ -417,6 +442,8 @@ void XMathGrad::GradDiv(XTensor * node, bool isEfficient)
             = dE/dc * a * (-b^-2) */
    if (!isEfficient || b->isGrad) {
        XNoder::MakeGrad(b);
+        if (a->mem != NULL)
+            a->mem->LockBuf();
        XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
        _Power(b, tmp, -2.0F);
        _NegateMe(tmp);
@@ -424,6 +451,8 @@ void XMathGrad::GradDiv(XTensor * node, bool isEfficient)
        _Multiply(node->grad, tmp, b->grad, 1.0F);

        DelTensorBuf(tmp);
+        if (a->mem != NULL)
+            a->mem->UnlockBuf();
    }
 }

@@ -461,9 +490,17 @@ void XMathGrad::GradDivDim(XTensor * node, bool isEfficient)
        int dimSize[MAX_TENSOR_DIM_NUM];
        memcpy(dimSize, a->dimSize, sizeof(int) * a->order);

+        if (a->mem != NULL)
+            a->mem->LockBuf();
        XTensor * aTMP1 = NewTensorBufV2(a, a->devID, a->mem);
        XTensor * aTMP2 = NewTensorBufV2(a, a->devID, a->mem);
+        if ((b->mem != NULL) && (b->mem != a->mem)) {
+            b->mem->LockBuf();
+        }
        XTensor * bTMP = NewTensorBufV2(b, b->devID, b->mem);
+        if ((node->mem != NULL) && (node->mem != a->mem) && (node->mem != b->mem)) {
+            node->mem->LockBuf();
+        }
        XTensor * interGradTMP = NewTensorBufV2(node->grad, node->devID, node->mem);

        _Negate(a, aTMP1);
@@ -505,6 +542,7 @@ void XMathGrad::GradDivDim(XTensor * node, bool isEfficient)
               Then reduce along with z and x to obtain dE/db. */
            interGradTMP->Reshape(3, reshapedSize);

+            // b->mem->LockBuf();
            XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
            _ReduceSum(interGradTMP, interGrad, 2);

@@ -515,12 +553,21 @@ void XMathGrad::GradDivDim(XTensor * node, bool isEfficient)

            DelTensorBuf(bGradTMP2);
            DelTensorBuf(interGrad);
+            // b->mem->UnlockBuf();
        }

        DelTensorBuf(interGradTMP);
+        if ((node->mem != NULL) && (node->mem != a->mem) && (node->mem != b->mem)) {
+            node->mem->UnlockBuf();
+        }
        DelTensorBuf(bTMP);
+        if ((b->mem != NULL) && (b->mem != a->mem)) {
+            b->mem->UnlockBuf();
+        }
        DelTensorBuf(aTMP2);
        DelTensorBuf(aTMP1);
+        if (a->mem != NULL)
+            a->mem->UnlockBuf();
    }
 }

@@ -805,6 +852,8 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
        int dimSize[MAX_TENSOR_DIM_NUM];
        memcpy(dimSize, a->dimSize, sizeof(int) * a->order);

+        if (node->mem != NULL)
+            node->mem->LockBuf();
        XTensor * bGradTMP = NewTensorBufV2(node->grad, node->devID, node->mem);
        _Multiply(node->grad, a, bGradTMP);

@@ -817,12 +866,18 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
               size of b. Then we can reduce the matrix into a row vector. */
            bGradTMP->Reshape(2, reshapedSize);

+            if ((b->mem != NULL) && (b->mem != node->mem)) {
+                b->mem->LockBuf();
+            }
            XTensor * bGradTMP2 = NewTensorBufV2(b->grad, b->devID, b->mem);
            _ReduceSum(bGradTMP, bGradTMP2, 0);

            _Sum(b->grad, bGradTMP2, b->grad);

            DelTensorBuf(bGradTMP2);
+            if ((b->mem != NULL) && (b->mem != node->mem)) {
+                b->mem->UnlockBuf();
+            }
        }
        else {
            int reshapedSize[MAX_TENSOR_DIM_NUM];
@@ -841,6 +896,9 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
               Then reduce along with z and x to obtain dE/db. */
            bGradTMP->Reshape(3, reshapedSize);

+            if ((b->mem != NULL) && (b->mem != node->mem)) {
+                b->mem->LockBuf();
+            }
            XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
            _ReduceSum(bGradTMP, interGrad, 2);

@@ -851,8 +909,13 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)

            DelTensorBuf(bGradTMP2);
            DelTensorBuf(interGrad);
+            if ((b->mem != NULL) && (b->mem != node->mem)) {
+                b->mem->UnlockBuf();
+            }
        }
        DelTensorBuf(bGradTMP);
+        if (node->mem != NULL)
+            node->mem->UnlockBuf();
    }
 }

@@ -949,12 +1012,16 @@ void XMathGrad::GradPower(XTensor * node, bool isEfficient)
    if (!isEfficient || a->isGrad) {
        XNoder::MakeGrad(a);

+        if (a->mem != NULL)
+            a->mem->LockBuf();
        XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
        _Power(a, tmp, p - 1.0F);
        _ScaleAndShiftMe(tmp, p);
        _Multiply(node->grad, tmp, a->grad, 1.0F);

        DelTensorBuf(tmp);
+        if (a->mem != NULL)
+            a->mem->UnlockBuf();
    }
 }

@@ -979,12 +1046,16 @@ void XMathGrad::GradReciprocal(XTensor* node, bool isEfficient)
    if (!isEfficient || a->isGrad) {
        XNoder::MakeGrad(a);

+        if (a->mem != NULL)
+            a->mem->LockBuf();
        XTensor* tmp = NewTensorBufV2(a, a->devID, a->mem);
        _Power(a, tmp, -2.0F);
        _NegateMe(tmp);
        _Multiply(node->grad, tmp, a->grad, 1.0F);

        DelTensorBuf(tmp);
+        if (a->mem != NULL)
+            a->mem->UnlockBuf();
    }
 }

@@ -1008,11 +1079,15 @@ void XMathGrad::GradSqrt(XTensor * node, bool isEfficient)
    if (!isEfficient || a->isGrad) {
        XNoder::MakeGrad(a);

+        if (a->mem != NULL)
+            a->mem->LockBuf();
        XTensor* tmp = NewTensorBufV2(a, a->devID, a->mem);
        _ScaleMe(tmp, 2.0F);
        _Multiply(node->grad, tmp, a->grad, 1.0F);

        DelTensorBuf(tmp);
+        if (a->mem != NULL)
+            a->mem->UnlockBuf();
    }
 }

@@ -1036,12 +1111,16 @@ void XMathGrad::GradSquare(XTensor * node, bool isEfficient)
    if (!isEfficient || a->isGrad) {
        XNoder::MakeGrad(a);

+        if (a->mem != NULL)
+            a->mem->LockBuf();
        XTensor* tmp = NewTensorBufV2(a, a->devID, a->mem);
        _Power(a, tmp, -0.5F);
        _ScaleMe(tmp, 0.5);
        _Multiply(node->grad, tmp, a->grad, 1.0F);

        DelTensorBuf(tmp);
+        if (a->mem != NULL)
+            a->mem->UnlockBuf();
    }
 }

@@ -1226,12 +1305,16 @@ void XMathGrad::GradSubDim(XTensor * node, bool isEfficient)
               size of b. Then we can reduce the matrix into a row vector. */
            node->grad->Reshape(2, reshapedSize);

+            if (b->mem != NULL)
+                b->mem->LockBuf();
            XTensor * bGradTMP = NewTensorBufV2(b->grad, b->devID, b->mem);
            _ReduceSum(node->grad, bGradTMP, 0);
            if (beta != 1.0F)
                _ScaleAndShiftMe(bGradTMP, beta);
            _Sub(b->grad, bGradTMP, b->grad);
            DelTensorBuf(bGradTMP);
+            if (b->mem != NULL)
+                b->mem->UnlockBuf();

            node->grad->Reshape(order, dimSize);
        }
@@ -1252,6 +1335,8 @@ void XMathGrad::GradSubDim(XTensor * node, bool isEfficient)
               Then reduce along with z and x to obtain dE/db. */
            node->grad->Reshape(3, reshapedSize);

+            if (b->mem != NULL)
+                b->mem->LockBuf();
            XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);

            _ReduceSum(node->grad, interGrad, 2);
@@ -1266,6 +1351,8 @@ void XMathGrad::GradSubDim(XTensor * node, bool isEfficient)
            node->grad->Reshape(order, dimSize);

            DelTensorBuf(interGrad);
+            if (b->mem != NULL)
+                b->mem->UnlockBuf();
        }
    }
 }
@@ -1346,12 +1433,16 @@ void XMathGrad::GradSumDim(XTensor * node, bool isEfficient)
               size of b. Then we can reduce the matrix into a row vector. */
            node->grad->Reshape(2, reshapedSize);

+            if (b->mem != NULL)
+                b->mem->LockBuf();
            XTensor * bGradTMP = NewTensorBufV2(b->grad, b->devID, b->mem);
            _ReduceSum(node->grad, bGradTMP, 0);
            if (beta != 1.0F)
                _ScaleAndShiftMe(bGradTMP, beta);
            _Sum(bGradTMP, b->grad, b->grad);
            DelTensorBuf(bGradTMP);
+            if (b->mem != NULL)
+                b->mem->UnlockBuf();

            node->grad->Reshape(order, dimSize);
        }
@@ -1372,6 +1463,8 @@ void XMathGrad::GradSumDim(XTensor * node, bool isEfficient)
               Then reduce along with z and x to obtain dE/db. */
            node->grad->Reshape(3, reshapedSize);

+            if (b->mem != NULL)
+                b->mem->LockBuf();
            XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);

            _ReduceSum(node->grad, interGrad, 2);
@@ -1386,6 +1479,8 @@ void XMathGrad::GradSumDim(XTensor * node, bool isEfficient)
            node->grad->Reshape(order, dimSize);

            DelTensorBuf(interGrad);
+            if (b->mem != NULL)
+                b->mem->UnlockBuf();
        }
    }
 }
@@ -1452,12 +1547,16 @@ void XMathGrad::GradReduceMean(XTensor * node, bool isEfficient)
    if (!isEfficient || a->isGrad) {
        XNoder::MakeGrad(a);

+        if (a->mem != NULL)
+            a->mem->LockBuf();
        XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
        _Unsqueeze(node->grad, tmp, dim, n);
        _ScaleAndShiftMe(tmp, 1.0F / n);
        _Sum(a->grad, tmp, a->grad);

        DelTensorBuf(tmp);
+        if (a->mem != NULL)
+            a->mem->UnlockBuf();
    }
 }

@@ -1486,10 +1585,14 @@ void XMathGrad::GradReduceSum(XTensor * node, bool isEfficient)
    if (!isEfficient || a->isGrad) {
        XNoder::MakeGrad(a);

+        if (a->mem != NULL)
+            a->mem->LockBuf();
        XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
        _Unsqueeze(node->grad, tmp, dim, n);
        _Sum(a->grad, tmp, a->grad);
        DelTensorBuf(tmp);
+        if (a->mem != NULL)
+            a->mem->UnlockBuf();
    }
 }

@@ -1515,11 +1618,15 @@ void XMathGrad::GradReduceSumAll(XTensor * node, bool isEfficient)
    if (!isEfficient || a->isGrad) {
        XNoder::MakeGrad(a);

+        if (a->mem != NULL)
+            a->mem->LockBuf();
        XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
        DTYPE value = node->grad->Get0D();
        tmp->SetDataFixed(value);
        _Sum(a->grad, tmp, a->grad);
        DelTensorBuf(tmp);
+        if (a->mem != NULL)
+            a->mem->UnlockBuf();
    }
 }

@@ -1542,9 +1649,14 @@ void XMathGrad::GradReduceSumSquared(XTensor * node, bool isEfficient)

    XTensor * a = income.tails[0];
    XTensor * b = income.tails[1];
+    if (a->mem != NULL)
+        a->mem->LockBuf();
    XTensor * c = NewTensorBufV2(a, a->devID, a->mem);
    XTensor * d = NewTensorBufV2(a, a->devID, a->mem);
    XTensor * e = NewTensorBufV2(a, a->devID, a->mem);
+    if ((b->mem != NULL) && (b->mem != a->mem)) {
+        b->mem->LockBuf();
+    }
    XTensor * f = NewTensorBufV2(b, b->devID, b->mem);

    int dim = income.GetParamInt(0);
@@ -1573,9 +1685,14 @@ void XMathGrad::GradReduceSumSquared(XTensor * node, bool isEfficient)
    }

    DelTensorBuf(f);
+    if ((b->mem != NULL) && (b->mem != a->mem)) {
+        b->mem->UnlockBuf();
+    }
    DelTensorBuf(e);
    DelTensorBuf(d);
    DelTensorBuf(c);
+    if (a->mem != NULL)
+        a->mem->UnlockBuf();
 }

 /*
@@ -1598,9 +1715,14 @@ void XMathGrad::GradReduceVariance(XTensor * node, bool isEfficient)

    XTensor * a = income.tails[0];
    XTensor * b = income.tails[1];
+    if (a->mem != NULL)
+        a->mem->LockBuf();
    XTensor * c = NewTensorBufV2(a, a->devID, a->mem);
    XTensor * d = NewTensorBufV2(a, a->devID, a->mem);
    XTensor * e = NewTensorBufV2(a, a->devID, a->mem);
+    if ((b->mem != NULL) && (b->mem != a->mem)) {
+        b->mem->LockBuf();
+    }
    XTensor * f = NewTensorBufV2(b, b->devID, b->mem);

    int dim = income.GetParamInt(0);
@@ -1628,9 +1750,14 @@ void XMathGrad::GradReduceVariance(XTensor * node, bool isEfficient)
    }

    DelTensorBuf(f);
+    if ((b->mem != NULL) && (b->mem != a->mem)) {
+        b->mem->UnlockBuf();
+    }
    DelTensorBuf(e);
    DelTensorBuf(d);
    DelTensorBuf(c);
+    if (a->mem != NULL)
+        a->mem->UnlockBuf();
 }

 /*
@@ -1675,10 +1802,14 @@ void XMathGrad::GradMulAndShift(XTensor * node, bool isEfficient)
            size of b. Then we can reduce the matrix into a row vector. */
            node->grad->Reshape(2, reshapedSize);

+            if (b->mem != NULL)
+                b->mem->LockBuf();
            XTensor * bGradTMP = NewTensorBufV2(b->grad, b->devID, b->mem);
            _ReduceSum(node->grad, bGradTMP, 0);
            _Sum(bGradTMP, b->grad, b->grad);
            DelTensorBuf(bGradTMP);
+            if (b->mem != NULL)
+                b->mem->UnlockBuf();

            node->grad->Reshape(order, dimSize);
        }
@@ -1699,6 +1830,8 @@ void XMathGrad::GradMulAndShift(XTensor * node, bool isEfficient)
            Then reduce along with z and x to obtain dE/db. */
            node->grad->Reshape(3, reshapedSize);

+            if (b->mem != NULL)
+                b->mem->LockBuf();
            XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
            _ReduceSum(node->grad, interGrad, 2);

@@ -1710,6 +1843,8 @@ void XMathGrad::GradMulAndShift(XTensor * node, bool isEfficient)
            node->grad->Reshape(order, dimSize);

            DelTensorBuf(interGrad);
+            if (b->mem != NULL)
+                b->mem->UnlockBuf();
        }
    }

@@ -1814,6 +1949,8 @@ void XMathGrad::GradMLP(XTensor* node, bool isEfficient)
            Then reduce along with z and x to obtain dE/db. */
            node->grad->Reshape(3, reshapedSize);

+            if (b->mem != NULL)
+                b->mem->LockBuf();
            XTensor* interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
            _ReduceSum(node->grad, interGrad, 2);

@@ -1825,6 +1962,8 @@ void XMathGrad::GradMLP(XTensor* node, bool isEfficient)
            node->grad->Reshape(order, dimSize);

            DelTensorBuf(interGrad);
+            if (b->mem != NULL)
+                b->mem->UnlockBuf();
        }
    }


--- a/source/network/XBackwardShape.cpp
+++ b/source/network/XBackwardShape.cpp
@@ -105,11 +105,15 @@ void XShapeGrad::GradConvertDataType(XTensor* node, bool isEfficient)
    if (!isEfficient || a->isGrad) {
        XNoder::MakeGrad(a);

+        if (a->mem != NULL)
+            a->mem->LockBuf();
        XTensor* tmp = NewTensorBufV2(a, a->devID, a->mem);
        _ConvertDataType(node->grad, tmp);
        _SumMe(a->grad, tmp);

        DelTensorBuf(tmp);
+        if (a->mem != NULL)
+            a->mem->UnlockBuf();
    }

    node->visitMark = NODE_FINISHED;
@@ -141,11 +145,15 @@ void XShapeGrad::GradCopyIndexed(XTensor * node, bool isEfficient)
    if (!isEfficient || input->isGrad) {
        XNoder::MakeGrad(input);

+        if (input->mem != NULL)
+            input->mem->LockBuf();
        XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
        _SpreadForCopyIndexed(tmp, node->grad, dim, srcIndex, tgtIndex, copyNum);
        _SumMe(input->grad, tmp);

        DelTensorBuf(tmp);
+        if (input->mem != NULL)
+            input->mem->UnlockBuf();
    }

    node->visitMark = NODE_FINISHED;
@@ -173,12 +181,16 @@ void XShapeGrad::GradGather(XTensor * node, bool isEfficient)
    if (!isEfficient || input->isGrad) {
        XNoder::MakeGrad(input);

+        if (input->mem != NULL)
+            input->mem->LockBuf();
        XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
        tmp->SetZeroAll();
        _SpreadForGather(tmp, node->grad, index);
        _SumMe(input->grad, tmp);

        DelTensorBuf(tmp);
+        if (input->mem != NULL)
+            input->mem->UnlockBuf();
    }

    node->visitMark = NODE_FINISHED;
@@ -200,6 +212,8 @@ void XShapeGrad::GradDropoutWithIndex(XTensor * node, bool isEfficient)
    if (!isEfficient || input->isGrad) {
        XNoder::MakeGrad(input);

+        if (input->mem != NULL)
+            input->mem->LockBuf();
        XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
        _CopyValues(node->grad, tmp);

@@ -212,6 +226,8 @@ void XShapeGrad::GradDropoutWithIndex(XTensor * node, bool isEfficient)
        _SumMe(input->grad, tmp);

        DelTensorBuf(tmp);
+        if (input->mem != NULL)
+            input->mem->UnlockBuf();
    }

    node->visitMark = NODE_FINISHED;
@@ -456,12 +472,16 @@ void XShapeGrad::GradSplit(XTensor * node, bool isEfficient)
        /* if the tensor is used somewhere else, we need another SUM
           for gradient accumulation */
        else {
+            if (input->mem != NULL)
+                input->mem->LockBuf();
            XTensor * inputGradTMP = NewTensorBufV2(input, input->devID, input->mem);

            _Merge(node->grad, inputGradTMP, whereToSplit + 1, 0);
            _Sum(input->grad, inputGradTMP, input->grad);

            DelTensorBuf(inputGradTMP);
+            if (input->mem != NULL)
+                input->mem->UnlockBuf();
        }
    }

@@ -543,12 +563,16 @@ void XShapeGrad::GradSplitListPost(XTensor * node, bool isEfficient)
           somewhere else, we need another SUM for gradient
           accumulation */
        else {
+            if (node->mem != NULL)
+                node->mem->LockBuf();
            XTensor * nodeGradTMP = NewTensorBufV2(node, node->devID, node->mem);

            _Merge(&splits, nodeGradTMP, whereToSplit + 1);
            _Sum(node->grad, nodeGradTMP, node->grad);

            DelTensorBuf(nodeGradTMP);
+            if (node->mem != NULL)
+                node->mem->UnlockBuf();
        }
    }

@@ -584,11 +608,15 @@ void XShapeGrad::GradTranspose(XTensor * node, bool isEfficient)
        CheckNTErrors(input->order > i && i >= 0, "index of dimension is out of scope!");
        CheckNTErrors(input->order > j && j >= 0, "index of dimension is out of scope!");

+        if (input->mem != NULL)
+            input->mem->LockBuf();
        XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
        _Transpose(output->grad, tmp, i, j);
        _Sum(input->grad, tmp, input->grad);

        DelTensorBuf(tmp);
+        if (input->mem != NULL)
+            input->mem->UnlockBuf();
    }

    node->visitMark = NODE_FINISHED;
@@ -622,12 +650,16 @@ void XShapeGrad::GradUnsqueeze(XTensor * node, bool isEfficient)
    if (!isEfficient || input->isGrad) {
        XNoder::MakeGrad(input);

+        if (input->mem != NULL)
+            input->mem->LockBuf();
        XTensor * tmp = NewTensorBufV2(input->grad, input->devID, input->mem);

        _ReduceSum(output->grad, tmp, dim);
        _Sum(input->grad, tmp, input->grad);

        DelTensorBuf(tmp);
+        if (input->mem != NULL)
+            input->mem->UnlockBuf();
    }

    node->visitMark = NODE_FINISHED;

--- a/source/sample/transformer/Model.cpp
+++ b/source/sample/transformer/Model.cpp
@@ -265,6 +265,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
    dims[inputDec.order + 1] = inputEnc.GetDim(inputEnc.order - 1);
    InitTensor(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, paddingEnc.devID);

+    GMems.GetMem(paddingEnc.devID)->LockBuf();
    XTensor* maskEncDecTMPEnc = NewTensorBuf(paddingEnc.order + 1, dims + 1,
        paddingEnc.dataType, paddingEnc.devID);
    XTensor* maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID);
@@ -275,6 +276,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,

    DelTensorBuf(maskEncDecTMPDec);
    DelTensorBuf(maskEncDecTMPEnc);
+    GMems.GetMem(paddingEnc.devID)->UnlockBuf();

    /* padding on the source side */
    int* dimsPadding = new int[paddingEnc.order + 2];
@@ -283,6 +285,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
    dimsPadding[paddingEnc.order - 1] = paddingEnc.GetDim(-1);
    dimsPadding[paddingEnc.order] = paddingEnc.GetDim(-1);

+    GMems.GetMem(paddingEnc.devID)->LockBuf();
    XTensor* padding2 = NewTensorBuf(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType,
        paddingEnc.devID);

@@ -309,6 +312,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,

    DelTensorBuf(padding3);
    DelTensorBuf(padding2);
+    GMems.GetMem(paddingEnc.devID)->UnlockBuf();
 }

 /*

--- a/source/sample/transformer/train/Trainer.cpp
+++ b/source/sample/transformer/train/Trainer.cpp
@@ -428,6 +428,7 @@ void Trainer::Update(Model* model, const float lr)
            _ScaleAndShiftMe(v, (1.0F - adamBeta2), 0);

            /* v2 = m / (sqrt(v) + delta) */
+            GMems.GetMem(v->devID)->LockBuf();
            XTensor* v2 = NewTensorBuf(v, v->devID);
            _Power(v, v2, 0.5F);
            _ScaleAndShiftMe(v2, 1.0F, d);
@@ -437,6 +438,7 @@ void Trainer::Update(Model* model, const float lr)
            _Sum(para, v2, para, -e);

            DelTensorBuf(v2);
+            GMems.GetMem(v->devID)->UnlockBuf();
        }
        else {
            /* the delta rule */

--- a/source/tensor/core/arithmetic/Div.cpp
+++ b/source/tensor/core/arithmetic/Div.cpp
@@ -253,15 +253,25 @@ void Div(const XTensor & a, const XTensor & b, XTensor & c, DTYPE alpha, int lea

    if (b.order == 0){
        DTYPE scale = 1.0F / b.Get0D();
+        if (a.mem != NULL)
+            a.mem->LockBuf();
        XTensor * tmp1 = NewTensorBufV2(&a, a.devID, a.mem);
+        if ((c.mem != NULL) && (c.mem != a.mem)) {
+            c.mem->LockBuf();
+        }
        XTensor * tmp2 = NewTensorBufV2(&c, c.devID, c.mem);

        ScaleAndShift(a, *tmp1, scale, 0.0F);
        ScaleAndShift(c, *tmp2, alpha, 0.0F);
        Sum(*tmp2, *tmp1, c);

-        DelTensorBuf(tmp1);
        DelTensorBuf(tmp2);
+        if ((c.mem != NULL) && (c.mem != a.mem)) {
+            c.mem->UnlockBuf();
+        }
+        DelTensorBuf(tmp1);
+        if (a.mem != NULL)
+            a.mem->UnlockBuf();
    }
    else {
        int n = GetBroadcastDimIndex(a, b);

--- a/source/tensor/core/arithmetic/MulAndShift.cpp
+++ b/source/tensor/core/arithmetic/MulAndShift.cpp
@@ -61,6 +61,8 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,

    float dr = (!x.isSparse || !w.isSparse) ? 1.0F : MAX(x.denseRatio, w.denseRatio);

+    if (x.mem != NULL)
+        x.mem->LockBuf();
    XTensor * tmp = NewTensorBufV2(order, dimSize, x.dataType, dr, x.devID, x.mem);

    /* call _MatrixMul function */
@@ -101,6 +103,8 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
    /* destroy variables */
    delete[] dimSize;
    DelTensorBuf(tmp);
+    if (x.mem != NULL)
+        x.mem->UnlockBuf();

    return c;
 }
@@ -137,6 +141,8 @@ XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedX,

    float dr = (!x.isSparse || !w.isSparse) ? 1.0F : MAX(x.denseRatio, w.denseRatio);

+    if (x.mem != NULL)
+        x.mem->LockBuf();
    XTensor * tmp = NewTensorBufV2(order, dimSize, x.dataType, dr, x.devID, x.mem);

    /* call _MatrixMul function */
@@ -175,6 +181,8 @@ XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedX,
    /* destroy variables */
    delete[] dimSize;
    DelTensorBuf(tmp);
+    if (x.mem != NULL)
+        x.mem->UnlockBuf();

    return c;
 }

--- a/source/tensor/core/arithmetic/Multiply.cpp
+++ b/source/tensor/core/arithmetic/Multiply.cpp
@@ -277,15 +277,25 @@ void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int l

    if (b.order == 0){
        DTYPE scale = b.Get0D();
+        if (a.mem != NULL)
+            a.mem->LockBuf();
        XTensor * tmp1 = NewTensorBufV2(&a, a.devID, a.mem);
+        if ((c.mem != NULL) && (c.mem != a.mem)) {
+            c.mem->LockBuf();
+        }
        XTensor * tmp2 = NewTensorBufV2(&c, c.devID, c.mem);

        ScaleAndShift(a, *tmp1, scale, 0.0F);
        ScaleAndShift(c, *tmp2, alpha, 0.0F);
        Sum(*tmp2, *tmp1, c);

-        DelTensorBuf(tmp1);
        DelTensorBuf(tmp2);
+        if ((c.mem != NULL) && (c.mem != a.mem)) {
+            c.mem->UnlockBuf();
+        }
+        DelTensorBuf(tmp1);
+        if (a.mem != NULL)
+            a.mem->UnlockBuf();
    }
    else {
        int n = GetBroadcastDimIndex(a, b);

--- a/source/tensor/core/arithmetic/MultiplyDim.cpp
+++ b/source/tensor/core/arithmetic/MultiplyDim.cpp
@@ -290,9 +290,16 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
                source = target;
            }
            
-            target = t->mem != NULL ?
+            /*target = t->mem != NULL ?
                     t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize):
-                     XMemAlloc(t->devID, t->unitNum * t->unitSize);
+                     XMemAlloc(t->devID, t->unitNum * t->unitSize);*/
+            if (t->mem != NULL) {
+                t->mem->LockBuf();
+                target = t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize);
+            }
+            else {
+                target = XMemAlloc(t->devID, t->unitNum * t->unitSize);
+            }
            
            s->data = source;
            t->data = target;
@@ -302,8 +309,9 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
            /* free the memory space of the one before the last allocation */
            if(count > 0){
                int size = s->unitNum * s->unitSize;
-                if(t->mem != NULL)
+                if(t->mem != NULL) {
                    t->mem->ReleaseBuf(t->devID, size);
+                }
                else
                    XMemFree(t->devID, source);
            }
@@ -312,8 +320,10 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
            if(isLast){
                CheckNTErrors(t->unitNum == c->unitNum, "Wrong tensor size!");
                _Multiply(a, t, c, beta);
-                if(t->mem != NULL)
+                if(t->mem != NULL) {
                    t->mem->ReleaseBuf(t->devID, t->unitNum * t->unitSize);
+                    t->mem->UnlockBuf();
+                }
                else
                    XMemFree(t->devID, target);
                target = NULL;

--- a/source/tensor/core/arithmetic/SumDim.cpp
+++ b/source/tensor/core/arithmetic/SumDim.cpp
@@ -293,10 +293,16 @@ void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta
                source = target;
            }
            
-            target = t->mem != NULL ?
+            /*target = t->mem != NULL ?
                     t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize):
-                     XMemAlloc(t->devID, t->unitNum * t->unitSize);
-            
+                     XMemAlloc(t->devID, t->unitNum * t->unitSize);*/
+            if (t->mem != NULL) {
+                t->mem->LockBuf();
+                target = t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize);
+            }
+            else {
+                target = XMemAlloc(t->devID, t->unitNum * t->unitSize);
+            }
            s->data = source;
            t->data = target;
            
@@ -315,8 +321,10 @@ void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta
            if(isLast){
                CheckNTErrors(t->unitNum == c->unitNum, "Wrong tensor size!");
                _Sum(a, t, c, beta);
-                if(t->mem != NULL)
+                if(t->mem != NULL) {
                    t->mem->ReleaseBuf(t->devID, t->unitNum * t->unitSize);
+                    t->mem->UnlockBuf();
+                }
                else
                    XMemFree(t->devID, target);
                target = NULL;

--- a/source/tensor/core/arithmetic/XTensorBLAS.cu
+++ b/source/tensor/core/arithmetic/XTensorBLAS.cu
@@ -330,6 +330,7 @@ void _CudaBLASMatrixMULList(cublasHandle_t * handle,
            DTYPE ** cpGPU = NULL;

            if (mem != NULL) {
+                mem->LockBuf();
                mem->SetPinBuf();
                apGPU = (DTYPE**)mem->AllocBuf(mem->devID, sizeof(DTYPE*) * a->count, 256);
                bpGPU = (DTYPE**)mem->AllocBuf(mem->devID, sizeof(DTYPE*) * a->count, 256);
@@ -356,8 +357,10 @@ void _CudaBLASMatrixMULList(cublasHandle_t * handle,
            delete[] bp;
            delete[] cp;

-            if(mem != NULL)
+            if (mem != NULL) {
                mem->BackToPinBuf();
+                mem->UnlockBuf();
+            }
            else {
                XMemFree(a0->devID, apGPU);
                XMemFree(a0->devID, bpGPU);

--- a/source/tensor/core/getandset/SetData.cpp
+++ b/source/tensor/core/getandset/SetData.cpp
@@ -696,13 +696,23 @@ void _SetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYPE nu
 #ifdef USE_CUDA
        XMem * mem = tensor->mem;
        MTYPE size = num * sizeof(MTYPE);
-        MTYPE * offsetsCuda = mem != NULL ? (MTYPE*)mem->AllocBuf(mem->devID, size) : (MTYPE*)XMemAlloc(tensor->devID, size);
+        //MTYPE * offsetsCuda = mem != NULL ? (MTYPE*)mem->AllocBuf(mem->devID, size) : (MTYPE*)XMemAlloc(tensor->devID, size);
+        MTYPE * offsetsCuda;
+        if (mem != NULL) {
+            mem->LockBuf();
+            offsetsCuda = (MTYPE*)mem->AllocBuf(mem->devID, size);
+        }
+        else {
+            offsetsCuda = (MTYPE*)XMemAlloc(tensor->devID, size);
+        }
        XMemCopy(offsetsCuda, tensor->devID, offsets, -1, num * sizeof(MTYPE));

        _CudaSetDataWithOffset(tensor, offsetsCuda, value, num);
        
-        if (mem != NULL)
+        if (mem != NULL) {
            mem->ReleaseBuf(mem->devID, size);
+            mem->UnlockBuf();
+        }
        else
            XMemFree(tensor->devID, offsetsCuda);
 #else

--- a/source/tensor/core/getandset/SetData.cu
+++ b/source/tensor/core/getandset/SetData.cu
@@ -636,12 +636,23 @@ void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * va
    int devIDBackup;
    ProtectCudaDev(tensor->devID, devIDBackup);

-    MTYPE * offsetsCuda = mem != NULL ? 
+    /*MTYPE * offsetsCuda = mem != NULL ? 
                            (MTYPE*)mem->AllocBuf(mem->devID, offsetSize) : 
                            (MTYPE*)XMemAlloc(tensor->devID, offsetSize);
-    void * valuesCuda  = mem != NULL ? 
-                            mem->AllocBuf(mem->devID, valueSize) : 
-                            XMemAlloc(tensor->devID, valueSize);
+    void * valuesCuda = mem != NULL ?
+                        mem->AllocBuf(mem->devID, valueSize) :
+                        XMemAlloc(tensor->devID, valueSize);*/
+    MTYPE * offsetsCuda;
+    void * valuesCuda; 
+    if (mem != NULL) {
+        mem->LockBuf();
+        offsetsCuda = (MTYPE*)mem->AllocBuf(mem->devID, offsetSize);
+        valuesCuda = mem->AllocBuf(mem->devID, valueSize);
+    }
+    else {
+        offsetsCuda = (MTYPE*)XMemAlloc(tensor->devID, offsetSize);
+        valuesCuda = XMemAlloc(tensor->devID, valueSize);
+    }

    if (mem != NULL) {
        XMemCopy(offsetsCuda, mem->devID, offsets, -1, offsetSize);
@@ -657,6 +668,7 @@ void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * va
    if (mem != NULL) {
        mem->ReleaseBuf(mem->devID, valueSize);
        mem->ReleaseBuf(mem->devID, offsetSize);
+        mem->UnlockBuf();
    }
    else {
        XMemFree(tensor->devID, valuesCuda);

--- a/source/tensor/core/movement/CopyBlocks.cpp
+++ b/source/tensor/core/movement/CopyBlocks.cpp
@@ -45,15 +45,25 @@ void _CopyBlocks(void * source, int unitSize, int blockSize, int blockNum, void 
    if (devID >= 0) {
 #ifdef USE_CUDA
        /* copy the index from host to device */
-        int * targetBlocksTMP = myMem != NULL ?
+        /*int * targetBlocksTMP = myMem != NULL ?
                               (int*)myMem->AllocBuf(devID, blockNum * sizeof(int)):
-                               (int*)XMemAlloc(devID, blockNum * sizeof(int));
+                               (int*)XMemAlloc(devID, blockNum * sizeof(int));*/
+        int * targetBlocksTMP;
+        if (myMem != NULL) {
+            myMem->LockBuf();
+            targetBlocksTMP = (int*)myMem->AllocBuf(devID, blockNum * sizeof(int));
+        }
+        else {
+            targetBlocksTMP = (int*)XMemAlloc(devID, blockNum * sizeof(int));
+        }
        XMemCopy(targetBlocksTMP, devID, targetBlocks, -1, blockNum * sizeof(int));

        _CopyBlocksOnSite(source, unitSize, blockSize, blockNum, target, targetBlocksTMP, devID);

-        if(myMem != NULL)
+        if (myMem != NULL) {
            myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
+            myMem->UnlockBuf();
+        }
        else
            XMemFree(devID, targetBlocksTMP);
 #else

--- a/source/tensor/core/movement/CopyBlocksInGrid.cpp
+++ b/source/tensor/core/movement/CopyBlocksInGrid.cpp
@@ -47,14 +47,17 @@ void _CopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, 
 #ifdef USE_CUDA
        int * indexGPU = index;
        if (!isIndexOnDev) {
+            myMem->LockBuf();
            indexGPU = (int*)myMem->AllocBuf(myMem->devID, blockNum * gridNum * sizeof(int));
            XMemCopy(indexGPU, myMem->devID, index, -1, blockNum * gridNum * sizeof(int));
        }

        _CudaCopyBlocksInGrid(source, blockSize, blockNum, gridNum, target, indexGPU, unitSize, myMem);

-        if (!isIndexOnDev)
+        if (!isIndexOnDev) {
            myMem->ReleaseBuf(myMem->devID, blockNum * gridNum * sizeof(int));
+            myMem->UnlockBuf();
+        }
 #else
        ShowNTErrors("Plesae specify USE_CUDA and recompile the code!");
 #endif

--- a/source/tensor/core/movement/CopyBlocksSelected.cu
+++ b/source/tensor/core/movement/CopyBlocksSelected.cu
@@ -80,12 +80,23 @@ void _CudaCopyBlocksSelected(void * source, int unitSize, int blockSize, int * s
    ProtectCudaDev(devID, devIDBackup);

    /* copy the index to the GPU memory */
-    int * sourceBlocksTMP = myMem != NULL ? 
+    /*int * sourceBlocksTMP = myMem != NULL ? 
                           (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) : 
                           (int *)XMemAlloc(devID, blockNum * sizeof(int));
    int * targetBlocksTMP = myMem != NULL ? 
                           (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) : 
-                           (int *)XMemAlloc(devID, blockNum * sizeof(int));
+                           (int *)XMemAlloc(devID, blockNum * sizeof(int));*/
+    int * sourceBlocksTMP;
+    int * targetBlocksTMP;
+    if (myMem != NULL) {
+        myMem->LockBuf();
+        sourceBlocksTMP = (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int));
+        targetBlocksTMP = (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int));
+    }
+    else {
+        sourceBlocksTMP = (int *)XMemAlloc(devID, blockNum * sizeof(int));
+        targetBlocksTMP = (int *)XMemAlloc(devID, blockNum * sizeof(int));
+    }
    
    XMemCopy(sourceBlocksTMP, devID, sourceBlocks, -1, blockNum * sizeof(int));
    XMemCopy(targetBlocksTMP, devID, targetBlocks, -1, blockNum * sizeof(int));
@@ -107,6 +118,7 @@ void _CudaCopyBlocksSelected(void * source, int unitSize, int blockSize, int * s
    if (myMem != NULL) {
        myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
        myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
+        myMem->UnlockBuf();
    }
    else {
        XMemFree(devID, sourceBlocksTMP);

--- a/source/tensor/core/movement/Gather.cu
+++ b/source/tensor/core/movement/Gather.cu
@@ -131,9 +131,16 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
            CheckNTErrors(srcIndexValue < s->unitNum, "Wrong index!");
        }

-        sIndex = mem != NULL ? 
+        /*sIndex = mem != NULL ? 
                  (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) : 
-                  (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
+                  (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);*/
+        if (mem != NULL) {
+            mem->LockBuf();
+            sIndex = (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize);
+        }
+        else {
+            sIndex = (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
+        }
        XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize);
    }
    else {
@@ -169,8 +176,10 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
    }

    if (srcIndex->devID < 0) {
-        if(mem != NULL)
+        if (mem != NULL) {
            mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
+            mem->UnlockBuf();
+        }
        else
            XMemFree(mem->devID, sIndex);
    }
@@ -209,9 +218,16 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
            CheckNTErrors(srcIndexValue < s->unitNum, "Wrong index!");
        }

-        sIndex = mem != NULL ?
-                  (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
-                  (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
+        /*sIndex = mem != NULL ?
+        (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
+        (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);*/
+        if (mem != NULL) {
+            mem->LockBuf();
+            sIndex = (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize);
+        }
+        else {
+            sIndex = (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
+        }
        XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize);
    }
    else {
@@ -238,6 +254,15 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
    else {
        ShowNTErrors("Unsupported dataType!");
    }
+
+    if (srcIndex->devID < 0) {
+        if (mem != NULL) {
+            mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
+            mem->UnlockBuf();
+        }
+        else
+            XMemFree(mem->devID, sIndex);
+    }
 }
 #endif // USE_CUDA


--- a/source/tensor/core/movement/Spread.cu
+++ b/source/tensor/core/movement/Spread.cu
@@ -177,9 +177,17 @@ void _CudaSpread(XTensor * source, XTensor * collection, int dim,
        DTYPE * c = (DTYPE*)collection->data;

        XMem * mem = source->mem;
-        int * si = mem != NULL ? 
+        /*int * si = mem != NULL ? 
                   (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize * 2) : 
-                   (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize * 2);
+                   (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize * 2);*/
+        int * si;
+        if (mem != NULL) {
+            mem->LockBuf();
+            si = (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize * 2);
+        }
+        else {
+            si = (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize * 2);
+        }
        int * ci = si + indexSize;

        XMemCopy(si, mem->devID, srcIndex, -1, sizeof(int) * indexSize);
@@ -188,8 +196,10 @@ void _CudaSpread(XTensor * source, XTensor * collection, int dim,
        KernelSpreadFuzed<<<blocks, threads >>>(s, c, blockNum, blockSizeSrc, blockSizeColl,
                                                stride, indexSize, si, ci);

-        if(mem != NULL)
+        if (mem != NULL) {
            mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize * 2);
+            mem->UnlockBuf();
+        }
        else
            XMemFree(mem->devID, si);
    }
@@ -393,9 +403,16 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcI
    dim3 threads(cudaBlocks[0], cudaBlocks[1]);

    if (srcIndex->devID < 0) {
-        sIndex = mem != NULL ? 
+        /*sIndex = mem != NULL ? 
                (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) : 
-                (int*)XMemAlloc(devID, sizeof(int) * indexSize);
+                (int*)XMemAlloc(devID, sizeof(int) * indexSize);*/
+        if (mem != NULL) {
+            mem->LockBuf();
+            sIndex = (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize);
+        }
+        else {
+            sIndex = (int*)XMemAlloc(devID, sizeof(int) * indexSize);
+        }
        XMemCopy(sIndex, devID, srcIndex->data, -1, sizeof(int) * indexSize);
    }
    else
@@ -422,8 +439,10 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcI
    }

    if (srcIndex->devID < 0) {
-        if(mem != NULL)
+        if (mem != NULL) {
            mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
+            mem->UnlockBuf();
+        }
        else
            XMemFree(devID, sIndex);
    }

--- a/source/tensor/core/reduce/ReduceMax.cu
+++ b/source/tensor/core/reduce/ReduceMax.cu
@@ -512,8 +512,8 @@ void funName(DTYPE * input, DTYPE * output,int stride, int strideNum,           
 KERNELREDUCEFUN1(KernelReduceMaxOp, MAX, shflDownReduceMax, FLOAT_MIN)
 KERNELREDUCEFUN1(KernelReduceMinOp, MIN, shflDownReduceMin, MAX_FLOAT)

-/* 
-get the max-valued items along a dimension of the tensor (cuda version). 
+/*
+get the max-valued items along a dimension of the tensor (cuda version).
 For a 1-dimensional data array a,
 sum_i = max_{0<=j<strideNum} input_{i,j}
 >> input - the input tensor
@@ -574,7 +574,14 @@ void _funcName(const XTensor * input, XTensor * output, int dim)                
        XMem * mem = input->mem;                                                                                                              \
        GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);                                     \
        int bufSize = input->unitSize * cudaGridSize[0] * stride * blockNum * 2;                                                              \
-        DTYPE * buf  = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(devID, bufSize);                          \
+        DTYPE * buf;                                                                                                                          \
+        if (mem != NULL) {                                                                                                                    \
+            mem->LockBuf();                                                                                                                   \
+            buf = (DTYPE*)mem->AllocBuf(mem->devID, bufSize);                                                                                 \
+        }                                                                                                                                     \
+        else {                                                                                                                                \
+            buf = (DTYPE*)XMemAlloc(devID, bufSize);                                                                                          \
+        }                                                                                                                                     \
        DTYPE * buf1 = buf;                                                                                                                   \
        DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum;                                                                             \
        do {                                                                                                                                  \
@@ -706,8 +713,10 @@ void _funcName(const XTensor * input, XTensor * output, int dim)                
                                                                                                                                              \
        } while (strideNum > 1);                                                                                                              \
                                                                                                                                              \
-        if (mem != NULL)                                                                                                                      \
+        if (mem != NULL) {                                                                                                                    \
            mem->ReleaseBuf(mem->devID, bufSize);                                                                                             \
+            mem->UnlockBuf();                                                                                                                 \
+        }                                                                                                                                     \
        else                                                                                                                                  \
            XMemFree(input->devID, buf);                                                                                                      \
    }                                                                                                                                         \

--- a/source/tensor/core/reduce/ReduceSum.cu
+++ b/source/tensor/core/reduce/ReduceSum.cu
@@ -757,7 +757,15 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
        GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);

        int bufSize = input->unitSize * cudaGridSize[0] * stride * blockNum * 2;
-        DTYPE * buf  = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(devID, bufSize);
+        //DTYPE * buf  = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(devID, bufSize);
+        DTYPE * buf;
+        if (mem != NULL) {
+            mem->LockBuf();
+            buf = (DTYPE*)mem->AllocBuf(mem->devID, bufSize);
+        }
+        else {
+            buf = (DTYPE*)XMemAlloc(devID, bufSize);
+        }
        DTYPE * buf1 = buf;
        DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum;
        do {
@@ -907,8 +915,10 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
        } while (strideNum > 1);
        

-        if (mem != NULL)
+        if (mem != NULL) {
            mem->ReleaseBuf(mem->devID, bufSize);
+            mem->UnlockBuf();
+        }
        else
            XMemFree(devID, buf);
    }

--- a/source/tensor/core/reduce/ReduceSumAll.cpp
+++ b/source/tensor/core/reduce/ReduceSumAll.cpp
@@ -56,12 +56,16 @@ void _ReduceSumAll(const XTensor * source, XTensor * target)

    int dims[1] = {source->unitNum};

+    if (source->mem != NULL)
+        source->mem->LockBuf();
    XTensor * all = NewTensorBufV2(1, dims, source->dataType, source->denseRatio, source->devID, source->mem);

    _CopyValues(source, all);
    _ReduceSum(all, target, 0);

    DelTensorBuf(all);
+    if (source->mem != NULL)
+        source->mem->UnlockBuf();
 }

 /*
@@ -72,6 +76,8 @@ sum all the items of the tensor (It should be optimized!)
 void _ReduceSumAll(const XTensor * source, DTYPE * value)
 {
    int * dimSize = new int[MAX_TENSOR_DIM_NUM];
+    if (source->mem != NULL)
+        source->mem->LockBuf();
    XTensor * target = NewTensorBufV2(0, dimSize, source->dataType, source->denseRatio, source->devID, source->mem);
    target->SetTMPFlag();

@@ -81,6 +87,8 @@ void _ReduceSumAll(const XTensor * source, DTYPE * value)

    delete[] dimSize;
    DelTensorBuf(target);
+    if (source->mem != NULL)
+        source->mem->UnlockBuf();
 }

 /*

--- a/source/tensor/core/shape/Merge.cpp
+++ b/source/tensor/core/shape/Merge.cpp
@@ -118,30 +118,54 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)

        void * dataTMP = t->data;

-        if (!isOnSameDevice)
-            dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(mem->devID, size);
+        if (!isOnSameDevice) {
+            /*dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(mem->devID, size);*/
+            if (mem != NULL) {
+                mem->LockBuf();
+                dataTMP = mem->AllocBuf(mem->devID, size);
+            }
+            else {
+                dataTMP = XMemAlloc(mem->devID, size);
+            }
+        }

        int blockNumInMerge = s->dimSize[leadingDim];
        int splitSizeInGrid = gridSize / blockNumInMerge;
        int realBlockSize = blockSize * t->unitSize;

-        int * blockIndex = (int*)(mem != NULL ?
+        /*int * blockIndex = (int*)(mem != NULL ?
                                  mem->AllocBuf(mem->devID, blockNum * gridNum * sizeof(int)) :
-                                  XMemAlloc(s->devID, blockNum * gridNum * sizeof(int)));
+                                  XMemAlloc(s->devID, blockNum * gridNum * sizeof(int)));*/
+        int * blockIndex;
+        if (mem != NULL) {
+            if (isOnSameDevice) {
+                mem->LockBuf();
+            }
+            blockIndex = (int*)mem->AllocBuf(mem->devID, blockNum * gridNum * sizeof(int));
+        }
+        else {
+            blockIndex = (int*)XMemAlloc(s->devID, blockNum * gridNum * sizeof(int));
+        }

        _MakeMergeBlockIndex(blockIndex, blockNum, blockNumInMerge, splitSizeInGrid, gridSize, gridNum, s->devID);

        _CopyBlocksOnSite(s->data, s->unitSize, realBlockSize, blockNum * gridNum, dataTMP, blockIndex, s->devID);

-        if (mem != NULL)
+        if (mem != NULL) {
            mem->ReleaseBuf(mem->devID, blockNum * gridNum * sizeof(int));
+            if (isOnSameDevice) {
+                mem->UnlockBuf();
+            }
+        }
        else
            XMemFree(s->devID, blockIndex);

        if (!isOnSameDevice) {
            XMemCopy(t->data, t->devID, dataTMP, s->devID, size);
-            if (mem != NULL)
+            if (mem != NULL) {
                mem->ReleaseBuf(mem->devID, size);
+                mem->UnlockBuf();
+            }
            else
                XMemFree(s->devID, dataTMP);
        }
@@ -358,8 +382,16 @@ void _Merge(const TensorList * smalls, XTensor * t, int whereToMerge)
        void * dataTMP = NULL;
        if (uniform)
            dataTMP = smallsItem0->data;
-        else
-            dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(t->devID, size);
+        else {
+            //dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(t->devID, size);
+            if (mem != NULL) {
+                mem->LockBuf();
+                dataTMP = mem->AllocBuf(mem->devID, size);
+            }
+            else {
+                dataTMP = XMemAlloc(t->devID, size);
+            }
+        }

        tensorTMP->data = dataTMP;

@@ -378,8 +410,10 @@ void _Merge(const TensorList * smalls, XTensor * t, int whereToMerge)
        tensorTMP->data = NULL;
        delete tensorTMP;

-        if ((!uniform) && (mem != NULL))
+        if ((!uniform) && (mem != NULL)) {
            mem->ReleaseBuf(mem->devID, size);
+            mem->UnlockBuf();
+        }
        else
            XMemFree(t->devID, dataTMP);
    }

--- a/source/tensor/core/shape/MergeBlockLists.cu
+++ b/source/tensor/core/shape/MergeBlockLists.cu
@@ -117,7 +117,7 @@ void _CudaMergeBlockLists(const StrList* sourceList, int * blockSizes, int block

    GDevs.GetCudaThread2D(myMem->devID, realMaxBlockSize, newBlockListSize, MAX_INT,
                          cudaGridSizes, cudaBlockSizes);
-
+    myMem->LockBuf();
    myMem->SetPinBuf();
    int * sizesGPU = (int*)myMem->AllocBuf(myMem->devID, sizeof(int) * newBlockListSize, 256);

@@ -133,6 +133,7 @@ void _CudaMergeBlockLists(const StrList* sourceList, int * blockSizes, int block
                            (sourceArraysGPU, sizesGPU, newBlockListSize, targetArraysGPU);

    myMem->BackToPinBuf();
+    myMem->UnlockBuf();

    delete[] sourceArrays;
    delete[] targetArrays;

--- a/source/tensor/core/shape/Split.cpp
+++ b/source/tensor/core/shape/Split.cpp
@@ -110,22 +110,44 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)

        void * dataTMP = t->data;

-        if (!isOnSameDevice)
-            dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(s->devID, size);
+        if (!isOnSameDevice) {
+            //dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(s->devID, size);
+            if (mem != NULL) {
+                mem->LockBuf();
+                dataTMP = mem->AllocBuf(mem->devID, size);
+            }
+            else {
+                dataTMP = XMemAlloc(s->devID, size);
+            }
+        }

        int realBlockSize = blockSize * t->unitSize;
        int blockSplitSize = blockNum / splitNum;

-        int * blockIndex = (int*)(mem != NULL ?
+        /*int * blockIndex = (int*)(mem != NULL ?
                                  mem->AllocBuf(mem->devID, blockNum * sizeof(int)) :
-                                  XMemAlloc(s->devID, blockNum * sizeof(int)));
+                                  XMemAlloc(s->devID, blockNum * sizeof(int)));*/
+        int * blockIndex;
+        if (mem != NULL) {
+            if (isOnSameDevice) {
+                mem->LockBuf();
+            }
+            blockIndex = (int*)mem->AllocBuf(mem->devID, blockNum * sizeof(int));
+        }
+        else {
+            blockIndex = (int*)XMemAlloc(s->devID, blockNum * sizeof(int));
+        }

        _MakeSplitBlockIndex(blockIndex, splitNum, blockSplitSize, blockNum, s->devID);

        _CopyBlocksOnSite(s->data, s->unitSize, realBlockSize, blockNum, dataTMP, blockIndex, s->devID);

-        if (mem != NULL)
+        if (mem != NULL) {
            mem->ReleaseBuf(mem->devID, blockNum * sizeof(int));
+            if (isOnSameDevice) {
+                mem->UnlockBuf();
+            }
+        }
        else
            XMemFree(s->devID, blockIndex);

@@ -133,8 +155,10 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
        if (!isOnSameDevice) {
            XMemCopy(t->data, t->devID, dataTMP, s->devID, size);

-            if (mem != NULL)
+            if (mem != NULL) {
                mem->ReleaseBuf(mem->devID, size);
+                mem->UnlockBuf();
+            }
            else
                XMemFree(s->devID, dataTMP);
        }
@@ -333,7 +357,14 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli
            dataTMP = first->data;
        }
        else {
-            dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(big->devID, size);
+            //dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(big->devID, size);
+            if (mem != NULL) {
+                mem->LockBuf();
+                dataTMP = mem->AllocBuf(mem->devID, size);
+            }
+            else {
+                dataTMP = XMemAlloc(big->devID, size);
+            }
        }

        tensorTMP->data = dataTMP;
@@ -354,8 +385,10 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli
        tensorTMP->data = NULL;
        delete tensorTMP;

-        if ((!uniform) && (mem != NULL))
+        if ((!uniform) && (mem != NULL)) {
            mem->ReleaseBuf(mem->devID, size);
+            mem->UnlockBuf();
+        }
        else
            XMemFree(big->devID, dataTMP);
    }

--- a/source/tensor/core/sort/Sort.cu
+++ b/source/tensor/core/sort/Sort.cu
@@ -234,7 +234,15 @@ void _CudaSortBig(const XTensor * a, XTensor * b, XTensor * indexA, XTensor * in
    int m = GetNextPower2(strideNum);
    int n = stride * blockNum;

-    void * buf = mem != NULL ? mem->AllocBuf(a->devID, n * m * a->unitSize) : XMemAlloc(a->devID, n * m * a->unitSize);
+    //void * buf = mem != NULL ? mem->AllocBuf(a->devID, n * m * a->unitSize) : XMemAlloc(a->devID, n * m * a->unitSize);
+    void * buf;
+    if (mem != NULL) {
+        mem->LockBuf();
+        buf = mem->AllocBuf(a->devID, n * m * a->unitSize);
+    }
+    else {
+        buf = XMemAlloc(a->devID, n * m * a->unitSize);
+    }
    void * bufIndex = NULL;
    if (indexA != NULL && indexB != NULL) {
        bufIndex = mem != NULL ? mem->AllocBuf(a->devID, n * m * sizeof(int)) : XMemAlloc(a->devID, n * m * sizeof(int));
@@ -289,8 +297,10 @@ void _CudaSortBig(const XTensor * a, XTensor * b, XTensor * indexA, XTensor * in
        KernelReorganizeBack<int> << <dim3(cudaGrids[1], cudaGrids[0]), dim3(cudaBlocks[1], cudaBlocks[0]) >> >
                                      (bufIndex, indexB->data, m, n, stride, k, blockNum);

-    if (mem != NULL)
+    if (mem != NULL) {
        mem->ReleaseBuf(a->devID, n * m * a->unitSize);
+        mem->UnlockBuf();
+    }
    else
        XMemFree(a->devID, buf);
    if (indexA != NULL && indexB != NULL)

--- a/source/tensor/function/LogSoftmax.cpp
+++ b/source/tensor/function/LogSoftmax.cpp
@@ -79,6 +79,8 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
        blockSize = stride * dimensionSize;
        blockNum = y->unitNum / blockSize;

+        if (mem != NULL)
+            mem->LockBuf();
        max = NewTensorBufV2(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
        sum = NewTensorBufV2(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);

@@ -153,6 +155,8 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)

        DelTensorBuf(max);
        DelTensorBuf(sum);
+        if (mem != NULL)
+            mem->UnlockBuf();

        if (x->devID >= 0) {
            delete blockx;

--- a/source/tensor/function/Softmax.cpp
+++ b/source/tensor/function/Softmax.cpp
@@ -54,6 +54,8 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
        XTensor * max = NULL;
        XTensor * sum = NULL;

+        if (mem != NULL)
+            mem->LockBuf();
        max = NewTensorBufV2(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
        sum = NewTensorBufV2(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);

@@ -113,6 +115,8 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)

        DelTensorBuf(sum);
        DelTensorBuf(max);
+        if (mem != NULL)
+            mem->UnlockBuf();

        delete[] dimSize;
    }

--- a/source/tensor/loss/CrossEntropy.cpp
+++ b/source/tensor/loss/CrossEntropy.cpp
@@ -354,8 +354,10 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
            dimSize[i - 1] = output->dimSize[i];
    }

+    if (output->mem != NULL)
+        output->mem->LockBuf();
    XTensor * lossBuf = NewTensorBufV2(output->order - 1, dimSize, output->dataType, output->denseRatio, 
-                                     output->devID, output->mem);
+                                       output->devID, output->mem);

    _CrossEntropy(output, gold, lossBuf, weight, padding, leadingDim);

@@ -367,10 +369,16 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
            nonZeroNum = (DTYPE)lossBuf->unitNum;
        }
        else {
+            if ((padding->mem != NULL) && (padding->mem != output->mem)) {
+                padding->mem->LockBuf();
+            }
            XTensor * tmp = NewTensorBufV2(padding, padding->devID, padding->mem);
            _IsNonZero(padding, tmp);
            _ReduceSumAll(tmp, &nonZeroNum);
            DelTensorBuf(tmp);
+            if ((padding->mem != NULL) && (padding->mem != output->mem)) {
+                padding->mem->UnlockBuf();
+            }
        }

        loss = loss / nonZeroNum;
@@ -384,6 +392,8 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,

    delete[] dimSize;
    DelTensorBuf(lossBuf);
+    if (output->mem != NULL)
+        output->mem->UnlockBuf();

    return loss;
 }

--- a/source/tensor/loss/CrossEntropy.cu
+++ b/source/tensor/loss/CrossEntropy.cu
@@ -57,6 +57,9 @@ void _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
 {
    int n = leadingDim < 0 ? output->order - 1 : leadingDim;
    
+    if (output->mem != NULL) {
+        output->mem->LockBuf();
+    }
    XTensor * interBuf1 = NewTensorBufV2(output, output->devID, output->mem);
    XTensor * interBuf2 = NewTensorBufV2(output, output->devID, output->mem);
    
@@ -73,6 +76,9 @@ void _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,

    DelTensorBuf(interBuf2);
    DelTensorBuf(interBuf1);
+    if (output->mem != NULL) {
+        output->mem->UnlockBuf();
+    }
 }

 /*
@@ -118,6 +124,9 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
            dimSize[i - 1] = output->dimSize[i];
    }

+    if (output->mem != NULL) {
+        output->mem->LockBuf();
+    }
    XTensor * lossBuf = NewTensorBufV2(output->order - 1, dimSize, output->dataType, output->denseRatio, 
                                     output->devID, output->mem);

@@ -131,10 +140,16 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
            nonZeroNum = (DTYPE)lossBuf->unitNum;
        }
        else {
+            if ((padding->mem != NULL) && (padding->mem != output->mem)) {
+                padding->mem->LockBuf();
+            }
            XTensor * tmp = NewTensorBufV2(padding, padding->devID, padding->mem);
            _IsNonZero(padding, tmp);
            _ReduceSumAll(tmp, &nonZeroNum);
            DelTensorBuf(tmp);
+            if ((padding->mem != NULL) && (padding->mem != output->mem)) {
+                padding->mem->UnlockBuf();
+            }
        }

        loss = loss / nonZeroNum;
@@ -148,6 +163,9 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,

    delete[] dimSize;
    DelTensorBuf(lossBuf);
+    if (output->mem != NULL) {
+        output->mem->UnlockBuf();
+    }

    return loss;
 }

--- a/source/train/XWorkerBroadcast.h
+++ b/source/train/XWorkerBroadcast.h
@@ -86,4 +86,4 @@ public:

 }

-#endif
+#endif
\ No newline at end of file