Commit 8f368e73 by liyinqiao

Merge with liyinqiao branch.

1. Add mutex when operating the memory pool.
2. Support CMake 3.19 and fix some CMake bugs. Note that CUDA_ROOT variable in CMake is modified as CUDA_TOOLKIT_ROOT. You can find this update in the README.
3. Fix some bugs on Ubuntu.
parent 7e102f8a
......@@ -25,19 +25,19 @@ option(USE_MKL "Use MKL" OFF)
option(USE_OPENBLAS "Use OpenBLAS" OFF)
option(GEN_DLL "Generate Dynamic Link Library" OFF)
# If set USE_CUDA ON, please modify CUDA_ROOT below.
# If set USE_CUDA ON, please modify CUDA_TOOLKIT_ROOT below.
# If set USE_MKL ON, please modify the INTEL_ROOT below.
# If set USE_OPENBLAS ON, please modify the OPENBLAS_ROOT below.
if (USE_CUDA)
if(NOT EXISTS ${CUDA_ROOT})
if(NOT EXISTS ${CUDA_TOOLKIT_ROOT})
if(WIN32)
set(CUDA_ROOT "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.2")
set(CUDA_TOOLKIT_ROOT "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.2")
else()
set(CUDA_ROOT "/usr/local/cuda-9.2")
set(CUDA_TOOLKIT_ROOT "/usr/local/cuda-9.2")
endif()
endif()
set(CUDA_TOOLKIT_ROOT_DIR ${CUDA_ROOT})
message(STATUS "CUDA_ROOT: ${CUDA_ROOT}")
set(CUDA_TOOLKIT_ROOT_DIR ${CUDA_TOOLKIT_ROOT})
message(STATUS "CUDA_TOOLKIT_ROOT: ${CUDA_TOOLKIT_ROOT}")
endif()
if(USE_MKL)
if(NOT DEFINED INTEL_ROOT)
......@@ -128,12 +128,13 @@ if(USE_CUDA)
if(WIN32)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4819")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-maxrregcount=0 -m64 -Wno-deprecated-gpu-targets -use_fast_math")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-maxrregcount=0 -Wno-deprecated-gpu-targets -use_fast_math")
string(REPLACE -m32 -m64 CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${ARCH_FLAGS})
set(CMAKE_POLICY_DEFAULT_CMP0028 NEW)
link_directories("${CUDA_ROOT}/lib/x64")
include_directories("${CUDA_ROOT}/include")
set(CUDA_LIB_DIR "${CUDA_ROOT}/lib/x64/")
link_directories("${CUDA_TOOLKIT_ROOT}/lib/x64")
include_directories("${CUDA_TOOLKIT_ROOT}/include")
set(CUDA_LIB_DIR "${CUDA_TOOLKIT_ROOT}/lib/x64/")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}cublas.lib")
if(CUDA_VERSION_MAJOR EQUAL 11)
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}cublasLt.lib")
......@@ -146,9 +147,9 @@ if(USE_CUDA)
set(CMAKE_CXX_FLAGS "-fPIC -msse4.2 -w -march=native -Wno-enum-compare -Wno-sign-compare -Wno-format -Wno-dev -O3 -DNDEBUG -rdynamic")
set(CUDA_NVCC_FLAGS "-Xcompiler -fPIC -maxrregcount=0 --disable-warnings -use_fast_math -DUSE_CUDA -Wno-deprecated-gpu-targets -std=c++11")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${ARCH_FLAGS})
link_directories("${CUDA_ROOT}/lib64")
include_directories("${CUDA_ROOT}/include")
set(CUDA_LIB_DIR "${CUDA_ROOT}/lib64/")
link_directories("${CUDA_TOOLKIT_ROOT}/lib64")
include_directories("${CUDA_TOOLKIT_ROOT}/include")
set(CUDA_LIB_DIR "${CUDA_TOOLKIT_ROOT}/lib64/")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcublas_static.a")
if(CUDA_VERSION_MAJOR EQUAL 11)
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcublasLt_static.a")
......@@ -158,7 +159,13 @@ if(USE_CUDA)
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libnppc_static.a")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcudadevrt.a")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcurand_static.a")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "/usr/lib64/libdl.so.2")
if(EXISTS "/usr/lib64/libdl.so.2")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "/usr/lib64/libdl.so.2")
elseif(EXISTS "/lib/x86_64-linux-gnu/libdl.so.2")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "/lib/x86_64-linux-gnu/libdl.so.2")
elseif(EXISTS "/lib64/libdl.so.2")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "/lib64/libdl.so.2")
endif()
endif()
endif()
......@@ -293,4 +300,4 @@ else()
target_link_libraries(${NIUTENSOR_EXE} ${ALL_LIB} ${FLAG})
endif()
message(STATUS "${MESS}")
endif()
\ No newline at end of file
endif()
......@@ -39,14 +39,14 @@ NiuTensor蟾・蜈キ蛹庄莉・蝨ィWindows縲´inux莉・蜿確acOS邇ッ蠅ク玖ソ幄。悟ョ芽」シ梧髪
##### CMake方式(Visual Studio)
对于WIndows平台的NiuTensor工具包安装,这里可以使用CMake工具自动生成Visual Studio项目(需要用户提前安装CMake工具以及Visual Studio集成开发环境),操作步骤如下:
对于Windows平台的NiuTensor工具包安装,这里可以使用CMake工具自动生成Visual Studio项目(需要用户提前安装CMake工具以及Visual Studio集成开发环境),操作步骤如下:
- 在工具包根目录新建目录以保存生成的Visual Studio项目文件(如建立build目录)。
- 在项目根目录打开Windows平台的命令行工具(如PowerShell),执行`cd build`命令进入新建的build目录。
- 执行CMake命令对Visual Studio项目进行生成(如果 visual studio 版本低于 2019,则在使用下列命令的时候需额外加上`-A x64`的CMake参数),如计划生成动态链接库,则仅需在命令中额外加上`-DGEN_DLL=ON`的CMake参数即可,否则默认生成可执行程序。
- 如项目计划启用MKL数学运算库(需用户自行安装),则仅需在CMake命令中使用`-DUSE_MKL=ON`参数,并通过`-DINTEL_ROOT='/intel/root/path'`指定MKL库(Intel工具包)的安装路径。如`cmake -DUSE_MKL=ON -DINTEL_ROOT='C:/Program Files (x86)/IntelSWTools/compilers_and_libraries_2020.2.254/windows' ..`
- 如项目计划启用OpenBLAS数学运算库(需用户自行安装),则仅需在CMake命令中使用`-DUSE_OPENBLAS=ON`参数,并通过`-DOPENBLAS_ROOT='/openblas/root/path'`指定OpenBLAS库的安装路径。如`cmake -DUSE_OPENBLAS=ON -DOPENBLAS_ROOT='C:/Program Files/OpenBLAS' ..`
- 如项目计划启用CUDA数学运算库(需用户自行安装),则仅需在CMake命令中使用`-DUSE_CUDA=ON`参数,并通过`-DCUDA_ROOT='/cuda/root/path'`指定CUDA库的安装路径,通过-DGPU_ARCH=ARCH指定所在GPU设备的架构(K:Kepler架构;M:Maxwell架构;P:Pascal架构;V:Volta架构;T:Turing架构;A:Ampere架构)。如`cmake -DUSE_CUDA=ON -DCUDA_ROOT='C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.2' -DGPU_ARCH=P ..`。如需在GPU设备上使用半精度浮点数进行运算,需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数(需要注意的是半精度但需要注意的是,半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持,该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询)。
- 如项目计划启用CUDA数学运算库(需用户自行安装),则仅需在CMake命令中使用`-DUSE_CUDA=ON`参数,并通过`-DCUDA_TOOLKIT_ROOT='/cuda/root/path'`指定CUDA库的安装路径,通过-DGPU_ARCH=ARCH指定所在GPU设备的架构(K:Kepler架构;M:Maxwell架构;P:Pascal架构;V:Volta架构;T:Turing架构;A:Ampere架构)。如`cmake -DUSE_CUDA=ON -DCUDA_TOOLKIT_ROOT='C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.2' -DGPU_ARCH=P ..`。如需在GPU设备上使用半精度浮点数进行运算,需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数(需要注意的是半精度但需要注意的是,半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持,该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询)。
- 执行成功将显示`Build files have been written to:...`
- 打开build目录中的NiuTensor.sln文件即可通过Visual Studio打开NiuTensor项目。
- 打开后在解决方案管理器中选中NiuTensor,右键将其设为启动项目即可开始使用。
......@@ -67,7 +67,7 @@ NiuTensor蟾・蜈キ蛹庄莉・蝨ィWindows縲´inux莉・蜿確acOS邇ッ蠅ク玖ソ幄。悟ョ芽」シ梧髪
- 打开CLion首选项,点击“构建,执行,部署”选项卡中的CMake,在“CMake选项”中进行设置,设置完成后CLion将自动使用CMake对项目进行构建,如计划生成动态链接库,则仅需在在“CMake选项”中额外加上`-DGEN_DLL=ON`的CMake参数即可,否则默认生成可执行程序。
- 如项目计划启用MKL数学运算库(需用户自行安装),则仅需在“CMake选项”中填入`-DUSE_MKL=ON`,并通过`-DINTEL_ROOT='/intel/root/path'`指定MKL库(Intel工具包)的安装路径。如`-DUSE_MKL=ON -DINTEL_ROOT='/opt/intel/compilers_and_libraries_2020.2.254/linux'`
- 如项目计划启用OpenBLAS数学运算库(需用户自行安装),则仅需在“CMake选项”中填入`-DUSE_OPENBLAS=ON`,并通过`-DOPENBLAS_ROOT='/openblas/root/path'`指定OpenBLAS库的安装路径。如`-DUSE_OPENBLAS=ON -DOPENBLAS_ROOT='/opt/OpenBLAS'`
- 如项目计划启用CUDA数学运算库(需用户自行安装),则仅需在“CMake选项”中填入`-DUSE_CUDA=ON`,并通过`-DCUDA_ROOT='/cuda/root/path'`指定CUDA库的安装路径,通过-DGPU_ARCH=ARCH指定所在GPU设备的架构(K:Kepler架构;M:Maxwell架构;P:Pascal架构;V:Volta架构;T:Turing架构;A:Ampere架构)。如`-DUSE_CUDA=ON -DCUDA_ROOT='/usr/local/cuda-9.2' -DGPU_ARCH=P `。如需在GPU设备上使用半精度浮点数进行运算,需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数(需要注意的是半精度但需要注意的是,半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持,该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询)。
- 如项目计划启用CUDA数学运算库(需用户自行安装),则仅需在“CMake选项”中填入`-DUSE_CUDA=ON`,并通过`-DCUDA_TOOLKIT_ROOT='/cuda/root/path'`指定CUDA库的安装路径,通过-DGPU_ARCH=ARCH指定所在GPU设备的架构(K:Kepler架构;M:Maxwell架构;P:Pascal架构;V:Volta架构;T:Turing架构;A:Ampere架构)。如`-DUSE_CUDA=ON -DCUDA_TOOLKIT_ROOT='/usr/local/cuda-9.2' -DGPU_ARCH=P `。如需在GPU设备上使用半精度浮点数进行运算,需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数(需要注意的是半精度但需要注意的是,半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持,该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询)。
##### CMake方式(命令行)
......@@ -78,7 +78,7 @@ NiuTensor蟾・蜈キ蛹庄莉・蝨ィWindows縲´inux莉・蜿確acOS邇ッ蠅ク玖ソ幄。悟ョ芽」シ梧髪
- 执行CMake命令对项目进行生成,如计划生成动态链接库,则仅需在命令中额外加上`-DGEN_DLL=ON`的CMake参数即可,否则默认生成可执行程序。
- 如项目计划启用MKL数学运算库(需用户自行安装),则仅需在CMake命令中使用`-DUSE_MKL=ON`参数,并通过`-DINTEL_ROOT='/intel/root/path'`指定MKL库(Intel工具包)的安装路径。如`cmake -DUSE_MKL=ON -DINTEL_ROOT='/opt/intel/compilers_and_libraries_2020.2.254/linux' ..`
- 如项目计划启用OpenBLAS数学运算库(需用户自行安装),则仅需在CMake命令中使用`-DUSE_OPENBLAS=ON`参数,并通过`-DOPENBLAS_ROOT='/openblas/root/path'`指定OpenBLAS库的安装路径。如`cmake -DUSE_OPENBLAS=ON -DOPENBLAS_ROOT='/opt/OpenBLAS' ..`
- 如项目计划启用CUDA数学运算库(需用户自行安装),则仅需在CMake命令中使用`-DUSE_CUDA=ON`参数,并通过`-DCUDA_ROOT='/cuda/root/path'`指定CUDA库的安装路径,通过-DGPU_ARCH=ARCH指定所在GPU设备的架构(K:Kepler架构;M:Maxwell架构;P:Pascal架构;V:Volta架构;T:Turing架构;A:Ampere架构)。如`cmake -DUSE_CUDA=ON -DCUDA_ROOT='/usr/local/cuda-9.2' -DGPU_ARCH=P ..`。如需在GPU设备上使用半精度浮点数进行运算,需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数(需要注意的是半精度但需要注意的是,半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持,该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询)。
- 如项目计划启用CUDA数学运算库(需用户自行安装),则仅需在CMake命令中使用`-DUSE_CUDA=ON`参数,并通过`-DCUDA_TOOLKIT_ROOT='/cuda/root/path'`指定CUDA库的安装路径,通过-DGPU_ARCH=ARCH指定所在GPU设备的架构(K:Kepler架构;M:Maxwell架构;P:Pascal架构;V:Volta架构;T:Turing架构;A:Ampere架构)。如`cmake -DUSE_CUDA=ON -DCUDA_TOOLKIT_ROOT='/usr/local/cuda-9.2' -DGPU_ARCH=P ..`。如需在GPU设备上使用半精度浮点数进行运算,需在启用`-DUSE_CUDA=ON`参数的同时启用`-USE_HALF_PRECISION=ON`参数(需要注意的是半精度但需要注意的是,半精度操作仅在使用Pascal及更新架构的NVIDIA GPU中提供支持,该项可参考[NVIDIA GPU设备信息](https://developer.nvidia.com/cuda-gpus)进行查询)。
- 执行成功将显示`Build files have been written to:...`并在该目录下生成Makefile文件。
- 执行`make -j`命令对NiuTensor项目进行编译,执行成功将显示`Built target NiuTensor`,安装完毕。
......@@ -137,4 +137,4 @@ NiuTensor蠑驥剰ョ。邂怜コ鍋罰荳懷圏螟ァ蟄ヲ閾ェ辟カ隸ュ險螟炊螳樣ェ悟ョ、蟆冗央蠑貅仙
## 更新版本
NiuTensor version 0.3.5 - 2021年2月6日
NiuTensor version 0.4.0 - 2021年3月13日
......@@ -159,11 +159,15 @@ void XMathGrad::GradAbsolute(XTensor * node, bool isEfficient)
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
if (a->mem != NULL)
a->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
_Sign(a, tmp);
_Multiply(node->grad, tmp, a->grad, 1.0F);
DelTensorBuf(tmp);
if (a->mem != NULL)
a->mem->UnlockBuf();
}
}
......@@ -188,12 +192,16 @@ void XMathGrad::GradCos(XTensor * node, bool isEfficient)
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
if (a->mem != NULL)
a->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
_Sin(a, tmp);
_NegateMe(tmp);
_Multiply(node->grad, tmp, a->grad, 1.0F);
DelTensorBuf(tmp);
if (a->mem != NULL)
a->mem->UnlockBuf();
}
}
......@@ -218,11 +226,15 @@ void XMathGrad::GradExp(XTensor * node, bool isEfficient)
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
if (a->mem != NULL)
a->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
_Exp(a, tmp);
_Multiply(node->grad, tmp, a->grad, 1.0F);
DelTensorBuf(tmp);
if (a->mem != NULL)
a->mem->UnlockBuf();
}
}
......@@ -317,11 +329,15 @@ void XMathGrad::GradSin(XTensor * node, bool isEfficient)
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
if (a->mem != NULL)
a->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
_Cos(a, tmp);
_Multiply(node->grad, tmp, a->grad, 1.0F);
DelTensorBuf(tmp);
if (a->mem != NULL)
a->mem->UnlockBuf();
}
}
......@@ -341,17 +357,22 @@ void XMathGrad::GradTan(XTensor * node, bool isEfficient)
CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for TAN!");
XTensor * a = income.tails[0];
XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
/* dE/da = dE/dc * 1/(cos(a))^2
= dE/dc * (cos(a))^-2 */
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
if (a->mem != NULL)
a->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
_Cos(a, tmp);
_PowerMe(tmp, -2.0F);
_Multiply(node->grad, tmp, a->grad, 1.0F);
DelTensorBuf(tmp);
if (a->mem != NULL)
a->mem->UnlockBuf();
}
}
......@@ -379,11 +400,15 @@ void XMathGrad::GradClip(XTensor * node, bool isEfficient)
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
if (a->mem != NULL)
a->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
_ClipBackward(node, a, node->grad, tmp, lower, upper);
_SumMe(a->grad, tmp);
DelTensorBuf(tmp);
if (a->mem != NULL)
a->mem->UnlockBuf();
}
}
......@@ -417,6 +442,8 @@ void XMathGrad::GradDiv(XTensor * node, bool isEfficient)
= dE/dc * a * (-b^-2) */
if (!isEfficient || b->isGrad) {
XNoder::MakeGrad(b);
if (a->mem != NULL)
a->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
_Power(b, tmp, -2.0F);
_NegateMe(tmp);
......@@ -424,6 +451,8 @@ void XMathGrad::GradDiv(XTensor * node, bool isEfficient)
_Multiply(node->grad, tmp, b->grad, 1.0F);
DelTensorBuf(tmp);
if (a->mem != NULL)
a->mem->UnlockBuf();
}
}
......@@ -461,9 +490,17 @@ void XMathGrad::GradDivDim(XTensor * node, bool isEfficient)
int dimSize[MAX_TENSOR_DIM_NUM];
memcpy(dimSize, a->dimSize, sizeof(int) * a->order);
if (a->mem != NULL)
a->mem->LockBuf();
XTensor * aTMP1 = NewTensorBufV2(a, a->devID, a->mem);
XTensor * aTMP2 = NewTensorBufV2(a, a->devID, a->mem);
if ((b->mem != NULL) && (b->mem != a->mem)) {
b->mem->LockBuf();
}
XTensor * bTMP = NewTensorBufV2(b, b->devID, b->mem);
if ((node->mem != NULL) && (node->mem != a->mem) && (node->mem != b->mem)) {
node->mem->LockBuf();
}
XTensor * interGradTMP = NewTensorBufV2(node->grad, node->devID, node->mem);
_Negate(a, aTMP1);
......@@ -505,6 +542,7 @@ void XMathGrad::GradDivDim(XTensor * node, bool isEfficient)
Then reduce along with z and x to obtain dE/db. */
interGradTMP->Reshape(3, reshapedSize);
// b->mem->LockBuf();
XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
_ReduceSum(interGradTMP, interGrad, 2);
......@@ -515,12 +553,21 @@ void XMathGrad::GradDivDim(XTensor * node, bool isEfficient)
DelTensorBuf(bGradTMP2);
DelTensorBuf(interGrad);
// b->mem->UnlockBuf();
}
DelTensorBuf(interGradTMP);
if ((node->mem != NULL) && (node->mem != a->mem) && (node->mem != b->mem)) {
node->mem->UnlockBuf();
}
DelTensorBuf(bTMP);
if ((b->mem != NULL) && (b->mem != a->mem)) {
b->mem->UnlockBuf();
}
DelTensorBuf(aTMP2);
DelTensorBuf(aTMP1);
if (a->mem != NULL)
a->mem->UnlockBuf();
}
}
......@@ -805,6 +852,8 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
int dimSize[MAX_TENSOR_DIM_NUM];
memcpy(dimSize, a->dimSize, sizeof(int) * a->order);
if (node->mem != NULL)
node->mem->LockBuf();
XTensor * bGradTMP = NewTensorBufV2(node->grad, node->devID, node->mem);
_Multiply(node->grad, a, bGradTMP);
......@@ -817,12 +866,18 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
size of b. Then we can reduce the matrix into a row vector. */
bGradTMP->Reshape(2, reshapedSize);
if ((b->mem != NULL) && (b->mem != node->mem)) {
b->mem->LockBuf();
}
XTensor * bGradTMP2 = NewTensorBufV2(b->grad, b->devID, b->mem);
_ReduceSum(bGradTMP, bGradTMP2, 0);
_Sum(b->grad, bGradTMP2, b->grad);
DelTensorBuf(bGradTMP2);
if ((b->mem != NULL) && (b->mem != node->mem)) {
b->mem->UnlockBuf();
}
}
else {
int reshapedSize[MAX_TENSOR_DIM_NUM];
......@@ -841,6 +896,9 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
Then reduce along with z and x to obtain dE/db. */
bGradTMP->Reshape(3, reshapedSize);
if ((b->mem != NULL) && (b->mem != node->mem)) {
b->mem->LockBuf();
}
XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
_ReduceSum(bGradTMP, interGrad, 2);
......@@ -851,8 +909,13 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
DelTensorBuf(bGradTMP2);
DelTensorBuf(interGrad);
if ((b->mem != NULL) && (b->mem != node->mem)) {
b->mem->UnlockBuf();
}
}
DelTensorBuf(bGradTMP);
if (node->mem != NULL)
node->mem->UnlockBuf();
}
}
......@@ -949,12 +1012,16 @@ void XMathGrad::GradPower(XTensor * node, bool isEfficient)
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
if (a->mem != NULL)
a->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
_Power(a, tmp, p - 1.0F);
_ScaleAndShiftMe(tmp, p);
_Multiply(node->grad, tmp, a->grad, 1.0F);
DelTensorBuf(tmp);
if (a->mem != NULL)
a->mem->UnlockBuf();
}
}
......@@ -979,12 +1046,16 @@ void XMathGrad::GradReciprocal(XTensor* node, bool isEfficient)
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
if (a->mem != NULL)
a->mem->LockBuf();
XTensor* tmp = NewTensorBufV2(a, a->devID, a->mem);
_Power(a, tmp, -2.0F);
_NegateMe(tmp);
_Multiply(node->grad, tmp, a->grad, 1.0F);
DelTensorBuf(tmp);
if (a->mem != NULL)
a->mem->UnlockBuf();
}
}
......@@ -1008,11 +1079,15 @@ void XMathGrad::GradSqrt(XTensor * node, bool isEfficient)
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
if (a->mem != NULL)
a->mem->LockBuf();
XTensor* tmp = NewTensorBufV2(a, a->devID, a->mem);
_ScaleMe(tmp, 2.0F);
_Multiply(node->grad, tmp, a->grad, 1.0F);
DelTensorBuf(tmp);
if (a->mem != NULL)
a->mem->UnlockBuf();
}
}
......@@ -1036,12 +1111,16 @@ void XMathGrad::GradSquare(XTensor * node, bool isEfficient)
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
if (a->mem != NULL)
a->mem->LockBuf();
XTensor* tmp = NewTensorBufV2(a, a->devID, a->mem);
_Power(a, tmp, -0.5F);
_ScaleMe(tmp, 0.5);
_Multiply(node->grad, tmp, a->grad, 1.0F);
DelTensorBuf(tmp);
if (a->mem != NULL)
a->mem->UnlockBuf();
}
}
......@@ -1226,12 +1305,16 @@ void XMathGrad::GradSubDim(XTensor * node, bool isEfficient)
size of b. Then we can reduce the matrix into a row vector. */
node->grad->Reshape(2, reshapedSize);
if (b->mem != NULL)
b->mem->LockBuf();
XTensor * bGradTMP = NewTensorBufV2(b->grad, b->devID, b->mem);
_ReduceSum(node->grad, bGradTMP, 0);
if (beta != 1.0F)
_ScaleAndShiftMe(bGradTMP, beta);
_Sub(b->grad, bGradTMP, b->grad);
DelTensorBuf(bGradTMP);
if (b->mem != NULL)
b->mem->UnlockBuf();
node->grad->Reshape(order, dimSize);
}
......@@ -1252,6 +1335,8 @@ void XMathGrad::GradSubDim(XTensor * node, bool isEfficient)
Then reduce along with z and x to obtain dE/db. */
node->grad->Reshape(3, reshapedSize);
if (b->mem != NULL)
b->mem->LockBuf();
XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
_ReduceSum(node->grad, interGrad, 2);
......@@ -1266,6 +1351,8 @@ void XMathGrad::GradSubDim(XTensor * node, bool isEfficient)
node->grad->Reshape(order, dimSize);
DelTensorBuf(interGrad);
if (b->mem != NULL)
b->mem->UnlockBuf();
}
}
}
......@@ -1346,12 +1433,16 @@ void XMathGrad::GradSumDim(XTensor * node, bool isEfficient)
size of b. Then we can reduce the matrix into a row vector. */
node->grad->Reshape(2, reshapedSize);
if (b->mem != NULL)
b->mem->LockBuf();
XTensor * bGradTMP = NewTensorBufV2(b->grad, b->devID, b->mem);
_ReduceSum(node->grad, bGradTMP, 0);
if (beta != 1.0F)
_ScaleAndShiftMe(bGradTMP, beta);
_Sum(bGradTMP, b->grad, b->grad);
DelTensorBuf(bGradTMP);
if (b->mem != NULL)
b->mem->UnlockBuf();
node->grad->Reshape(order, dimSize);
}
......@@ -1372,6 +1463,8 @@ void XMathGrad::GradSumDim(XTensor * node, bool isEfficient)
Then reduce along with z and x to obtain dE/db. */
node->grad->Reshape(3, reshapedSize);
if (b->mem != NULL)
b->mem->LockBuf();
XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
_ReduceSum(node->grad, interGrad, 2);
......@@ -1386,6 +1479,8 @@ void XMathGrad::GradSumDim(XTensor * node, bool isEfficient)
node->grad->Reshape(order, dimSize);
DelTensorBuf(interGrad);
if (b->mem != NULL)
b->mem->UnlockBuf();
}
}
}
......@@ -1452,12 +1547,16 @@ void XMathGrad::GradReduceMean(XTensor * node, bool isEfficient)
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
if (a->mem != NULL)
a->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
_Unsqueeze(node->grad, tmp, dim, n);
_ScaleAndShiftMe(tmp, 1.0F / n);
_Sum(a->grad, tmp, a->grad);
DelTensorBuf(tmp);
if (a->mem != NULL)
a->mem->UnlockBuf();
}
}
......@@ -1486,10 +1585,14 @@ void XMathGrad::GradReduceSum(XTensor * node, bool isEfficient)
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
if (a->mem != NULL)
a->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
_Unsqueeze(node->grad, tmp, dim, n);
_Sum(a->grad, tmp, a->grad);
DelTensorBuf(tmp);
if (a->mem != NULL)
a->mem->UnlockBuf();
}
}
......@@ -1515,11 +1618,15 @@ void XMathGrad::GradReduceSumAll(XTensor * node, bool isEfficient)
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
if (a->mem != NULL)
a->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(a, a->devID, a->mem);
DTYPE value = node->grad->Get0D();
tmp->SetDataFixed(value);
_Sum(a->grad, tmp, a->grad);
DelTensorBuf(tmp);
if (a->mem != NULL)
a->mem->UnlockBuf();
}
}
......@@ -1542,9 +1649,14 @@ void XMathGrad::GradReduceSumSquared(XTensor * node, bool isEfficient)
XTensor * a = income.tails[0];
XTensor * b = income.tails[1];
if (a->mem != NULL)
a->mem->LockBuf();
XTensor * c = NewTensorBufV2(a, a->devID, a->mem);
XTensor * d = NewTensorBufV2(a, a->devID, a->mem);
XTensor * e = NewTensorBufV2(a, a->devID, a->mem);
if ((b->mem != NULL) && (b->mem != a->mem)) {
b->mem->LockBuf();
}
XTensor * f = NewTensorBufV2(b, b->devID, b->mem);
int dim = income.GetParamInt(0);
......@@ -1573,9 +1685,14 @@ void XMathGrad::GradReduceSumSquared(XTensor * node, bool isEfficient)
}
DelTensorBuf(f);
if ((b->mem != NULL) && (b->mem != a->mem)) {
b->mem->UnlockBuf();
}
DelTensorBuf(e);
DelTensorBuf(d);
DelTensorBuf(c);
if (a->mem != NULL)
a->mem->UnlockBuf();
}
/*
......@@ -1598,9 +1715,14 @@ void XMathGrad::GradReduceVariance(XTensor * node, bool isEfficient)
XTensor * a = income.tails[0];
XTensor * b = income.tails[1];
if (a->mem != NULL)
a->mem->LockBuf();
XTensor * c = NewTensorBufV2(a, a->devID, a->mem);
XTensor * d = NewTensorBufV2(a, a->devID, a->mem);
XTensor * e = NewTensorBufV2(a, a->devID, a->mem);
if ((b->mem != NULL) && (b->mem != a->mem)) {
b->mem->LockBuf();
}
XTensor * f = NewTensorBufV2(b, b->devID, b->mem);
int dim = income.GetParamInt(0);
......@@ -1628,9 +1750,14 @@ void XMathGrad::GradReduceVariance(XTensor * node, bool isEfficient)
}
DelTensorBuf(f);
if ((b->mem != NULL) && (b->mem != a->mem)) {
b->mem->UnlockBuf();
}
DelTensorBuf(e);
DelTensorBuf(d);
DelTensorBuf(c);
if (a->mem != NULL)
a->mem->UnlockBuf();
}
/*
......@@ -1675,10 +1802,14 @@ void XMathGrad::GradMulAndShift(XTensor * node, bool isEfficient)
size of b. Then we can reduce the matrix into a row vector. */
node->grad->Reshape(2, reshapedSize);
if (b->mem != NULL)
b->mem->LockBuf();
XTensor * bGradTMP = NewTensorBufV2(b->grad, b->devID, b->mem);
_ReduceSum(node->grad, bGradTMP, 0);
_Sum(bGradTMP, b->grad, b->grad);
DelTensorBuf(bGradTMP);
if (b->mem != NULL)
b->mem->UnlockBuf();
node->grad->Reshape(order, dimSize);
}
......@@ -1699,6 +1830,8 @@ void XMathGrad::GradMulAndShift(XTensor * node, bool isEfficient)
Then reduce along with z and x to obtain dE/db. */
node->grad->Reshape(3, reshapedSize);
if (b->mem != NULL)
b->mem->LockBuf();
XTensor * interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
_ReduceSum(node->grad, interGrad, 2);
......@@ -1710,6 +1843,8 @@ void XMathGrad::GradMulAndShift(XTensor * node, bool isEfficient)
node->grad->Reshape(order, dimSize);
DelTensorBuf(interGrad);
if (b->mem != NULL)
b->mem->UnlockBuf();
}
}
......@@ -1814,6 +1949,8 @@ void XMathGrad::GradMLP(XTensor* node, bool isEfficient)
Then reduce along with z and x to obtain dE/db. */
node->grad->Reshape(3, reshapedSize);
if (b->mem != NULL)
b->mem->LockBuf();
XTensor* interGrad = NewTensorBufV2(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
_ReduceSum(node->grad, interGrad, 2);
......@@ -1825,6 +1962,8 @@ void XMathGrad::GradMLP(XTensor* node, bool isEfficient)
node->grad->Reshape(order, dimSize);
DelTensorBuf(interGrad);
if (b->mem != NULL)
b->mem->UnlockBuf();
}
}
......
......@@ -105,11 +105,15 @@ void XShapeGrad::GradConvertDataType(XTensor* node, bool isEfficient)
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
if (a->mem != NULL)
a->mem->LockBuf();
XTensor* tmp = NewTensorBufV2(a, a->devID, a->mem);
_ConvertDataType(node->grad, tmp);
_SumMe(a->grad, tmp);
DelTensorBuf(tmp);
if (a->mem != NULL)
a->mem->UnlockBuf();
}
node->visitMark = NODE_FINISHED;
......@@ -141,11 +145,15 @@ void XShapeGrad::GradCopyIndexed(XTensor * node, bool isEfficient)
if (!isEfficient || input->isGrad) {
XNoder::MakeGrad(input);
if (input->mem != NULL)
input->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
_SpreadForCopyIndexed(tmp, node->grad, dim, srcIndex, tgtIndex, copyNum);
_SumMe(input->grad, tmp);
DelTensorBuf(tmp);
if (input->mem != NULL)
input->mem->UnlockBuf();
}
node->visitMark = NODE_FINISHED;
......@@ -173,12 +181,16 @@ void XShapeGrad::GradGather(XTensor * node, bool isEfficient)
if (!isEfficient || input->isGrad) {
XNoder::MakeGrad(input);
if (input->mem != NULL)
input->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
tmp->SetZeroAll();
_SpreadForGather(tmp, node->grad, index);
_SumMe(input->grad, tmp);
DelTensorBuf(tmp);
if (input->mem != NULL)
input->mem->UnlockBuf();
}
node->visitMark = NODE_FINISHED;
......@@ -200,6 +212,8 @@ void XShapeGrad::GradDropoutWithIndex(XTensor * node, bool isEfficient)
if (!isEfficient || input->isGrad) {
XNoder::MakeGrad(input);
if (input->mem != NULL)
input->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
_CopyValues(node->grad, tmp);
......@@ -212,6 +226,8 @@ void XShapeGrad::GradDropoutWithIndex(XTensor * node, bool isEfficient)
_SumMe(input->grad, tmp);
DelTensorBuf(tmp);
if (input->mem != NULL)
input->mem->UnlockBuf();
}
node->visitMark = NODE_FINISHED;
......@@ -456,12 +472,16 @@ void XShapeGrad::GradSplit(XTensor * node, bool isEfficient)
/* if the tensor is used somewhere else, we need another SUM
for gradient accumulation */
else {
if (input->mem != NULL)
input->mem->LockBuf();
XTensor * inputGradTMP = NewTensorBufV2(input, input->devID, input->mem);
_Merge(node->grad, inputGradTMP, whereToSplit + 1, 0);
_Sum(input->grad, inputGradTMP, input->grad);
DelTensorBuf(inputGradTMP);
if (input->mem != NULL)
input->mem->UnlockBuf();
}
}
......@@ -543,12 +563,16 @@ void XShapeGrad::GradSplitListPost(XTensor * node, bool isEfficient)
somewhere else, we need another SUM for gradient
accumulation */
else {
if (node->mem != NULL)
node->mem->LockBuf();
XTensor * nodeGradTMP = NewTensorBufV2(node, node->devID, node->mem);
_Merge(&splits, nodeGradTMP, whereToSplit + 1);
_Sum(node->grad, nodeGradTMP, node->grad);
DelTensorBuf(nodeGradTMP);
if (node->mem != NULL)
node->mem->UnlockBuf();
}
}
......@@ -584,11 +608,15 @@ void XShapeGrad::GradTranspose(XTensor * node, bool isEfficient)
CheckNTErrors(input->order > i && i >= 0, "index of dimension is out of scope!");
CheckNTErrors(input->order > j && j >= 0, "index of dimension is out of scope!");
if (input->mem != NULL)
input->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
_Transpose(output->grad, tmp, i, j);
_Sum(input->grad, tmp, input->grad);
DelTensorBuf(tmp);
if (input->mem != NULL)
input->mem->UnlockBuf();
}
node->visitMark = NODE_FINISHED;
......@@ -622,12 +650,16 @@ void XShapeGrad::GradUnsqueeze(XTensor * node, bool isEfficient)
if (!isEfficient || input->isGrad) {
XNoder::MakeGrad(input);
if (input->mem != NULL)
input->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(input->grad, input->devID, input->mem);
_ReduceSum(output->grad, tmp, dim);
_Sum(input->grad, tmp, input->grad);
DelTensorBuf(tmp);
if (input->mem != NULL)
input->mem->UnlockBuf();
}
node->visitMark = NODE_FINISHED;
......
......@@ -265,6 +265,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
dims[inputDec.order + 1] = inputEnc.GetDim(inputEnc.order - 1);
InitTensor(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, paddingEnc.devID);
GMems.GetMem(paddingEnc.devID)->LockBuf();
XTensor* maskEncDecTMPEnc = NewTensorBuf(paddingEnc.order + 1, dims + 1,
paddingEnc.dataType, paddingEnc.devID);
XTensor* maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID);
......@@ -275,6 +276,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
DelTensorBuf(maskEncDecTMPDec);
DelTensorBuf(maskEncDecTMPEnc);
GMems.GetMem(paddingEnc.devID)->UnlockBuf();
/* padding on the source side */
int* dimsPadding = new int[paddingEnc.order + 2];
......@@ -283,6 +285,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
dimsPadding[paddingEnc.order - 1] = paddingEnc.GetDim(-1);
dimsPadding[paddingEnc.order] = paddingEnc.GetDim(-1);
GMems.GetMem(paddingEnc.devID)->LockBuf();
XTensor* padding2 = NewTensorBuf(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType,
paddingEnc.devID);
......@@ -309,6 +312,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
DelTensorBuf(padding3);
DelTensorBuf(padding2);
GMems.GetMem(paddingEnc.devID)->UnlockBuf();
}
/*
......
......@@ -428,6 +428,7 @@ void Trainer::Update(Model* model, const float lr)
_ScaleAndShiftMe(v, (1.0F - adamBeta2), 0);
/* v2 = m / (sqrt(v) + delta) */
GMems.GetMem(v->devID)->LockBuf();
XTensor* v2 = NewTensorBuf(v, v->devID);
_Power(v, v2, 0.5F);
_ScaleAndShiftMe(v2, 1.0F, d);
......@@ -437,6 +438,7 @@ void Trainer::Update(Model* model, const float lr)
_Sum(para, v2, para, -e);
DelTensorBuf(v2);
GMems.GetMem(v->devID)->UnlockBuf();
}
else {
/* the delta rule */
......
......@@ -253,15 +253,25 @@ void Div(const XTensor & a, const XTensor & b, XTensor & c, DTYPE alpha, int lea
if (b.order == 0){
DTYPE scale = 1.0F / b.Get0D();
if (a.mem != NULL)
a.mem->LockBuf();
XTensor * tmp1 = NewTensorBufV2(&a, a.devID, a.mem);
if ((c.mem != NULL) && (c.mem != a.mem)) {
c.mem->LockBuf();
}
XTensor * tmp2 = NewTensorBufV2(&c, c.devID, c.mem);
ScaleAndShift(a, *tmp1, scale, 0.0F);
ScaleAndShift(c, *tmp2, alpha, 0.0F);
Sum(*tmp2, *tmp1, c);
DelTensorBuf(tmp1);
DelTensorBuf(tmp2);
if ((c.mem != NULL) && (c.mem != a.mem)) {
c.mem->UnlockBuf();
}
DelTensorBuf(tmp1);
if (a.mem != NULL)
a.mem->UnlockBuf();
}
else {
int n = GetBroadcastDimIndex(a, b);
......
......@@ -61,6 +61,8 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
float dr = (!x.isSparse || !w.isSparse) ? 1.0F : MAX(x.denseRatio, w.denseRatio);
if (x.mem != NULL)
x.mem->LockBuf();
XTensor * tmp = NewTensorBufV2(order, dimSize, x.dataType, dr, x.devID, x.mem);
/* call _MatrixMul function */
......@@ -101,6 +103,8 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
/* destroy variables */
delete[] dimSize;
DelTensorBuf(tmp);
if (x.mem != NULL)
x.mem->UnlockBuf();
return c;
}
......@@ -137,6 +141,8 @@ XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedX,
float dr = (!x.isSparse || !w.isSparse) ? 1.0F : MAX(x.denseRatio, w.denseRatio);
if (x.mem != NULL)
x.mem->LockBuf();
XTensor * tmp = NewTensorBufV2(order, dimSize, x.dataType, dr, x.devID, x.mem);
/* call _MatrixMul function */
......@@ -175,6 +181,8 @@ XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedX,
/* destroy variables */
delete[] dimSize;
DelTensorBuf(tmp);
if (x.mem != NULL)
x.mem->UnlockBuf();
return c;
}
......
......@@ -277,15 +277,25 @@ void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int l
if (b.order == 0){
DTYPE scale = b.Get0D();
if (a.mem != NULL)
a.mem->LockBuf();
XTensor * tmp1 = NewTensorBufV2(&a, a.devID, a.mem);
if ((c.mem != NULL) && (c.mem != a.mem)) {
c.mem->LockBuf();
}
XTensor * tmp2 = NewTensorBufV2(&c, c.devID, c.mem);
ScaleAndShift(a, *tmp1, scale, 0.0F);
ScaleAndShift(c, *tmp2, alpha, 0.0F);
Sum(*tmp2, *tmp1, c);
DelTensorBuf(tmp1);
DelTensorBuf(tmp2);
if ((c.mem != NULL) && (c.mem != a.mem)) {
c.mem->UnlockBuf();
}
DelTensorBuf(tmp1);
if (a.mem != NULL)
a.mem->UnlockBuf();
}
else {
int n = GetBroadcastDimIndex(a, b);
......
......@@ -290,9 +290,16 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
source = target;
}
target = t->mem != NULL ?
/*target = t->mem != NULL ?
t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize):
XMemAlloc(t->devID, t->unitNum * t->unitSize);
XMemAlloc(t->devID, t->unitNum * t->unitSize);*/
if (t->mem != NULL) {
t->mem->LockBuf();
target = t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize);
}
else {
target = XMemAlloc(t->devID, t->unitNum * t->unitSize);
}
s->data = source;
t->data = target;
......@@ -302,8 +309,9 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
/* free the memory space of the one before the last allocation */
if(count > 0){
int size = s->unitNum * s->unitSize;
if(t->mem != NULL)
if(t->mem != NULL) {
t->mem->ReleaseBuf(t->devID, size);
}
else
XMemFree(t->devID, source);
}
......@@ -312,8 +320,10 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
if(isLast){
CheckNTErrors(t->unitNum == c->unitNum, "Wrong tensor size!");
_Multiply(a, t, c, beta);
if(t->mem != NULL)
if(t->mem != NULL) {
t->mem->ReleaseBuf(t->devID, t->unitNum * t->unitSize);
t->mem->UnlockBuf();
}
else
XMemFree(t->devID, target);
target = NULL;
......
......@@ -293,10 +293,16 @@ void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta
source = target;
}
target = t->mem != NULL ?
/*target = t->mem != NULL ?
t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize):
XMemAlloc(t->devID, t->unitNum * t->unitSize);
XMemAlloc(t->devID, t->unitNum * t->unitSize);*/
if (t->mem != NULL) {
t->mem->LockBuf();
target = t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize);
}
else {
target = XMemAlloc(t->devID, t->unitNum * t->unitSize);
}
s->data = source;
t->data = target;
......@@ -315,8 +321,10 @@ void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta
if(isLast){
CheckNTErrors(t->unitNum == c->unitNum, "Wrong tensor size!");
_Sum(a, t, c, beta);
if(t->mem != NULL)
if(t->mem != NULL) {
t->mem->ReleaseBuf(t->devID, t->unitNum * t->unitSize);
t->mem->UnlockBuf();
}
else
XMemFree(t->devID, target);
target = NULL;
......
......@@ -330,6 +330,7 @@ void _CudaBLASMatrixMULList(cublasHandle_t * handle,
DTYPE ** cpGPU = NULL;
if (mem != NULL) {
mem->LockBuf();
mem->SetPinBuf();
apGPU = (DTYPE**)mem->AllocBuf(mem->devID, sizeof(DTYPE*) * a->count, 256);
bpGPU = (DTYPE**)mem->AllocBuf(mem->devID, sizeof(DTYPE*) * a->count, 256);
......@@ -356,8 +357,10 @@ void _CudaBLASMatrixMULList(cublasHandle_t * handle,
delete[] bp;
delete[] cp;
if(mem != NULL)
if (mem != NULL) {
mem->BackToPinBuf();
mem->UnlockBuf();
}
else {
XMemFree(a0->devID, apGPU);
XMemFree(a0->devID, bpGPU);
......
......@@ -696,13 +696,23 @@ void _SetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYPE nu
#ifdef USE_CUDA
XMem * mem = tensor->mem;
MTYPE size = num * sizeof(MTYPE);
MTYPE * offsetsCuda = mem != NULL ? (MTYPE*)mem->AllocBuf(mem->devID, size) : (MTYPE*)XMemAlloc(tensor->devID, size);
//MTYPE * offsetsCuda = mem != NULL ? (MTYPE*)mem->AllocBuf(mem->devID, size) : (MTYPE*)XMemAlloc(tensor->devID, size);
MTYPE * offsetsCuda;
if (mem != NULL) {
mem->LockBuf();
offsetsCuda = (MTYPE*)mem->AllocBuf(mem->devID, size);
}
else {
offsetsCuda = (MTYPE*)XMemAlloc(tensor->devID, size);
}
XMemCopy(offsetsCuda, tensor->devID, offsets, -1, num * sizeof(MTYPE));
_CudaSetDataWithOffset(tensor, offsetsCuda, value, num);
if (mem != NULL)
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, size);
mem->UnlockBuf();
}
else
XMemFree(tensor->devID, offsetsCuda);
#else
......
......@@ -636,12 +636,23 @@ void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * va
int devIDBackup;
ProtectCudaDev(tensor->devID, devIDBackup);
MTYPE * offsetsCuda = mem != NULL ?
/*MTYPE * offsetsCuda = mem != NULL ?
(MTYPE*)mem->AllocBuf(mem->devID, offsetSize) :
(MTYPE*)XMemAlloc(tensor->devID, offsetSize);
void * valuesCuda = mem != NULL ?
mem->AllocBuf(mem->devID, valueSize) :
XMemAlloc(tensor->devID, valueSize);
void * valuesCuda = mem != NULL ?
mem->AllocBuf(mem->devID, valueSize) :
XMemAlloc(tensor->devID, valueSize);*/
MTYPE * offsetsCuda;
void * valuesCuda;
if (mem != NULL) {
mem->LockBuf();
offsetsCuda = (MTYPE*)mem->AllocBuf(mem->devID, offsetSize);
valuesCuda = mem->AllocBuf(mem->devID, valueSize);
}
else {
offsetsCuda = (MTYPE*)XMemAlloc(tensor->devID, offsetSize);
valuesCuda = XMemAlloc(tensor->devID, valueSize);
}
if (mem != NULL) {
XMemCopy(offsetsCuda, mem->devID, offsets, -1, offsetSize);
......@@ -657,6 +668,7 @@ void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * va
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, valueSize);
mem->ReleaseBuf(mem->devID, offsetSize);
mem->UnlockBuf();
}
else {
XMemFree(tensor->devID, valuesCuda);
......
......@@ -45,15 +45,25 @@ void _CopyBlocks(void * source, int unitSize, int blockSize, int blockNum, void
if (devID >= 0) {
#ifdef USE_CUDA
/* copy the index from host to device */
int * targetBlocksTMP = myMem != NULL ?
/*int * targetBlocksTMP = myMem != NULL ?
(int*)myMem->AllocBuf(devID, blockNum * sizeof(int)):
(int*)XMemAlloc(devID, blockNum * sizeof(int));
(int*)XMemAlloc(devID, blockNum * sizeof(int));*/
int * targetBlocksTMP;
if (myMem != NULL) {
myMem->LockBuf();
targetBlocksTMP = (int*)myMem->AllocBuf(devID, blockNum * sizeof(int));
}
else {
targetBlocksTMP = (int*)XMemAlloc(devID, blockNum * sizeof(int));
}
XMemCopy(targetBlocksTMP, devID, targetBlocks, -1, blockNum * sizeof(int));
_CopyBlocksOnSite(source, unitSize, blockSize, blockNum, target, targetBlocksTMP, devID);
if(myMem != NULL)
if (myMem != NULL) {
myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
myMem->UnlockBuf();
}
else
XMemFree(devID, targetBlocksTMP);
#else
......
......@@ -47,14 +47,17 @@ void _CopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum,
#ifdef USE_CUDA
int * indexGPU = index;
if (!isIndexOnDev) {
myMem->LockBuf();
indexGPU = (int*)myMem->AllocBuf(myMem->devID, blockNum * gridNum * sizeof(int));
XMemCopy(indexGPU, myMem->devID, index, -1, blockNum * gridNum * sizeof(int));
}
_CudaCopyBlocksInGrid(source, blockSize, blockNum, gridNum, target, indexGPU, unitSize, myMem);
if (!isIndexOnDev)
if (!isIndexOnDev) {
myMem->ReleaseBuf(myMem->devID, blockNum * gridNum * sizeof(int));
myMem->UnlockBuf();
}
#else
ShowNTErrors("Plesae specify USE_CUDA and recompile the code!");
#endif
......
......@@ -80,12 +80,23 @@ void _CudaCopyBlocksSelected(void * source, int unitSize, int blockSize, int * s
ProtectCudaDev(devID, devIDBackup);
/* copy the index to the GPU memory */
int * sourceBlocksTMP = myMem != NULL ?
/*int * sourceBlocksTMP = myMem != NULL ?
(int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) :
(int *)XMemAlloc(devID, blockNum * sizeof(int));
int * targetBlocksTMP = myMem != NULL ?
(int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) :
(int *)XMemAlloc(devID, blockNum * sizeof(int));
(int *)XMemAlloc(devID, blockNum * sizeof(int));*/
int * sourceBlocksTMP;
int * targetBlocksTMP;
if (myMem != NULL) {
myMem->LockBuf();
sourceBlocksTMP = (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int));
targetBlocksTMP = (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int));
}
else {
sourceBlocksTMP = (int *)XMemAlloc(devID, blockNum * sizeof(int));
targetBlocksTMP = (int *)XMemAlloc(devID, blockNum * sizeof(int));
}
XMemCopy(sourceBlocksTMP, devID, sourceBlocks, -1, blockNum * sizeof(int));
XMemCopy(targetBlocksTMP, devID, targetBlocks, -1, blockNum * sizeof(int));
......@@ -107,6 +118,7 @@ void _CudaCopyBlocksSelected(void * source, int unitSize, int blockSize, int * s
if (myMem != NULL) {
myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
myMem->UnlockBuf();
}
else {
XMemFree(devID, sourceBlocksTMP);
......
......@@ -131,9 +131,16 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
CheckNTErrors(srcIndexValue < s->unitNum, "Wrong index!");
}
sIndex = mem != NULL ?
/*sIndex = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
(int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
(int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);*/
if (mem != NULL) {
mem->LockBuf();
sIndex = (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize);
}
else {
sIndex = (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
}
XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize);
}
else {
......@@ -169,8 +176,10 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
}
if (srcIndex->devID < 0) {
if(mem != NULL)
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
mem->UnlockBuf();
}
else
XMemFree(mem->devID, sIndex);
}
......@@ -209,9 +218,16 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
CheckNTErrors(srcIndexValue < s->unitNum, "Wrong index!");
}
sIndex = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
(int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
/*sIndex = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
(int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);*/
if (mem != NULL) {
mem->LockBuf();
sIndex = (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize);
}
else {
sIndex = (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
}
XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize);
}
else {
......@@ -238,6 +254,15 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
else {
ShowNTErrors("Unsupported dataType!");
}
if (srcIndex->devID < 0) {
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
mem->UnlockBuf();
}
else
XMemFree(mem->devID, sIndex);
}
}
#endif // USE_CUDA
......
......@@ -177,9 +177,17 @@ void _CudaSpread(XTensor * source, XTensor * collection, int dim,
DTYPE * c = (DTYPE*)collection->data;
XMem * mem = source->mem;
int * si = mem != NULL ?
/*int * si = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize * 2) :
(int*)XMemAlloc(mem->devID, sizeof(int) * indexSize * 2);
(int*)XMemAlloc(mem->devID, sizeof(int) * indexSize * 2);*/
int * si;
if (mem != NULL) {
mem->LockBuf();
si = (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize * 2);
}
else {
si = (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize * 2);
}
int * ci = si + indexSize;
XMemCopy(si, mem->devID, srcIndex, -1, sizeof(int) * indexSize);
......@@ -188,8 +196,10 @@ void _CudaSpread(XTensor * source, XTensor * collection, int dim,
KernelSpreadFuzed<<<blocks, threads >>>(s, c, blockNum, blockSizeSrc, blockSizeColl,
stride, indexSize, si, ci);
if(mem != NULL)
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize * 2);
mem->UnlockBuf();
}
else
XMemFree(mem->devID, si);
}
......@@ -393,9 +403,16 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcI
dim3 threads(cudaBlocks[0], cudaBlocks[1]);
if (srcIndex->devID < 0) {
sIndex = mem != NULL ?
/*sIndex = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
(int*)XMemAlloc(devID, sizeof(int) * indexSize);
(int*)XMemAlloc(devID, sizeof(int) * indexSize);*/
if (mem != NULL) {
mem->LockBuf();
sIndex = (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize);
}
else {
sIndex = (int*)XMemAlloc(devID, sizeof(int) * indexSize);
}
XMemCopy(sIndex, devID, srcIndex->data, -1, sizeof(int) * indexSize);
}
else
......@@ -422,8 +439,10 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcI
}
if (srcIndex->devID < 0) {
if(mem != NULL)
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
mem->UnlockBuf();
}
else
XMemFree(devID, sIndex);
}
......
......@@ -512,8 +512,8 @@ void funName(DTYPE * input, DTYPE * output,int stride, int strideNum,
KERNELREDUCEFUN1(KernelReduceMaxOp, MAX, shflDownReduceMax, FLOAT_MIN)
KERNELREDUCEFUN1(KernelReduceMinOp, MIN, shflDownReduceMin, MAX_FLOAT)
/*
get the max-valued items along a dimension of the tensor (cuda version).
/*
get the max-valued items along a dimension of the tensor (cuda version).
For a 1-dimensional data array a,
sum_i = max_{0<=j<strideNum} input_{i,j}
>> input - the input tensor
......@@ -574,7 +574,14 @@ void _funcName(const XTensor * input, XTensor * output, int dim)
XMem * mem = input->mem; \
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize); \
int bufSize = input->unitSize * cudaGridSize[0] * stride * blockNum * 2; \
DTYPE * buf = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(devID, bufSize); \
DTYPE * buf; \
if (mem != NULL) { \
mem->LockBuf(); \
buf = (DTYPE*)mem->AllocBuf(mem->devID, bufSize); \
} \
else { \
buf = (DTYPE*)XMemAlloc(devID, bufSize); \
} \
DTYPE * buf1 = buf; \
DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum; \
do { \
......@@ -706,8 +713,10 @@ void _funcName(const XTensor * input, XTensor * output, int dim)
\
} while (strideNum > 1); \
\
if (mem != NULL) \
if (mem != NULL) { \
mem->ReleaseBuf(mem->devID, bufSize); \
mem->UnlockBuf(); \
} \
else \
XMemFree(input->devID, buf); \
} \
......
......@@ -757,7 +757,15 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
int bufSize = input->unitSize * cudaGridSize[0] * stride * blockNum * 2;
DTYPE * buf = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(devID, bufSize);
//DTYPE * buf = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(devID, bufSize);
DTYPE * buf;
if (mem != NULL) {
mem->LockBuf();
buf = (DTYPE*)mem->AllocBuf(mem->devID, bufSize);
}
else {
buf = (DTYPE*)XMemAlloc(devID, bufSize);
}
DTYPE * buf1 = buf;
DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum;
do {
......@@ -907,8 +915,10 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
} while (strideNum > 1);
if (mem != NULL)
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, bufSize);
mem->UnlockBuf();
}
else
XMemFree(devID, buf);
}
......
......@@ -56,12 +56,16 @@ void _ReduceSumAll(const XTensor * source, XTensor * target)
int dims[1] = {source->unitNum};
if (source->mem != NULL)
source->mem->LockBuf();
XTensor * all = NewTensorBufV2(1, dims, source->dataType, source->denseRatio, source->devID, source->mem);
_CopyValues(source, all);
_ReduceSum(all, target, 0);
DelTensorBuf(all);
if (source->mem != NULL)
source->mem->UnlockBuf();
}
/*
......@@ -72,6 +76,8 @@ sum all the items of the tensor (It should be optimized!)
void _ReduceSumAll(const XTensor * source, DTYPE * value)
{
int * dimSize = new int[MAX_TENSOR_DIM_NUM];
if (source->mem != NULL)
source->mem->LockBuf();
XTensor * target = NewTensorBufV2(0, dimSize, source->dataType, source->denseRatio, source->devID, source->mem);
target->SetTMPFlag();
......@@ -81,6 +87,8 @@ void _ReduceSumAll(const XTensor * source, DTYPE * value)
delete[] dimSize;
DelTensorBuf(target);
if (source->mem != NULL)
source->mem->UnlockBuf();
}
/*
......
......@@ -118,30 +118,54 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
void * dataTMP = t->data;
if (!isOnSameDevice)
dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(mem->devID, size);
if (!isOnSameDevice) {
/*dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(mem->devID, size);*/
if (mem != NULL) {
mem->LockBuf();
dataTMP = mem->AllocBuf(mem->devID, size);
}
else {
dataTMP = XMemAlloc(mem->devID, size);
}
}
int blockNumInMerge = s->dimSize[leadingDim];
int splitSizeInGrid = gridSize / blockNumInMerge;
int realBlockSize = blockSize * t->unitSize;
int * blockIndex = (int*)(mem != NULL ?
/*int * blockIndex = (int*)(mem != NULL ?
mem->AllocBuf(mem->devID, blockNum * gridNum * sizeof(int)) :
XMemAlloc(s->devID, blockNum * gridNum * sizeof(int)));
XMemAlloc(s->devID, blockNum * gridNum * sizeof(int)));*/
int * blockIndex;
if (mem != NULL) {
if (isOnSameDevice) {
mem->LockBuf();
}
blockIndex = (int*)mem->AllocBuf(mem->devID, blockNum * gridNum * sizeof(int));
}
else {
blockIndex = (int*)XMemAlloc(s->devID, blockNum * gridNum * sizeof(int));
}
_MakeMergeBlockIndex(blockIndex, blockNum, blockNumInMerge, splitSizeInGrid, gridSize, gridNum, s->devID);
_CopyBlocksOnSite(s->data, s->unitSize, realBlockSize, blockNum * gridNum, dataTMP, blockIndex, s->devID);
if (mem != NULL)
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, blockNum * gridNum * sizeof(int));
if (isOnSameDevice) {
mem->UnlockBuf();
}
}
else
XMemFree(s->devID, blockIndex);
if (!isOnSameDevice) {
XMemCopy(t->data, t->devID, dataTMP, s->devID, size);
if (mem != NULL)
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, size);
mem->UnlockBuf();
}
else
XMemFree(s->devID, dataTMP);
}
......@@ -358,8 +382,16 @@ void _Merge(const TensorList * smalls, XTensor * t, int whereToMerge)
void * dataTMP = NULL;
if (uniform)
dataTMP = smallsItem0->data;
else
dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(t->devID, size);
else {
//dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(t->devID, size);
if (mem != NULL) {
mem->LockBuf();
dataTMP = mem->AllocBuf(mem->devID, size);
}
else {
dataTMP = XMemAlloc(t->devID, size);
}
}
tensorTMP->data = dataTMP;
......@@ -378,8 +410,10 @@ void _Merge(const TensorList * smalls, XTensor * t, int whereToMerge)
tensorTMP->data = NULL;
delete tensorTMP;
if ((!uniform) && (mem != NULL))
if ((!uniform) && (mem != NULL)) {
mem->ReleaseBuf(mem->devID, size);
mem->UnlockBuf();
}
else
XMemFree(t->devID, dataTMP);
}
......
......@@ -117,7 +117,7 @@ void _CudaMergeBlockLists(const StrList* sourceList, int * blockSizes, int block
GDevs.GetCudaThread2D(myMem->devID, realMaxBlockSize, newBlockListSize, MAX_INT,
cudaGridSizes, cudaBlockSizes);
myMem->LockBuf();
myMem->SetPinBuf();
int * sizesGPU = (int*)myMem->AllocBuf(myMem->devID, sizeof(int) * newBlockListSize, 256);
......@@ -133,6 +133,7 @@ void _CudaMergeBlockLists(const StrList* sourceList, int * blockSizes, int block
(sourceArraysGPU, sizesGPU, newBlockListSize, targetArraysGPU);
myMem->BackToPinBuf();
myMem->UnlockBuf();
delete[] sourceArrays;
delete[] targetArrays;
......
......@@ -110,22 +110,44 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
void * dataTMP = t->data;
if (!isOnSameDevice)
dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(s->devID, size);
if (!isOnSameDevice) {
//dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(s->devID, size);
if (mem != NULL) {
mem->LockBuf();
dataTMP = mem->AllocBuf(mem->devID, size);
}
else {
dataTMP = XMemAlloc(s->devID, size);
}
}
int realBlockSize = blockSize * t->unitSize;
int blockSplitSize = blockNum / splitNum;
int * blockIndex = (int*)(mem != NULL ?
/*int * blockIndex = (int*)(mem != NULL ?
mem->AllocBuf(mem->devID, blockNum * sizeof(int)) :
XMemAlloc(s->devID, blockNum * sizeof(int)));
XMemAlloc(s->devID, blockNum * sizeof(int)));*/
int * blockIndex;
if (mem != NULL) {
if (isOnSameDevice) {
mem->LockBuf();
}
blockIndex = (int*)mem->AllocBuf(mem->devID, blockNum * sizeof(int));
}
else {
blockIndex = (int*)XMemAlloc(s->devID, blockNum * sizeof(int));
}
_MakeSplitBlockIndex(blockIndex, splitNum, blockSplitSize, blockNum, s->devID);
_CopyBlocksOnSite(s->data, s->unitSize, realBlockSize, blockNum, dataTMP, blockIndex, s->devID);
if (mem != NULL)
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, blockNum * sizeof(int));
if (isOnSameDevice) {
mem->UnlockBuf();
}
}
else
XMemFree(s->devID, blockIndex);
......@@ -133,8 +155,10 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
if (!isOnSameDevice) {
XMemCopy(t->data, t->devID, dataTMP, s->devID, size);
if (mem != NULL)
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, size);
mem->UnlockBuf();
}
else
XMemFree(s->devID, dataTMP);
}
......@@ -333,7 +357,14 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli
dataTMP = first->data;
}
else {
dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(big->devID, size);
//dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(big->devID, size);
if (mem != NULL) {
mem->LockBuf();
dataTMP = mem->AllocBuf(mem->devID, size);
}
else {
dataTMP = XMemAlloc(big->devID, size);
}
}
tensorTMP->data = dataTMP;
......@@ -354,8 +385,10 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli
tensorTMP->data = NULL;
delete tensorTMP;
if ((!uniform) && (mem != NULL))
if ((!uniform) && (mem != NULL)) {
mem->ReleaseBuf(mem->devID, size);
mem->UnlockBuf();
}
else
XMemFree(big->devID, dataTMP);
}
......
......@@ -234,7 +234,15 @@ void _CudaSortBig(const XTensor * a, XTensor * b, XTensor * indexA, XTensor * in
int m = GetNextPower2(strideNum);
int n = stride * blockNum;
void * buf = mem != NULL ? mem->AllocBuf(a->devID, n * m * a->unitSize) : XMemAlloc(a->devID, n * m * a->unitSize);
//void * buf = mem != NULL ? mem->AllocBuf(a->devID, n * m * a->unitSize) : XMemAlloc(a->devID, n * m * a->unitSize);
void * buf;
if (mem != NULL) {
mem->LockBuf();
buf = mem->AllocBuf(a->devID, n * m * a->unitSize);
}
else {
buf = XMemAlloc(a->devID, n * m * a->unitSize);
}
void * bufIndex = NULL;
if (indexA != NULL && indexB != NULL) {
bufIndex = mem != NULL ? mem->AllocBuf(a->devID, n * m * sizeof(int)) : XMemAlloc(a->devID, n * m * sizeof(int));
......@@ -289,8 +297,10 @@ void _CudaSortBig(const XTensor * a, XTensor * b, XTensor * indexA, XTensor * in
KernelReorganizeBack<int> << <dim3(cudaGrids[1], cudaGrids[0]), dim3(cudaBlocks[1], cudaBlocks[0]) >> >
(bufIndex, indexB->data, m, n, stride, k, blockNum);
if (mem != NULL)
if (mem != NULL) {
mem->ReleaseBuf(a->devID, n * m * a->unitSize);
mem->UnlockBuf();
}
else
XMemFree(a->devID, buf);
if (indexA != NULL && indexB != NULL)
......
......@@ -79,6 +79,8 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
blockSize = stride * dimensionSize;
blockNum = y->unitNum / blockSize;
if (mem != NULL)
mem->LockBuf();
max = NewTensorBufV2(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
sum = NewTensorBufV2(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
......@@ -153,6 +155,8 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
DelTensorBuf(max);
DelTensorBuf(sum);
if (mem != NULL)
mem->UnlockBuf();
if (x->devID >= 0) {
delete blockx;
......
......@@ -54,6 +54,8 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
XTensor * max = NULL;
XTensor * sum = NULL;
if (mem != NULL)
mem->LockBuf();
max = NewTensorBufV2(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
sum = NewTensorBufV2(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
......@@ -113,6 +115,8 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
DelTensorBuf(sum);
DelTensorBuf(max);
if (mem != NULL)
mem->UnlockBuf();
delete[] dimSize;
}
......
......@@ -354,8 +354,10 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
dimSize[i - 1] = output->dimSize[i];
}
if (output->mem != NULL)
output->mem->LockBuf();
XTensor * lossBuf = NewTensorBufV2(output->order - 1, dimSize, output->dataType, output->denseRatio,
output->devID, output->mem);
output->devID, output->mem);
_CrossEntropy(output, gold, lossBuf, weight, padding, leadingDim);
......@@ -367,10 +369,16 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
nonZeroNum = (DTYPE)lossBuf->unitNum;
}
else {
if ((padding->mem != NULL) && (padding->mem != output->mem)) {
padding->mem->LockBuf();
}
XTensor * tmp = NewTensorBufV2(padding, padding->devID, padding->mem);
_IsNonZero(padding, tmp);
_ReduceSumAll(tmp, &nonZeroNum);
DelTensorBuf(tmp);
if ((padding->mem != NULL) && (padding->mem != output->mem)) {
padding->mem->UnlockBuf();
}
}
loss = loss / nonZeroNum;
......@@ -384,6 +392,8 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
delete[] dimSize;
DelTensorBuf(lossBuf);
if (output->mem != NULL)
output->mem->UnlockBuf();
return loss;
}
......
......@@ -57,6 +57,9 @@ void _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
{
int n = leadingDim < 0 ? output->order - 1 : leadingDim;
if (output->mem != NULL) {
output->mem->LockBuf();
}
XTensor * interBuf1 = NewTensorBufV2(output, output->devID, output->mem);
XTensor * interBuf2 = NewTensorBufV2(output, output->devID, output->mem);
......@@ -73,6 +76,9 @@ void _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
DelTensorBuf(interBuf2);
DelTensorBuf(interBuf1);
if (output->mem != NULL) {
output->mem->UnlockBuf();
}
}
/*
......@@ -118,6 +124,9 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
dimSize[i - 1] = output->dimSize[i];
}
if (output->mem != NULL) {
output->mem->LockBuf();
}
XTensor * lossBuf = NewTensorBufV2(output->order - 1, dimSize, output->dataType, output->denseRatio,
output->devID, output->mem);
......@@ -131,10 +140,16 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
nonZeroNum = (DTYPE)lossBuf->unitNum;
}
else {
if ((padding->mem != NULL) && (padding->mem != output->mem)) {
padding->mem->LockBuf();
}
XTensor * tmp = NewTensorBufV2(padding, padding->devID, padding->mem);
_IsNonZero(padding, tmp);
_ReduceSumAll(tmp, &nonZeroNum);
DelTensorBuf(tmp);
if ((padding->mem != NULL) && (padding->mem != output->mem)) {
padding->mem->UnlockBuf();
}
}
loss = loss / nonZeroNum;
......@@ -148,6 +163,9 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
delete[] dimSize;
DelTensorBuf(lossBuf);
if (output->mem != NULL) {
output->mem->UnlockBuf();
}
return loss;
}
......
......@@ -86,4 +86,4 @@ public:
}
#endif
#endif
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论