Commit ae6e43fd by liyinqiao

Merge with xiaotong branch.

1. Add mutex when operating the memory pool.
2. Support CMake 3.19 and fix some CMake bugs. Note that CUDA_ROOT variable in CMake is modified as CUDA_TOOLKIT_ROOT. You can find this update in the README.
3. Fix some bugs on Ubuntu.
parent 7e102f8a
...@@ -25,19 +25,19 @@ option(USE_MKL "Use MKL" OFF) ...@@ -25,19 +25,19 @@ option(USE_MKL "Use MKL" OFF)
option(USE_OPENBLAS "Use OpenBLAS" OFF) option(USE_OPENBLAS "Use OpenBLAS" OFF)
option(GEN_DLL "Generate Dynamic Link Library" OFF) option(GEN_DLL "Generate Dynamic Link Library" OFF)
# If set USE_CUDA ON, please modify CUDA_ROOT below. # If set USE_CUDA ON, please modify CUDA_TOOLKIT_ROOT below.
# If set USE_MKL ON, please modify the INTEL_ROOT below. # If set USE_MKL ON, please modify the INTEL_ROOT below.
# If set USE_OPENBLAS ON, please modify the OPENBLAS_ROOT below. # If set USE_OPENBLAS ON, please modify the OPENBLAS_ROOT below.
if (USE_CUDA) if (USE_CUDA)
if(NOT EXISTS ${CUDA_ROOT}) if(NOT EXISTS ${CUDA_TOOLKIT_ROOT})
if(WIN32) if(WIN32)
set(CUDA_ROOT "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.2") set(CUDA_TOOLKIT_ROOT "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.2")
else() else()
set(CUDA_ROOT "/usr/local/cuda-9.2") set(CUDA_TOOLKIT_ROOT "/usr/local/cuda-9.2")
endif() endif()
endif() endif()
set(CUDA_TOOLKIT_ROOT_DIR ${CUDA_ROOT}) set(CUDA_TOOLKIT_ROOT_DIR ${CUDA_TOOLKIT_ROOT})
message(STATUS "CUDA_ROOT: ${CUDA_ROOT}") message(STATUS "CUDA_TOOLKIT_ROOT: ${CUDA_TOOLKIT_ROOT}")
endif() endif()
if(USE_MKL) if(USE_MKL)
if(NOT DEFINED INTEL_ROOT) if(NOT DEFINED INTEL_ROOT)
...@@ -128,12 +128,13 @@ if(USE_CUDA) ...@@ -128,12 +128,13 @@ if(USE_CUDA)
if(WIN32) if(WIN32)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4819") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4819")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-maxrregcount=0 -m64 -Wno-deprecated-gpu-targets -use_fast_math") set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-maxrregcount=0 -Wno-deprecated-gpu-targets -use_fast_math")
string(REPLACE -m32 -m64 CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${ARCH_FLAGS}) set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${ARCH_FLAGS})
set(CMAKE_POLICY_DEFAULT_CMP0028 NEW) set(CMAKE_POLICY_DEFAULT_CMP0028 NEW)
link_directories("${CUDA_ROOT}/lib/x64") link_directories("${CUDA_TOOLKIT_ROOT}/lib/x64")
include_directories("${CUDA_ROOT}/include") include_directories("${CUDA_TOOLKIT_ROOT}/include")
set(CUDA_LIB_DIR "${CUDA_ROOT}/lib/x64/") set(CUDA_LIB_DIR "${CUDA_TOOLKIT_ROOT}/lib/x64/")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}cublas.lib") set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}cublas.lib")
if(CUDA_VERSION_MAJOR EQUAL 11) if(CUDA_VERSION_MAJOR EQUAL 11)
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}cublasLt.lib") set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}cublasLt.lib")
...@@ -146,9 +147,9 @@ if(USE_CUDA) ...@@ -146,9 +147,9 @@ if(USE_CUDA)
set(CMAKE_CXX_FLAGS "-fPIC -msse4.2 -w -march=native -Wno-enum-compare -Wno-sign-compare -Wno-format -Wno-dev -O3 -DNDEBUG -rdynamic") set(CMAKE_CXX_FLAGS "-fPIC -msse4.2 -w -march=native -Wno-enum-compare -Wno-sign-compare -Wno-format -Wno-dev -O3 -DNDEBUG -rdynamic")
set(CUDA_NVCC_FLAGS "-Xcompiler -fPIC -maxrregcount=0 --disable-warnings -use_fast_math -DUSE_CUDA -Wno-deprecated-gpu-targets -std=c++11") set(CUDA_NVCC_FLAGS "-Xcompiler -fPIC -maxrregcount=0 --disable-warnings -use_fast_math -DUSE_CUDA -Wno-deprecated-gpu-targets -std=c++11")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${ARCH_FLAGS}) set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${ARCH_FLAGS})
link_directories("${CUDA_ROOT}/lib64") link_directories("${CUDA_TOOLKIT_ROOT}/lib64")
include_directories("${CUDA_ROOT}/include") include_directories("${CUDA_TOOLKIT_ROOT}/include")
set(CUDA_LIB_DIR "${CUDA_ROOT}/lib64/") set(CUDA_LIB_DIR "${CUDA_TOOLKIT_ROOT}/lib64/")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcublas_static.a") set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcublas_static.a")
if(CUDA_VERSION_MAJOR EQUAL 11) if(CUDA_VERSION_MAJOR EQUAL 11)
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcublasLt_static.a") set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcublasLt_static.a")
...@@ -158,7 +159,13 @@ if(USE_CUDA) ...@@ -158,7 +159,13 @@ if(USE_CUDA)
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libnppc_static.a") set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libnppc_static.a")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcudadevrt.a") set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcudadevrt.a")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcurand_static.a") set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcurand_static.a")
if(EXISTS "/usr/lib64/libdl.so.2")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "/usr/lib64/libdl.so.2") set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "/usr/lib64/libdl.so.2")
elseif(EXISTS "/lib/x86_64-linux-gnu/libdl.so.2")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "/lib/x86_64-linux-gnu/libdl.so.2")
elseif(EXISTS "/lib64/libdl.so.2")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "/lib64/libdl.so.2")
endif()
endif() endif()
endif() endif()
......
...@@ -105,11 +105,15 @@ void XShapeGrad::GradConvertDataType(XTensor* node, bool isEfficient) ...@@ -105,11 +105,15 @@ void XShapeGrad::GradConvertDataType(XTensor* node, bool isEfficient)
if (!isEfficient || a->isGrad) { if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a); XNoder::MakeGrad(a);
if (a->mem != NULL)
a->mem->LockBuf();
XTensor* tmp = NewTensorBufV2(a, a->devID, a->mem); XTensor* tmp = NewTensorBufV2(a, a->devID, a->mem);
_ConvertDataType(node->grad, tmp); _ConvertDataType(node->grad, tmp);
_SumMe(a->grad, tmp); _SumMe(a->grad, tmp);
DelTensorBuf(tmp); DelTensorBuf(tmp);
if (a->mem != NULL)
a->mem->UnlockBuf();
} }
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
...@@ -141,11 +145,15 @@ void XShapeGrad::GradCopyIndexed(XTensor * node, bool isEfficient) ...@@ -141,11 +145,15 @@ void XShapeGrad::GradCopyIndexed(XTensor * node, bool isEfficient)
if (!isEfficient || input->isGrad) { if (!isEfficient || input->isGrad) {
XNoder::MakeGrad(input); XNoder::MakeGrad(input);
if (input->mem != NULL)
input->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem); XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
_SpreadForCopyIndexed(tmp, node->grad, dim, srcIndex, tgtIndex, copyNum); _SpreadForCopyIndexed(tmp, node->grad, dim, srcIndex, tgtIndex, copyNum);
_SumMe(input->grad, tmp); _SumMe(input->grad, tmp);
DelTensorBuf(tmp); DelTensorBuf(tmp);
if (input->mem != NULL)
input->mem->UnlockBuf();
} }
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
...@@ -173,12 +181,16 @@ void XShapeGrad::GradGather(XTensor * node, bool isEfficient) ...@@ -173,12 +181,16 @@ void XShapeGrad::GradGather(XTensor * node, bool isEfficient)
if (!isEfficient || input->isGrad) { if (!isEfficient || input->isGrad) {
XNoder::MakeGrad(input); XNoder::MakeGrad(input);
if (input->mem != NULL)
input->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem); XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
tmp->SetZeroAll(); tmp->SetZeroAll();
_SpreadForGather(tmp, node->grad, index); _SpreadForGather(tmp, node->grad, index);
_SumMe(input->grad, tmp); _SumMe(input->grad, tmp);
DelTensorBuf(tmp); DelTensorBuf(tmp);
if (input->mem != NULL)
input->mem->UnlockBuf();
} }
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
...@@ -200,6 +212,8 @@ void XShapeGrad::GradDropoutWithIndex(XTensor * node, bool isEfficient) ...@@ -200,6 +212,8 @@ void XShapeGrad::GradDropoutWithIndex(XTensor * node, bool isEfficient)
if (!isEfficient || input->isGrad) { if (!isEfficient || input->isGrad) {
XNoder::MakeGrad(input); XNoder::MakeGrad(input);
if (input->mem != NULL)
input->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem); XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
_CopyValues(node->grad, tmp); _CopyValues(node->grad, tmp);
...@@ -212,6 +226,8 @@ void XShapeGrad::GradDropoutWithIndex(XTensor * node, bool isEfficient) ...@@ -212,6 +226,8 @@ void XShapeGrad::GradDropoutWithIndex(XTensor * node, bool isEfficient)
_SumMe(input->grad, tmp); _SumMe(input->grad, tmp);
DelTensorBuf(tmp); DelTensorBuf(tmp);
if (input->mem != NULL)
input->mem->UnlockBuf();
} }
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
...@@ -456,12 +472,16 @@ void XShapeGrad::GradSplit(XTensor * node, bool isEfficient) ...@@ -456,12 +472,16 @@ void XShapeGrad::GradSplit(XTensor * node, bool isEfficient)
/* if the tensor is used somewhere else, we need another SUM /* if the tensor is used somewhere else, we need another SUM
for gradient accumulation */ for gradient accumulation */
else { else {
if (input->mem != NULL)
input->mem->LockBuf();
XTensor * inputGradTMP = NewTensorBufV2(input, input->devID, input->mem); XTensor * inputGradTMP = NewTensorBufV2(input, input->devID, input->mem);
_Merge(node->grad, inputGradTMP, whereToSplit + 1, 0); _Merge(node->grad, inputGradTMP, whereToSplit + 1, 0);
_Sum(input->grad, inputGradTMP, input->grad); _Sum(input->grad, inputGradTMP, input->grad);
DelTensorBuf(inputGradTMP); DelTensorBuf(inputGradTMP);
if (input->mem != NULL)
input->mem->UnlockBuf();
} }
} }
...@@ -543,12 +563,16 @@ void XShapeGrad::GradSplitListPost(XTensor * node, bool isEfficient) ...@@ -543,12 +563,16 @@ void XShapeGrad::GradSplitListPost(XTensor * node, bool isEfficient)
somewhere else, we need another SUM for gradient somewhere else, we need another SUM for gradient
accumulation */ accumulation */
else { else {
if (node->mem != NULL)
node->mem->LockBuf();
XTensor * nodeGradTMP = NewTensorBufV2(node, node->devID, node->mem); XTensor * nodeGradTMP = NewTensorBufV2(node, node->devID, node->mem);
_Merge(&splits, nodeGradTMP, whereToSplit + 1); _Merge(&splits, nodeGradTMP, whereToSplit + 1);
_Sum(node->grad, nodeGradTMP, node->grad); _Sum(node->grad, nodeGradTMP, node->grad);
DelTensorBuf(nodeGradTMP); DelTensorBuf(nodeGradTMP);
if (node->mem != NULL)
node->mem->UnlockBuf();
} }
} }
...@@ -584,11 +608,15 @@ void XShapeGrad::GradTranspose(XTensor * node, bool isEfficient) ...@@ -584,11 +608,15 @@ void XShapeGrad::GradTranspose(XTensor * node, bool isEfficient)
CheckNTErrors(input->order > i && i >= 0, "index of dimension is out of scope!"); CheckNTErrors(input->order > i && i >= 0, "index of dimension is out of scope!");
CheckNTErrors(input->order > j && j >= 0, "index of dimension is out of scope!"); CheckNTErrors(input->order > j && j >= 0, "index of dimension is out of scope!");
if (input->mem != NULL)
input->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem); XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
_Transpose(output->grad, tmp, i, j); _Transpose(output->grad, tmp, i, j);
_Sum(input->grad, tmp, input->grad); _Sum(input->grad, tmp, input->grad);
DelTensorBuf(tmp); DelTensorBuf(tmp);
if (input->mem != NULL)
input->mem->UnlockBuf();
} }
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
...@@ -622,12 +650,16 @@ void XShapeGrad::GradUnsqueeze(XTensor * node, bool isEfficient) ...@@ -622,12 +650,16 @@ void XShapeGrad::GradUnsqueeze(XTensor * node, bool isEfficient)
if (!isEfficient || input->isGrad) { if (!isEfficient || input->isGrad) {
XNoder::MakeGrad(input); XNoder::MakeGrad(input);
if (input->mem != NULL)
input->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(input->grad, input->devID, input->mem); XTensor * tmp = NewTensorBufV2(input->grad, input->devID, input->mem);
_ReduceSum(output->grad, tmp, dim); _ReduceSum(output->grad, tmp, dim);
_Sum(input->grad, tmp, input->grad); _Sum(input->grad, tmp, input->grad);
DelTensorBuf(tmp); DelTensorBuf(tmp);
if (input->mem != NULL)
input->mem->UnlockBuf();
} }
node->visitMark = NODE_FINISHED; node->visitMark = NODE_FINISHED;
......
...@@ -265,6 +265,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec, ...@@ -265,6 +265,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
dims[inputDec.order + 1] = inputEnc.GetDim(inputEnc.order - 1); dims[inputDec.order + 1] = inputEnc.GetDim(inputEnc.order - 1);
InitTensor(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, paddingEnc.devID); InitTensor(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, paddingEnc.devID);
GMems.GetMem(paddingEnc.devID)->LockBuf();
XTensor* maskEncDecTMPEnc = NewTensorBuf(paddingEnc.order + 1, dims + 1, XTensor* maskEncDecTMPEnc = NewTensorBuf(paddingEnc.order + 1, dims + 1,
paddingEnc.dataType, paddingEnc.devID); paddingEnc.dataType, paddingEnc.devID);
XTensor* maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID); XTensor* maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID);
...@@ -275,6 +276,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec, ...@@ -275,6 +276,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
DelTensorBuf(maskEncDecTMPDec); DelTensorBuf(maskEncDecTMPDec);
DelTensorBuf(maskEncDecTMPEnc); DelTensorBuf(maskEncDecTMPEnc);
GMems.GetMem(paddingEnc.devID)->UnlockBuf();
/* padding on the source side */ /* padding on the source side */
int* dimsPadding = new int[paddingEnc.order + 2]; int* dimsPadding = new int[paddingEnc.order + 2];
...@@ -283,6 +285,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec, ...@@ -283,6 +285,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
dimsPadding[paddingEnc.order - 1] = paddingEnc.GetDim(-1); dimsPadding[paddingEnc.order - 1] = paddingEnc.GetDim(-1);
dimsPadding[paddingEnc.order] = paddingEnc.GetDim(-1); dimsPadding[paddingEnc.order] = paddingEnc.GetDim(-1);
GMems.GetMem(paddingEnc.devID)->LockBuf();
XTensor* padding2 = NewTensorBuf(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType, XTensor* padding2 = NewTensorBuf(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType,
paddingEnc.devID); paddingEnc.devID);
...@@ -309,6 +312,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec, ...@@ -309,6 +312,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
DelTensorBuf(padding3); DelTensorBuf(padding3);
DelTensorBuf(padding2); DelTensorBuf(padding2);
GMems.GetMem(paddingEnc.devID)->UnlockBuf();
} }
/* /*
......
...@@ -428,6 +428,7 @@ void Trainer::Update(Model* model, const float lr) ...@@ -428,6 +428,7 @@ void Trainer::Update(Model* model, const float lr)
_ScaleAndShiftMe(v, (1.0F - adamBeta2), 0); _ScaleAndShiftMe(v, (1.0F - adamBeta2), 0);
/* v2 = m / (sqrt(v) + delta) */ /* v2 = m / (sqrt(v) + delta) */
GMems.GetMem(v->devID)->LockBuf();
XTensor* v2 = NewTensorBuf(v, v->devID); XTensor* v2 = NewTensorBuf(v, v->devID);
_Power(v, v2, 0.5F); _Power(v, v2, 0.5F);
_ScaleAndShiftMe(v2, 1.0F, d); _ScaleAndShiftMe(v2, 1.0F, d);
...@@ -437,6 +438,7 @@ void Trainer::Update(Model* model, const float lr) ...@@ -437,6 +438,7 @@ void Trainer::Update(Model* model, const float lr)
_Sum(para, v2, para, -e); _Sum(para, v2, para, -e);
DelTensorBuf(v2); DelTensorBuf(v2);
GMems.GetMem(v->devID)->UnlockBuf();
} }
else { else {
/* the delta rule */ /* the delta rule */
......
...@@ -253,15 +253,25 @@ void Div(const XTensor & a, const XTensor & b, XTensor & c, DTYPE alpha, int lea ...@@ -253,15 +253,25 @@ void Div(const XTensor & a, const XTensor & b, XTensor & c, DTYPE alpha, int lea
if (b.order == 0){ if (b.order == 0){
DTYPE scale = 1.0F / b.Get0D(); DTYPE scale = 1.0F / b.Get0D();
if (a.mem != NULL)
a.mem->LockBuf();
XTensor * tmp1 = NewTensorBufV2(&a, a.devID, a.mem); XTensor * tmp1 = NewTensorBufV2(&a, a.devID, a.mem);
if ((c.mem != NULL) && (c.mem != a.mem)) {
c.mem->LockBuf();
}
XTensor * tmp2 = NewTensorBufV2(&c, c.devID, c.mem); XTensor * tmp2 = NewTensorBufV2(&c, c.devID, c.mem);
ScaleAndShift(a, *tmp1, scale, 0.0F); ScaleAndShift(a, *tmp1, scale, 0.0F);
ScaleAndShift(c, *tmp2, alpha, 0.0F); ScaleAndShift(c, *tmp2, alpha, 0.0F);
Sum(*tmp2, *tmp1, c); Sum(*tmp2, *tmp1, c);
DelTensorBuf(tmp1);
DelTensorBuf(tmp2); DelTensorBuf(tmp2);
if ((c.mem != NULL) && (c.mem != a.mem)) {
c.mem->UnlockBuf();
}
DelTensorBuf(tmp1);
if (a.mem != NULL)
a.mem->UnlockBuf();
} }
else { else {
int n = GetBroadcastDimIndex(a, b); int n = GetBroadcastDimIndex(a, b);
......
...@@ -61,6 +61,8 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b, ...@@ -61,6 +61,8 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
float dr = (!x.isSparse || !w.isSparse) ? 1.0F : MAX(x.denseRatio, w.denseRatio); float dr = (!x.isSparse || !w.isSparse) ? 1.0F : MAX(x.denseRatio, w.denseRatio);
if (x.mem != NULL)
x.mem->LockBuf();
XTensor * tmp = NewTensorBufV2(order, dimSize, x.dataType, dr, x.devID, x.mem); XTensor * tmp = NewTensorBufV2(order, dimSize, x.dataType, dr, x.devID, x.mem);
/* call _MatrixMul function */ /* call _MatrixMul function */
...@@ -101,6 +103,8 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b, ...@@ -101,6 +103,8 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
/* destroy variables */ /* destroy variables */
delete[] dimSize; delete[] dimSize;
DelTensorBuf(tmp); DelTensorBuf(tmp);
if (x.mem != NULL)
x.mem->UnlockBuf();
return c; return c;
} }
...@@ -137,6 +141,8 @@ XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedX, ...@@ -137,6 +141,8 @@ XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedX,
float dr = (!x.isSparse || !w.isSparse) ? 1.0F : MAX(x.denseRatio, w.denseRatio); float dr = (!x.isSparse || !w.isSparse) ? 1.0F : MAX(x.denseRatio, w.denseRatio);
if (x.mem != NULL)
x.mem->LockBuf();
XTensor * tmp = NewTensorBufV2(order, dimSize, x.dataType, dr, x.devID, x.mem); XTensor * tmp = NewTensorBufV2(order, dimSize, x.dataType, dr, x.devID, x.mem);
/* call _MatrixMul function */ /* call _MatrixMul function */
...@@ -175,6 +181,8 @@ XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedX, ...@@ -175,6 +181,8 @@ XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedX,
/* destroy variables */ /* destroy variables */
delete[] dimSize; delete[] dimSize;
DelTensorBuf(tmp); DelTensorBuf(tmp);
if (x.mem != NULL)
x.mem->UnlockBuf();
return c; return c;
} }
......
...@@ -277,15 +277,25 @@ void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int l ...@@ -277,15 +277,25 @@ void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int l
if (b.order == 0){ if (b.order == 0){
DTYPE scale = b.Get0D(); DTYPE scale = b.Get0D();
if (a.mem != NULL)
a.mem->LockBuf();
XTensor * tmp1 = NewTensorBufV2(&a, a.devID, a.mem); XTensor * tmp1 = NewTensorBufV2(&a, a.devID, a.mem);
if ((c.mem != NULL) && (c.mem != a.mem)) {
c.mem->LockBuf();
}
XTensor * tmp2 = NewTensorBufV2(&c, c.devID, c.mem); XTensor * tmp2 = NewTensorBufV2(&c, c.devID, c.mem);
ScaleAndShift(a, *tmp1, scale, 0.0F); ScaleAndShift(a, *tmp1, scale, 0.0F);
ScaleAndShift(c, *tmp2, alpha, 0.0F); ScaleAndShift(c, *tmp2, alpha, 0.0F);
Sum(*tmp2, *tmp1, c); Sum(*tmp2, *tmp1, c);
DelTensorBuf(tmp1);
DelTensorBuf(tmp2); DelTensorBuf(tmp2);
if ((c.mem != NULL) && (c.mem != a.mem)) {
c.mem->UnlockBuf();
}
DelTensorBuf(tmp1);
if (a.mem != NULL)
a.mem->UnlockBuf();
} }
else { else {
int n = GetBroadcastDimIndex(a, b); int n = GetBroadcastDimIndex(a, b);
......
...@@ -290,9 +290,16 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE ...@@ -290,9 +290,16 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
source = target; source = target;
} }
target = t->mem != NULL ? /*target = t->mem != NULL ?
t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize): t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize):
XMemAlloc(t->devID, t->unitNum * t->unitSize); XMemAlloc(t->devID, t->unitNum * t->unitSize);*/
if (t->mem != NULL) {
t->mem->LockBuf();
target = t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize);
}
else {
target = XMemAlloc(t->devID, t->unitNum * t->unitSize);
}
s->data = source; s->data = source;
t->data = target; t->data = target;
...@@ -302,8 +309,9 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE ...@@ -302,8 +309,9 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
/* free the memory space of the one before the last allocation */ /* free the memory space of the one before the last allocation */
if(count > 0){ if(count > 0){
int size = s->unitNum * s->unitSize; int size = s->unitNum * s->unitSize;
if(t->mem != NULL) if(t->mem != NULL) {
t->mem->ReleaseBuf(t->devID, size); t->mem->ReleaseBuf(t->devID, size);
}
else else
XMemFree(t->devID, source); XMemFree(t->devID, source);
} }
...@@ -312,8 +320,10 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE ...@@ -312,8 +320,10 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
if(isLast){ if(isLast){
CheckNTErrors(t->unitNum == c->unitNum, "Wrong tensor size!"); CheckNTErrors(t->unitNum == c->unitNum, "Wrong tensor size!");
_Multiply(a, t, c, beta); _Multiply(a, t, c, beta);
if(t->mem != NULL) if(t->mem != NULL) {
t->mem->ReleaseBuf(t->devID, t->unitNum * t->unitSize); t->mem->ReleaseBuf(t->devID, t->unitNum * t->unitSize);
t->mem->UnlockBuf();
}
else else
XMemFree(t->devID, target); XMemFree(t->devID, target);
target = NULL; target = NULL;
......
...@@ -293,10 +293,16 @@ void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta ...@@ -293,10 +293,16 @@ void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta
source = target; source = target;
} }
target = t->mem != NULL ? /*target = t->mem != NULL ?
t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize): t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize):
XMemAlloc(t->devID, t->unitNum * t->unitSize); XMemAlloc(t->devID, t->unitNum * t->unitSize);*/
if (t->mem != NULL) {
t->mem->LockBuf();
target = t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize);
}
else {
target = XMemAlloc(t->devID, t->unitNum * t->unitSize);
}
s->data = source; s->data = source;
t->data = target; t->data = target;
...@@ -315,8 +321,10 @@ void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta ...@@ -315,8 +321,10 @@ void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta
if(isLast){ if(isLast){
CheckNTErrors(t->unitNum == c->unitNum, "Wrong tensor size!"); CheckNTErrors(t->unitNum == c->unitNum, "Wrong tensor size!");
_Sum(a, t, c, beta); _Sum(a, t, c, beta);
if(t->mem != NULL) if(t->mem != NULL) {
t->mem->ReleaseBuf(t->devID, t->unitNum * t->unitSize); t->mem->ReleaseBuf(t->devID, t->unitNum * t->unitSize);
t->mem->UnlockBuf();
}
else else
XMemFree(t->devID, target); XMemFree(t->devID, target);
target = NULL; target = NULL;
......
...@@ -330,6 +330,7 @@ void _CudaBLASMatrixMULList(cublasHandle_t * handle, ...@@ -330,6 +330,7 @@ void _CudaBLASMatrixMULList(cublasHandle_t * handle,
DTYPE ** cpGPU = NULL; DTYPE ** cpGPU = NULL;
if (mem != NULL) { if (mem != NULL) {
mem->LockBuf();
mem->SetPinBuf(); mem->SetPinBuf();
apGPU = (DTYPE**)mem->AllocBuf(mem->devID, sizeof(DTYPE*) * a->count, 256); apGPU = (DTYPE**)mem->AllocBuf(mem->devID, sizeof(DTYPE*) * a->count, 256);
bpGPU = (DTYPE**)mem->AllocBuf(mem->devID, sizeof(DTYPE*) * a->count, 256); bpGPU = (DTYPE**)mem->AllocBuf(mem->devID, sizeof(DTYPE*) * a->count, 256);
...@@ -356,8 +357,10 @@ void _CudaBLASMatrixMULList(cublasHandle_t * handle, ...@@ -356,8 +357,10 @@ void _CudaBLASMatrixMULList(cublasHandle_t * handle,
delete[] bp; delete[] bp;
delete[] cp; delete[] cp;
if(mem != NULL) if (mem != NULL) {
mem->BackToPinBuf(); mem->BackToPinBuf();
mem->UnlockBuf();
}
else { else {
XMemFree(a0->devID, apGPU); XMemFree(a0->devID, apGPU);
XMemFree(a0->devID, bpGPU); XMemFree(a0->devID, bpGPU);
......
...@@ -696,13 +696,23 @@ void _SetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYPE nu ...@@ -696,13 +696,23 @@ void _SetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYPE nu
#ifdef USE_CUDA #ifdef USE_CUDA
XMem * mem = tensor->mem; XMem * mem = tensor->mem;
MTYPE size = num * sizeof(MTYPE); MTYPE size = num * sizeof(MTYPE);
MTYPE * offsetsCuda = mem != NULL ? (MTYPE*)mem->AllocBuf(mem->devID, size) : (MTYPE*)XMemAlloc(tensor->devID, size); //MTYPE * offsetsCuda = mem != NULL ? (MTYPE*)mem->AllocBuf(mem->devID, size) : (MTYPE*)XMemAlloc(tensor->devID, size);
MTYPE * offsetsCuda;
if (mem != NULL) {
mem->LockBuf();
offsetsCuda = (MTYPE*)mem->AllocBuf(mem->devID, size);
}
else {
offsetsCuda = (MTYPE*)XMemAlloc(tensor->devID, size);
}
XMemCopy(offsetsCuda, tensor->devID, offsets, -1, num * sizeof(MTYPE)); XMemCopy(offsetsCuda, tensor->devID, offsets, -1, num * sizeof(MTYPE));
_CudaSetDataWithOffset(tensor, offsetsCuda, value, num); _CudaSetDataWithOffset(tensor, offsetsCuda, value, num);
if (mem != NULL) if (mem != NULL) {
mem->ReleaseBuf(mem->devID, size); mem->ReleaseBuf(mem->devID, size);
mem->UnlockBuf();
}
else else
XMemFree(tensor->devID, offsetsCuda); XMemFree(tensor->devID, offsetsCuda);
#else #else
......
...@@ -636,12 +636,23 @@ void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * va ...@@ -636,12 +636,23 @@ void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * va
int devIDBackup; int devIDBackup;
ProtectCudaDev(tensor->devID, devIDBackup); ProtectCudaDev(tensor->devID, devIDBackup);
MTYPE * offsetsCuda = mem != NULL ? /*MTYPE * offsetsCuda = mem != NULL ?
(MTYPE*)mem->AllocBuf(mem->devID, offsetSize) : (MTYPE*)mem->AllocBuf(mem->devID, offsetSize) :
(MTYPE*)XMemAlloc(tensor->devID, offsetSize); (MTYPE*)XMemAlloc(tensor->devID, offsetSize);
void * valuesCuda = mem != NULL ? void * valuesCuda = mem != NULL ?
mem->AllocBuf(mem->devID, valueSize) : mem->AllocBuf(mem->devID, valueSize) :
XMemAlloc(tensor->devID, valueSize); XMemAlloc(tensor->devID, valueSize);*/
MTYPE * offsetsCuda;
void * valuesCuda;
if (mem != NULL) {
mem->LockBuf();
offsetsCuda = (MTYPE*)mem->AllocBuf(mem->devID, offsetSize);
valuesCuda = mem->AllocBuf(mem->devID, valueSize);
}
else {
offsetsCuda = (MTYPE*)XMemAlloc(tensor->devID, offsetSize);
valuesCuda = XMemAlloc(tensor->devID, valueSize);
}
if (mem != NULL) { if (mem != NULL) {
XMemCopy(offsetsCuda, mem->devID, offsets, -1, offsetSize); XMemCopy(offsetsCuda, mem->devID, offsets, -1, offsetSize);
...@@ -657,6 +668,7 @@ void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * va ...@@ -657,6 +668,7 @@ void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * va
if (mem != NULL) { if (mem != NULL) {
mem->ReleaseBuf(mem->devID, valueSize); mem->ReleaseBuf(mem->devID, valueSize);
mem->ReleaseBuf(mem->devID, offsetSize); mem->ReleaseBuf(mem->devID, offsetSize);
mem->UnlockBuf();
} }
else { else {
XMemFree(tensor->devID, valuesCuda); XMemFree(tensor->devID, valuesCuda);
......
...@@ -45,15 +45,25 @@ void _CopyBlocks(void * source, int unitSize, int blockSize, int blockNum, void ...@@ -45,15 +45,25 @@ void _CopyBlocks(void * source, int unitSize, int blockSize, int blockNum, void
if (devID >= 0) { if (devID >= 0) {
#ifdef USE_CUDA #ifdef USE_CUDA
/* copy the index from host to device */ /* copy the index from host to device */
int * targetBlocksTMP = myMem != NULL ? /*int * targetBlocksTMP = myMem != NULL ?
(int*)myMem->AllocBuf(devID, blockNum * sizeof(int)): (int*)myMem->AllocBuf(devID, blockNum * sizeof(int)):
(int*)XMemAlloc(devID, blockNum * sizeof(int)); (int*)XMemAlloc(devID, blockNum * sizeof(int));*/
int * targetBlocksTMP;
if (myMem != NULL) {
myMem->LockBuf();
targetBlocksTMP = (int*)myMem->AllocBuf(devID, blockNum * sizeof(int));
}
else {
targetBlocksTMP = (int*)XMemAlloc(devID, blockNum * sizeof(int));
}
XMemCopy(targetBlocksTMP, devID, targetBlocks, -1, blockNum * sizeof(int)); XMemCopy(targetBlocksTMP, devID, targetBlocks, -1, blockNum * sizeof(int));
_CopyBlocksOnSite(source, unitSize, blockSize, blockNum, target, targetBlocksTMP, devID); _CopyBlocksOnSite(source, unitSize, blockSize, blockNum, target, targetBlocksTMP, devID);
if(myMem != NULL) if (myMem != NULL) {
myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int)); myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
myMem->UnlockBuf();
}
else else
XMemFree(devID, targetBlocksTMP); XMemFree(devID, targetBlocksTMP);
#else #else
......
...@@ -47,14 +47,17 @@ void _CopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, ...@@ -47,14 +47,17 @@ void _CopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum,
#ifdef USE_CUDA #ifdef USE_CUDA
int * indexGPU = index; int * indexGPU = index;
if (!isIndexOnDev) { if (!isIndexOnDev) {
myMem->LockBuf();
indexGPU = (int*)myMem->AllocBuf(myMem->devID, blockNum * gridNum * sizeof(int)); indexGPU = (int*)myMem->AllocBuf(myMem->devID, blockNum * gridNum * sizeof(int));
XMemCopy(indexGPU, myMem->devID, index, -1, blockNum * gridNum * sizeof(int)); XMemCopy(indexGPU, myMem->devID, index, -1, blockNum * gridNum * sizeof(int));
} }
_CudaCopyBlocksInGrid(source, blockSize, blockNum, gridNum, target, indexGPU, unitSize, myMem); _CudaCopyBlocksInGrid(source, blockSize, blockNum, gridNum, target, indexGPU, unitSize, myMem);
if (!isIndexOnDev) if (!isIndexOnDev) {
myMem->ReleaseBuf(myMem->devID, blockNum * gridNum * sizeof(int)); myMem->ReleaseBuf(myMem->devID, blockNum * gridNum * sizeof(int));
myMem->UnlockBuf();
}
#else #else
ShowNTErrors("Plesae specify USE_CUDA and recompile the code!"); ShowNTErrors("Plesae specify USE_CUDA and recompile the code!");
#endif #endif
......
...@@ -80,12 +80,23 @@ void _CudaCopyBlocksSelected(void * source, int unitSize, int blockSize, int * s ...@@ -80,12 +80,23 @@ void _CudaCopyBlocksSelected(void * source, int unitSize, int blockSize, int * s
ProtectCudaDev(devID, devIDBackup); ProtectCudaDev(devID, devIDBackup);
/* copy the index to the GPU memory */ /* copy the index to the GPU memory */
int * sourceBlocksTMP = myMem != NULL ? /*int * sourceBlocksTMP = myMem != NULL ?
(int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) : (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) :
(int *)XMemAlloc(devID, blockNum * sizeof(int)); (int *)XMemAlloc(devID, blockNum * sizeof(int));
int * targetBlocksTMP = myMem != NULL ? int * targetBlocksTMP = myMem != NULL ?
(int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) : (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) :
(int *)XMemAlloc(devID, blockNum * sizeof(int)); (int *)XMemAlloc(devID, blockNum * sizeof(int));*/
int * sourceBlocksTMP;
int * targetBlocksTMP;
if (myMem != NULL) {
myMem->LockBuf();
sourceBlocksTMP = (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int));
targetBlocksTMP = (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int));
}
else {
sourceBlocksTMP = (int *)XMemAlloc(devID, blockNum * sizeof(int));
targetBlocksTMP = (int *)XMemAlloc(devID, blockNum * sizeof(int));
}
XMemCopy(sourceBlocksTMP, devID, sourceBlocks, -1, blockNum * sizeof(int)); XMemCopy(sourceBlocksTMP, devID, sourceBlocks, -1, blockNum * sizeof(int));
XMemCopy(targetBlocksTMP, devID, targetBlocks, -1, blockNum * sizeof(int)); XMemCopy(targetBlocksTMP, devID, targetBlocks, -1, blockNum * sizeof(int));
...@@ -107,6 +118,7 @@ void _CudaCopyBlocksSelected(void * source, int unitSize, int blockSize, int * s ...@@ -107,6 +118,7 @@ void _CudaCopyBlocksSelected(void * source, int unitSize, int blockSize, int * s
if (myMem != NULL) { if (myMem != NULL) {
myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int)); myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int)); myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
myMem->UnlockBuf();
} }
else { else {
XMemFree(devID, sourceBlocksTMP); XMemFree(devID, sourceBlocksTMP);
......
...@@ -131,9 +131,16 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex) ...@@ -131,9 +131,16 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
CheckNTErrors(srcIndexValue < s->unitNum, "Wrong index!"); CheckNTErrors(srcIndexValue < s->unitNum, "Wrong index!");
} }
sIndex = mem != NULL ? /*sIndex = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) : (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
(int*)XMemAlloc(mem->devID, sizeof(int) * indexSize); (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);*/
if (mem != NULL) {
mem->LockBuf();
sIndex = (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize);
}
else {
sIndex = (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
}
XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize); XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize);
} }
else { else {
...@@ -169,8 +176,10 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex) ...@@ -169,8 +176,10 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
} }
if (srcIndex->devID < 0) { if (srcIndex->devID < 0) {
if(mem != NULL) if (mem != NULL) {
mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize); mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
mem->UnlockBuf();
}
else else
XMemFree(mem->devID, sIndex); XMemFree(mem->devID, sIndex);
} }
...@@ -209,9 +218,16 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim) ...@@ -209,9 +218,16 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
CheckNTErrors(srcIndexValue < s->unitNum, "Wrong index!"); CheckNTErrors(srcIndexValue < s->unitNum, "Wrong index!");
} }
sIndex = mem != NULL ? /*sIndex = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) : (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
(int*)XMemAlloc(mem->devID, sizeof(int) * indexSize); (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);*/
if (mem != NULL) {
mem->LockBuf();
sIndex = (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize);
}
else {
sIndex = (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
}
XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize); XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize);
} }
else { else {
...@@ -238,6 +254,15 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim) ...@@ -238,6 +254,15 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
else { else {
ShowNTErrors("Unsupported dataType!"); ShowNTErrors("Unsupported dataType!");
} }
if (srcIndex->devID < 0) {
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
mem->UnlockBuf();
}
else
XMemFree(mem->devID, sIndex);
}
} }
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -177,9 +177,17 @@ void _CudaSpread(XTensor * source, XTensor * collection, int dim, ...@@ -177,9 +177,17 @@ void _CudaSpread(XTensor * source, XTensor * collection, int dim,
DTYPE * c = (DTYPE*)collection->data; DTYPE * c = (DTYPE*)collection->data;
XMem * mem = source->mem; XMem * mem = source->mem;
int * si = mem != NULL ? /*int * si = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize * 2) : (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize * 2) :
(int*)XMemAlloc(mem->devID, sizeof(int) * indexSize * 2); (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize * 2);*/
int * si;
if (mem != NULL) {
mem->LockBuf();
si = (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize * 2);
}
else {
si = (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize * 2);
}
int * ci = si + indexSize; int * ci = si + indexSize;
XMemCopy(si, mem->devID, srcIndex, -1, sizeof(int) * indexSize); XMemCopy(si, mem->devID, srcIndex, -1, sizeof(int) * indexSize);
...@@ -188,8 +196,10 @@ void _CudaSpread(XTensor * source, XTensor * collection, int dim, ...@@ -188,8 +196,10 @@ void _CudaSpread(XTensor * source, XTensor * collection, int dim,
KernelSpreadFuzed<<<blocks, threads >>>(s, c, blockNum, blockSizeSrc, blockSizeColl, KernelSpreadFuzed<<<blocks, threads >>>(s, c, blockNum, blockSizeSrc, blockSizeColl,
stride, indexSize, si, ci); stride, indexSize, si, ci);
if(mem != NULL) if (mem != NULL) {
mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize * 2); mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize * 2);
mem->UnlockBuf();
}
else else
XMemFree(mem->devID, si); XMemFree(mem->devID, si);
} }
...@@ -393,9 +403,16 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcI ...@@ -393,9 +403,16 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcI
dim3 threads(cudaBlocks[0], cudaBlocks[1]); dim3 threads(cudaBlocks[0], cudaBlocks[1]);
if (srcIndex->devID < 0) { if (srcIndex->devID < 0) {
sIndex = mem != NULL ? /*sIndex = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) : (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
(int*)XMemAlloc(devID, sizeof(int) * indexSize); (int*)XMemAlloc(devID, sizeof(int) * indexSize);*/
if (mem != NULL) {
mem->LockBuf();
sIndex = (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize);
}
else {
sIndex = (int*)XMemAlloc(devID, sizeof(int) * indexSize);
}
XMemCopy(sIndex, devID, srcIndex->data, -1, sizeof(int) * indexSize); XMemCopy(sIndex, devID, srcIndex->data, -1, sizeof(int) * indexSize);
} }
else else
...@@ -422,8 +439,10 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcI ...@@ -422,8 +439,10 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcI
} }
if (srcIndex->devID < 0) { if (srcIndex->devID < 0) {
if(mem != NULL) if (mem != NULL) {
mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize); mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
mem->UnlockBuf();
}
else else
XMemFree(devID, sIndex); XMemFree(devID, sIndex);
} }
......
...@@ -574,7 +574,14 @@ void _funcName(const XTensor * input, XTensor * output, int dim) ...@@ -574,7 +574,14 @@ void _funcName(const XTensor * input, XTensor * output, int dim)
XMem * mem = input->mem; \ XMem * mem = input->mem; \
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize); \ GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize); \
int bufSize = input->unitSize * cudaGridSize[0] * stride * blockNum * 2; \ int bufSize = input->unitSize * cudaGridSize[0] * stride * blockNum * 2; \
DTYPE * buf = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(devID, bufSize); \ DTYPE * buf; \
if (mem != NULL) { \
mem->LockBuf(); \
buf = (DTYPE*)mem->AllocBuf(mem->devID, bufSize); \
} \
else { \
buf = (DTYPE*)XMemAlloc(devID, bufSize); \
} \
DTYPE * buf1 = buf; \ DTYPE * buf1 = buf; \
DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum; \ DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum; \
do { \ do { \
...@@ -706,8 +713,10 @@ void _funcName(const XTensor * input, XTensor * output, int dim) ...@@ -706,8 +713,10 @@ void _funcName(const XTensor * input, XTensor * output, int dim)
\ \
} while (strideNum > 1); \ } while (strideNum > 1); \
\ \
if (mem != NULL) \ if (mem != NULL) { \
mem->ReleaseBuf(mem->devID, bufSize); \ mem->ReleaseBuf(mem->devID, bufSize); \
mem->UnlockBuf(); \
} \
else \ else \
XMemFree(input->devID, buf); \ XMemFree(input->devID, buf); \
} \ } \
......
...@@ -757,7 +757,15 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen ...@@ -757,7 +757,15 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize); GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
int bufSize = input->unitSize * cudaGridSize[0] * stride * blockNum * 2; int bufSize = input->unitSize * cudaGridSize[0] * stride * blockNum * 2;
DTYPE * buf = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(devID, bufSize); //DTYPE * buf = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(devID, bufSize);
DTYPE * buf;
if (mem != NULL) {
mem->LockBuf();
buf = (DTYPE*)mem->AllocBuf(mem->devID, bufSize);
}
else {
buf = (DTYPE*)XMemAlloc(devID, bufSize);
}
DTYPE * buf1 = buf; DTYPE * buf1 = buf;
DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum; DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum;
do { do {
...@@ -907,8 +915,10 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen ...@@ -907,8 +915,10 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
} while (strideNum > 1); } while (strideNum > 1);
if (mem != NULL) if (mem != NULL) {
mem->ReleaseBuf(mem->devID, bufSize); mem->ReleaseBuf(mem->devID, bufSize);
mem->UnlockBuf();
}
else else
XMemFree(devID, buf); XMemFree(devID, buf);
} }
......
...@@ -56,12 +56,16 @@ void _ReduceSumAll(const XTensor * source, XTensor * target) ...@@ -56,12 +56,16 @@ void _ReduceSumAll(const XTensor * source, XTensor * target)
int dims[1] = {source->unitNum}; int dims[1] = {source->unitNum};
if (source->mem != NULL)
source->mem->LockBuf();
XTensor * all = NewTensorBufV2(1, dims, source->dataType, source->denseRatio, source->devID, source->mem); XTensor * all = NewTensorBufV2(1, dims, source->dataType, source->denseRatio, source->devID, source->mem);
_CopyValues(source, all); _CopyValues(source, all);
_ReduceSum(all, target, 0); _ReduceSum(all, target, 0);
DelTensorBuf(all); DelTensorBuf(all);
if (source->mem != NULL)
source->mem->UnlockBuf();
} }
/* /*
...@@ -72,6 +76,8 @@ sum all the items of the tensor (It should be optimized!) ...@@ -72,6 +76,8 @@ sum all the items of the tensor (It should be optimized!)
void _ReduceSumAll(const XTensor * source, DTYPE * value) void _ReduceSumAll(const XTensor * source, DTYPE * value)
{ {
int * dimSize = new int[MAX_TENSOR_DIM_NUM]; int * dimSize = new int[MAX_TENSOR_DIM_NUM];
if (source->mem != NULL)
source->mem->LockBuf();
XTensor * target = NewTensorBufV2(0, dimSize, source->dataType, source->denseRatio, source->devID, source->mem); XTensor * target = NewTensorBufV2(0, dimSize, source->dataType, source->denseRatio, source->devID, source->mem);
target->SetTMPFlag(); target->SetTMPFlag();
...@@ -81,6 +87,8 @@ void _ReduceSumAll(const XTensor * source, DTYPE * value) ...@@ -81,6 +87,8 @@ void _ReduceSumAll(const XTensor * source, DTYPE * value)
delete[] dimSize; delete[] dimSize;
DelTensorBuf(target); DelTensorBuf(target);
if (source->mem != NULL)
source->mem->UnlockBuf();
} }
/* /*
......
...@@ -118,30 +118,54 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim) ...@@ -118,30 +118,54 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
void * dataTMP = t->data; void * dataTMP = t->data;
if (!isOnSameDevice) if (!isOnSameDevice) {
dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(mem->devID, size); /*dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(mem->devID, size);*/
if (mem != NULL) {
mem->LockBuf();
dataTMP = mem->AllocBuf(mem->devID, size);
}
else {
dataTMP = XMemAlloc(mem->devID, size);
}
}
int blockNumInMerge = s->dimSize[leadingDim]; int blockNumInMerge = s->dimSize[leadingDim];
int splitSizeInGrid = gridSize / blockNumInMerge; int splitSizeInGrid = gridSize / blockNumInMerge;
int realBlockSize = blockSize * t->unitSize; int realBlockSize = blockSize * t->unitSize;
int * blockIndex = (int*)(mem != NULL ? /*int * blockIndex = (int*)(mem != NULL ?
mem->AllocBuf(mem->devID, blockNum * gridNum * sizeof(int)) : mem->AllocBuf(mem->devID, blockNum * gridNum * sizeof(int)) :
XMemAlloc(s->devID, blockNum * gridNum * sizeof(int))); XMemAlloc(s->devID, blockNum * gridNum * sizeof(int)));*/
int * blockIndex;
if (mem != NULL) {
if (isOnSameDevice) {
mem->LockBuf();
}
blockIndex = (int*)mem->AllocBuf(mem->devID, blockNum * gridNum * sizeof(int));
}
else {
blockIndex = (int*)XMemAlloc(s->devID, blockNum * gridNum * sizeof(int));
}
_MakeMergeBlockIndex(blockIndex, blockNum, blockNumInMerge, splitSizeInGrid, gridSize, gridNum, s->devID); _MakeMergeBlockIndex(blockIndex, blockNum, blockNumInMerge, splitSizeInGrid, gridSize, gridNum, s->devID);
_CopyBlocksOnSite(s->data, s->unitSize, realBlockSize, blockNum * gridNum, dataTMP, blockIndex, s->devID); _CopyBlocksOnSite(s->data, s->unitSize, realBlockSize, blockNum * gridNum, dataTMP, blockIndex, s->devID);
if (mem != NULL) if (mem != NULL) {
mem->ReleaseBuf(mem->devID, blockNum * gridNum * sizeof(int)); mem->ReleaseBuf(mem->devID, blockNum * gridNum * sizeof(int));
if (isOnSameDevice) {
mem->UnlockBuf();
}
}
else else
XMemFree(s->devID, blockIndex); XMemFree(s->devID, blockIndex);
if (!isOnSameDevice) { if (!isOnSameDevice) {
XMemCopy(t->data, t->devID, dataTMP, s->devID, size); XMemCopy(t->data, t->devID, dataTMP, s->devID, size);
if (mem != NULL) if (mem != NULL) {
mem->ReleaseBuf(mem->devID, size); mem->ReleaseBuf(mem->devID, size);
mem->UnlockBuf();
}
else else
XMemFree(s->devID, dataTMP); XMemFree(s->devID, dataTMP);
} }
...@@ -358,8 +382,16 @@ void _Merge(const TensorList * smalls, XTensor * t, int whereToMerge) ...@@ -358,8 +382,16 @@ void _Merge(const TensorList * smalls, XTensor * t, int whereToMerge)
void * dataTMP = NULL; void * dataTMP = NULL;
if (uniform) if (uniform)
dataTMP = smallsItem0->data; dataTMP = smallsItem0->data;
else else {
dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(t->devID, size); //dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(t->devID, size);
if (mem != NULL) {
mem->LockBuf();
dataTMP = mem->AllocBuf(mem->devID, size);
}
else {
dataTMP = XMemAlloc(t->devID, size);
}
}
tensorTMP->data = dataTMP; tensorTMP->data = dataTMP;
...@@ -378,8 +410,10 @@ void _Merge(const TensorList * smalls, XTensor * t, int whereToMerge) ...@@ -378,8 +410,10 @@ void _Merge(const TensorList * smalls, XTensor * t, int whereToMerge)
tensorTMP->data = NULL; tensorTMP->data = NULL;
delete tensorTMP; delete tensorTMP;
if ((!uniform) && (mem != NULL)) if ((!uniform) && (mem != NULL)) {
mem->ReleaseBuf(mem->devID, size); mem->ReleaseBuf(mem->devID, size);
mem->UnlockBuf();
}
else else
XMemFree(t->devID, dataTMP); XMemFree(t->devID, dataTMP);
} }
......
...@@ -117,7 +117,7 @@ void _CudaMergeBlockLists(const StrList* sourceList, int * blockSizes, int block ...@@ -117,7 +117,7 @@ void _CudaMergeBlockLists(const StrList* sourceList, int * blockSizes, int block
GDevs.GetCudaThread2D(myMem->devID, realMaxBlockSize, newBlockListSize, MAX_INT, GDevs.GetCudaThread2D(myMem->devID, realMaxBlockSize, newBlockListSize, MAX_INT,
cudaGridSizes, cudaBlockSizes); cudaGridSizes, cudaBlockSizes);
myMem->LockBuf();
myMem->SetPinBuf(); myMem->SetPinBuf();
int * sizesGPU = (int*)myMem->AllocBuf(myMem->devID, sizeof(int) * newBlockListSize, 256); int * sizesGPU = (int*)myMem->AllocBuf(myMem->devID, sizeof(int) * newBlockListSize, 256);
...@@ -133,6 +133,7 @@ void _CudaMergeBlockLists(const StrList* sourceList, int * blockSizes, int block ...@@ -133,6 +133,7 @@ void _CudaMergeBlockLists(const StrList* sourceList, int * blockSizes, int block
(sourceArraysGPU, sizesGPU, newBlockListSize, targetArraysGPU); (sourceArraysGPU, sizesGPU, newBlockListSize, targetArraysGPU);
myMem->BackToPinBuf(); myMem->BackToPinBuf();
myMem->UnlockBuf();
delete[] sourceArrays; delete[] sourceArrays;
delete[] targetArrays; delete[] targetArrays;
......
...@@ -110,22 +110,44 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum) ...@@ -110,22 +110,44 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
void * dataTMP = t->data; void * dataTMP = t->data;
if (!isOnSameDevice) if (!isOnSameDevice) {
dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(s->devID, size); //dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(s->devID, size);
if (mem != NULL) {
mem->LockBuf();
dataTMP = mem->AllocBuf(mem->devID, size);
}
else {
dataTMP = XMemAlloc(s->devID, size);
}
}
int realBlockSize = blockSize * t->unitSize; int realBlockSize = blockSize * t->unitSize;
int blockSplitSize = blockNum / splitNum; int blockSplitSize = blockNum / splitNum;
int * blockIndex = (int*)(mem != NULL ? /*int * blockIndex = (int*)(mem != NULL ?
mem->AllocBuf(mem->devID, blockNum * sizeof(int)) : mem->AllocBuf(mem->devID, blockNum * sizeof(int)) :
XMemAlloc(s->devID, blockNum * sizeof(int))); XMemAlloc(s->devID, blockNum * sizeof(int)));*/
int * blockIndex;
if (mem != NULL) {
if (isOnSameDevice) {
mem->LockBuf();
}
blockIndex = (int*)mem->AllocBuf(mem->devID, blockNum * sizeof(int));
}
else {
blockIndex = (int*)XMemAlloc(s->devID, blockNum * sizeof(int));
}
_MakeSplitBlockIndex(blockIndex, splitNum, blockSplitSize, blockNum, s->devID); _MakeSplitBlockIndex(blockIndex, splitNum, blockSplitSize, blockNum, s->devID);
_CopyBlocksOnSite(s->data, s->unitSize, realBlockSize, blockNum, dataTMP, blockIndex, s->devID); _CopyBlocksOnSite(s->data, s->unitSize, realBlockSize, blockNum, dataTMP, blockIndex, s->devID);
if (mem != NULL) if (mem != NULL) {
mem->ReleaseBuf(mem->devID, blockNum * sizeof(int)); mem->ReleaseBuf(mem->devID, blockNum * sizeof(int));
if (isOnSameDevice) {
mem->UnlockBuf();
}
}
else else
XMemFree(s->devID, blockIndex); XMemFree(s->devID, blockIndex);
...@@ -133,8 +155,10 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum) ...@@ -133,8 +155,10 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
if (!isOnSameDevice) { if (!isOnSameDevice) {
XMemCopy(t->data, t->devID, dataTMP, s->devID, size); XMemCopy(t->data, t->devID, dataTMP, s->devID, size);
if (mem != NULL) if (mem != NULL) {
mem->ReleaseBuf(mem->devID, size); mem->ReleaseBuf(mem->devID, size);
mem->UnlockBuf();
}
else else
XMemFree(s->devID, dataTMP); XMemFree(s->devID, dataTMP);
} }
...@@ -333,7 +357,14 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli ...@@ -333,7 +357,14 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli
dataTMP = first->data; dataTMP = first->data;
} }
else { else {
dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(big->devID, size); //dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(big->devID, size);
if (mem != NULL) {
mem->LockBuf();
dataTMP = mem->AllocBuf(mem->devID, size);
}
else {
dataTMP = XMemAlloc(big->devID, size);
}
} }
tensorTMP->data = dataTMP; tensorTMP->data = dataTMP;
...@@ -354,8 +385,10 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli ...@@ -354,8 +385,10 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli
tensorTMP->data = NULL; tensorTMP->data = NULL;
delete tensorTMP; delete tensorTMP;
if ((!uniform) && (mem != NULL)) if ((!uniform) && (mem != NULL)) {
mem->ReleaseBuf(mem->devID, size); mem->ReleaseBuf(mem->devID, size);
mem->UnlockBuf();
}
else else
XMemFree(big->devID, dataTMP); XMemFree(big->devID, dataTMP);
} }
......
...@@ -234,7 +234,15 @@ void _CudaSortBig(const XTensor * a, XTensor * b, XTensor * indexA, XTensor * in ...@@ -234,7 +234,15 @@ void _CudaSortBig(const XTensor * a, XTensor * b, XTensor * indexA, XTensor * in
int m = GetNextPower2(strideNum); int m = GetNextPower2(strideNum);
int n = stride * blockNum; int n = stride * blockNum;
void * buf = mem != NULL ? mem->AllocBuf(a->devID, n * m * a->unitSize) : XMemAlloc(a->devID, n * m * a->unitSize); //void * buf = mem != NULL ? mem->AllocBuf(a->devID, n * m * a->unitSize) : XMemAlloc(a->devID, n * m * a->unitSize);
void * buf;
if (mem != NULL) {
mem->LockBuf();
buf = mem->AllocBuf(a->devID, n * m * a->unitSize);
}
else {
buf = XMemAlloc(a->devID, n * m * a->unitSize);
}
void * bufIndex = NULL; void * bufIndex = NULL;
if (indexA != NULL && indexB != NULL) { if (indexA != NULL && indexB != NULL) {
bufIndex = mem != NULL ? mem->AllocBuf(a->devID, n * m * sizeof(int)) : XMemAlloc(a->devID, n * m * sizeof(int)); bufIndex = mem != NULL ? mem->AllocBuf(a->devID, n * m * sizeof(int)) : XMemAlloc(a->devID, n * m * sizeof(int));
...@@ -289,8 +297,10 @@ void _CudaSortBig(const XTensor * a, XTensor * b, XTensor * indexA, XTensor * in ...@@ -289,8 +297,10 @@ void _CudaSortBig(const XTensor * a, XTensor * b, XTensor * indexA, XTensor * in
KernelReorganizeBack<int> << <dim3(cudaGrids[1], cudaGrids[0]), dim3(cudaBlocks[1], cudaBlocks[0]) >> > KernelReorganizeBack<int> << <dim3(cudaGrids[1], cudaGrids[0]), dim3(cudaBlocks[1], cudaBlocks[0]) >> >
(bufIndex, indexB->data, m, n, stride, k, blockNum); (bufIndex, indexB->data, m, n, stride, k, blockNum);
if (mem != NULL) if (mem != NULL) {
mem->ReleaseBuf(a->devID, n * m * a->unitSize); mem->ReleaseBuf(a->devID, n * m * a->unitSize);
mem->UnlockBuf();
}
else else
XMemFree(a->devID, buf); XMemFree(a->devID, buf);
if (indexA != NULL && indexB != NULL) if (indexA != NULL && indexB != NULL)
......
...@@ -79,6 +79,8 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim) ...@@ -79,6 +79,8 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
blockSize = stride * dimensionSize; blockSize = stride * dimensionSize;
blockNum = y->unitNum / blockSize; blockNum = y->unitNum / blockSize;
if (mem != NULL)
mem->LockBuf();
max = NewTensorBufV2(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem); max = NewTensorBufV2(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
sum = NewTensorBufV2(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem); sum = NewTensorBufV2(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
...@@ -153,6 +155,8 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim) ...@@ -153,6 +155,8 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
DelTensorBuf(max); DelTensorBuf(max);
DelTensorBuf(sum); DelTensorBuf(sum);
if (mem != NULL)
mem->UnlockBuf();
if (x->devID >= 0) { if (x->devID >= 0) {
delete blockx; delete blockx;
......
...@@ -54,6 +54,8 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim) ...@@ -54,6 +54,8 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
XTensor * max = NULL; XTensor * max = NULL;
XTensor * sum = NULL; XTensor * sum = NULL;
if (mem != NULL)
mem->LockBuf();
max = NewTensorBufV2(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem); max = NewTensorBufV2(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
sum = NewTensorBufV2(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem); sum = NewTensorBufV2(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
...@@ -113,6 +115,8 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim) ...@@ -113,6 +115,8 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
DelTensorBuf(sum); DelTensorBuf(sum);
DelTensorBuf(max); DelTensorBuf(max);
if (mem != NULL)
mem->UnlockBuf();
delete[] dimSize; delete[] dimSize;
} }
......
...@@ -354,6 +354,8 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold, ...@@ -354,6 +354,8 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
dimSize[i - 1] = output->dimSize[i]; dimSize[i - 1] = output->dimSize[i];
} }
if (output->mem != NULL)
output->mem->LockBuf();
XTensor * lossBuf = NewTensorBufV2(output->order - 1, dimSize, output->dataType, output->denseRatio, XTensor * lossBuf = NewTensorBufV2(output->order - 1, dimSize, output->dataType, output->denseRatio,
output->devID, output->mem); output->devID, output->mem);
...@@ -367,10 +369,16 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold, ...@@ -367,10 +369,16 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
nonZeroNum = (DTYPE)lossBuf->unitNum; nonZeroNum = (DTYPE)lossBuf->unitNum;
} }
else { else {
if ((padding->mem != NULL) && (padding->mem != output->mem)) {
padding->mem->LockBuf();
}
XTensor * tmp = NewTensorBufV2(padding, padding->devID, padding->mem); XTensor * tmp = NewTensorBufV2(padding, padding->devID, padding->mem);
_IsNonZero(padding, tmp); _IsNonZero(padding, tmp);
_ReduceSumAll(tmp, &nonZeroNum); _ReduceSumAll(tmp, &nonZeroNum);
DelTensorBuf(tmp); DelTensorBuf(tmp);
if ((padding->mem != NULL) && (padding->mem != output->mem)) {
padding->mem->UnlockBuf();
}
} }
loss = loss / nonZeroNum; loss = loss / nonZeroNum;
...@@ -384,6 +392,8 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold, ...@@ -384,6 +392,8 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
delete[] dimSize; delete[] dimSize;
DelTensorBuf(lossBuf); DelTensorBuf(lossBuf);
if (output->mem != NULL)
output->mem->UnlockBuf();
return loss; return loss;
} }
......
...@@ -57,6 +57,9 @@ void _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold, ...@@ -57,6 +57,9 @@ void _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
{ {
int n = leadingDim < 0 ? output->order - 1 : leadingDim; int n = leadingDim < 0 ? output->order - 1 : leadingDim;
if (output->mem != NULL) {
output->mem->LockBuf();
}
XTensor * interBuf1 = NewTensorBufV2(output, output->devID, output->mem); XTensor * interBuf1 = NewTensorBufV2(output, output->devID, output->mem);
XTensor * interBuf2 = NewTensorBufV2(output, output->devID, output->mem); XTensor * interBuf2 = NewTensorBufV2(output, output->devID, output->mem);
...@@ -73,6 +76,9 @@ void _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold, ...@@ -73,6 +76,9 @@ void _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
DelTensorBuf(interBuf2); DelTensorBuf(interBuf2);
DelTensorBuf(interBuf1); DelTensorBuf(interBuf1);
if (output->mem != NULL) {
output->mem->UnlockBuf();
}
} }
/* /*
...@@ -118,6 +124,9 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold, ...@@ -118,6 +124,9 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
dimSize[i - 1] = output->dimSize[i]; dimSize[i - 1] = output->dimSize[i];
} }
if (output->mem != NULL) {
output->mem->LockBuf();
}
XTensor * lossBuf = NewTensorBufV2(output->order - 1, dimSize, output->dataType, output->denseRatio, XTensor * lossBuf = NewTensorBufV2(output->order - 1, dimSize, output->dataType, output->denseRatio,
output->devID, output->mem); output->devID, output->mem);
...@@ -131,10 +140,16 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold, ...@@ -131,10 +140,16 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
nonZeroNum = (DTYPE)lossBuf->unitNum; nonZeroNum = (DTYPE)lossBuf->unitNum;
} }
else { else {
if ((padding->mem != NULL) && (padding->mem != output->mem)) {
padding->mem->LockBuf();
}
XTensor * tmp = NewTensorBufV2(padding, padding->devID, padding->mem); XTensor * tmp = NewTensorBufV2(padding, padding->devID, padding->mem);
_IsNonZero(padding, tmp); _IsNonZero(padding, tmp);
_ReduceSumAll(tmp, &nonZeroNum); _ReduceSumAll(tmp, &nonZeroNum);
DelTensorBuf(tmp); DelTensorBuf(tmp);
if ((padding->mem != NULL) && (padding->mem != output->mem)) {
padding->mem->UnlockBuf();
}
} }
loss = loss / nonZeroNum; loss = loss / nonZeroNum;
...@@ -148,6 +163,9 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold, ...@@ -148,6 +163,9 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
delete[] dimSize; delete[] dimSize;
DelTensorBuf(lossBuf); DelTensorBuf(lossBuf);
if (output->mem != NULL) {
output->mem->UnlockBuf();
}
return loss; return loss;
} }
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论