Commit ae6e43fd by liyinqiao

Merge with xiaotong branch.

1. Add mutex when operating the memory pool.
2. Support CMake 3.19 and fix some CMake bugs. Note that CUDA_ROOT variable in CMake is modified as CUDA_TOOLKIT_ROOT. You can find this update in the README.
3. Fix some bugs on Ubuntu.
parent 7e102f8a
......@@ -25,19 +25,19 @@ option(USE_MKL "Use MKL" OFF)
option(USE_OPENBLAS "Use OpenBLAS" OFF)
option(GEN_DLL "Generate Dynamic Link Library" OFF)
# If set USE_CUDA ON, please modify CUDA_ROOT below.
# If set USE_CUDA ON, please modify CUDA_TOOLKIT_ROOT below.
# If set USE_MKL ON, please modify the INTEL_ROOT below.
# If set USE_OPENBLAS ON, please modify the OPENBLAS_ROOT below.
if (USE_CUDA)
if(NOT EXISTS ${CUDA_ROOT})
if(NOT EXISTS ${CUDA_TOOLKIT_ROOT})
if(WIN32)
set(CUDA_ROOT "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.2")
set(CUDA_TOOLKIT_ROOT "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.2")
else()
set(CUDA_ROOT "/usr/local/cuda-9.2")
set(CUDA_TOOLKIT_ROOT "/usr/local/cuda-9.2")
endif()
endif()
set(CUDA_TOOLKIT_ROOT_DIR ${CUDA_ROOT})
message(STATUS "CUDA_ROOT: ${CUDA_ROOT}")
set(CUDA_TOOLKIT_ROOT_DIR ${CUDA_TOOLKIT_ROOT})
message(STATUS "CUDA_TOOLKIT_ROOT: ${CUDA_TOOLKIT_ROOT}")
endif()
if(USE_MKL)
if(NOT DEFINED INTEL_ROOT)
......@@ -128,12 +128,13 @@ if(USE_CUDA)
if(WIN32)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4819")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-maxrregcount=0 -m64 -Wno-deprecated-gpu-targets -use_fast_math")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-maxrregcount=0 -Wno-deprecated-gpu-targets -use_fast_math")
string(REPLACE -m32 -m64 CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${ARCH_FLAGS})
set(CMAKE_POLICY_DEFAULT_CMP0028 NEW)
link_directories("${CUDA_ROOT}/lib/x64")
include_directories("${CUDA_ROOT}/include")
set(CUDA_LIB_DIR "${CUDA_ROOT}/lib/x64/")
link_directories("${CUDA_TOOLKIT_ROOT}/lib/x64")
include_directories("${CUDA_TOOLKIT_ROOT}/include")
set(CUDA_LIB_DIR "${CUDA_TOOLKIT_ROOT}/lib/x64/")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}cublas.lib")
if(CUDA_VERSION_MAJOR EQUAL 11)
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}cublasLt.lib")
......@@ -146,9 +147,9 @@ if(USE_CUDA)
set(CMAKE_CXX_FLAGS "-fPIC -msse4.2 -w -march=native -Wno-enum-compare -Wno-sign-compare -Wno-format -Wno-dev -O3 -DNDEBUG -rdynamic")
set(CUDA_NVCC_FLAGS "-Xcompiler -fPIC -maxrregcount=0 --disable-warnings -use_fast_math -DUSE_CUDA -Wno-deprecated-gpu-targets -std=c++11")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${ARCH_FLAGS})
link_directories("${CUDA_ROOT}/lib64")
include_directories("${CUDA_ROOT}/include")
set(CUDA_LIB_DIR "${CUDA_ROOT}/lib64/")
link_directories("${CUDA_TOOLKIT_ROOT}/lib64")
include_directories("${CUDA_TOOLKIT_ROOT}/include")
set(CUDA_LIB_DIR "${CUDA_TOOLKIT_ROOT}/lib64/")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcublas_static.a")
if(CUDA_VERSION_MAJOR EQUAL 11)
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcublasLt_static.a")
......@@ -158,7 +159,13 @@ if(USE_CUDA)
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libnppc_static.a")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcudadevrt.a")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "${CUDA_LIB_DIR}libcurand_static.a")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "/usr/lib64/libdl.so.2")
if(EXISTS "/usr/lib64/libdl.so.2")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "/usr/lib64/libdl.so.2")
elseif(EXISTS "/lib/x86_64-linux-gnu/libdl.so.2")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "/lib/x86_64-linux-gnu/libdl.so.2")
elseif(EXISTS "/lib64/libdl.so.2")
set(CUDA_LIB_PATH ${CUDA_LIB_PATH} "/lib64/libdl.so.2")
endif()
endif()
endif()
......@@ -293,4 +300,4 @@ else()
target_link_libraries(${NIUTENSOR_EXE} ${ALL_LIB} ${FLAG})
endif()
message(STATUS "${MESS}")
endif()
\ No newline at end of file
endif()
......@@ -105,11 +105,15 @@ void XShapeGrad::GradConvertDataType(XTensor* node, bool isEfficient)
if (!isEfficient || a->isGrad) {
XNoder::MakeGrad(a);
if (a->mem != NULL)
a->mem->LockBuf();
XTensor* tmp = NewTensorBufV2(a, a->devID, a->mem);
_ConvertDataType(node->grad, tmp);
_SumMe(a->grad, tmp);
DelTensorBuf(tmp);
if (a->mem != NULL)
a->mem->UnlockBuf();
}
node->visitMark = NODE_FINISHED;
......@@ -141,11 +145,15 @@ void XShapeGrad::GradCopyIndexed(XTensor * node, bool isEfficient)
if (!isEfficient || input->isGrad) {
XNoder::MakeGrad(input);
if (input->mem != NULL)
input->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
_SpreadForCopyIndexed(tmp, node->grad, dim, srcIndex, tgtIndex, copyNum);
_SumMe(input->grad, tmp);
DelTensorBuf(tmp);
if (input->mem != NULL)
input->mem->UnlockBuf();
}
node->visitMark = NODE_FINISHED;
......@@ -173,12 +181,16 @@ void XShapeGrad::GradGather(XTensor * node, bool isEfficient)
if (!isEfficient || input->isGrad) {
XNoder::MakeGrad(input);
if (input->mem != NULL)
input->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
tmp->SetZeroAll();
_SpreadForGather(tmp, node->grad, index);
_SumMe(input->grad, tmp);
DelTensorBuf(tmp);
if (input->mem != NULL)
input->mem->UnlockBuf();
}
node->visitMark = NODE_FINISHED;
......@@ -200,6 +212,8 @@ void XShapeGrad::GradDropoutWithIndex(XTensor * node, bool isEfficient)
if (!isEfficient || input->isGrad) {
XNoder::MakeGrad(input);
if (input->mem != NULL)
input->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
_CopyValues(node->grad, tmp);
......@@ -212,6 +226,8 @@ void XShapeGrad::GradDropoutWithIndex(XTensor * node, bool isEfficient)
_SumMe(input->grad, tmp);
DelTensorBuf(tmp);
if (input->mem != NULL)
input->mem->UnlockBuf();
}
node->visitMark = NODE_FINISHED;
......@@ -456,12 +472,16 @@ void XShapeGrad::GradSplit(XTensor * node, bool isEfficient)
/* if the tensor is used somewhere else, we need another SUM
for gradient accumulation */
else {
if (input->mem != NULL)
input->mem->LockBuf();
XTensor * inputGradTMP = NewTensorBufV2(input, input->devID, input->mem);
_Merge(node->grad, inputGradTMP, whereToSplit + 1, 0);
_Sum(input->grad, inputGradTMP, input->grad);
DelTensorBuf(inputGradTMP);
if (input->mem != NULL)
input->mem->UnlockBuf();
}
}
......@@ -543,12 +563,16 @@ void XShapeGrad::GradSplitListPost(XTensor * node, bool isEfficient)
somewhere else, we need another SUM for gradient
accumulation */
else {
if (node->mem != NULL)
node->mem->LockBuf();
XTensor * nodeGradTMP = NewTensorBufV2(node, node->devID, node->mem);
_Merge(&splits, nodeGradTMP, whereToSplit + 1);
_Sum(node->grad, nodeGradTMP, node->grad);
DelTensorBuf(nodeGradTMP);
if (node->mem != NULL)
node->mem->UnlockBuf();
}
}
......@@ -584,11 +608,15 @@ void XShapeGrad::GradTranspose(XTensor * node, bool isEfficient)
CheckNTErrors(input->order > i && i >= 0, "index of dimension is out of scope!");
CheckNTErrors(input->order > j && j >= 0, "index of dimension is out of scope!");
if (input->mem != NULL)
input->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(input, input->devID, input->mem);
_Transpose(output->grad, tmp, i, j);
_Sum(input->grad, tmp, input->grad);
DelTensorBuf(tmp);
if (input->mem != NULL)
input->mem->UnlockBuf();
}
node->visitMark = NODE_FINISHED;
......@@ -622,12 +650,16 @@ void XShapeGrad::GradUnsqueeze(XTensor * node, bool isEfficient)
if (!isEfficient || input->isGrad) {
XNoder::MakeGrad(input);
if (input->mem != NULL)
input->mem->LockBuf();
XTensor * tmp = NewTensorBufV2(input->grad, input->devID, input->mem);
_ReduceSum(output->grad, tmp, dim);
_Sum(input->grad, tmp, input->grad);
DelTensorBuf(tmp);
if (input->mem != NULL)
input->mem->UnlockBuf();
}
node->visitMark = NODE_FINISHED;
......
......@@ -265,6 +265,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
dims[inputDec.order + 1] = inputEnc.GetDim(inputEnc.order - 1);
InitTensor(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, paddingEnc.devID);
GMems.GetMem(paddingEnc.devID)->LockBuf();
XTensor* maskEncDecTMPEnc = NewTensorBuf(paddingEnc.order + 1, dims + 1,
paddingEnc.dataType, paddingEnc.devID);
XTensor* maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID);
......@@ -275,6 +276,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
DelTensorBuf(maskEncDecTMPDec);
DelTensorBuf(maskEncDecTMPEnc);
GMems.GetMem(paddingEnc.devID)->UnlockBuf();
/* padding on the source side */
int* dimsPadding = new int[paddingEnc.order + 2];
......@@ -283,6 +285,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
dimsPadding[paddingEnc.order - 1] = paddingEnc.GetDim(-1);
dimsPadding[paddingEnc.order] = paddingEnc.GetDim(-1);
GMems.GetMem(paddingEnc.devID)->LockBuf();
XTensor* padding2 = NewTensorBuf(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType,
paddingEnc.devID);
......@@ -309,6 +312,7 @@ void Model::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
DelTensorBuf(padding3);
DelTensorBuf(padding2);
GMems.GetMem(paddingEnc.devID)->UnlockBuf();
}
/*
......
......@@ -428,6 +428,7 @@ void Trainer::Update(Model* model, const float lr)
_ScaleAndShiftMe(v, (1.0F - adamBeta2), 0);
/* v2 = m / (sqrt(v) + delta) */
GMems.GetMem(v->devID)->LockBuf();
XTensor* v2 = NewTensorBuf(v, v->devID);
_Power(v, v2, 0.5F);
_ScaleAndShiftMe(v2, 1.0F, d);
......@@ -437,6 +438,7 @@ void Trainer::Update(Model* model, const float lr)
_Sum(para, v2, para, -e);
DelTensorBuf(v2);
GMems.GetMem(v->devID)->UnlockBuf();
}
else {
/* the delta rule */
......
......@@ -253,15 +253,25 @@ void Div(const XTensor & a, const XTensor & b, XTensor & c, DTYPE alpha, int lea
if (b.order == 0){
DTYPE scale = 1.0F / b.Get0D();
if (a.mem != NULL)
a.mem->LockBuf();
XTensor * tmp1 = NewTensorBufV2(&a, a.devID, a.mem);
if ((c.mem != NULL) && (c.mem != a.mem)) {
c.mem->LockBuf();
}
XTensor * tmp2 = NewTensorBufV2(&c, c.devID, c.mem);
ScaleAndShift(a, *tmp1, scale, 0.0F);
ScaleAndShift(c, *tmp2, alpha, 0.0F);
Sum(*tmp2, *tmp1, c);
DelTensorBuf(tmp1);
DelTensorBuf(tmp2);
if ((c.mem != NULL) && (c.mem != a.mem)) {
c.mem->UnlockBuf();
}
DelTensorBuf(tmp1);
if (a.mem != NULL)
a.mem->UnlockBuf();
}
else {
int n = GetBroadcastDimIndex(a, b);
......
......@@ -61,6 +61,8 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
float dr = (!x.isSparse || !w.isSparse) ? 1.0F : MAX(x.denseRatio, w.denseRatio);
if (x.mem != NULL)
x.mem->LockBuf();
XTensor * tmp = NewTensorBufV2(order, dimSize, x.dataType, dr, x.devID, x.mem);
/* call _MatrixMul function */
......@@ -101,6 +103,8 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
/* destroy variables */
delete[] dimSize;
DelTensorBuf(tmp);
if (x.mem != NULL)
x.mem->UnlockBuf();
return c;
}
......@@ -137,6 +141,8 @@ XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedX,
float dr = (!x.isSparse || !w.isSparse) ? 1.0F : MAX(x.denseRatio, w.denseRatio);
if (x.mem != NULL)
x.mem->LockBuf();
XTensor * tmp = NewTensorBufV2(order, dimSize, x.dataType, dr, x.devID, x.mem);
/* call _MatrixMul function */
......@@ -175,6 +181,8 @@ XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedX,
/* destroy variables */
delete[] dimSize;
DelTensorBuf(tmp);
if (x.mem != NULL)
x.mem->UnlockBuf();
return c;
}
......
......@@ -277,15 +277,25 @@ void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int l
if (b.order == 0){
DTYPE scale = b.Get0D();
if (a.mem != NULL)
a.mem->LockBuf();
XTensor * tmp1 = NewTensorBufV2(&a, a.devID, a.mem);
if ((c.mem != NULL) && (c.mem != a.mem)) {
c.mem->LockBuf();
}
XTensor * tmp2 = NewTensorBufV2(&c, c.devID, c.mem);
ScaleAndShift(a, *tmp1, scale, 0.0F);
ScaleAndShift(c, *tmp2, alpha, 0.0F);
Sum(*tmp2, *tmp1, c);
DelTensorBuf(tmp1);
DelTensorBuf(tmp2);
if ((c.mem != NULL) && (c.mem != a.mem)) {
c.mem->UnlockBuf();
}
DelTensorBuf(tmp1);
if (a.mem != NULL)
a.mem->UnlockBuf();
}
else {
int n = GetBroadcastDimIndex(a, b);
......
......@@ -290,9 +290,16 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
source = target;
}
target = t->mem != NULL ?
/*target = t->mem != NULL ?
t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize):
XMemAlloc(t->devID, t->unitNum * t->unitSize);
XMemAlloc(t->devID, t->unitNum * t->unitSize);*/
if (t->mem != NULL) {
t->mem->LockBuf();
target = t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize);
}
else {
target = XMemAlloc(t->devID, t->unitNum * t->unitSize);
}
s->data = source;
t->data = target;
......@@ -302,8 +309,9 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
/* free the memory space of the one before the last allocation */
if(count > 0){
int size = s->unitNum * s->unitSize;
if(t->mem != NULL)
if(t->mem != NULL) {
t->mem->ReleaseBuf(t->devID, size);
}
else
XMemFree(t->devID, source);
}
......@@ -312,8 +320,10 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
if(isLast){
CheckNTErrors(t->unitNum == c->unitNum, "Wrong tensor size!");
_Multiply(a, t, c, beta);
if(t->mem != NULL)
if(t->mem != NULL) {
t->mem->ReleaseBuf(t->devID, t->unitNum * t->unitSize);
t->mem->UnlockBuf();
}
else
XMemFree(t->devID, target);
target = NULL;
......
......@@ -293,10 +293,16 @@ void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta
source = target;
}
target = t->mem != NULL ?
/*target = t->mem != NULL ?
t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize):
XMemAlloc(t->devID, t->unitNum * t->unitSize);
XMemAlloc(t->devID, t->unitNum * t->unitSize);*/
if (t->mem != NULL) {
t->mem->LockBuf();
target = t->mem->AllocBuf(t->devID, t->unitNum * t->unitSize);
}
else {
target = XMemAlloc(t->devID, t->unitNum * t->unitSize);
}
s->data = source;
t->data = target;
......@@ -315,8 +321,10 @@ void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta
if(isLast){
CheckNTErrors(t->unitNum == c->unitNum, "Wrong tensor size!");
_Sum(a, t, c, beta);
if(t->mem != NULL)
if(t->mem != NULL) {
t->mem->ReleaseBuf(t->devID, t->unitNum * t->unitSize);
t->mem->UnlockBuf();
}
else
XMemFree(t->devID, target);
target = NULL;
......
......@@ -330,6 +330,7 @@ void _CudaBLASMatrixMULList(cublasHandle_t * handle,
DTYPE ** cpGPU = NULL;
if (mem != NULL) {
mem->LockBuf();
mem->SetPinBuf();
apGPU = (DTYPE**)mem->AllocBuf(mem->devID, sizeof(DTYPE*) * a->count, 256);
bpGPU = (DTYPE**)mem->AllocBuf(mem->devID, sizeof(DTYPE*) * a->count, 256);
......@@ -356,8 +357,10 @@ void _CudaBLASMatrixMULList(cublasHandle_t * handle,
delete[] bp;
delete[] cp;
if(mem != NULL)
if (mem != NULL) {
mem->BackToPinBuf();
mem->UnlockBuf();
}
else {
XMemFree(a0->devID, apGPU);
XMemFree(a0->devID, bpGPU);
......
......@@ -696,13 +696,23 @@ void _SetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYPE nu
#ifdef USE_CUDA
XMem * mem = tensor->mem;
MTYPE size = num * sizeof(MTYPE);
MTYPE * offsetsCuda = mem != NULL ? (MTYPE*)mem->AllocBuf(mem->devID, size) : (MTYPE*)XMemAlloc(tensor->devID, size);
//MTYPE * offsetsCuda = mem != NULL ? (MTYPE*)mem->AllocBuf(mem->devID, size) : (MTYPE*)XMemAlloc(tensor->devID, size);
MTYPE * offsetsCuda;
if (mem != NULL) {
mem->LockBuf();
offsetsCuda = (MTYPE*)mem->AllocBuf(mem->devID, size);
}
else {
offsetsCuda = (MTYPE*)XMemAlloc(tensor->devID, size);
}
XMemCopy(offsetsCuda, tensor->devID, offsets, -1, num * sizeof(MTYPE));
_CudaSetDataWithOffset(tensor, offsetsCuda, value, num);
if (mem != NULL)
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, size);
mem->UnlockBuf();
}
else
XMemFree(tensor->devID, offsetsCuda);
#else
......
......@@ -636,12 +636,23 @@ void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * va
int devIDBackup;
ProtectCudaDev(tensor->devID, devIDBackup);
MTYPE * offsetsCuda = mem != NULL ?
/*MTYPE * offsetsCuda = mem != NULL ?
(MTYPE*)mem->AllocBuf(mem->devID, offsetSize) :
(MTYPE*)XMemAlloc(tensor->devID, offsetSize);
void * valuesCuda = mem != NULL ?
mem->AllocBuf(mem->devID, valueSize) :
XMemAlloc(tensor->devID, valueSize);
void * valuesCuda = mem != NULL ?
mem->AllocBuf(mem->devID, valueSize) :
XMemAlloc(tensor->devID, valueSize);*/
MTYPE * offsetsCuda;
void * valuesCuda;
if (mem != NULL) {
mem->LockBuf();
offsetsCuda = (MTYPE*)mem->AllocBuf(mem->devID, offsetSize);
valuesCuda = mem->AllocBuf(mem->devID, valueSize);
}
else {
offsetsCuda = (MTYPE*)XMemAlloc(tensor->devID, offsetSize);
valuesCuda = XMemAlloc(tensor->devID, valueSize);
}
if (mem != NULL) {
XMemCopy(offsetsCuda, mem->devID, offsets, -1, offsetSize);
......@@ -657,6 +668,7 @@ void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * va
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, valueSize);
mem->ReleaseBuf(mem->devID, offsetSize);
mem->UnlockBuf();
}
else {
XMemFree(tensor->devID, valuesCuda);
......
......@@ -45,15 +45,25 @@ void _CopyBlocks(void * source, int unitSize, int blockSize, int blockNum, void
if (devID >= 0) {
#ifdef USE_CUDA
/* copy the index from host to device */
int * targetBlocksTMP = myMem != NULL ?
/*int * targetBlocksTMP = myMem != NULL ?
(int*)myMem->AllocBuf(devID, blockNum * sizeof(int)):
(int*)XMemAlloc(devID, blockNum * sizeof(int));
(int*)XMemAlloc(devID, blockNum * sizeof(int));*/
int * targetBlocksTMP;
if (myMem != NULL) {
myMem->LockBuf();
targetBlocksTMP = (int*)myMem->AllocBuf(devID, blockNum * sizeof(int));
}
else {
targetBlocksTMP = (int*)XMemAlloc(devID, blockNum * sizeof(int));
}
XMemCopy(targetBlocksTMP, devID, targetBlocks, -1, blockNum * sizeof(int));
_CopyBlocksOnSite(source, unitSize, blockSize, blockNum, target, targetBlocksTMP, devID);
if(myMem != NULL)
if (myMem != NULL) {
myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
myMem->UnlockBuf();
}
else
XMemFree(devID, targetBlocksTMP);
#else
......
......@@ -47,14 +47,17 @@ void _CopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum,
#ifdef USE_CUDA
int * indexGPU = index;
if (!isIndexOnDev) {
myMem->LockBuf();
indexGPU = (int*)myMem->AllocBuf(myMem->devID, blockNum * gridNum * sizeof(int));
XMemCopy(indexGPU, myMem->devID, index, -1, blockNum * gridNum * sizeof(int));
}
_CudaCopyBlocksInGrid(source, blockSize, blockNum, gridNum, target, indexGPU, unitSize, myMem);
if (!isIndexOnDev)
if (!isIndexOnDev) {
myMem->ReleaseBuf(myMem->devID, blockNum * gridNum * sizeof(int));
myMem->UnlockBuf();
}
#else
ShowNTErrors("Plesae specify USE_CUDA and recompile the code!");
#endif
......
......@@ -80,12 +80,23 @@ void _CudaCopyBlocksSelected(void * source, int unitSize, int blockSize, int * s
ProtectCudaDev(devID, devIDBackup);
/* copy the index to the GPU memory */
int * sourceBlocksTMP = myMem != NULL ?
/*int * sourceBlocksTMP = myMem != NULL ?
(int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) :
(int *)XMemAlloc(devID, blockNum * sizeof(int));
int * targetBlocksTMP = myMem != NULL ?
(int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) :
(int *)XMemAlloc(devID, blockNum * sizeof(int));
(int *)XMemAlloc(devID, blockNum * sizeof(int));*/
int * sourceBlocksTMP;
int * targetBlocksTMP;
if (myMem != NULL) {
myMem->LockBuf();
sourceBlocksTMP = (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int));
targetBlocksTMP = (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int));
}
else {
sourceBlocksTMP = (int *)XMemAlloc(devID, blockNum * sizeof(int));
targetBlocksTMP = (int *)XMemAlloc(devID, blockNum * sizeof(int));
}
XMemCopy(sourceBlocksTMP, devID, sourceBlocks, -1, blockNum * sizeof(int));
XMemCopy(targetBlocksTMP, devID, targetBlocks, -1, blockNum * sizeof(int));
......@@ -107,6 +118,7 @@ void _CudaCopyBlocksSelected(void * source, int unitSize, int blockSize, int * s
if (myMem != NULL) {
myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
myMem->UnlockBuf();
}
else {
XMemFree(devID, sourceBlocksTMP);
......
......@@ -131,9 +131,16 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
CheckNTErrors(srcIndexValue < s->unitNum, "Wrong index!");
}
sIndex = mem != NULL ?
/*sIndex = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
(int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
(int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);*/
if (mem != NULL) {
mem->LockBuf();
sIndex = (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize);
}
else {
sIndex = (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
}
XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize);
}
else {
......@@ -169,8 +176,10 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
}
if (srcIndex->devID < 0) {
if(mem != NULL)
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
mem->UnlockBuf();
}
else
XMemFree(mem->devID, sIndex);
}
......@@ -209,9 +218,16 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
CheckNTErrors(srcIndexValue < s->unitNum, "Wrong index!");
}
sIndex = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
(int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
/*sIndex = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
(int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);*/
if (mem != NULL) {
mem->LockBuf();
sIndex = (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize);
}
else {
sIndex = (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
}
XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize);
}
else {
......@@ -238,6 +254,15 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
else {
ShowNTErrors("Unsupported dataType!");
}
if (srcIndex->devID < 0) {
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
mem->UnlockBuf();
}
else
XMemFree(mem->devID, sIndex);
}
}
#endif // USE_CUDA
......
......@@ -177,9 +177,17 @@ void _CudaSpread(XTensor * source, XTensor * collection, int dim,
DTYPE * c = (DTYPE*)collection->data;
XMem * mem = source->mem;
int * si = mem != NULL ?
/*int * si = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize * 2) :
(int*)XMemAlloc(mem->devID, sizeof(int) * indexSize * 2);
(int*)XMemAlloc(mem->devID, sizeof(int) * indexSize * 2);*/
int * si;
if (mem != NULL) {
mem->LockBuf();
si = (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize * 2);
}
else {
si = (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize * 2);
}
int * ci = si + indexSize;
XMemCopy(si, mem->devID, srcIndex, -1, sizeof(int) * indexSize);
......@@ -188,8 +196,10 @@ void _CudaSpread(XTensor * source, XTensor * collection, int dim,
KernelSpreadFuzed<<<blocks, threads >>>(s, c, blockNum, blockSizeSrc, blockSizeColl,
stride, indexSize, si, ci);
if(mem != NULL)
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize * 2);
mem->UnlockBuf();
}
else
XMemFree(mem->devID, si);
}
......@@ -393,9 +403,16 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcI
dim3 threads(cudaBlocks[0], cudaBlocks[1]);
if (srcIndex->devID < 0) {
sIndex = mem != NULL ?
/*sIndex = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
(int*)XMemAlloc(devID, sizeof(int) * indexSize);
(int*)XMemAlloc(devID, sizeof(int) * indexSize);*/
if (mem != NULL) {
mem->LockBuf();
sIndex = (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize);
}
else {
sIndex = (int*)XMemAlloc(devID, sizeof(int) * indexSize);
}
XMemCopy(sIndex, devID, srcIndex->data, -1, sizeof(int) * indexSize);
}
else
......@@ -422,8 +439,10 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcI
}
if (srcIndex->devID < 0) {
if(mem != NULL)
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
mem->UnlockBuf();
}
else
XMemFree(devID, sIndex);
}
......
......@@ -512,8 +512,8 @@ void funName(DTYPE * input, DTYPE * output,int stride, int strideNum,
KERNELREDUCEFUN1(KernelReduceMaxOp, MAX, shflDownReduceMax, FLOAT_MIN)
KERNELREDUCEFUN1(KernelReduceMinOp, MIN, shflDownReduceMin, MAX_FLOAT)
/*
get the max-valued items along a dimension of the tensor (cuda version).
/*
get the max-valued items along a dimension of the tensor (cuda version).
For a 1-dimensional data array a,
sum_i = max_{0<=j<strideNum} input_{i,j}
>> input - the input tensor
......@@ -574,7 +574,14 @@ void _funcName(const XTensor * input, XTensor * output, int dim)
XMem * mem = input->mem; \
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize); \
int bufSize = input->unitSize * cudaGridSize[0] * stride * blockNum * 2; \
DTYPE * buf = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(devID, bufSize); \
DTYPE * buf; \
if (mem != NULL) { \
mem->LockBuf(); \
buf = (DTYPE*)mem->AllocBuf(mem->devID, bufSize); \
} \
else { \
buf = (DTYPE*)XMemAlloc(devID, bufSize); \
} \
DTYPE * buf1 = buf; \
DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum; \
do { \
......@@ -706,8 +713,10 @@ void _funcName(const XTensor * input, XTensor * output, int dim)
\
} while (strideNum > 1); \
\
if (mem != NULL) \
if (mem != NULL) { \
mem->ReleaseBuf(mem->devID, bufSize); \
mem->UnlockBuf(); \
} \
else \
XMemFree(input->devID, buf); \
} \
......
......@@ -757,7 +757,15 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
int bufSize = input->unitSize * cudaGridSize[0] * stride * blockNum * 2;
DTYPE * buf = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(devID, bufSize);
//DTYPE * buf = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(devID, bufSize);
DTYPE * buf;
if (mem != NULL) {
mem->LockBuf();
buf = (DTYPE*)mem->AllocBuf(mem->devID, bufSize);
}
else {
buf = (DTYPE*)XMemAlloc(devID, bufSize);
}
DTYPE * buf1 = buf;
DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum;
do {
......@@ -907,8 +915,10 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
} while (strideNum > 1);
if (mem != NULL)
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, bufSize);
mem->UnlockBuf();
}
else
XMemFree(devID, buf);
}
......
......@@ -56,12 +56,16 @@ void _ReduceSumAll(const XTensor * source, XTensor * target)
int dims[1] = {source->unitNum};
if (source->mem != NULL)
source->mem->LockBuf();
XTensor * all = NewTensorBufV2(1, dims, source->dataType, source->denseRatio, source->devID, source->mem);
_CopyValues(source, all);
_ReduceSum(all, target, 0);
DelTensorBuf(all);
if (source->mem != NULL)
source->mem->UnlockBuf();
}
/*
......@@ -72,6 +76,8 @@ sum all the items of the tensor (It should be optimized!)
void _ReduceSumAll(const XTensor * source, DTYPE * value)
{
int * dimSize = new int[MAX_TENSOR_DIM_NUM];
if (source->mem != NULL)
source->mem->LockBuf();
XTensor * target = NewTensorBufV2(0, dimSize, source->dataType, source->denseRatio, source->devID, source->mem);
target->SetTMPFlag();
......@@ -81,6 +87,8 @@ void _ReduceSumAll(const XTensor * source, DTYPE * value)
delete[] dimSize;
DelTensorBuf(target);
if (source->mem != NULL)
source->mem->UnlockBuf();
}
/*
......
......@@ -118,30 +118,54 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
void * dataTMP = t->data;
if (!isOnSameDevice)
dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(mem->devID, size);
if (!isOnSameDevice) {
/*dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(mem->devID, size);*/
if (mem != NULL) {
mem->LockBuf();
dataTMP = mem->AllocBuf(mem->devID, size);
}
else {
dataTMP = XMemAlloc(mem->devID, size);
}
}
int blockNumInMerge = s->dimSize[leadingDim];
int splitSizeInGrid = gridSize / blockNumInMerge;
int realBlockSize = blockSize * t->unitSize;
int * blockIndex = (int*)(mem != NULL ?
/*int * blockIndex = (int*)(mem != NULL ?
mem->AllocBuf(mem->devID, blockNum * gridNum * sizeof(int)) :
XMemAlloc(s->devID, blockNum * gridNum * sizeof(int)));
XMemAlloc(s->devID, blockNum * gridNum * sizeof(int)));*/
int * blockIndex;
if (mem != NULL) {
if (isOnSameDevice) {
mem->LockBuf();
}
blockIndex = (int*)mem->AllocBuf(mem->devID, blockNum * gridNum * sizeof(int));
}
else {
blockIndex = (int*)XMemAlloc(s->devID, blockNum * gridNum * sizeof(int));
}
_MakeMergeBlockIndex(blockIndex, blockNum, blockNumInMerge, splitSizeInGrid, gridSize, gridNum, s->devID);
_CopyBlocksOnSite(s->data, s->unitSize, realBlockSize, blockNum * gridNum, dataTMP, blockIndex, s->devID);
if (mem != NULL)
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, blockNum * gridNum * sizeof(int));
if (isOnSameDevice) {
mem->UnlockBuf();
}
}
else
XMemFree(s->devID, blockIndex);
if (!isOnSameDevice) {
XMemCopy(t->data, t->devID, dataTMP, s->devID, size);
if (mem != NULL)
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, size);
mem->UnlockBuf();
}
else
XMemFree(s->devID, dataTMP);
}
......@@ -358,8 +382,16 @@ void _Merge(const TensorList * smalls, XTensor * t, int whereToMerge)
void * dataTMP = NULL;
if (uniform)
dataTMP = smallsItem0->data;
else
dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(t->devID, size);
else {
//dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(t->devID, size);
if (mem != NULL) {
mem->LockBuf();
dataTMP = mem->AllocBuf(mem->devID, size);
}
else {
dataTMP = XMemAlloc(t->devID, size);
}
}
tensorTMP->data = dataTMP;
......@@ -378,8 +410,10 @@ void _Merge(const TensorList * smalls, XTensor * t, int whereToMerge)
tensorTMP->data = NULL;
delete tensorTMP;
if ((!uniform) && (mem != NULL))
if ((!uniform) && (mem != NULL)) {
mem->ReleaseBuf(mem->devID, size);
mem->UnlockBuf();
}
else
XMemFree(t->devID, dataTMP);
}
......
......@@ -117,7 +117,7 @@ void _CudaMergeBlockLists(const StrList* sourceList, int * blockSizes, int block
GDevs.GetCudaThread2D(myMem->devID, realMaxBlockSize, newBlockListSize, MAX_INT,
cudaGridSizes, cudaBlockSizes);
myMem->LockBuf();
myMem->SetPinBuf();
int * sizesGPU = (int*)myMem->AllocBuf(myMem->devID, sizeof(int) * newBlockListSize, 256);
......@@ -133,6 +133,7 @@ void _CudaMergeBlockLists(const StrList* sourceList, int * blockSizes, int block
(sourceArraysGPU, sizesGPU, newBlockListSize, targetArraysGPU);
myMem->BackToPinBuf();
myMem->UnlockBuf();
delete[] sourceArrays;
delete[] targetArrays;
......
......@@ -110,22 +110,44 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
void * dataTMP = t->data;
if (!isOnSameDevice)
dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(s->devID, size);
if (!isOnSameDevice) {
//dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(s->devID, size);
if (mem != NULL) {
mem->LockBuf();
dataTMP = mem->AllocBuf(mem->devID, size);
}
else {
dataTMP = XMemAlloc(s->devID, size);
}
}
int realBlockSize = blockSize * t->unitSize;
int blockSplitSize = blockNum / splitNum;
int * blockIndex = (int*)(mem != NULL ?
/*int * blockIndex = (int*)(mem != NULL ?
mem->AllocBuf(mem->devID, blockNum * sizeof(int)) :
XMemAlloc(s->devID, blockNum * sizeof(int)));
XMemAlloc(s->devID, blockNum * sizeof(int)));*/
int * blockIndex;
if (mem != NULL) {
if (isOnSameDevice) {
mem->LockBuf();
}
blockIndex = (int*)mem->AllocBuf(mem->devID, blockNum * sizeof(int));
}
else {
blockIndex = (int*)XMemAlloc(s->devID, blockNum * sizeof(int));
}
_MakeSplitBlockIndex(blockIndex, splitNum, blockSplitSize, blockNum, s->devID);
_CopyBlocksOnSite(s->data, s->unitSize, realBlockSize, blockNum, dataTMP, blockIndex, s->devID);
if (mem != NULL)
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, blockNum * sizeof(int));
if (isOnSameDevice) {
mem->UnlockBuf();
}
}
else
XMemFree(s->devID, blockIndex);
......@@ -133,8 +155,10 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
if (!isOnSameDevice) {
XMemCopy(t->data, t->devID, dataTMP, s->devID, size);
if (mem != NULL)
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, size);
mem->UnlockBuf();
}
else
XMemFree(s->devID, dataTMP);
}
......@@ -333,7 +357,14 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli
dataTMP = first->data;
}
else {
dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(big->devID, size);
//dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(big->devID, size);
if (mem != NULL) {
mem->LockBuf();
dataTMP = mem->AllocBuf(mem->devID, size);
}
else {
dataTMP = XMemAlloc(big->devID, size);
}
}
tensorTMP->data = dataTMP;
......@@ -354,8 +385,10 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli
tensorTMP->data = NULL;
delete tensorTMP;
if ((!uniform) && (mem != NULL))
if ((!uniform) && (mem != NULL)) {
mem->ReleaseBuf(mem->devID, size);
mem->UnlockBuf();
}
else
XMemFree(big->devID, dataTMP);
}
......
......@@ -234,7 +234,15 @@ void _CudaSortBig(const XTensor * a, XTensor * b, XTensor * indexA, XTensor * in
int m = GetNextPower2(strideNum);
int n = stride * blockNum;
void * buf = mem != NULL ? mem->AllocBuf(a->devID, n * m * a->unitSize) : XMemAlloc(a->devID, n * m * a->unitSize);
//void * buf = mem != NULL ? mem->AllocBuf(a->devID, n * m * a->unitSize) : XMemAlloc(a->devID, n * m * a->unitSize);
void * buf;
if (mem != NULL) {
mem->LockBuf();
buf = mem->AllocBuf(a->devID, n * m * a->unitSize);
}
else {
buf = XMemAlloc(a->devID, n * m * a->unitSize);
}
void * bufIndex = NULL;
if (indexA != NULL && indexB != NULL) {
bufIndex = mem != NULL ? mem->AllocBuf(a->devID, n * m * sizeof(int)) : XMemAlloc(a->devID, n * m * sizeof(int));
......@@ -289,8 +297,10 @@ void _CudaSortBig(const XTensor * a, XTensor * b, XTensor * indexA, XTensor * in
KernelReorganizeBack<int> << <dim3(cudaGrids[1], cudaGrids[0]), dim3(cudaBlocks[1], cudaBlocks[0]) >> >
(bufIndex, indexB->data, m, n, stride, k, blockNum);
if (mem != NULL)
if (mem != NULL) {
mem->ReleaseBuf(a->devID, n * m * a->unitSize);
mem->UnlockBuf();
}
else
XMemFree(a->devID, buf);
if (indexA != NULL && indexB != NULL)
......
......@@ -79,6 +79,8 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
blockSize = stride * dimensionSize;
blockNum = y->unitNum / blockSize;
if (mem != NULL)
mem->LockBuf();
max = NewTensorBufV2(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
sum = NewTensorBufV2(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
......@@ -153,6 +155,8 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
DelTensorBuf(max);
DelTensorBuf(sum);
if (mem != NULL)
mem->UnlockBuf();
if (x->devID >= 0) {
delete blockx;
......
......@@ -54,6 +54,8 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
XTensor * max = NULL;
XTensor * sum = NULL;
if (mem != NULL)
mem->LockBuf();
max = NewTensorBufV2(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
sum = NewTensorBufV2(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
......@@ -113,6 +115,8 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
DelTensorBuf(sum);
DelTensorBuf(max);
if (mem != NULL)
mem->UnlockBuf();
delete[] dimSize;
}
......
......@@ -354,8 +354,10 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
dimSize[i - 1] = output->dimSize[i];
}
if (output->mem != NULL)
output->mem->LockBuf();
XTensor * lossBuf = NewTensorBufV2(output->order - 1, dimSize, output->dataType, output->denseRatio,
output->devID, output->mem);
output->devID, output->mem);
_CrossEntropy(output, gold, lossBuf, weight, padding, leadingDim);
......@@ -367,10 +369,16 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
nonZeroNum = (DTYPE)lossBuf->unitNum;
}
else {
if ((padding->mem != NULL) && (padding->mem != output->mem)) {
padding->mem->LockBuf();
}
XTensor * tmp = NewTensorBufV2(padding, padding->devID, padding->mem);
_IsNonZero(padding, tmp);
_ReduceSumAll(tmp, &nonZeroNum);
DelTensorBuf(tmp);
if ((padding->mem != NULL) && (padding->mem != output->mem)) {
padding->mem->UnlockBuf();
}
}
loss = loss / nonZeroNum;
......@@ -384,6 +392,8 @@ DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
delete[] dimSize;
DelTensorBuf(lossBuf);
if (output->mem != NULL)
output->mem->UnlockBuf();
return loss;
}
......
......@@ -57,6 +57,9 @@ void _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
{
int n = leadingDim < 0 ? output->order - 1 : leadingDim;
if (output->mem != NULL) {
output->mem->LockBuf();
}
XTensor * interBuf1 = NewTensorBufV2(output, output->devID, output->mem);
XTensor * interBuf2 = NewTensorBufV2(output, output->devID, output->mem);
......@@ -73,6 +76,9 @@ void _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
DelTensorBuf(interBuf2);
DelTensorBuf(interBuf1);
if (output->mem != NULL) {
output->mem->UnlockBuf();
}
}
/*
......@@ -118,6 +124,9 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
dimSize[i - 1] = output->dimSize[i];
}
if (output->mem != NULL) {
output->mem->LockBuf();
}
XTensor * lossBuf = NewTensorBufV2(output->order - 1, dimSize, output->dataType, output->denseRatio,
output->devID, output->mem);
......@@ -131,10 +140,16 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
nonZeroNum = (DTYPE)lossBuf->unitNum;
}
else {
if ((padding->mem != NULL) && (padding->mem != output->mem)) {
padding->mem->LockBuf();
}
XTensor * tmp = NewTensorBufV2(padding, padding->devID, padding->mem);
_IsNonZero(padding, tmp);
_ReduceSumAll(tmp, &nonZeroNum);
DelTensorBuf(tmp);
if ((padding->mem != NULL) && (padding->mem != output->mem)) {
padding->mem->UnlockBuf();
}
}
loss = loss / nonZeroNum;
......@@ -148,6 +163,9 @@ DTYPE _CudaCrossEntropyFast(const XTensor * output, const XTensor * gold,
delete[] dimSize;
DelTensorBuf(lossBuf);
if (output->mem != NULL) {
output->mem->UnlockBuf();
}
return loss;
}
......
......@@ -86,4 +86,4 @@ public:
}
#endif
#endif
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论