Commit a095188b by liyinqiao

Bug fixed. (This version cannot run on the GPUs with pascal and much older framework).

Fix the bugs in ReduceMax and its related functions.
parent f1792ca4
......@@ -554,14 +554,6 @@ void _funcName(const XTensor * input, XTensor * output, int dim)
blockSize = stride * strideNum; \
\
int devID = input->devID; \
XMem * mem = input->mem; \
\
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize); \
\
int bufSize = sizeof(DTYPE) * cudaGridSize[0] * stride * blockNum * 2; \
DTYPE * buf = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(input->devID, bufSize); \
DTYPE * buf1 = buf; \
DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum; \
\
int devIDBackup; \
ProtectCudaDev(input->devID, devIDBackup); \
......@@ -580,6 +572,12 @@ void _funcName(const XTensor * input, XTensor * output, int dim)
} \
} \
else { \
XMem * mem = input->mem; \
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize); \
int bufSize = input->unitSize * cudaGridSize[0] * stride * blockNum * 2; \
DTYPE * buf = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(devID, bufSize); \
DTYPE * buf1 = buf; \
DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum; \
do { \
if (input->dataType == DEFAULT_DTYPE) { \
DTYPE * iData = NULL; \
......@@ -666,7 +664,7 @@ void _funcName(const XTensor * input, XTensor * output, int dim)
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]); \
if (cudaGridSize[0] == 1) \
oData = (__half*)output->data; \
KernelReduceMax <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum); \
KernelReduceMax <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum); \
} \
else if (strideNum < 128) { \
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize); \
......@@ -674,7 +672,7 @@ void _funcName(const XTensor * input, XTensor * output, int dim)
if (cudaGridSize[0] == 1) \
oData = (__half*)output->data; \
CheckNTErrors(cudaBlockSize[0] >= 64, "Incorrect thread number when calling the cuda kernel!"); \
KernelReduceMaxFast<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum); \
KernelReduceMaxFast<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum); \
} \
else if (strideNum < 256) { \
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize); \
......@@ -682,7 +680,7 @@ void _funcName(const XTensor * input, XTensor * output, int dim)
if (cudaGridSize[0] == 1) \
oData = (__half*)output->data; \
CheckNTErrors(cudaBlockSize[0] >= 128, "Incorrect thread number when calling the cuda kernel!"); \
KernelReduceMaxFast<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum); \
KernelReduceMaxFast<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum); \
} \
else if (strideNum < 512) { \
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize); \
......@@ -690,7 +688,7 @@ void _funcName(const XTensor * input, XTensor * output, int dim)
if (cudaGridSize[0] == 1) \
oData = (__half*)output->data; \
CheckNTErrors(cudaBlockSize[0] >= 256, "Incorrect thread number when calling the cuda kernel!"); \
KernelReduceMaxFast<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum); \
KernelReduceMaxFast<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum); \
} \
else { \
GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize); \
......@@ -698,7 +696,7 @@ void _funcName(const XTensor * input, XTensor * output, int dim)
if (cudaGridSize[0] == 1) \
oData = (__half*)output->data; \
CheckNTErrors(cudaBlockSize[0] >= 512, "Incorrect thread number when calling the cuda kernel!"); \
KernelReduceMaxFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum); \
KernelReduceMaxFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum); \
} \
} \
\
......@@ -708,14 +706,13 @@ void _funcName(const XTensor * input, XTensor * output, int dim)
iter++; \
\
} while (strideNum > 1); \
} \
\
if (mem != NULL) \
mem->ReleaseBuf(mem->devID, bufSize); \
else \
XMemFree(input->devID, buf); \
} \
BacktoCudaDev(input->devID, devIDBackup); \
\
if (mem != NULL) \
mem->ReleaseBuf(mem->devID, bufSize); \
else \
XMemFree(input->devID, buf); \
}
_CUDAREDUCE(_CudaReduceMax, KernelReduceMaxOp, KernelReduceMaxOpLessBlocks, KernelReduceMax, KernelReduceMaxFast)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论