Bug fixed. (This version cannot run on the GPUs with pascal and much older framework).

Fix the bugs in ReduceMax and its related functions.

Bug fixed. (This version cannot run on the GPUs with pascal and much older framework).
Fix the bugs in ReduceMax and its related functions.
a095188b · liyinqiao · f1792ca4 · a095188b
Commit a095188b authored Apr 20, 2020 by liyinqiao
--- a/source/tensor/core/reduce/ReduceMax.cu
+++ b/source/tensor/core/reduce/ReduceMax.cu
@@ -554,14 +554,6 @@ void _funcName(const XTensor * input, XTensor * output, int dim)                
    blockSize = stride * strideNum;                                                                                                           \
                                                                                                                                              \
    int devID = input->devID;                                                                                                                 \
-    XMem * mem = input->mem;                                                                                                                  \
-                                                                                                                                              \
-    GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);                                         \
-                                                                                                                                              \
-    int bufSize = sizeof(DTYPE) * cudaGridSize[0] * stride * blockNum * 2;                                                                    \
-    DTYPE * buf = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(input->devID, bufSize);                        \
-    DTYPE * buf1 = buf;                                                                                                                       \
-    DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum;                                                                                 \
                                                                                                                                              \
    int devIDBackup;                                                                                                                          \
    ProtectCudaDev(input->devID, devIDBackup);                                                                                                \
@@ -580,6 +572,12 @@ void _funcName(const XTensor * input, XTensor * output, int dim)                
        }                                                                                                                                     \
    }                                                                                                                                         \
    else {                                                                                                                                    \
+        XMem * mem = input->mem;                                                                                                              \
+        GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);                                     \
+        int bufSize = input->unitSize * cudaGridSize[0] * stride * blockNum * 2;                                                              \
+        DTYPE * buf  = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(devID, bufSize);                          \
+        DTYPE * buf1 = buf;                                                                                                                   \
+        DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum;                                                                             \
        do {                                                                                                                                  \
            if (input->dataType == DEFAULT_DTYPE) {                                                                                           \
                DTYPE * iData = NULL;                                                                                                         \
@@ -666,7 +664,7 @@ void _funcName(const XTensor * input, XTensor * output, int dim)                
                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);                               \
                    if (cudaGridSize[0] == 1)                                                                                                 \
                        oData = (__half*)output->data;                                                                                        \
-                    KernelReduceMax <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);                      \
+                    KernelReduceMax <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);                   \
                }                                                                                                                             \
                else if (strideNum < 128) {                                                                                                   \
                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);        \
@@ -674,7 +672,7 @@ void _funcName(const XTensor * input, XTensor * output, int dim)                
                    if (cudaGridSize[0] == 1)                                                                                                 \
                        oData = (__half*)output->data;                                                                                        \
                    CheckNTErrors(cudaBlockSize[0] >= 64, "Incorrect thread number when calling the cuda kernel!");                           \
-                    KernelReduceMaxFast<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);                   \
+                    KernelReduceMaxFast<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);           \
                }                                                                                                                             \
                else if (strideNum < 256) {                                                                                                   \
                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);       \
@@ -682,7 +680,7 @@ void _funcName(const XTensor * input, XTensor * output, int dim)                
                    if (cudaGridSize[0] == 1)                                                                                                 \
                        oData = (__half*)output->data;                                                                                        \
                    CheckNTErrors(cudaBlockSize[0] >= 128, "Incorrect thread number when calling the cuda kernel!");                          \
-                    KernelReduceMaxFast<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);                  \
+                    KernelReduceMaxFast<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);          \
                }                                                                                                                             \
                else if (strideNum < 512) {                                                                                                   \
                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);       \
@@ -690,7 +688,7 @@ void _funcName(const XTensor * input, XTensor * output, int dim)                
                    if (cudaGridSize[0] == 1)                                                                                                 \
                        oData = (__half*)output->data;                                                                                        \
                    CheckNTErrors(cudaBlockSize[0] >= 256, "Incorrect thread number when calling the cuda kernel!");                          \
-                    KernelReduceMaxFast<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);                  \
+                    KernelReduceMaxFast<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);          \
                }                                                                                                                             \
                else {                                                                                                                        \
                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);       \
@@ -698,7 +696,7 @@ void _funcName(const XTensor * input, XTensor * output, int dim)                
                    if (cudaGridSize[0] == 1)                                                                                                 \
                        oData = (__half*)output->data;                                                                                        \
                    CheckNTErrors(cudaBlockSize[0] >= 512, "Incorrect thread number when calling the cuda kernel!");                          \
-                    KernelReduceMaxFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);                  \
+                    KernelReduceMaxFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);          \
                }                                                                                                                             \
            }                                                                                                                                 \
                                                                                                                                              \
@@ -708,14 +706,13 @@ void _funcName(const XTensor * input, XTensor * output, int dim)                
            iter++;                                                                                                                           \
                                                                                                                                              \
        } while (strideNum > 1);                                                                                                              \
-    }                                                                                                                                         \
                                                                                                                                              \
+        if (mem != NULL)                                                                                                                      \
+            mem->ReleaseBuf(mem->devID, bufSize);                                                                                             \
+        else                                                                                                                                  \
+            XMemFree(input->devID, buf);                                                                                                      \
+    }                                                                                                                                         \
    BacktoCudaDev(input->devID, devIDBackup);                                                                                                 \
-                                                                                                                                              \
-    if (mem != NULL)                                                                                                                          \
-        mem->ReleaseBuf(mem->devID, bufSize);                                                                                                 \
-    else                                                                                                                                      \
-        XMemFree(input->devID, buf);                                                                                                          \
 }
 _CUDAREDUCE(_CudaReduceMax, KernelReduceMaxOp, KernelReduceMaxOpLessBlocks, KernelReduceMax, KernelReduceMaxFast)