replace the cuda thread allocation code

99225c29 · xiaotong · 58abfba4 · 99225c29
Commit 99225c29 authored Nov 28, 2018 by xiaotong
--- a/source/tensor/core/movement/Spread.cu
+++ b/source/tensor/core/movement/Spread.cu
@@ -252,11 +252,6 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, int dim,
        }
    }
    else{
-        GDevs.GetCudaThread2D(source->devID, blockNum * indexSize, stride, MAX_INT, cudaGrids, cudaBlocks);
-
-        dim3 blocks(cudaGrids[0], cudaGrids[1]);
-        dim3 threads(cudaBlocks[0], cudaBlocks[1]);
-
        XMem * mem = source->mem;
        int * si = mem != NULL ? 
                  (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) : 
@@ -271,6 +266,11 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, int dim,
        DTYPE * s = (DTYPE*)source->data;
        DTYPE * c = (DTYPE*)collection->data;

+        GDevs.GetCudaThread2D(source->devID, blockNum * indexSize, stride, MAX_INT, cudaGrids, cudaBlocks);
+
+        dim3 blocks(cudaGrids[0], cudaGrids[1]);
+        dim3 threads(cudaBlocks[0], cudaBlocks[1]);
+
        KernelSpreadForGatherFuzed<<<blocks, threads >>>(s, c, blockNum, blockSizeSrc, blockSizeColl, stride, indexSize, si, ci);

        if (mem != NULL) {