Commit 99225c29 by xiaotong

replace the cuda thread allocation code

parent 58abfba4
......@@ -252,11 +252,6 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, int dim,
}
}
else{
GDevs.GetCudaThread2D(source->devID, blockNum * indexSize, stride, MAX_INT, cudaGrids, cudaBlocks);
dim3 blocks(cudaGrids[0], cudaGrids[1]);
dim3 threads(cudaBlocks[0], cudaBlocks[1]);
XMem * mem = source->mem;
int * si = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
......@@ -271,6 +266,11 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, int dim,
DTYPE * s = (DTYPE*)source->data;
DTYPE * c = (DTYPE*)collection->data;
GDevs.GetCudaThread2D(source->devID, blockNum * indexSize, stride, MAX_INT, cudaGrids, cudaBlocks);
dim3 blocks(cudaGrids[0], cudaGrids[1]);
dim3 threads(cudaBlocks[0], cudaBlocks[1]);
KernelSpreadForGatherFuzed<<<blocks, threads >>>(s, c, blockNum, blockSizeSrc, blockSizeColl, stride, indexSize, si, ci);
if (mem != NULL) {
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论