Commit 4cabf059 by xiaotong

better memory allocation

parent cf149d57
......@@ -257,24 +257,32 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, int dim,
dim3 blocks(cudaGrids[0], cudaGrids[1]);
dim3 threads(cudaBlocks[0], cudaBlocks[1]);
DTYPE * s = (DTYPE*)source->data;
DTYPE * c = (DTYPE*)collection->data;
XMem * mem = source->mem;
int * si = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize * 2) :
(int*)XMemAlloc(mem->devID, sizeof(int) * indexSize * 2);
int * ci = si + indexSize;
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
(int*)XMemAlloc(source->devID, sizeof(int) * indexSize);
int * ci = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
(int*)XMemAlloc(collection->devID, sizeof(int) * indexSize);
XMemCopy(si, source->devID, srcIndex, -1, sizeof(int) * indexSize);
XMemCopy(ci, collection->devID, collIndex, -1, sizeof(int) * indexSize);
//XMemCopy(si, source->devID, srcIndex, -1, sizeof(int));
//XMemCopy(ci, collection->devID, collIndex, -1, sizeof(int));
XMemCopy(si, mem->devID, srcIndex, -1, sizeof(int) * indexSize);
XMemCopy(ci, mem->devID, collIndex, -1, sizeof(int) * indexSize);
DTYPE * s = (DTYPE*)source->data;
DTYPE * c = (DTYPE*)collection->data;
KernelSpreadForGatherFuzed<<<blocks, threads >>>(s, c, blockNum, blockSizeSrc, blockSizeColl, stride, indexSize, si, ci);
if(mem != NULL)
mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize * 2);
else
XMemFree(mem->devID, si);
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
}
else {
XMemFree(source->devID, si);
XMemFree(collection->devID, ci);
}
}
BacktoCudaDev(source->devID, devIDBackup);
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论