Commit 15f75d3a by xiaotong

add devID to the function as an argument so that it does not require XMem as a necessary input

parent 3f23f074
...@@ -35,24 +35,33 @@ copy a number of blocks to target positions ...@@ -35,24 +35,33 @@ copy a number of blocks to target positions
>> target - target data array >> target - target data array
>> targetBlocks - target positions of the copy >> targetBlocks - target positions of the copy
>> myMem - the memory pool >> myMem - the memory pool
>> devID - device id
*/ */
void _CopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem) void _CopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID)
{ {
if (myMem != NULL && myMem->devID >= 0) { if (myMem != NULL)
devID = myMem->devID;
if (devID >= 0) {
#ifdef USE_CUDA #ifdef USE_CUDA
/* copy the index from host to device */ /* copy the index from host to device */
int * targetBlocksTMP = (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)); int * targetBlocksTMP = myMem != NULL ?
(int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)):
(int*)XMemAlloc(devID, blockNum * sizeof(int));
XMemCopy(targetBlocksTMP, myMem->devID, targetBlocks, -1, blockNum * sizeof(int)); XMemCopy(targetBlocksTMP, myMem->devID, targetBlocks, -1, blockNum * sizeof(int));
_CopyBlocksOnSite(source, blockSize, blockNum, target, targetBlocksTMP, myMem); _CopyBlocksOnSite(source, blockSize, blockNum, target, targetBlocksTMP, devID);
myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int)); if(myMem != NULL)
myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
else
XMemFree(devID, targetBlocksTMP);
#else #else
ShowNTErrors("Plesae specify USE_CUDA and recompile the code!"); ShowNTErrors("Plesae specify USE_CUDA and recompile the code!");
#endif #endif
} }
else { else {
_CopyBlocksOnSite(source, blockSize, blockNum, target, targetBlocks, myMem); _CopyBlocksOnSite(source, blockSize, blockNum, target, targetBlocks, devID);
} }
} }
...@@ -65,11 +74,12 @@ copy a number of blocks source source positions to target positions ...@@ -65,11 +74,12 @@ copy a number of blocks source source positions to target positions
>> target - target data array >> target - target data array
>> targetBlocks - target positions of the copy >> targetBlocks - target positions of the copy
>> myMem - the memory pool >> myMem - the memory pool
>> devID - device id
*/ */
void _CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID) void _CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID)
{ {
if (myMem != NULL) if (myMem != NULL)
CheckNTErrors((myMem->devID == devID), "DevIDs are different between memory pool and input devID!"); devID = myMem->devID;
if (devID >= 0) { if (devID >= 0) {
#ifdef USE_CUDA #ifdef USE_CUDA
......
...@@ -27,7 +27,7 @@ ...@@ -27,7 +27,7 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* copy a number of blocks to target positions */ /* copy a number of blocks to target positions */
void _CopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem); void _CopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID);
/* copy a number of blocks from source positions to target positions */ /* copy a number of blocks from source positions to target positions */
void _CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID); void _CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID);
......
...@@ -34,20 +34,18 @@ all the data has been on the device (CPU/GPU) already. ...@@ -34,20 +34,18 @@ all the data has been on the device (CPU/GPU) already.
>> blockNum - number of blocks >> blockNum - number of blocks
>> target - target data array >> target - target data array
>> targetBlocks - target positions of the copy >> targetBlocks - target positions of the copy
>> myMem - the memory pool >> devID - device id
*/ */
void _CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem) void _CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, int devID)
{ {
if (myMem != NULL && myMem->devID >= 0) { if (devID >= 0) {
#ifdef USE_CUDA #ifdef USE_CUDA
_CudaCopyBlocks(source, blockSize, blockNum, target, targetBlocks, myMem); _CudaCopyBlocks(source, blockSize, blockNum, target, targetBlocks, devID);
#else #else
ShowNTErrors("Plesae specify USE_CUDA and recompile the code!"); ShowNTErrors("Plesae specify USE_CUDA and recompile the code!");
#endif #endif
} }
else { else {
int devID = myMem != NULL ? myMem->devID : -1;
/* /*
The following code should be fine with GPUs, but too many The following code should be fine with GPUs, but too many
kernel calls would slow down the system. We prefer to use kernel calls would slow down the system. We prefer to use
...@@ -55,8 +53,8 @@ void _CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target ...@@ -55,8 +53,8 @@ void _CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target
*/ */
for (int i = 0, b = 0; i < blockNum; i++, b += blockSize) { for (int i = 0, b = 0; i < blockNum; i++, b += blockSize) {
XMemCopy((char*)target + targetBlocks[i] * blockSize, devID, XMemCopy((char*)target + targetBlocks[i] * blockSize, devID,
(char*)source + b, devID, blockSize); (char*)source + b, devID, blockSize);
} }
} }
} }
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
...@@ -78,13 +78,12 @@ copy a number of blocks to target positions (cuda version) ...@@ -78,13 +78,12 @@ copy a number of blocks to target positions (cuda version)
>> blockNum - number of blocks >> blockNum - number of blocks
>> target - target data array >> target - target data array
>> targetBlocks - target positions of the copy (on the device) >> targetBlocks - target positions of the copy (on the device)
>> myMem - memory pool >> devID - device id
*/ */
void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem) void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, int devID)
{ {
CheckNTErrors((myMem != NULL), "No memory pool!"); CheckNTErrors(devID >= 0, "Wrong device to run!");
CheckNTErrors((myMem->devID >= 0), "Wrong device to run!"); CheckNTErrors(blockSize % sizeof(DTYPE) == 0, "Unsupported block size!");
CheckNTErrors((blockSize % sizeof(DTYPE) == 0), "Unsupported block size!");
int cudaGrids[3]; int cudaGrids[3];
int cudaBlocks[3]; int cudaBlocks[3];
...@@ -92,15 +91,15 @@ void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, ...@@ -92,15 +91,15 @@ void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target,
if (bSize % 4 == 0) { if (bSize % 4 == 0) {
GDevs.GetCudaThread2D(myMem->devID, bSize / 4, blockNum, MAX_INT, cudaGrids, cudaBlocks); GDevs.GetCudaThread2D(myMem->devID, bSize / 4, blockNum, MAX_INT, cudaGrids, cudaBlocks);
KernelCopyBlocks<4> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> > KernelCopyBlocks<4> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>>
((DTYPE*)source, bSize, blockNum, (DTYPE*)target, targetBlocks); ((DTYPE*)source, bSize, blockNum, (DTYPE*)target, targetBlocks);
} }
else { else {
GDevs.GetCudaThread2D(myMem->devID, bSize, blockNum, MAX_INT, cudaGrids, cudaBlocks); GDevs.GetCudaThread2D(myMem->devID, bSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
KernelCopyBlocks<1> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> > KernelCopyBlocks<1> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>>
((DTYPE*)source, bSize, blockNum, (DTYPE*)target, targetBlocks); ((DTYPE*)source, bSize, blockNum, (DTYPE*)target, targetBlocks);
} }
} }
#endif // USE_CUDA #endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
...@@ -33,10 +33,10 @@ __global__ ...@@ -33,10 +33,10 @@ __global__
void KernelCopyBlocks(DTYPE * source, int blockSize, int blockNum, DTYPE * target, int * targetBlocks); void KernelCopyBlocks(DTYPE * source, int blockSize, int blockNum, DTYPE * target, int * targetBlocks);
/* copy a number of blocks to target positions (cuda version) */ /* copy a number of blocks to target positions (cuda version) */
void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem); void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, int devID);
#endif // USE_CUDA #endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
#endif // __COPYBLOCKS_CUH__ #endif // __COPYBLOCKS_CUH__
\ No newline at end of file
...@@ -27,7 +27,7 @@ ...@@ -27,7 +27,7 @@
namespace nts { // namespace nts(NiuTrans.Tensor) namespace nts { // namespace nts(NiuTrans.Tensor)
/* copy a number of blocks to target positions (on site) */ /* copy a number of blocks to target positions (on site) */
void _CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem); void _CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, int devID);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -126,7 +126,7 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim) ...@@ -126,7 +126,7 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
_MakeMergeBlockIndex(blockIndex, blockNum, blockNumInMerge, splitSizeInGrid, gridSize, gridNum, mem); _MakeMergeBlockIndex(blockIndex, blockNum, blockNumInMerge, splitSizeInGrid, gridSize, gridNum, mem);
_CopyBlocksOnSite(s->data, realBlockSize, blockNum, dataTMP, blockIndex, mem); _CopyBlocksOnSite(s->data, realBlockSize, blockNum, dataTMP, blockIndex, s->devID);
if (mem != NULL) if (mem != NULL)
mem->ReleaseBuf(mem->devID, blockNum * gridNum * sizeof(int)); mem->ReleaseBuf(mem->devID, blockNum * gridNum * sizeof(int));
......
...@@ -138,7 +138,7 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum) ...@@ -138,7 +138,7 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
_MakeSplitBlockIndex(blockIndex, splitNum, blockSplitSize, blockNum, s->devID); _MakeSplitBlockIndex(blockIndex, splitNum, blockSplitSize, blockNum, s->devID);
_CopyBlocksOnSite(s->data, realBlockSize, blockNum, dataTMP, blockIndex, mem); _CopyBlocksOnSite(s->data, realBlockSize, blockNum, dataTMP, blockIndex, s->devID);
if (mem != NULL) if (mem != NULL)
mem->ReleaseBuf(mem->devID, blockNum * sizeof(int)); mem->ReleaseBuf(mem->devID, blockNum * sizeof(int));
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论