Commit 15f75d3a by xiaotong

add devID to the function as an argument so that it does not require XMem as a necessary input

parent 3f23f074
......@@ -35,24 +35,33 @@ copy a number of blocks to target positions
>> target - target data array
>> targetBlocks - target positions of the copy
>> myMem - the memory pool
>> devID - device id
*/
void _CopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem)
void _CopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID)
{
if (myMem != NULL && myMem->devID >= 0) {
if (myMem != NULL)
devID = myMem->devID;
if (devID >= 0) {
#ifdef USE_CUDA
/* copy the index from host to device */
int * targetBlocksTMP = (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int));
int * targetBlocksTMP = myMem != NULL ?
(int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)):
(int*)XMemAlloc(devID, blockNum * sizeof(int));
XMemCopy(targetBlocksTMP, myMem->devID, targetBlocks, -1, blockNum * sizeof(int));
_CopyBlocksOnSite(source, blockSize, blockNum, target, targetBlocksTMP, myMem);
_CopyBlocksOnSite(source, blockSize, blockNum, target, targetBlocksTMP, devID);
myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
if(myMem != NULL)
myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
else
XMemFree(devID, targetBlocksTMP);
#else
ShowNTErrors("Plesae specify USE_CUDA and recompile the code!");
#endif
}
else {
_CopyBlocksOnSite(source, blockSize, blockNum, target, targetBlocks, myMem);
_CopyBlocksOnSite(source, blockSize, blockNum, target, targetBlocks, devID);
}
}
......@@ -65,11 +74,12 @@ copy a number of blocks source source positions to target positions
>> target - target data array
>> targetBlocks - target positions of the copy
>> myMem - the memory pool
>> devID - device id
*/
void _CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID)
{
if (myMem != NULL)
CheckNTErrors((myMem->devID == devID), "DevIDs are different between memory pool and input devID!");
devID = myMem->devID;
if (devID >= 0) {
#ifdef USE_CUDA
......
......@@ -27,7 +27,7 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* copy a number of blocks to target positions */
void _CopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem);
void _CopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID);
/* copy a number of blocks from source positions to target positions */
void _CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID);
......
......@@ -34,20 +34,18 @@ all the data has been on the device (CPU/GPU) already.
>> blockNum - number of blocks
>> target - target data array
>> targetBlocks - target positions of the copy
>> myMem - the memory pool
>> devID - device id
*/
void _CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem)
void _CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, int devID)
{
if (myMem != NULL && myMem->devID >= 0) {
if (devID >= 0) {
#ifdef USE_CUDA
_CudaCopyBlocks(source, blockSize, blockNum, target, targetBlocks, myMem);
_CudaCopyBlocks(source, blockSize, blockNum, target, targetBlocks, devID);
#else
ShowNTErrors("Plesae specify USE_CUDA and recompile the code!");
#endif
}
else {
int devID = myMem != NULL ? myMem->devID : -1;
/*
The following code should be fine with GPUs, but too many
kernel calls would slow down the system. We prefer to use
......@@ -55,8 +53,8 @@ void _CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target
*/
for (int i = 0, b = 0; i < blockNum; i++, b += blockSize) {
XMemCopy((char*)target + targetBlocks[i] * blockSize, devID,
(char*)source + b, devID, blockSize);
(char*)source + b, devID, blockSize);
}
}
}
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
} // namespace nts(NiuTrans.Tensor)
......@@ -78,13 +78,12 @@ copy a number of blocks to target positions (cuda version)
>> blockNum - number of blocks
>> target - target data array
>> targetBlocks - target positions of the copy (on the device)
>> myMem - memory pool
>> devID - device id
*/
void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem)
void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, int devID)
{
CheckNTErrors((myMem != NULL), "No memory pool!");
CheckNTErrors((myMem->devID >= 0), "Wrong device to run!");
CheckNTErrors((blockSize % sizeof(DTYPE) == 0), "Unsupported block size!");
CheckNTErrors(devID >= 0, "Wrong device to run!");
CheckNTErrors(blockSize % sizeof(DTYPE) == 0, "Unsupported block size!");
int cudaGrids[3];
int cudaBlocks[3];
......@@ -92,15 +91,15 @@ void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target,
if (bSize % 4 == 0) {
GDevs.GetCudaThread2D(myMem->devID, bSize / 4, blockNum, MAX_INT, cudaGrids, cudaBlocks);
KernelCopyBlocks<4> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((DTYPE*)source, bSize, blockNum, (DTYPE*)target, targetBlocks);
KernelCopyBlocks<4> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>>
((DTYPE*)source, bSize, blockNum, (DTYPE*)target, targetBlocks);
}
else {
GDevs.GetCudaThread2D(myMem->devID, bSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
KernelCopyBlocks<1> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((DTYPE*)source, bSize, blockNum, (DTYPE*)target, targetBlocks);
KernelCopyBlocks<1> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>>
((DTYPE*)source, bSize, blockNum, (DTYPE*)target, targetBlocks);
}
}
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
} // namespace nts(NiuTrans.Tensor)
......@@ -33,10 +33,10 @@ __global__
void KernelCopyBlocks(DTYPE * source, int blockSize, int blockNum, DTYPE * target, int * targetBlocks);
/* copy a number of blocks to target positions (cuda version) */
void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem);
void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, int devID);
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
#endif // __COPYBLOCKS_CUH__
\ No newline at end of file
#endif // __COPYBLOCKS_CUH__
......@@ -27,7 +27,7 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* copy a number of blocks to target positions (on site) */
void _CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem);
void _CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, int devID);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -126,7 +126,7 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
_MakeMergeBlockIndex(blockIndex, blockNum, blockNumInMerge, splitSizeInGrid, gridSize, gridNum, mem);
_CopyBlocksOnSite(s->data, realBlockSize, blockNum, dataTMP, blockIndex, mem);
_CopyBlocksOnSite(s->data, realBlockSize, blockNum, dataTMP, blockIndex, s->devID);
if (mem != NULL)
mem->ReleaseBuf(mem->devID, blockNum * gridNum * sizeof(int));
......
......@@ -138,7 +138,7 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
_MakeSplitBlockIndex(blockIndex, splitNum, blockSplitSize, blockNum, s->devID);
_CopyBlocksOnSite(s->data, realBlockSize, blockNum, dataTMP, blockIndex, mem);
_CopyBlocksOnSite(s->data, realBlockSize, blockNum, dataTMP, blockIndex, s->devID);
if (mem != NULL)
mem->ReleaseBuf(mem->devID, blockNum * sizeof(int));
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论