Commit e84e725e by xiaotong

speed-up data copy

parent f7ed3448
...@@ -51,10 +51,18 @@ void _CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target ...@@ -51,10 +51,18 @@ void _CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target
kernel calls would slow down the system. We prefer to use kernel calls would slow down the system. We prefer to use
one kernel to do block copy in batch (kernel fusion). one kernel to do block copy in batch (kernel fusion).
*/ */
if(blockSize == sizeof(int)){
for (int i = 0, b = 0; i < blockNum; i++, b += blockSize) {
*(int*)((char*)target + targetBlocks[i] * blockSize) =
*(int*)((char*)source + b);
}
}
else{
for (int i = 0, b = 0; i < blockNum; i++, b += blockSize) { for (int i = 0, b = 0; i < blockNum; i++, b += blockSize) {
XMemCopy((char*)target + targetBlocks[i] * blockSize, devID, XMemCopy((char*)target + targetBlocks[i] * blockSize, devID,
(char*)source + b, devID, blockSize); (char*)source + b, devID, blockSize);
} }
} }
}
} }
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
...@@ -36,39 +36,48 @@ NOTE that this version makes more use of the 2d threads in cuda ...@@ -36,39 +36,48 @@ NOTE that this version makes more use of the 2d threads in cuda
>> target - target data array >> target - target data array
>> targetBlocks - target positions of the copy >> targetBlocks - target positions of the copy
*/ */
template<int miniBlockSize> template<class T>
__global__ __global__
void KernelCopyBlocks(DTYPE * source, int blockSize, int blockNum, DTYPE * target, int * targetBlocks) void KernelCopyBlocks(T * source, int blockSize, int blockNum, T * target, int * targetBlocks)
{ {
/* entry index in the block */ /* entry index in the block */
int i = (blockDim.x * blockIdx.x + threadIdx.x) * miniBlockSize; int i = blockDim.x * blockIdx.x + threadIdx.x;
/* block index */ /* block index */
int j = blockDim.y * blockIdx.y + threadIdx.y; int j = blockDim.y * blockIdx.y + threadIdx.y;
if (j >= blockNum) if (i >= blockSize || j >= blockNum)
return; return;
/* target position */ T * s = source + blockSize * j;
int k = targetBlocks[j]; T * t = target + blockSize * targetBlocks[j];
DTYPE * s = source + blockSize * j;
DTYPE * t = target + blockSize * k;
if (i < blockSize) {
if (miniBlockSize == 4) {
t[i] = s[i];
t[i + 1] = s[i + 1];
t[i + 2] = s[i + 2];
t[i + 3] = s[i + 3];
}
else if (miniBlockSize <= 1) {
t[i] = s[i]; t[i] = s[i];
} }
else {
printf("something wrong!"); /*
} copy a number of blocks to target positions
} NOTE that this version makes more use of the 2d threads in cuda
>> source - data array (head of the blocks) to copy from
>> blockSize - size of block
>> blockNum - number of blocks
>> target - target data array
>> targetBlocks - target positions of the copy
*/
template<class T>
__global__
void KernelCopyBlocksV2(T * source, int blockSize, int blockNum, int totalSize, T * target, int * targetBlocks)
{
/* entry index in the block */
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i >= totalSize)
return;
int targetBlockID = targetBlocks[i / blockSize];
int targetOffset = i % blockSize;
*(target + blockSize * targetBlockID + targetOffset) = source[i];
} }
/* /*
...@@ -83,21 +92,30 @@ copy a number of blocks to target positions (cuda version) ...@@ -83,21 +92,30 @@ copy a number of blocks to target positions (cuda version)
void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, int devID) void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, int devID)
{ {
CheckNTErrors(devID >= 0, "Wrong device to run!"); CheckNTErrors(devID >= 0, "Wrong device to run!");
CheckNTErrors(blockSize % sizeof(DTYPE) == 0, "Unsupported block size!");
int cudaGrids[3]; int cudaGrids[3];
int cudaBlocks[3]; int cudaBlocks[3];
int bSize = blockSize / sizeof(DTYPE);
if (bSize % 4 == 0) { if(blockSize % sizeof(double) == 0){
GDevs.GetCudaThread2D(devID, bSize / 4, blockNum, MAX_INT, cudaGrids, cudaBlocks); int bSize = blockSize / sizeof(double);
KernelCopyBlocks<4> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>> //GDevs.GetCudaThread(devID, bSize * blockNum, cudaGrids, cudaBlocks);
((DTYPE*)source, bSize, blockNum, (DTYPE*)target, targetBlocks); //KernelCopyBlocksV2<double> <<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
// ((double*)source, bSize, blockNum, bSize * blockNum, (double*)target, targetBlocks);
GDevs.GetCudaThread2D(devID, bSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
KernelCopyBlocks<double> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>>
((double*)source, bSize, blockNum, (double*)target, targetBlocks);
} }
else { else
if(blockSize % sizeof(float) == 0){
int bSize = blockSize / sizeof(float);
//GDevs.GetCudaThread(devID, bSize * blockNum, cudaGrids, cudaBlocks);
//KernelCopyBlocksV2<float> <<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
// ((float*)source, bSize, blockNum, bSize * blockNum, (float*)target, targetBlocks);
GDevs.GetCudaThread2D(devID, bSize, blockNum, MAX_INT, cudaGrids, cudaBlocks); GDevs.GetCudaThread2D(devID, bSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
KernelCopyBlocks<1> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>> KernelCopyBlocks<float> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>>
((DTYPE*)source, bSize, blockNum, (DTYPE*)target, targetBlocks); ((float*)source, bSize, blockNum, (float*)target, targetBlocks);
}
else{
ShowNTErrors("Unsupported block size!");
} }
} }
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -28,10 +28,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -28,10 +28,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* copy a number of blocks to target positions */
__global__
void KernelCopyBlocks(DTYPE * source, int blockSize, int blockNum, DTYPE * target, int * targetBlocks);
/* copy a number of blocks to target positions (cuda version) */ /* copy a number of blocks to target positions (cuda version) */
void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, int devID); void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, int devID);
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论