Commit e84e725e by xiaotong

speed-up data copy

parent f7ed3448
......@@ -51,9 +51,17 @@ void _CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target
kernel calls would slow down the system. We prefer to use
one kernel to do block copy in batch (kernel fusion).
*/
for (int i = 0, b = 0; i < blockNum; i++, b += blockSize) {
XMemCopy((char*)target + targetBlocks[i] * blockSize, devID,
(char*)source + b, devID, blockSize);
if(blockSize == sizeof(int)){
for (int i = 0, b = 0; i < blockNum; i++, b += blockSize) {
*(int*)((char*)target + targetBlocks[i] * blockSize) =
*(int*)((char*)source + b);
}
}
else{
for (int i = 0, b = 0; i < blockNum; i++, b += blockSize) {
XMemCopy((char*)target + targetBlocks[i] * blockSize, devID,
(char*)source + b, devID, blockSize);
}
}
}
}
......
......@@ -36,39 +36,48 @@ NOTE that this version makes more use of the 2d threads in cuda
>> target - target data array
>> targetBlocks - target positions of the copy
*/
template<int miniBlockSize>
template<class T>
__global__
void KernelCopyBlocks(DTYPE * source, int blockSize, int blockNum, DTYPE * target, int * targetBlocks)
void KernelCopyBlocks(T * source, int blockSize, int blockNum, T * target, int * targetBlocks)
{
/* entry index in the block */
int i = (blockDim.x * blockIdx.x + threadIdx.x) * miniBlockSize;
int i = blockDim.x * blockIdx.x + threadIdx.x;
/* block index */
int j = blockDim.y * blockIdx.y + threadIdx.y;
if (j >= blockNum)
if (i >= blockSize || j >= blockNum)
return;
/* target position */
int k = targetBlocks[j];
DTYPE * s = source + blockSize * j;
DTYPE * t = target + blockSize * k;
if (i < blockSize) {
if (miniBlockSize == 4) {
t[i] = s[i];
t[i + 1] = s[i + 1];
t[i + 2] = s[i + 2];
t[i + 3] = s[i + 3];
}
else if (miniBlockSize <= 1) {
t[i] = s[i];
}
else {
printf("something wrong!");
}
}
T * s = source + blockSize * j;
T * t = target + blockSize * targetBlocks[j];
t[i] = s[i];
}
/*
copy a number of blocks to target positions
NOTE that this version makes more use of the 2d threads in cuda
>> source - data array (head of the blocks) to copy from
>> blockSize - size of block
>> blockNum - number of blocks
>> target - target data array
>> targetBlocks - target positions of the copy
*/
template<class T>
__global__
void KernelCopyBlocksV2(T * source, int blockSize, int blockNum, int totalSize, T * target, int * targetBlocks)
{
/* entry index in the block */
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i >= totalSize)
return;
int targetBlockID = targetBlocks[i / blockSize];
int targetOffset = i % blockSize;
*(target + blockSize * targetBlockID + targetOffset) = source[i];
}
/*
......@@ -83,21 +92,30 @@ copy a number of blocks to target positions (cuda version)
void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, int devID)
{
CheckNTErrors(devID >= 0, "Wrong device to run!");
CheckNTErrors(blockSize % sizeof(DTYPE) == 0, "Unsupported block size!");
int cudaGrids[3];
int cudaBlocks[3];
int bSize = blockSize / sizeof(DTYPE);
if (bSize % 4 == 0) {
GDevs.GetCudaThread2D(devID, bSize / 4, blockNum, MAX_INT, cudaGrids, cudaBlocks);
KernelCopyBlocks<4> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>>
((DTYPE*)source, bSize, blockNum, (DTYPE*)target, targetBlocks);
if(blockSize % sizeof(double) == 0){
int bSize = blockSize / sizeof(double);
//GDevs.GetCudaThread(devID, bSize * blockNum, cudaGrids, cudaBlocks);
//KernelCopyBlocksV2<double> <<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
// ((double*)source, bSize, blockNum, bSize * blockNum, (double*)target, targetBlocks);
GDevs.GetCudaThread2D(devID, bSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
KernelCopyBlocks<double> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>>
((double*)source, bSize, blockNum, (double*)target, targetBlocks);
}
else {
else
if(blockSize % sizeof(float) == 0){
int bSize = blockSize / sizeof(float);
//GDevs.GetCudaThread(devID, bSize * blockNum, cudaGrids, cudaBlocks);
//KernelCopyBlocksV2<float> <<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
// ((float*)source, bSize, blockNum, bSize * blockNum, (float*)target, targetBlocks);
GDevs.GetCudaThread2D(devID, bSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
KernelCopyBlocks<1> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>>
((DTYPE*)source, bSize, blockNum, (DTYPE*)target, targetBlocks);
KernelCopyBlocks<float> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>>
((float*)source, bSize, blockNum, (float*)target, targetBlocks);
}
else{
ShowNTErrors("Unsupported block size!");
}
}
#endif // USE_CUDA
......
......@@ -28,10 +28,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* copy a number of blocks to target positions */
__global__
void KernelCopyBlocks(DTYPE * source, int blockSize, int blockNum, DTYPE * target, int * targetBlocks);
/* copy a number of blocks to target positions (cuda version) */
void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, int devID);
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论