speed-up data copy

e84e725e · xiaotong · f7ed3448 · e84e725e · e84e725e · e84e725e
Commit e84e725e authored Jul 28, 2018 by xiaotong
--- a/source/tensor/core/movement/CopyBlocksOnSite.cpp
+++ b/source/tensor/core/movement/CopyBlocksOnSite.cpp
@@ -51,10 +51,18 @@ void _CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target
        kernel calls would slow down the system. We prefer to use
        one kernel to do block copy in batch (kernel fusion). 
        */
+        if(blockSize == sizeof(int)){
+            for (int i = 0, b = 0; i < blockNum; i++, b += blockSize) {
+                *(int*)((char*)target + targetBlocks[i] * blockSize) = 
+                *(int*)((char*)source + b);
+            }
+        }
+        else{
            for (int i = 0, b = 0; i < blockNum; i++, b += blockSize) {
                XMemCopy((char*)target + targetBlocks[i] * blockSize, devID,
                         (char*)source + b, devID, blockSize);
            }
        }
+    }
 }
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/movement/CopyBlocksOnSite.cu
+++ b/source/tensor/core/movement/CopyBlocksOnSite.cu
@@ -36,39 +36,48 @@ NOTE that this version makes more use of the 2d threads in cuda
 >> target - target data array
 >> targetBlocks - target positions of the copy
 */
-template<int miniBlockSize>
+template<class T>
 __global__
-void KernelCopyBlocks(DTYPE * source, int blockSize, int blockNum, DTYPE * target, int * targetBlocks)
+void KernelCopyBlocks(T * source, int blockSize, int blockNum, T * target, int * targetBlocks)
 {
    /* entry index in the block */
-    int i = (blockDim.x * blockIdx.x + threadIdx.x) * miniBlockSize;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
    /* block index */
    int j = blockDim.y * blockIdx.y + threadIdx.y;
-    if (j >= blockNum)
+    if (i >= blockSize || j >= blockNum)
        return;
-    /* target position */
+    T * s = source + blockSize * j;
-    int k = targetBlocks[j];
+    T * t = target + blockSize * targetBlocks[j];
-    DTYPE * s = source + blockSize * j;
-    DTYPE * t = target + blockSize * k;
-    if (i < blockSize) {
-        if (miniBlockSize == 4) {
-            t[i] = s[i];
-            t[i + 1] = s[i + 1];
-            t[i + 2] = s[i + 2];
-            t[i + 3] = s[i + 3];
-        }
-        else if (miniBlockSize <= 1) {
    t[i] = s[i];
-        }
+}
-        else {
-            printf("something wrong!");
+/*
-        }
+copy a number of blocks to target positions
-    }
+NOTE that this version makes more use of the 2d threads in cuda
+>> source - data array (head of the blocks) to copy from
+>> blockSize - size of block
+>> blockNum - number of blocks
+>> target - target data array
+>> targetBlocks - target positions of the copy
+*/
+template<class T>
+__global__
+void KernelCopyBlocksV2(T * source, int blockSize, int blockNum, int totalSize, T * target, int * targetBlocks)
+{
+    /* entry index in the block */
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i >= totalSize)
+        return;
+    int targetBlockID = targetBlocks[i / blockSize];
+    int targetOffset  = i % blockSize;
+    *(target + blockSize * targetBlockID + targetOffset) = source[i];
 }
 /*
@@ -83,21 +92,30 @@ copy a number of blocks to target positions (cuda version)
 void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, int devID)
 {
    CheckNTErrors(devID >= 0, "Wrong device to run!");
-    CheckNTErrors(blockSize % sizeof(DTYPE) == 0, "Unsupported block size!");
    int cudaGrids[3];
    int cudaBlocks[3];
-    int bSize = blockSize / sizeof(DTYPE);
-    if (bSize % 4 == 0) {
+    if(blockSize % sizeof(double) == 0){
-        GDevs.GetCudaThread2D(devID, bSize / 4, blockNum, MAX_INT, cudaGrids, cudaBlocks);
+        int bSize = blockSize / sizeof(double);
-        KernelCopyBlocks<4> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>>
+        //GDevs.GetCudaThread(devID, bSize * blockNum, cudaGrids, cudaBlocks);
-                             ((DTYPE*)source, bSize, blockNum, (DTYPE*)target, targetBlocks);
+        //KernelCopyBlocksV2<double> <<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
+        //                            ((double*)source, bSize, blockNum, bSize * blockNum, (double*)target, targetBlocks);
+        GDevs.GetCudaThread2D(devID, bSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
+        KernelCopyBlocks<double> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>>
+                                    ((double*)source, bSize, blockNum, (double*)target, targetBlocks);
    }
-    else {
+    else 
+    if(blockSize % sizeof(float) == 0){
+        int bSize = blockSize / sizeof(float);
+        //GDevs.GetCudaThread(devID, bSize * blockNum, cudaGrids, cudaBlocks);
+        //KernelCopyBlocksV2<float> <<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
+        //                           ((float*)source, bSize, blockNum, bSize * blockNum, (float*)target, targetBlocks);
        GDevs.GetCudaThread2D(devID, bSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
-        KernelCopyBlocks<1> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>>
+        KernelCopyBlocks<float> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>>
-                             ((DTYPE*)source, bSize, blockNum, (DTYPE*)target, targetBlocks);
+                                 ((float*)source, bSize, blockNum, (float*)target, targetBlocks);
+    }
+    else{
+        ShowNTErrors("Unsupported block size!");
    }
 }
 #endif // USE_CUDA

--- a/source/tensor/core/movement/CopyBlocksOnSite.cuh
+++ b/source/tensor/core/movement/CopyBlocksOnSite.cuh
@@ -28,10 +28,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
-/* copy a number of blocks to target positions */
-__global__
-void KernelCopyBlocks(DTYPE * source, int blockSize, int blockNum, DTYPE * target, int * targetBlocks);
 /* copy a number of blocks to target positions (cuda version) */
 void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, int devID);