add devID to the function as an argument so that it does not require XMem as a necessary input

15f75d3a · xiaotong · 3f23f074 · 15f75d3a · 15f75d3a · 15f75d3a
Commit 15f75d3a authored Jul 27, 2018 by xiaotong
--- a/source/tensor/core/movement/CopyBlocks.cpp
+++ b/source/tensor/core/movement/CopyBlocks.cpp
@@ -35,24 +35,33 @@ copy a number of blocks to target positions
 >> target - target data array
 >> targetBlocks - target positions of the copy
 >> myMem - the memory pool
+>> devID - device id
 */
-void _CopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem)
+void _CopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID)
 {
-    if (myMem != NULL && myMem->devID >= 0) {
+    if (myMem != NULL)
+        devID = myMem->devID;
+    if (devID >= 0) {
 #ifdef USE_CUDA
        /* copy the index from host to device */
-        int * targetBlocksTMP = (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int));
+        int * targetBlocksTMP = myMem != NULL ?
+                               (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)):
+                               (int*)XMemAlloc(devID, blockNum * sizeof(int));
        XMemCopy(targetBlocksTMP, myMem->devID, targetBlocks, -1, blockNum * sizeof(int));
-        _CopyBlocksOnSite(source, blockSize, blockNum, target, targetBlocksTMP, myMem);
+        _CopyBlocksOnSite(source, blockSize, blockNum, target, targetBlocksTMP, devID);
-        myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
+        if(myMem != NULL)
+            myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
+        else
+            XMemFree(devID, targetBlocksTMP);
 #else
        ShowNTErrors("Plesae specify USE_CUDA and recompile the code!");
 #endif
    }
    else {
-        _CopyBlocksOnSite(source, blockSize, blockNum, target, targetBlocks, myMem);
+        _CopyBlocksOnSite(source, blockSize, blockNum, target, targetBlocks, devID);
    }
 }
@@ -65,11 +74,12 @@ copy a number of blocks source source positions to target positions
 >> target - target data array
 >> targetBlocks - target positions of the copy
 >> myMem - the memory pool
+>> devID - device id
 */
 void _CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID)
 {
    if (myMem != NULL)
-        CheckNTErrors((myMem->devID == devID), "DevIDs are different between memory pool and input devID!");
+        devID = myMem->devID;
    if (devID >= 0) {
 #ifdef USE_CUDA

--- a/source/tensor/core/movement/CopyBlocks.h
+++ b/source/tensor/core/movement/CopyBlocks.h
@@ -27,7 +27,7 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* copy a number of blocks to target positions */
-void _CopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem);
+void _CopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID);
 /* copy a number of blocks from source positions to target positions */
 void _CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID);

--- a/source/tensor/core/movement/CopyBlocksOnSite.cpp
+++ b/source/tensor/core/movement/CopyBlocksOnSite.cpp
@@ -34,20 +34,18 @@ all the data has been on the device (CPU/GPU) already.
 >> blockNum - number of blocks
 >> target - target data array
 >> targetBlocks - target positions of the copy
->> myMem - the memory pool
+>> devID - device id
 */
-void _CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem)
+void _CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, int devID)
 {
-    if (myMem != NULL && myMem->devID >= 0) {
+    if (devID >= 0) {
 #ifdef USE_CUDA
-        _CudaCopyBlocks(source, blockSize, blockNum, target, targetBlocks, myMem);
+        _CudaCopyBlocks(source, blockSize, blockNum, target, targetBlocks, devID);
 #else
        ShowNTErrors("Plesae specify USE_CUDA and recompile the code!");
 #endif
    }
    else {
-        int devID = myMem != NULL ? myMem->devID : -1;
        /* 
        The following code should be fine with GPUs, but too many
        kernel calls would slow down the system. We prefer to use
@@ -55,8 +53,8 @@ void _CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target
        */
        for (int i = 0, b = 0; i < blockNum; i++, b += blockSize) {
            XMemCopy((char*)target + targetBlocks[i] * blockSize, devID,
-                (char*)source + b, devID, blockSize);
+                     (char*)source + b, devID, blockSize);
        }
    }
 }
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/movement/CopyBlocksOnSite.cu
+++ b/source/tensor/core/movement/CopyBlocksOnSite.cu
@@ -78,13 +78,12 @@ copy a number of blocks to target positions (cuda version)
 >> blockNum - number of blocks
 >> target - target data array
 >> targetBlocks - target positions of the copy (on the device)
->> myMem - memory pool
+>> devID - device id
 */
-void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem)
+void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, int devID)
 {
-    CheckNTErrors((myMem != NULL), "No memory pool!");
+    CheckNTErrors(devID >= 0, "Wrong device to run!");
-    CheckNTErrors((myMem->devID >= 0), "Wrong device to run!");
+    CheckNTErrors(blockSize % sizeof(DTYPE) == 0, "Unsupported block size!");
-    CheckNTErrors((blockSize % sizeof(DTYPE) == 0), "Unsupported block size!");
    int cudaGrids[3];
    int cudaBlocks[3];
@@ -92,15 +91,15 @@ void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, 
    if (bSize % 4 == 0) {
        GDevs.GetCudaThread2D(myMem->devID, bSize / 4, blockNum, MAX_INT, cudaGrids, cudaBlocks);
-        KernelCopyBlocks<4> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
+        KernelCopyBlocks<4> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>>
-            ((DTYPE*)source, bSize, blockNum, (DTYPE*)target, targetBlocks);
+                             ((DTYPE*)source, bSize, blockNum, (DTYPE*)target, targetBlocks);
    }
    else {
        GDevs.GetCudaThread2D(myMem->devID, bSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
-        KernelCopyBlocks<1> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
+        KernelCopyBlocks<1> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>>
-            ((DTYPE*)source, bSize, blockNum, (DTYPE*)target, targetBlocks);
+                             ((DTYPE*)source, bSize, blockNum, (DTYPE*)target, targetBlocks);
    }
 }
 #endif // USE_CUDA
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/movement/CopyBlocksOnSite.cuh
+++ b/source/tensor/core/movement/CopyBlocksOnSite.cuh
@@ -33,10 +33,10 @@ __global__
 void KernelCopyBlocks(DTYPE * source, int blockSize, int blockNum, DTYPE * target, int * targetBlocks);
 /* copy a number of blocks to target positions (cuda version) */
-void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem);
+void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, int devID);
 #endif // USE_CUDA
 } // namespace nts(NiuTrans.Tensor)
 #endif // __COPYBLOCKS_CUH__
\ No newline at end of file
--- a/source/tensor/core/movement/CopyBlocksOnSite.h
+++ b/source/tensor/core/movement/CopyBlocksOnSite.h
@@ -27,7 +27,7 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* copy a number of blocks to target positions (on site) */
-void _CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem);
+void _CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, int devID);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/shape/Merge.cpp
+++ b/source/tensor/core/shape/Merge.cpp
@@ -126,7 +126,7 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
        _MakeMergeBlockIndex(blockIndex, blockNum, blockNumInMerge, splitSizeInGrid, gridSize, gridNum, mem);
-        _CopyBlocksOnSite(s->data, realBlockSize, blockNum, dataTMP, blockIndex, mem);
+        _CopyBlocksOnSite(s->data, realBlockSize, blockNum, dataTMP, blockIndex, s->devID);
        if (mem != NULL)
            mem->ReleaseBuf(mem->devID, blockNum * gridNum * sizeof(int));

--- a/source/tensor/core/shape/Split.cpp
+++ b/source/tensor/core/shape/Split.cpp
@@ -138,7 +138,7 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
        _MakeSplitBlockIndex(blockIndex, splitNum, blockSplitSize, blockNum, s->devID);
-        _CopyBlocksOnSite(s->data, realBlockSize, blockNum, dataTMP, blockIndex, mem);
+        _CopyBlocksOnSite(s->data, realBlockSize, blockNum, dataTMP, blockIndex, s->devID);
        if (mem != NULL)
            mem->ReleaseBuf(mem->devID, blockNum * sizeof(int));