Merge remote-tracking branch 'origin/liyinqiao' into xuchen

2d504e7a · xuchen · 003def3d · fa24d475 · 2d504e7a · 2d504e7a
Commit 2d504e7a authored Jul 11, 2018 by xuchen
--- a/source/core/movement/CopyBlocks.cpp
+++ b/source/core/movement/CopyBlocks.cpp
@@ -66,18 +66,19 @@ copy a number of blocks source source positions to target positions
 >> targetBlocks - target positions of the copy
 >> myMem - the memory pool
 */
-void CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem)
+void CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID)
 {
-    if (myMem != NULL && myMem->devID >= 0) {
+    if (myMem != NULL)
+        CheckNTErrors((myMem->devID == devID), "DevIDs are different between memory pool and input devID!");
+
+    if (devID >= 0) {
 #ifdef USE_CUDA
-        CudaCopyBlocksSelected(source, blockSize, sourceBlocks, blockNum, target, targetBlocks, myMem);
+        CudaCopyBlocksSelected(source, blockSize, sourceBlocks, blockNum, target, targetBlocks, myMem, devID);
 #else
        ShowNTErrors("Plesae specify USE_CUDA and recompile the code!");
 #endif
    }
    else {
-        int devID = myMem != NULL ? myMem->devID : -1;
-
        /* 
        The following code should be fine with GPUs, but too many
        kernel calls would slow down the system. We prefer to use

--- a/source/core/movement/CopyBlocks.h
+++ b/source/core/movement/CopyBlocks.h
@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 void CopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem);

 /* copy a number of blocks from source positions to target positions */
-void CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem);
+void CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/core/movement/CopyBlocksSelected.cu
+++ b/source/core/movement/CopyBlocksSelected.cu
@@ -70,28 +70,33 @@ copy a number of blocks from source positions to target positions (cuda version)
 >> targetBlocks - target positions of the copy
 >> myMem - memory pool
 */
-void CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem)
+void CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID)
 {
-    CheckNTErrors((myMem != NULL), "No memory pool!");
-    CheckNTErrors((myMem->devID >= 0), "Wrong device to run!");
+    CheckNTErrors((devID >= 0), "Wrong device to run!");
    CheckNTErrors((blockSize % sizeof(DTYPE) == 0), "Unsupported block size!");

    /* copy the index to the GPU memory */
-    int * sourceBlocksTMP = (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int));
-    int * targetBlocksTMP = (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int));
-    XMemCopy(sourceBlocksTMP, myMem->devID, sourceBlocks, -1, blockNum * sizeof(int));
-    XMemCopy(targetBlocksTMP, myMem->devID, targetBlocks, -1, blockNum * sizeof(int));
+    int * sourceBlocksTMP = myMem != NULL ? (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) : (int *)XMemAlloc(devID, blockNum * sizeof(int));
+    int * targetBlocksTMP = myMem != NULL ? (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) : (int *)XMemAlloc(devID, blockNum * sizeof(int));
+    XMemCopy(sourceBlocksTMP, devID, sourceBlocks, -1, blockNum * sizeof(int));
+    XMemCopy(targetBlocksTMP, devID, targetBlocks, -1, blockNum * sizeof(int));

    int cudaGrids[3];
    int cudaBlocks[3];

-    GDevs.GetCudaThread2D(myMem->devID, blockSize / sizeof(DTYPE), blockNum, MAX_INT, cudaGrids, cudaBlocks);
+    GDevs.GetCudaThread2D(devID, blockSize / sizeof(DTYPE), blockNum, MAX_INT, cudaGrids, cudaBlocks);

    KernelCopyBlocksSelected << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
                               ((DTYPE*)source, blockSize / sizeof(DTYPE), sourceBlocksTMP, blockNum, (DTYPE*)target, targetBlocksTMP);
    
+    if (myMem != NULL) {
        myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
        myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
+    }
+    else {
+        XMemFree(devID, sourceBlocksTMP);
+        XMemFree(devID, targetBlocksTMP);
+    }
 }

 #endif // USE_CUDA

--- a/source/core/movement/CopyBlocksSelected.cuh
+++ b/source/core/movement/CopyBlocksSelected.cuh
@@ -34,7 +34,7 @@ void KernelCopyBlocksSelected(DTYPE * source, int blockSize, int * sourceBlocks,

 /* copy a number of blocks form source positions to target positions (cuda version) */
 extern "C"
-void CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem);
+void CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID);

 #endif // USE_CUDA


--- a/source/core/movement/CopyIndexed.cpp
+++ b/source/core/movement/CopyIndexed.cpp
@@ -84,7 +84,7 @@ bool CopyIndexed(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSiz
        CheckNTErrors((tgtIndex[i] < blockNumTgt), "Index is out of range!");
    }

-    CopyBlocks(s->data, blockSizeSrc * s->unitSize, realSrcIndex, realIndexSize, t->data, realTgtIndex, s->mem);
+    CopyBlocks(s->data, blockSizeSrc * s->unitSize, realSrcIndex, realIndexSize, t->data, realTgtIndex, s->mem, s->devID);

    delete[] realSrcIndex;
    delete[] realTgtIndex;

--- a/source/test/Test.cpp
+++ b/source/test/Test.cpp
@@ -31,7 +31,7 @@ bool Test()

    wrong = !TestConcatenate() || wrong;
    wrong = !TestConcatenateSolely() || wrong;
-    //wrong = !TestCopyIndexed() || wrong;
+    wrong = !TestCopyIndexed() || wrong;
    wrong = !TestCopyValues() || wrong;
    wrong = !TestMatrixMul() || wrong;
    wrong = !TestMatrixMul2D() || wrong;