Commit 2d504e7a by xuchen

Merge remote-tracking branch 'origin/liyinqiao' into xuchen

parents 003def3d fa24d475
...@@ -66,18 +66,19 @@ copy a number of blocks source source positions to target positions ...@@ -66,18 +66,19 @@ copy a number of blocks source source positions to target positions
>> targetBlocks - target positions of the copy >> targetBlocks - target positions of the copy
>> myMem - the memory pool >> myMem - the memory pool
*/ */
void CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem) void CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID)
{ {
if (myMem != NULL && myMem->devID >= 0) { if (myMem != NULL)
CheckNTErrors((myMem->devID == devID), "DevIDs are different between memory pool and input devID!");
if (devID >= 0) {
#ifdef USE_CUDA #ifdef USE_CUDA
CudaCopyBlocksSelected(source, blockSize, sourceBlocks, blockNum, target, targetBlocks, myMem); CudaCopyBlocksSelected(source, blockSize, sourceBlocks, blockNum, target, targetBlocks, myMem, devID);
#else #else
ShowNTErrors("Plesae specify USE_CUDA and recompile the code!"); ShowNTErrors("Plesae specify USE_CUDA and recompile the code!");
#endif #endif
} }
else { else {
int devID = myMem != NULL ? myMem->devID : -1;
/* /*
The following code should be fine with GPUs, but too many The following code should be fine with GPUs, but too many
kernel calls would slow down the system. We prefer to use kernel calls would slow down the system. We prefer to use
......
...@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor) ...@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
void CopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem); void CopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem);
/* copy a number of blocks from source positions to target positions */ /* copy a number of blocks from source positions to target positions */
void CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem); void CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID);
} // namespace nts(NiuTrans.Tensor) } // namespace nts(NiuTrans.Tensor)
......
...@@ -70,28 +70,33 @@ copy a number of blocks from source positions to target positions (cuda version) ...@@ -70,28 +70,33 @@ copy a number of blocks from source positions to target positions (cuda version)
>> targetBlocks - target positions of the copy >> targetBlocks - target positions of the copy
>> myMem - memory pool >> myMem - memory pool
*/ */
void CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem) void CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID)
{ {
CheckNTErrors((myMem != NULL), "No memory pool!"); CheckNTErrors((devID >= 0), "Wrong device to run!");
CheckNTErrors((myMem->devID >= 0), "Wrong device to run!");
CheckNTErrors((blockSize % sizeof(DTYPE) == 0), "Unsupported block size!"); CheckNTErrors((blockSize % sizeof(DTYPE) == 0), "Unsupported block size!");
/* copy the index to the GPU memory */ /* copy the index to the GPU memory */
int * sourceBlocksTMP = (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)); int * sourceBlocksTMP = myMem != NULL ? (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) : (int *)XMemAlloc(devID, blockNum * sizeof(int));
int * targetBlocksTMP = (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)); int * targetBlocksTMP = myMem != NULL ? (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) : (int *)XMemAlloc(devID, blockNum * sizeof(int));
XMemCopy(sourceBlocksTMP, myMem->devID, sourceBlocks, -1, blockNum * sizeof(int)); XMemCopy(sourceBlocksTMP, devID, sourceBlocks, -1, blockNum * sizeof(int));
XMemCopy(targetBlocksTMP, myMem->devID, targetBlocks, -1, blockNum * sizeof(int)); XMemCopy(targetBlocksTMP, devID, targetBlocks, -1, blockNum * sizeof(int));
int cudaGrids[3]; int cudaGrids[3];
int cudaBlocks[3]; int cudaBlocks[3];
GDevs.GetCudaThread2D(myMem->devID, blockSize / sizeof(DTYPE), blockNum, MAX_INT, cudaGrids, cudaBlocks); GDevs.GetCudaThread2D(devID, blockSize / sizeof(DTYPE), blockNum, MAX_INT, cudaGrids, cudaBlocks);
KernelCopyBlocksSelected << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> > KernelCopyBlocksSelected << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((DTYPE*)source, blockSize / sizeof(DTYPE), sourceBlocksTMP, blockNum, (DTYPE*)target, targetBlocksTMP); ((DTYPE*)source, blockSize / sizeof(DTYPE), sourceBlocksTMP, blockNum, (DTYPE*)target, targetBlocksTMP);
if (myMem != NULL) {
myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int)); myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int)); myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
}
else {
XMemFree(devID, sourceBlocksTMP);
XMemFree(devID, targetBlocksTMP);
}
} }
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -34,7 +34,7 @@ void KernelCopyBlocksSelected(DTYPE * source, int blockSize, int * sourceBlocks, ...@@ -34,7 +34,7 @@ void KernelCopyBlocksSelected(DTYPE * source, int blockSize, int * sourceBlocks,
/* copy a number of blocks form source positions to target positions (cuda version) */ /* copy a number of blocks form source positions to target positions (cuda version) */
extern "C" extern "C"
void CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem); void CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID);
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -84,7 +84,7 @@ bool CopyIndexed(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSiz ...@@ -84,7 +84,7 @@ bool CopyIndexed(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSiz
CheckNTErrors((tgtIndex[i] < blockNumTgt), "Index is out of range!"); CheckNTErrors((tgtIndex[i] < blockNumTgt), "Index is out of range!");
} }
CopyBlocks(s->data, blockSizeSrc * s->unitSize, realSrcIndex, realIndexSize, t->data, realTgtIndex, s->mem); CopyBlocks(s->data, blockSizeSrc * s->unitSize, realSrcIndex, realIndexSize, t->data, realTgtIndex, s->mem, s->devID);
delete[] realSrcIndex; delete[] realSrcIndex;
delete[] realTgtIndex; delete[] realTgtIndex;
......
...@@ -31,7 +31,7 @@ bool Test() ...@@ -31,7 +31,7 @@ bool Test()
wrong = !TestConcatenate() || wrong; wrong = !TestConcatenate() || wrong;
wrong = !TestConcatenateSolely() || wrong; wrong = !TestConcatenateSolely() || wrong;
//wrong = !TestCopyIndexed() || wrong; wrong = !TestCopyIndexed() || wrong;
wrong = !TestCopyValues() || wrong; wrong = !TestCopyValues() || wrong;
wrong = !TestMatrixMul() || wrong; wrong = !TestMatrixMul() || wrong;
wrong = !TestMatrixMul2D() || wrong; wrong = !TestMatrixMul2D() || wrong;
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论