Commit dcba416c by xiaotong

protect of device context

parent e083e9f2
...@@ -223,8 +223,11 @@ void _CudaCopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridN ...@@ -223,8 +223,11 @@ void _CudaCopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridN
int cudaGrids[3]; int cudaGrids[3];
int cudaBlocks[3]; int cudaBlocks[3];
int threadNum = MIN(MAX(blockSize, blockNum), MAX_CUDA_THREAD_NUM_PER_BLOCK); int threadNum = MIN(MAX(blockSize, blockNum), MAX_CUDA_THREAD_NUM_PER_BLOCK);
int devIDBackup;
ProtectCudaDev(myMem->devID, devIDBackup);
GDevs.GetCudaThread2D(myMem->devID, threadNum, gridNum * blockNum, INT_MAX, cudaGrids, cudaBlocks); GDevs.GetCudaThread2D(myMem->devID, threadNum, gridNum * blockNum, INT_MAX, cudaGrids, cudaBlocks);
cudaBlocks[1] = 1; cudaBlocks[1] = 1;
...@@ -237,39 +240,41 @@ void _CudaCopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridN ...@@ -237,39 +240,41 @@ void _CudaCopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridN
if (blockNum == 4) { if (blockNum == 4) {
if ((SHARED_MEMORY_SIZE / itemSize - 2 * MAX_CUDA_THREAD_NUM_PER_BLOCK) >= 2 * cudaBlocks[0] * blockNum) if ((SHARED_MEMORY_SIZE / itemSize - 2 * MAX_CUDA_THREAD_NUM_PER_BLOCK) >= 2 * cudaBlocks[0] * blockNum)
KernelCopyBlocksInGridFast<int, 4, 2> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> > KernelCopyBlocksInGridFast<int, 4, 2> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((int*)source, blockSize, blockNum, gridNum, (int*)target, index); ((int*)source, blockSize, blockNum, gridNum, (int*)target, index);
else else
KernelCopyBlocksInGridFast<int, 4, 1> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> > KernelCopyBlocksInGridFast<int, 4, 1> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((int*)source, blockSize, blockNum, gridNum, (int*)target, index); ((int*)source, blockSize, blockNum, gridNum, (int*)target, index);
} }
else if (blockNum == 6) { else if (blockNum == 6) {
if ((SHARED_MEMORY_SIZE / itemSize - 2 * MAX_CUDA_THREAD_NUM_PER_BLOCK) >= 2 * cudaBlocks[0] * blockNum) if ((SHARED_MEMORY_SIZE / itemSize - 2 * MAX_CUDA_THREAD_NUM_PER_BLOCK) >= 2 * cudaBlocks[0] * blockNum)
KernelCopyBlocksInGridFast<int, 6, 2> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> > KernelCopyBlocksInGridFast<int, 6, 2> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((int*)source, blockSize, blockNum, gridNum, (int*)target, index); ((int*)source, blockSize, blockNum, gridNum, (int*)target, index);
else else
KernelCopyBlocksInGridFast<int, 6, 1> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> > KernelCopyBlocksInGridFast<int, 6, 1> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((int*)source, blockSize, blockNum, gridNum, (int*)target, index); ((int*)source, blockSize, blockNum, gridNum, (int*)target, index);
} }
else if (blockNum == 8) { else if (blockNum == 8) {
if ((SHARED_MEMORY_SIZE / itemSize - 2 * MAX_CUDA_THREAD_NUM_PER_BLOCK) >= 2 * cudaBlocks[0] * blockNum) if ((SHARED_MEMORY_SIZE / itemSize - 2 * MAX_CUDA_THREAD_NUM_PER_BLOCK) >= 2 * cudaBlocks[0] * blockNum)
KernelCopyBlocksInGridFast<int, 8, 2> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> > KernelCopyBlocksInGridFast<int, 8, 2> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((int*)source, blockSize, blockNum, gridNum, (int*)target, index); ((int*)source, blockSize, blockNum, gridNum, (int*)target, index);
else else
KernelCopyBlocksInGridFast<int, 8, 1> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> > KernelCopyBlocksInGridFast<int, 8, 1> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((int*)source, blockSize, blockNum, gridNum, (int*)target, index); ((int*)source, blockSize, blockNum, gridNum, (int*)target, index);
} }
else if (blockNum == 12) { else if (blockNum == 12) {
if ((SHARED_MEMORY_SIZE / itemSize - 2 * MAX_CUDA_THREAD_NUM_PER_BLOCK) >= 2 * cudaBlocks[0] * blockNum) if ((SHARED_MEMORY_SIZE / itemSize - 2 * MAX_CUDA_THREAD_NUM_PER_BLOCK) >= 2 * cudaBlocks[0] * blockNum)
KernelCopyBlocksInGridFast<int, 12, 2> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> > KernelCopyBlocksInGridFast<int, 12, 2> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((int*)source, blockSize, blockNum, gridNum, (int*)target, index); ((int*)source, blockSize, blockNum, gridNum, (int*)target, index);
else else
KernelCopyBlocksInGridFast<int, 12, 1> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> > KernelCopyBlocksInGridFast<int, 12, 1> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((int*)source, blockSize, blockNum, gridNum, (int*)target, index); ((int*)source, blockSize, blockNum, gridNum, (int*)target, index);
} }
else { else {
KernelCopyBlocksInGrid<int> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> > KernelCopyBlocksInGrid<int> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((int*)source, blockSize, blockNum, gridNum, (int*)target, index); ((int*)source, blockSize, blockNum, gridNum, (int*)target, index);
} }
BacktoCudaDev(myMem->devID, devIDBackup);
} }
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -95,28 +95,33 @@ void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, ...@@ -95,28 +95,33 @@ void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target,
int cudaGrids[3]; int cudaGrids[3];
int cudaBlocks[3]; int cudaBlocks[3];
int devIDBackup;
ProtectCudaDev(devID, devIDBackup);
if(blockSize % sizeof(double) == 0){ if(blockSize % sizeof(double) == 0){
int bSize = blockSize / sizeof(double); int bSize = blockSize / sizeof(double);
//GDevs.GetCudaThread(devID, bSize * blockNum, cudaGrids, cudaBlocks); GDevs.GetCudaThread(devID, bSize * blockNum, cudaGrids, cudaBlocks);
//KernelCopyBlocksV2<double> <<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>> KernelCopyBlocksV2<double> <<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
// ((double*)source, bSize, blockNum, bSize * blockNum, (double*)target, targetBlocks); ((double*)source, bSize, blockNum, bSize * blockNum, (double*)target, targetBlocks);
GDevs.GetCudaThread2D(devID, bSize, blockNum, MAX_INT, cudaGrids, cudaBlocks); //GDevs.GetCudaThread2D(devID, bSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
KernelCopyBlocks<double> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>> //KernelCopyBlocks<double> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>>
((double*)source, bSize, blockNum, (double*)target, targetBlocks); // ((double*)source, bSize, blockNum, (double*)target, targetBlocks);
} }
else else
if(blockSize % sizeof(float) == 0){ if(blockSize % sizeof(float) == 0){
int bSize = blockSize / sizeof(float); int bSize = blockSize / sizeof(float);
//GDevs.GetCudaThread(devID, bSize * blockNum, cudaGrids, cudaBlocks); GDevs.GetCudaThread(devID, bSize * blockNum, cudaGrids, cudaBlocks);
//KernelCopyBlocksV2<float> <<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>> KernelCopyBlocksV2<float> <<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
// ((float*)source, bSize, blockNum, bSize * blockNum, (float*)target, targetBlocks); ((float*)source, bSize, blockNum, bSize * blockNum, (float*)target, targetBlocks);
GDevs.GetCudaThread2D(devID, bSize, blockNum, MAX_INT, cudaGrids, cudaBlocks); //GDevs.GetCudaThread2D(devID, bSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
KernelCopyBlocks<float> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>> //KernelCopyBlocks<float> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>>
((float*)source, bSize, blockNum, (float*)target, targetBlocks); // ((float*)source, bSize, blockNum, (float*)target, targetBlocks);
} }
else{ else{
ShowNTErrors("Unsupported block size!"); ShowNTErrors("Unsupported block size!");
} }
BacktoCudaDev(devID, devIDBackup);
} }
#endif // USE_CUDA #endif // USE_CUDA
......
...@@ -75,6 +75,9 @@ void _CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, i ...@@ -75,6 +75,9 @@ void _CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, i
CheckNTErrors(devID >= 0, "Wrong device to run!"); CheckNTErrors(devID >= 0, "Wrong device to run!");
CheckNTErrors((blockSize % sizeof(DTYPE) == 0), "Unsupported block size!"); CheckNTErrors((blockSize % sizeof(DTYPE) == 0), "Unsupported block size!");
int devIDBackup;
ProtectCudaDev(devID, devIDBackup);
/* copy the index to the GPU memory */ /* copy the index to the GPU memory */
int * sourceBlocksTMP = myMem != NULL ? (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) : (int *)XMemAlloc(devID, blockNum * sizeof(int)); int * sourceBlocksTMP = myMem != NULL ? (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) : (int *)XMemAlloc(devID, blockNum * sizeof(int));
int * targetBlocksTMP = myMem != NULL ? (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) : (int *)XMemAlloc(devID, blockNum * sizeof(int)); int * targetBlocksTMP = myMem != NULL ? (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) : (int *)XMemAlloc(devID, blockNum * sizeof(int));
...@@ -97,6 +100,8 @@ void _CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, i ...@@ -97,6 +100,8 @@ void _CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, i
XMemFree(devID, sourceBlocksTMP); XMemFree(devID, sourceBlocksTMP);
XMemFree(devID, targetBlocksTMP); XMemFree(devID, targetBlocksTMP);
} }
BacktoCudaDev(devID, devIDBackup);
} }
#endif // USE_CUDA #endif // USE_CUDA
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论