Commit dcba416c by xiaotong

protect of device context

parent e083e9f2
......@@ -223,8 +223,11 @@ void _CudaCopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridN
int cudaGrids[3];
int cudaBlocks[3];
int threadNum = MIN(MAX(blockSize, blockNum), MAX_CUDA_THREAD_NUM_PER_BLOCK);
int devIDBackup;
ProtectCudaDev(myMem->devID, devIDBackup);
GDevs.GetCudaThread2D(myMem->devID, threadNum, gridNum * blockNum, INT_MAX, cudaGrids, cudaBlocks);
cudaBlocks[1] = 1;
......@@ -237,39 +240,41 @@ void _CudaCopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridN
if (blockNum == 4) {
if ((SHARED_MEMORY_SIZE / itemSize - 2 * MAX_CUDA_THREAD_NUM_PER_BLOCK) >= 2 * cudaBlocks[0] * blockNum)
KernelCopyBlocksInGridFast<int, 4, 2> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((int*)source, blockSize, blockNum, gridNum, (int*)target, index);
((int*)source, blockSize, blockNum, gridNum, (int*)target, index);
else
KernelCopyBlocksInGridFast<int, 4, 1> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((int*)source, blockSize, blockNum, gridNum, (int*)target, index);
((int*)source, blockSize, blockNum, gridNum, (int*)target, index);
}
else if (blockNum == 6) {
if ((SHARED_MEMORY_SIZE / itemSize - 2 * MAX_CUDA_THREAD_NUM_PER_BLOCK) >= 2 * cudaBlocks[0] * blockNum)
KernelCopyBlocksInGridFast<int, 6, 2> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((int*)source, blockSize, blockNum, gridNum, (int*)target, index);
((int*)source, blockSize, blockNum, gridNum, (int*)target, index);
else
KernelCopyBlocksInGridFast<int, 6, 1> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((int*)source, blockSize, blockNum, gridNum, (int*)target, index);
((int*)source, blockSize, blockNum, gridNum, (int*)target, index);
}
else if (blockNum == 8) {
if ((SHARED_MEMORY_SIZE / itemSize - 2 * MAX_CUDA_THREAD_NUM_PER_BLOCK) >= 2 * cudaBlocks[0] * blockNum)
KernelCopyBlocksInGridFast<int, 8, 2> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((int*)source, blockSize, blockNum, gridNum, (int*)target, index);
((int*)source, blockSize, blockNum, gridNum, (int*)target, index);
else
KernelCopyBlocksInGridFast<int, 8, 1> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((int*)source, blockSize, blockNum, gridNum, (int*)target, index);
((int*)source, blockSize, blockNum, gridNum, (int*)target, index);
}
else if (blockNum == 12) {
if ((SHARED_MEMORY_SIZE / itemSize - 2 * MAX_CUDA_THREAD_NUM_PER_BLOCK) >= 2 * cudaBlocks[0] * blockNum)
KernelCopyBlocksInGridFast<int, 12, 2> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((int*)source, blockSize, blockNum, gridNum, (int*)target, index);
((int*)source, blockSize, blockNum, gridNum, (int*)target, index);
else
KernelCopyBlocksInGridFast<int, 12, 1> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((int*)source, blockSize, blockNum, gridNum, (int*)target, index);
((int*)source, blockSize, blockNum, gridNum, (int*)target, index);
}
else {
KernelCopyBlocksInGrid<int> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
((int*)source, blockSize, blockNum, gridNum, (int*)target, index);
((int*)source, blockSize, blockNum, gridNum, (int*)target, index);
}
BacktoCudaDev(myMem->devID, devIDBackup);
}
#endif // USE_CUDA
......
......@@ -95,28 +95,33 @@ void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target,
int cudaGrids[3];
int cudaBlocks[3];
int devIDBackup;
ProtectCudaDev(devID, devIDBackup);
if(blockSize % sizeof(double) == 0){
int bSize = blockSize / sizeof(double);
//GDevs.GetCudaThread(devID, bSize * blockNum, cudaGrids, cudaBlocks);
//KernelCopyBlocksV2<double> <<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
// ((double*)source, bSize, blockNum, bSize * blockNum, (double*)target, targetBlocks);
GDevs.GetCudaThread2D(devID, bSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
KernelCopyBlocks<double> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>>
((double*)source, bSize, blockNum, (double*)target, targetBlocks);
GDevs.GetCudaThread(devID, bSize * blockNum, cudaGrids, cudaBlocks);
KernelCopyBlocksV2<double> <<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
((double*)source, bSize, blockNum, bSize * blockNum, (double*)target, targetBlocks);
//GDevs.GetCudaThread2D(devID, bSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
//KernelCopyBlocks<double> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>>
// ((double*)source, bSize, blockNum, (double*)target, targetBlocks);
}
else
if(blockSize % sizeof(float) == 0){
int bSize = blockSize / sizeof(float);
//GDevs.GetCudaThread(devID, bSize * blockNum, cudaGrids, cudaBlocks);
//KernelCopyBlocksV2<float> <<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
// ((float*)source, bSize, blockNum, bSize * blockNum, (float*)target, targetBlocks);
GDevs.GetCudaThread2D(devID, bSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
KernelCopyBlocks<float> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>>
((float*)source, bSize, blockNum, (float*)target, targetBlocks);
GDevs.GetCudaThread(devID, bSize * blockNum, cudaGrids, cudaBlocks);
KernelCopyBlocksV2<float> <<<dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >>>
((float*)source, bSize, blockNum, bSize * blockNum, (float*)target, targetBlocks);
//GDevs.GetCudaThread2D(devID, bSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
//KernelCopyBlocks<float> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>>
// ((float*)source, bSize, blockNum, (float*)target, targetBlocks);
}
else{
ShowNTErrors("Unsupported block size!");
}
BacktoCudaDev(devID, devIDBackup);
}
#endif // USE_CUDA
......
......@@ -75,6 +75,9 @@ void _CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, i
CheckNTErrors(devID >= 0, "Wrong device to run!");
CheckNTErrors((blockSize % sizeof(DTYPE) == 0), "Unsupported block size!");
int devIDBackup;
ProtectCudaDev(devID, devIDBackup);
/* copy the index to the GPU memory */
int * sourceBlocksTMP = myMem != NULL ? (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) : (int *)XMemAlloc(devID, blockNum * sizeof(int));
int * targetBlocksTMP = myMem != NULL ? (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int)) : (int *)XMemAlloc(devID, blockNum * sizeof(int));
......@@ -97,6 +100,8 @@ void _CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, i
XMemFree(devID, sourceBlocksTMP);
XMemFree(devID, targetBlocksTMP);
}
BacktoCudaDev(devID, devIDBackup);
}
#endif // USE_CUDA
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论