Commit 30dd9d30 by 张裕浩

fix reduceMax and reduceSum bug

parent 06e95a0a
...@@ -544,11 +544,13 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim) ...@@ -544,11 +544,13 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
dim3 grids; dim3 grids;
dim3 blocks; dim3 blocks;
continuousStorageThreadAllocation(grids, blocks, (long long)blockNum, strideNum); continuousStorageThreadAllocation(grids, blocks, (long long)blockNum, strideNum);
if (blocks.y > 128) { if (blocks.y >= 128) {
KernelReduceMaxOp <<<grids, blocks >>> ((DTYPE *)input->data, (DTYPE*)output->data, stride, strideNum, grids.y, blockSize, blockNum); KernelReduceMaxOp <<<grids, blocks >>> ((DTYPE *)input->data, (DTYPE*)output->data, stride, strideNum, grids.y, blockSize, blockNum);
} }
else { else {
KernelReduceMaxOpLessBlocks <<<blockNum / 4, 128 >>> ((DTYPE *)input->data, (DTYPE*)output->data, strideNum, blockNum); if (blockNum % 4 != 0) blockNum = (int)(blockNum / 4) + 1;
else blockNum = blockNum / 4;
KernelReduceMaxOpLessBlocks <<<blockNum, 128 >>> ((DTYPE *)input->data, (DTYPE*)output->data, strideNum, blockNum);
} }
} }
else { else {
......
...@@ -730,10 +730,13 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen ...@@ -730,10 +730,13 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
dim3 grids; dim3 grids;
dim3 blocks; dim3 blocks;
continuousStorageThreadAllocation(grids, blocks, (long long)blockNum, strideNum); continuousStorageThreadAllocation(grids, blocks, (long long)blockNum, strideNum);
if (blocks.y > 128) if (blocks.y >= 128)
KernelReduceSumOp <<<grids, blocks >>> ((DTYPE *)input->data, (DTYPE*)output->data, stride, strideNum, grids.y, blockSize, blockNum, sp, power, isExp); KernelReduceSumOp <<<grids, blocks >>> ((DTYPE *)input->data, (DTYPE*)output->data, stride, strideNum, grids.y, blockSize, blockNum, sp, power, isExp);
else else {
KernelReduceSumOpLessBlocks <<<blockNum / 4, 128 >>> ((DTYPE *)input->data, (DTYPE*)output->data, strideNum, blockNum, sp, power, isExp); if (blockNum % 4 != 0) blockNum = (int)(blockNum / 4) + 1;
else blockNum = blockNum / 4;
KernelReduceSumOpLessBlocks << <blockNum, 128 >> > ((DTYPE *)input->data, (DTYPE*)output->data, strideNum, blockNum, sp, power, isExp);
}
} }
else if (stride != 1 && stride * blockNum > 4096){ else if (stride != 1 && stride * blockNum > 4096){
//GDevs->GetGridAndBlockSize2D(devID, stride * blockNum, strideNum,MAX_INT, cudaGridSize, cudaBlockSize); //GDevs->GetGridAndBlockSize2D(devID, stride * blockNum, strideNum,MAX_INT, cudaGridSize, cudaBlockSize);
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论