Commit 30dd9d30 by 张裕浩

fix reduceMax and reduceSum bug

parent 06e95a0a
......@@ -544,11 +544,13 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
dim3 grids;
dim3 blocks;
continuousStorageThreadAllocation(grids, blocks, (long long)blockNum, strideNum);
if (blocks.y > 128) {
if (blocks.y >= 128) {
KernelReduceMaxOp <<<grids, blocks >>> ((DTYPE *)input->data, (DTYPE*)output->data, stride, strideNum, grids.y, blockSize, blockNum);
}
else {
KernelReduceMaxOpLessBlocks <<<blockNum / 4, 128 >>> ((DTYPE *)input->data, (DTYPE*)output->data, strideNum, blockNum);
if (blockNum % 4 != 0) blockNum = (int)(blockNum / 4) + 1;
else blockNum = blockNum / 4;
KernelReduceMaxOpLessBlocks <<<blockNum, 128 >>> ((DTYPE *)input->data, (DTYPE*)output->data, strideNum, blockNum);
}
}
else {
......
......@@ -730,10 +730,13 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
dim3 grids;
dim3 blocks;
continuousStorageThreadAllocation(grids, blocks, (long long)blockNum, strideNum);
if (blocks.y > 128)
if (blocks.y >= 128)
KernelReduceSumOp <<<grids, blocks >>> ((DTYPE *)input->data, (DTYPE*)output->data, stride, strideNum, grids.y, blockSize, blockNum, sp, power, isExp);
else
KernelReduceSumOpLessBlocks <<<blockNum / 4, 128 >>> ((DTYPE *)input->data, (DTYPE*)output->data, strideNum, blockNum, sp, power, isExp);
else {
if (blockNum % 4 != 0) blockNum = (int)(blockNum / 4) + 1;
else blockNum = blockNum / 4;
KernelReduceSumOpLessBlocks << <blockNum, 128 >> > ((DTYPE *)input->data, (DTYPE*)output->data, strideNum, blockNum, sp, power, isExp);
}
}
else if (stride != 1 && stride * blockNum > 4096){
//GDevs->GetGridAndBlockSize2D(devID, stride * blockNum, strideNum,MAX_INT, cudaGridSize, cudaBlockSize);
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论