Commit ec71b1a9 by 张裕浩

reduce完整版

parent ce081078
......@@ -28,6 +28,30 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/*
use PTX code to reduce float data
*/
__device__ __forceinline__ float shfl_down_reduce_sum(float input)
{
float output;
asm volatile(
"{"
".reg .f32 r0;"
"shfl.down.b32 r0, %1, 0x10, 0x1f;"
"add.f32 %1, r0, %1;"
"shfl.down.b32 r0, %1, 0x8, 0xf;"
"add.f32 %1, r0, %1;"
"shfl.down.b32 r0, %1, 0x4, 0x7;"
"add.f32 %1, r0, %1;"
"shfl.down.b32 r0, %1, 0x2, 0x3;"
"add.f32 %1, r0, %1;"
"shfl.down.b32 r0, %1, 0x1, 0x1;"
"add.f32 %0, r0, %1;"
"}"
: "=f"(output) : "f"(input));
return output;
}
/*
use PTX code to reduce int data
*/
__device__ __forceinline__ int shfl_down_reduce_sum(int input)
......@@ -121,7 +145,6 @@ void KernelReduceSum(DTYPE * input, DTYPE * output,
__syncthreads();
}
/* write result for this block to the output array */
if (threadIdx.y == 0 && blockIdx.y < reducedStrideNum)
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = iData[threadIdx.x * blockDim.y];
......@@ -301,25 +324,49 @@ void KernelReduceSumFast(DTYPE * input, DTYPE * output,
value2 = exp(value2);
}
/* load data into the shared mem */
data[tid] = value + value2;
value = value + value2;
__syncthreads();
/* unroll the warp */
if(goodSize >= 512) {if(tid < 256) {data[tid] += data[tid + 256];} __syncthreads();}
if(goodSize >= 256) {if(tid < 128) {data[tid] += data[tid + 128];} __syncthreads();}
if(goodSize >= 128) {if(tid < 64) {data[tid] += data[tid + 64];} __syncthreads();}
if(goodSize >= 64) {if(tid < 32) {data[tid] += data[tid + 32];} __syncthreads();}
if(goodSize >= 32) {if(tid < 16) {data[tid] += data[tid + 16];} __syncthreads();}
if(goodSize >= 16) {if(tid < 8) {data[tid] += data[tid + 8];} __syncthreads();}
if(goodSize >= 8) {if(tid < 4) {data[tid] += data[tid + 4];} __syncthreads();}
if(goodSize >= 4) {if(tid < 2) {data[tid] += data[tid + 2];} __syncthreads();}
if(goodSize >= 2) {if(tid < 1) {data[tid] += data[tid + 1];} __syncthreads();}
/* write result for this block to the output array */
if(threadIdx.y == 0 && blockIdx.y < reducedStrideNum)
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = data[0];
value = shfl_down_reduce_sum(value);
/*value += __shfl_down_sync(0x0000001F, value, 16, 32);
value += __shfl_down_sync(0x0000001F, value, 8, 16);
value += __shfl_down_sync(0x0000001F, value, 4, 8);
value += __shfl_down_sync(0x0000001F, value, 2, 4);
value += __shfl_down_sync(0x0000001F, value, 1, 2);*/
if ((tid & 0x1f) == 0) { data[tid / 32] = value; }
__syncthreads();
if (tid < 32)
{
if (tid < blockDim.y / 32)
value = data[tid];
else value = 0;
value = shfl_down_reduce_sum(value);
if (tid == 0 && blockIdx.y < reducedStrideNum)
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = value;
}
/*if (blockDim.y / 32 >= 16) { if (tid < 8) { data[tid] += data[tid + 8]; } __syncthreads(); }
if (blockDim.y / 32 >= 8) { if (tid < 4) { data[tid] += data[tid + 4]; } __syncthreads(); }
if (blockDim.y / 32 >= 4) { if (tid < 2) { data[tid] += data[tid + 2]; } __syncthreads(); }
if (blockDim.y / 32 >= 2) { if (tid < 1) { data[tid] += data[tid + 1]; } __syncthreads(); }*/
///* load data into the shared mem */
//data[tid] = value + value2;
//__syncthreads();
///* unroll the warp */
//if(goodSize >= 512) {if(tid < 256) {data[tid] += data[tid + 256];} __syncthreads();}
//if(goodSize >= 256) {if(tid < 128) {data[tid] += data[tid + 128];} __syncthreads();}
//if(goodSize >= 128) {if(tid < 64) {data[tid] += data[tid + 64];} __syncthreads();}
//if(goodSize >= 64) {if(tid < 32) {data[tid] += data[tid + 32];} __syncthreads();}
//if(goodSize >= 32) {if(tid < 16) {data[tid] += data[tid + 16];} __syncthreads();}
//if(goodSize >= 16) {if(tid < 8) {data[tid] += data[tid + 8];} __syncthreads();}
//if(goodSize >= 8) {if(tid < 4) {data[tid] += data[tid + 4];} __syncthreads();}
//if(goodSize >= 4) {if(tid < 2) {data[tid] += data[tid + 2];} __syncthreads();}
//if(goodSize >= 2) {if(tid < 1) {data[tid] += data[tid + 1];} __syncthreads();}
///* write result for this block to the output array */
//if(threadIdx.y == 0 && blockIdx.y < reducedStrideNum)
// output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = data[0];
}
/*
......@@ -723,6 +770,18 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
}
//printf("grad %d %d thread %d %d\n", grids.x, grids.y, blocks.x, blocks.y);
}
else if (stride != 1 && stride * blockNum > 4096)
{
//GDevs->GetGridAndBlockSize2D(devID, stride * blockNum, strideNum,MAX_INT, cudaGridSize, cudaBlockSize);
//printf("%d %d %d %d\n", cudaGridSize[0], cudaGridSize[1], cudaBlockSize[0], cudaBlockSize[1]);
//unsigned int* goutput = (unsigned int *)input->data;
//convert2uintV2 << <dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >> > ((float*)input->data, goutput, stride, strideNum, blockNum, strideNum*blockNum*stride);
dim3 grid, block;
discontinuousStorageNoShareMemThreadAllocation(grid, block, stride, blockNum);
//printf("%d %d %d %d\n", cudaGridSize[0], cudaGridSize[1], cudaBlockSize[0], cudaBlockSize[1]);
KernelReduceSumDiscontinuousStorage << <grid, block >> > ((DTYPE *)input->data, (DTYPE*)output->data, stride, strideNum, sp, power, isExp);
}
else
{
do {
......@@ -742,7 +801,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
oData = buf1;
}
/* unroll the reduction procedure. The code is messy but it is faster. */
if (strideNum < 32) {
if (strideNum <= 32) {
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1)
......@@ -755,6 +814,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads);
KernelReduceSumFast<64> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp);
}
else if (strideNum < 256) {
......@@ -763,6 +823,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads);
KernelReduceSumFast<128> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp);
}
else if (strideNum < 512) {
......@@ -771,6 +832,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads);
KernelReduceSumFast<256> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp);
}
else {
......@@ -779,6 +841,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads);
KernelReduceSumFast<512> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp);
}
}
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论