Commit ec71b1a9 by 张裕浩

reduce完整版

parent ce081078
...@@ -28,6 +28,30 @@ namespace nts{ // namespace nts(NiuTrans.Tensor) ...@@ -28,6 +28,30 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA #ifdef USE_CUDA
/* /*
use PTX code to reduce float data
*/
__device__ __forceinline__ float shfl_down_reduce_sum(float input)
{
float output;
asm volatile(
"{"
".reg .f32 r0;"
"shfl.down.b32 r0, %1, 0x10, 0x1f;"
"add.f32 %1, r0, %1;"
"shfl.down.b32 r0, %1, 0x8, 0xf;"
"add.f32 %1, r0, %1;"
"shfl.down.b32 r0, %1, 0x4, 0x7;"
"add.f32 %1, r0, %1;"
"shfl.down.b32 r0, %1, 0x2, 0x3;"
"add.f32 %1, r0, %1;"
"shfl.down.b32 r0, %1, 0x1, 0x1;"
"add.f32 %0, r0, %1;"
"}"
: "=f"(output) : "f"(input));
return output;
}
/*
use PTX code to reduce int data use PTX code to reduce int data
*/ */
__device__ __forceinline__ int shfl_down_reduce_sum(int input) __device__ __forceinline__ int shfl_down_reduce_sum(int input)
...@@ -121,7 +145,6 @@ void KernelReduceSum(DTYPE * input, DTYPE * output, ...@@ -121,7 +145,6 @@ void KernelReduceSum(DTYPE * input, DTYPE * output,
__syncthreads(); __syncthreads();
} }
/* write result for this block to the output array */ /* write result for this block to the output array */
if (threadIdx.y == 0 && blockIdx.y < reducedStrideNum) if (threadIdx.y == 0 && blockIdx.y < reducedStrideNum)
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = iData[threadIdx.x * blockDim.y]; output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = iData[threadIdx.x * blockDim.y];
...@@ -301,25 +324,49 @@ void KernelReduceSumFast(DTYPE * input, DTYPE * output, ...@@ -301,25 +324,49 @@ void KernelReduceSumFast(DTYPE * input, DTYPE * output,
value2 = exp(value2); value2 = exp(value2);
} }
/* load data into the shared mem */ value = value + value2;
data[tid] = value + value2;
__syncthreads(); __syncthreads();
value = shfl_down_reduce_sum(value);
/* unroll the warp */ /*value += __shfl_down_sync(0x0000001F, value, 16, 32);
if(goodSize >= 512) {if(tid < 256) {data[tid] += data[tid + 256];} __syncthreads();} value += __shfl_down_sync(0x0000001F, value, 8, 16);
if(goodSize >= 256) {if(tid < 128) {data[tid] += data[tid + 128];} __syncthreads();} value += __shfl_down_sync(0x0000001F, value, 4, 8);
if(goodSize >= 128) {if(tid < 64) {data[tid] += data[tid + 64];} __syncthreads();} value += __shfl_down_sync(0x0000001F, value, 2, 4);
if(goodSize >= 64) {if(tid < 32) {data[tid] += data[tid + 32];} __syncthreads();} value += __shfl_down_sync(0x0000001F, value, 1, 2);*/
if(goodSize >= 32) {if(tid < 16) {data[tid] += data[tid + 16];} __syncthreads();} if ((tid & 0x1f) == 0) { data[tid / 32] = value; }
if(goodSize >= 16) {if(tid < 8) {data[tid] += data[tid + 8];} __syncthreads();} __syncthreads();
if(goodSize >= 8) {if(tid < 4) {data[tid] += data[tid + 4];} __syncthreads();} if (tid < 32)
if(goodSize >= 4) {if(tid < 2) {data[tid] += data[tid + 2];} __syncthreads();} {
if(goodSize >= 2) {if(tid < 1) {data[tid] += data[tid + 1];} __syncthreads();} if (tid < blockDim.y / 32)
value = data[tid];
/* write result for this block to the output array */ else value = 0;
if(threadIdx.y == 0 && blockIdx.y < reducedStrideNum) value = shfl_down_reduce_sum(value);
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = data[0]; if (tid == 0 && blockIdx.y < reducedStrideNum)
output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = value;
}
/*if (blockDim.y / 32 >= 16) { if (tid < 8) { data[tid] += data[tid + 8]; } __syncthreads(); }
if (blockDim.y / 32 >= 8) { if (tid < 4) { data[tid] += data[tid + 4]; } __syncthreads(); }
if (blockDim.y / 32 >= 4) { if (tid < 2) { data[tid] += data[tid + 2]; } __syncthreads(); }
if (blockDim.y / 32 >= 2) { if (tid < 1) { data[tid] += data[tid + 1]; } __syncthreads(); }*/
///* load data into the shared mem */
//data[tid] = value + value2;
//__syncthreads();
///* unroll the warp */
//if(goodSize >= 512) {if(tid < 256) {data[tid] += data[tid + 256];} __syncthreads();}
//if(goodSize >= 256) {if(tid < 128) {data[tid] += data[tid + 128];} __syncthreads();}
//if(goodSize >= 128) {if(tid < 64) {data[tid] += data[tid + 64];} __syncthreads();}
//if(goodSize >= 64) {if(tid < 32) {data[tid] += data[tid + 32];} __syncthreads();}
//if(goodSize >= 32) {if(tid < 16) {data[tid] += data[tid + 16];} __syncthreads();}
//if(goodSize >= 16) {if(tid < 8) {data[tid] += data[tid + 8];} __syncthreads();}
//if(goodSize >= 8) {if(tid < 4) {data[tid] += data[tid + 4];} __syncthreads();}
//if(goodSize >= 4) {if(tid < 2) {data[tid] += data[tid + 2];} __syncthreads();}
//if(goodSize >= 2) {if(tid < 1) {data[tid] += data[tid + 1];} __syncthreads();}
///* write result for this block to the output array */
//if(threadIdx.y == 0 && blockIdx.y < reducedStrideNum)
// output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = data[0];
} }
/* /*
...@@ -723,6 +770,18 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen ...@@ -723,6 +770,18 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
} }
//printf("grad %d %d thread %d %d\n", grids.x, grids.y, blocks.x, blocks.y); //printf("grad %d %d thread %d %d\n", grids.x, grids.y, blocks.x, blocks.y);
} }
else if (stride != 1 && stride * blockNum > 4096)
{
//GDevs->GetGridAndBlockSize2D(devID, stride * blockNum, strideNum,MAX_INT, cudaGridSize, cudaBlockSize);
//printf("%d %d %d %d\n", cudaGridSize[0], cudaGridSize[1], cudaBlockSize[0], cudaBlockSize[1]);
//unsigned int* goutput = (unsigned int *)input->data;
//convert2uintV2 << <dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >> > ((float*)input->data, goutput, stride, strideNum, blockNum, strideNum*blockNum*stride);
dim3 grid, block;
discontinuousStorageNoShareMemThreadAllocation(grid, block, stride, blockNum);
//printf("%d %d %d %d\n", cudaGridSize[0], cudaGridSize[1], cudaBlockSize[0], cudaBlockSize[1]);
KernelReduceSumDiscontinuousStorage << <grid, block >> > ((DTYPE *)input->data, (DTYPE*)output->data, stride, strideNum, sp, power, isExp);
}
else else
{ {
do { do {
...@@ -742,7 +801,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen ...@@ -742,7 +801,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
oData = buf1; oData = buf1;
} }
/* unroll the reduction procedure. The code is messy but it is faster. */ /* unroll the reduction procedure. The code is messy but it is faster. */
if (strideNum < 32) { if (strideNum <= 32) {
GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize); GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]); dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
if (cudaGridSize[0] == 1) if (cudaGridSize[0] == 1)
...@@ -755,6 +814,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen ...@@ -755,6 +814,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
if (cudaGridSize[0] == 1) if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data; oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!"); CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads);
KernelReduceSumFast<64> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp); KernelReduceSumFast<64> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp);
} }
else if (strideNum < 256) { else if (strideNum < 256) {
...@@ -763,6 +823,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen ...@@ -763,6 +823,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
if (cudaGridSize[0] == 1) if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data; oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!"); CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads);
KernelReduceSumFast<128> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp); KernelReduceSumFast<128> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp);
} }
else if (strideNum < 512) { else if (strideNum < 512) {
...@@ -771,6 +832,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen ...@@ -771,6 +832,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
if (cudaGridSize[0] == 1) if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data; oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!"); CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads);
KernelReduceSumFast<256> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp); KernelReduceSumFast<256> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp);
} }
else { else {
...@@ -779,6 +841,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen ...@@ -779,6 +841,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
if (cudaGridSize[0] == 1) if (cudaGridSize[0] == 1)
oData = (DTYPE*)output->data; oData = (DTYPE*)output->data;
CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!"); CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!");
adjustThreadForUseWarpOptimization(blocks, threads);
KernelReduceSumFast<512> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp); KernelReduceSumFast<512> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp);
} }
} }
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论