reduce完整版

ec71b1a9 · 张裕浩 · ce081078 · ec71b1a9
Commit ec71b1a9 authored Aug 03, 2018 by 张裕浩
--- a/source/tensor/core/reduce/ReduceSum.cu
+++ b/source/tensor/core/reduce/ReduceSum.cu
@@ -28,6 +28,30 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA

 /*
+use PTX code to reduce float data
+*/
+__device__ __forceinline__  float shfl_down_reduce_sum(float input)
+{
+    float output;
+    asm volatile(
+        "{"
+        ".reg .f32 r0;"
+        "shfl.down.b32  r0, %1, 0x10, 0x1f;"
+        "add.f32        %1, r0, %1;"
+        "shfl.down.b32  r0, %1, 0x8, 0xf;"
+        "add.f32        %1, r0, %1;"
+        "shfl.down.b32  r0, %1, 0x4, 0x7;"
+        "add.f32        %1, r0, %1;"
+        "shfl.down.b32  r0, %1, 0x2, 0x3;"
+        "add.f32        %1, r0, %1;"
+        "shfl.down.b32  r0, %1, 0x1, 0x1;"
+        "add.f32        %0, r0, %1;"
+        "}"
+        : "=f"(output) : "f"(input));
+    return output;
+}
+
+/*
 use PTX code to reduce int data
 */
 __device__ __forceinline__  int shfl_down_reduce_sum(int input)
@@ -121,7 +145,6 @@ void KernelReduceSum(DTYPE * input, DTYPE * output,

        __syncthreads();
    }
-
    /* write result for this block to the output array */
    if (threadIdx.y == 0 && blockIdx.y < reducedStrideNum) 
        output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = iData[threadIdx.x * blockDim.y];
@@ -301,25 +324,49 @@ void KernelReduceSumFast(DTYPE * input, DTYPE * output,
            value2 = exp(value2);
    }

-    /* load data into the shared mem */
-    data[tid] = value + value2;
-
+    value = value + value2;
    __syncthreads();
-
-    /* unroll the warp */
-    if(goodSize >= 512) {if(tid < 256) {data[tid] += data[tid + 256];} __syncthreads();}
-    if(goodSize >= 256) {if(tid < 128) {data[tid] += data[tid + 128];} __syncthreads();}
-    if(goodSize >= 128) {if(tid <  64) {data[tid] += data[tid +  64];} __syncthreads();}
-    if(goodSize >= 64)  {if(tid <  32) {data[tid] += data[tid +  32];} __syncthreads();}
-    if(goodSize >= 32)  {if(tid <  16) {data[tid] += data[tid +  16];} __syncthreads();}
-    if(goodSize >= 16)  {if(tid <   8) {data[tid] += data[tid +   8];} __syncthreads();}
-    if(goodSize >=  8)  {if(tid <   4) {data[tid] += data[tid +   4];} __syncthreads();}
-    if(goodSize >=  4)  {if(tid <   2) {data[tid] += data[tid +   2];} __syncthreads();}
-    if(goodSize >=  2)  {if(tid <   1) {data[tid] += data[tid +   1];} __syncthreads();}
-
-    /* write result for this block to the output array */
-    if(threadIdx.y == 0 && blockIdx.y < reducedStrideNum) 
-        output[(k * reducedStrideNum + blockIdx.y) * stride  + iOffset] = data[0];
+    value = shfl_down_reduce_sum(value);
+    /*value += __shfl_down_sync(0x0000001F, value, 16, 32);
+    value += __shfl_down_sync(0x0000001F, value, 8, 16);
+    value += __shfl_down_sync(0x0000001F, value, 4, 8);
+    value += __shfl_down_sync(0x0000001F, value, 2, 4);
+    value += __shfl_down_sync(0x0000001F, value, 1, 2);*/
+    if ((tid & 0x1f) == 0) { data[tid / 32] = value; }
+    __syncthreads();
+    if (tid < 32)
+    {
+        if (tid < blockDim.y / 32)
+            value = data[tid];
+        else value = 0;
+            value = shfl_down_reduce_sum(value);
+        if (tid == 0 && blockIdx.y < reducedStrideNum)
+            output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = value;
+    }
+    /*if (blockDim.y / 32 >= 16) { if (tid <   8) { data[tid] += data[tid + 8]; } __syncthreads(); }
+    if (blockDim.y / 32 >= 8) { if (tid <   4) { data[tid] += data[tid + 4]; } __syncthreads(); }
+    if (blockDim.y / 32 >= 4) { if (tid <   2) { data[tid] += data[tid + 2]; } __syncthreads(); }
+    if (blockDim.y / 32 >= 2) { if (tid <   1) { data[tid] += data[tid + 1]; } __syncthreads(); }*/
+
+    ///* load data into the shared mem */
+    //data[tid] = value + value2;
+
+    //__syncthreads();
+
+    ///* unroll the warp */
+    //if(goodSize >= 512) {if(tid < 256) {data[tid] += data[tid + 256];} __syncthreads();}
+    //if(goodSize >= 256) {if(tid < 128) {data[tid] += data[tid + 128];} __syncthreads();}
+    //if(goodSize >= 128) {if(tid <  64) {data[tid] += data[tid +  64];} __syncthreads();}
+    //if(goodSize >= 64)  {if(tid <  32) {data[tid] += data[tid +  32];} __syncthreads();}
+    //if(goodSize >= 32)  {if(tid <  16) {data[tid] += data[tid +  16];} __syncthreads();}
+    //if(goodSize >= 16)  {if(tid <   8) {data[tid] += data[tid +   8];} __syncthreads();}
+    //if(goodSize >=  8)  {if(tid <   4) {data[tid] += data[tid +   4];} __syncthreads();}
+    //if(goodSize >=  4)  {if(tid <   2) {data[tid] += data[tid +   2];} __syncthreads();}
+    //if(goodSize >=  2)  {if(tid <   1) {data[tid] += data[tid +   1];} __syncthreads();}
+
+    ///* write result for this block to the output array */
+    //if(threadIdx.y == 0 && blockIdx.y < reducedStrideNum) 
+    //    output[(k * reducedStrideNum + blockIdx.y) * stride  + iOffset] = data[0];
 }

 /* 
@@ -723,6 +770,18 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
        }
        //printf("grad %d %d thread %d %d\n", grids.x, grids.y, blocks.x, blocks.y);
    }
+    else if (stride != 1 && stride * blockNum > 4096)
+    {
+
+        //GDevs->GetGridAndBlockSize2D(devID, stride * blockNum, strideNum,MAX_INT, cudaGridSize, cudaBlockSize);
+        //printf("%d %d %d %d\n", cudaGridSize[0], cudaGridSize[1], cudaBlockSize[0], cudaBlockSize[1]);
+        //unsigned int* goutput = (unsigned int *)input->data;
+        //convert2uintV2 << <dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >> > ((float*)input->data, goutput, stride, strideNum, blockNum, strideNum*blockNum*stride);
+        dim3 grid, block;
+        discontinuousStorageNoShareMemThreadAllocation(grid, block, stride, blockNum);
+        //printf("%d %d %d %d\n", cudaGridSize[0], cudaGridSize[1], cudaBlockSize[0], cudaBlockSize[1]);
+        KernelReduceSumDiscontinuousStorage << <grid, block >> > ((DTYPE *)input->data, (DTYPE*)output->data, stride, strideNum, sp, power, isExp);
+    }
    else
    {
        do {
@@ -742,7 +801,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
                    oData = buf1;
                }
                /* unroll the reduction procedure. The code is messy but it is faster. */
-                if (strideNum < 32) {
+                if (strideNum <= 32) {
                    GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
                    if (cudaGridSize[0] == 1)
@@ -755,6 +814,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
                    if (cudaGridSize[0] == 1)
                        oData = (DTYPE*)output->data;
                    CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!");
+                    adjustThreadForUseWarpOptimization(blocks, threads);
                    KernelReduceSumFast<64> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp);
                }
                else if (strideNum < 256) {
@@ -763,6 +823,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
                    if (cudaGridSize[0] == 1)
                        oData = (DTYPE*)output->data;
                    CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!");
+                    adjustThreadForUseWarpOptimization(blocks, threads);
                    KernelReduceSumFast<128> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp);
                }
                else if (strideNum < 512) {
@@ -771,6 +832,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
                    if (cudaGridSize[0] == 1)
                        oData = (DTYPE*)output->data;
                    CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!");
+                    adjustThreadForUseWarpOptimization(blocks, threads);
                    KernelReduceSumFast<256> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp);
                }
                else {
@@ -779,6 +841,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
                    if (cudaGridSize[0] == 1)
                        oData = (DTYPE*)output->data;
                    CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!");
+                    adjustThreadForUseWarpOptimization(blocks, threads);
                    KernelReduceSumFast<512> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp);
                }
            }