添加reduce操作

ce081078 · 张裕浩 · d6d35fab · ce081078 · ce081078
Commit ce081078 authored Aug 03, 2018 by 张裕浩
--- a/source/tensor/core/reduce/ReduceMax.cu
+++ b/source/tensor/core/reduce/ReduceMax.cu
@@ -29,6 +29,38 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)

 #ifdef USE_CUDA

+
+/*
+use PTX code to reduce float data
+*/
+__device__ __forceinline__  float shfl_down_reduce_max(float input)
+{
+    float output;
+    asm volatile(
+        "{"
+        ".reg .f32 r0;"
+        ".reg .pred p;"
+        "shfl.down.b32  r0, %1, 0x10, 0x1f;"
+        "setp.lt.f32    p,%1,r0;"
+        "@p mov.f32     %1,r0;"
+        "shfl.down.b32  r0, %1, 0x8, 0xf;"
+        "setp.lt.f32    p,%1,r0;"
+        "@p mov.f32     %1,r0;"
+        "shfl.down.b32  r0, %1, 0x4, 0x7;"
+        "setp.lt.f32    p,%1,r0;"
+        "@p mov.f32     %1,r0;"
+        "shfl.down.b32  r0, %1, 0x2, 0x3;"
+        "setp.lt.f32    p,%1,r0;"
+        "@p mov.f32     %1,r0;"
+        "shfl.down.b32  r0, %1, 0x1, 0x1;"
+        "setp.lt.f32    p, %1, r0; "
+        "@p mov.f32     %1,r0;"
+        "mov.f32        %0,%1;"
+        "}"
+        : "=f"(output) : "f"(input));
+    return output;
+}
+
 /* 
 reduce a tensor to another that keeps the max value along a dimension  - slow version
 Given a block of data, we go over each dimension i in the stride and we have
@@ -191,25 +223,40 @@ void KernelReduceMaxFast(DTYPE * input, DTYPE * output,
    DTYPE value  = j < strideNum ? inputData[j * stride + iOffset]: FLOAT_MIN;
    DTYPE value2 = j + blockDim.y < strideNum ? inputData[(j + blockDim.y) * stride + iOffset]: FLOAT_MIN;

-    /* load data into the shared mem */
-    data[tid] = MAX(value, value2);
-
+    value = MAX(value, value2);
+    value = shfl_down_reduce_max(value);
+    if ((tid & 0x1f) == 0) { data[tid / 32] = value; }
    __syncthreads();

-    /* unroll the warp */
-    if(goodSize >= 512) {if(tid < 256) {if(data[tid] < data[tid + 256]) data[tid] = data[tid + 256];} __syncthreads();}
-    if(goodSize >= 256) {if(tid < 128) {if(data[tid] < data[tid + 128]) data[tid] = data[tid + 128];} __syncthreads();}
-    if(goodSize >= 128) {if(tid <  64) {if(data[tid] < data[tid +  64]) data[tid] = data[tid +  64];} __syncthreads();}
-    if(goodSize >=  64) {if(tid <  32) {if(data[tid] < data[tid +  32]) data[tid] = data[tid +  32];} __syncthreads();}
-    if(goodSize >=  32) {if(tid <  16) {if(data[tid] < data[tid +  16]) data[tid] = data[tid +  16];} __syncthreads();}
-    if(goodSize >=  16) {if(tid <   8) {if(data[tid] < data[tid +   8]) data[tid] = data[tid +   8];} __syncthreads();}
-    if(goodSize >=   8) {if(tid <   4) {if(data[tid] < data[tid +   4]) data[tid] = data[tid +   4];} __syncthreads();}
-    if(goodSize >=   4) {if(tid <   2) {if(data[tid] < data[tid +   2]) data[tid] = data[tid +   2];} __syncthreads();}
-    if(goodSize >=   2) {if(tid <   1) {if(data[tid] < data[tid +   1]) data[tid] = data[tid +   1];} __syncthreads();}
+    if (tid < 32)
+    {
+        if (tid < blockDim.y / 32)
+            value = data[tid];
+        else value = FLOAT_MIN;
+        value = shfl_down_reduce_max(value);
+        if (tid == 0 && blockIdx.y < reducedStrideNum)
+            output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = value;
+    }

-    /* write result for this block to the output array */
-    if(threadIdx.y == 0 && blockIdx.y < reducedStrideNum) 
-        output[(k * reducedStrideNum + blockIdx.y) * stride  + iOffset] = data[0];
+    ///* load data into the shared mem */
+    //data[tid] = MAX(value, value2);
+
+    //__syncthreads();
+
+    ///* unroll the warp */
+    //if(goodSize >= 512) {if(tid < 256) {if(data[tid] < data[tid + 256]) data[tid] = data[tid + 256];} __syncthreads();}
+    //if(goodSize >= 256) {if(tid < 128) {if(data[tid] < data[tid + 128]) data[tid] = data[tid + 128];} __syncthreads();}
+    //if(goodSize >= 128) {if(tid <  64) {if(data[tid] < data[tid +  64]) data[tid] = data[tid +  64];} __syncthreads();}
+    //if(goodSize >=  64) {if(tid <  32) {if(data[tid] < data[tid +  32]) data[tid] = data[tid +  32];} __syncthreads();}
+    //if(goodSize >=  32) {if(tid <  16) {if(data[tid] < data[tid +  16]) data[tid] = data[tid +  16];} __syncthreads();}
+    //if(goodSize >=  16) {if(tid <   8) {if(data[tid] < data[tid +   8]) data[tid] = data[tid +   8];} __syncthreads();}
+    //if(goodSize >=   8) {if(tid <   4) {if(data[tid] < data[tid +   4]) data[tid] = data[tid +   4];} __syncthreads();}
+    //if(goodSize >=   4) {if(tid <   2) {if(data[tid] < data[tid +   2]) data[tid] = data[tid +   2];} __syncthreads();}
+    //if(goodSize >=   2) {if(tid <   1) {if(data[tid] < data[tid +   1]) data[tid] = data[tid +   1];} __syncthreads();}
+
+    ///* write result for this block to the output array */
+    //if(threadIdx.y == 0 && blockIdx.y < reducedStrideNum) 
+    //    output[(k * reducedStrideNum + blockIdx.y) * stride  + iOffset] = data[0];
 }

 /*
@@ -326,6 +373,105 @@ void KernelReduceMaxSimpleFast(DTYPE * input, DTYPE * output,
    op[offset] = max;
 }

+inline void continuousStorageThreadAllocation(dim3& grid, dim3& block, long long vectorNum, int vectorSize)
+{
+    int warpNum = 4;
+    if (vectorNum < 20 * 8)
+    {
+        warpNum = 8;
+        if (vectorNum < 20 * 4)
+        {
+            warpNum = 16;
+            if (warpNum < 20 * 2)
+                warpNum = 32;
+        }
+    }
+    int minWarpNum = vectorSize / 32;
+    if (vectorSize % 32 != 0) minWarpNum++;
+    warpNum = min(warpNum, minWarpNum);
+    grid.x = vectorNum;
+    grid.y = 1;
+    grid.z = 1;
+    block.x = 1;
+    block.y = warpNum * 32;
+    block.z = 1;
+}
+
+inline void adjustThreadForUseWarpOptimization(dim3& blocks, dim3& threads)
+{
+    if (threads.x > 1)
+    {
+        blocks.x *= threads.x;
+        threads.x = 1;
+    }
+    if (threads.y<32)
+        threads.y = 32;
+}
+
+__global__
+void KernelReduceMaxOpLessBlocks(DTYPE * input, DTYPE * output,
+    int strideNum, int blockNum)
+{
+    int idx = threadIdx.x % 32;
+    int idy = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
+
+    int startIndex = idy * strideNum;
+    DTYPE threadMax = FLOAT_MIN;
+    for (int i = idx; i < strideNum; i += 32)
+    {
+        threadMax = max(input[startIndex + i], threadMax);
+    }
+    threadMax = shfl_down_reduce_max(threadMax);
+    if (idx == 0)
+        output[idy] = threadMax;
+}
+
+__global__
+void KernelReduceMaxOp(DTYPE * input, DTYPE * output,
+    int stride, int strideNum, int reducedStrideNum,
+    int blockSize, int blockNum)
+{
+    __shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK / 32];
+
+    unsigned int tid = threadIdx.y;
+    unsigned int j = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= stride * blockNum)
+        return;
+
+    /* first level reduction */
+    int k = i / stride;
+    int iOffset = i % stride;
+
+    DTYPE threadMax = FLOAT_MIN;
+
+    DTYPE * data = iData + threadIdx.x * blockDim.y;
+    DTYPE * inputData = input + k * blockSize;
+    for (int it = j; it < strideNum; it += blockDim.y)
+    {
+        threadMax = max(inputData[it * stride + iOffset], threadMax);
+    }
+    __syncthreads();
+    //op reduce
+    /*threadSum += __shfl_down_sync(0xFFFFFFFF, threadSum, 16, 32);
+    threadSum += __shfl_down_sync(0xFFFFFFFF, threadSum, 8, 16);
+    threadSum += __shfl_down_sync(0xFFFFFFFF, threadSum, 4, 8);
+    threadSum += __shfl_down_sync(0xFFFFFFFF, threadSum, 2, 4);
+    threadSum += __shfl_down_sync(0xFFFFFFFF, threadSum, 1, 2);*/
+    threadMax = shfl_down_reduce_max(threadMax);
+    if ((tid & 0x1f) == 0) { data[tid / 32] = threadMax; }
+    __syncthreads();
+    if (tid < 32)
+    {
+        if (tid < blockDim.y / 32)
+            threadMax = data[tid];
+        else threadMax = 0;
+        threadMax = shfl_down_reduce_max(threadMax);
+        if (tid == 0 && blockIdx.y < reducedStrideNum)
+            output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = threadMax;
+    }
+}
+
 /* 
 get the max-valued items along a dimension of the tensor (cuda version). 
 For a 1-dimensional data array a,
@@ -382,130 +528,149 @@ void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
    int devIDBackup;
    ProtectCudaDev(input->devID, devIDBackup);

-    do{
-        if (input->dataType == DEFAULT_DTYPE) {
-            DTYPE * iData = NULL;
-            DTYPE * oData = NULL;
-            if (iter == 0) {
-                iData = (DTYPE*)input->data;
-                oData = buf1;
-            }
-            else if (iter % 2 == 1) {
-                iData = buf1;
-                oData = buf2;
-            }
-            else {
-                iData = buf2;
-                oData = buf1;
-            }
-
-            /* unroll the reduction procedure. The code is messy but it is faster. */
-            if (strideNum < 32) {
-                GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                if (cudaGridSize[0] == 1)
-                    oData = (DTYPE*)output->data;
-                KernelReduceMax << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
-            }
-            else if (strideNum < 128) {
-                GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                if (cudaGridSize[0] == 1)
-                    oData = (DTYPE*)output->data;
-                CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!");
-                KernelReduceMaxFast<64> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
-            }
-            else if (strideNum < 256) {
-                GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                if (cudaGridSize[0] == 1)
-                    oData = (DTYPE*)output->data;
-                CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!");
-                KernelReduceMaxFast<128> << <blocks, threads >> >(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
-            }
-            else if (strideNum < 512) {
-                GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                if (cudaGridSize[0] == 1)
-                    oData = (DTYPE*)output->data;
-                CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!");
-                KernelReduceMaxFast<256> << <blocks, threads >> >(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
-            }
-            else {
-                GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                if (cudaGridSize[0] == 1)
-                    oData = (DTYPE*)output->data;
-                CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!");
-                KernelReduceMaxFast<512> << <blocks, threads >> >(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
-            }
+    if (stride == 1 && blockNum >= 10)
+    {
+        dim3 grids;
+        dim3 blocks;
+        continuousStorageThreadAllocation(grids, blocks, (long long)blockNum, strideNum);
+        if (blocks.y > 128)
+            KernelReduceMaxOp << <grids, blocks >> > ((DTYPE *)input->data, (DTYPE*)output->data, stride, strideNum, grids.y, blockSize, blockNum);
+        else
+        {
+            KernelReduceMaxOpLessBlocks << <blockNum / 4, 128 >> > ((DTYPE *)input->data, (DTYPE*)output->data, strideNum, blockNum);
        }
-        else if (input->dataType == X_FLOAT16) {
-            __half * buf1ft16 = (__half *)buf1;
-            __half * buf2ft16 = (__half *)buf2;
-            __half * iData = NULL;
-            __half * oData = NULL;
-            if (iter == 0) {
-                iData = (__half*)input->data;
-                oData = buf1ft16;
-            }
-            else if (iter % 2 == 1) {
-                iData = buf1ft16;
-                oData = buf2ft16;
+    }
+    else
+    {
+        do {
+            if (input->dataType == DEFAULT_DTYPE) {
+                DTYPE * iData = NULL;
+                DTYPE * oData = NULL;
+                if (iter == 0) {
+                    iData = (DTYPE*)input->data;
+                    oData = buf1;
+                }
+                else if (iter % 2 == 1) {
+                    iData = buf1;
+                    oData = buf2;
+                }
+                else {
+                    iData = buf2;
+                    oData = buf1;
+                }
+
+                /* unroll the reduction procedure. The code is messy but it is faster. */
+                if (strideNum < 32) {
+                    GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    if (cudaGridSize[0] == 1)
+                        oData = (DTYPE*)output->data;
+                    KernelReduceMax << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
+                }
+                else if (strideNum < 128) {
+                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    if (cudaGridSize[0] == 1)
+                        oData = (DTYPE*)output->data;
+                    CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!");
+                    adjustThreadForUseWarpOptimization(blocks, threads);
+                    KernelReduceMaxFast<64> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
+                }
+                else if (strideNum < 256) {
+                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    if (cudaGridSize[0] == 1)
+                        oData = (DTYPE*)output->data;
+                    CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!");
+                    adjustThreadForUseWarpOptimization(blocks, threads);
+                    KernelReduceMaxFast<128> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
+                }
+                else if (strideNum < 512) {
+                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    if (cudaGridSize[0] == 1)
+                        oData = (DTYPE*)output->data;
+                    CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!");
+                    adjustThreadForUseWarpOptimization(blocks, threads);
+                    KernelReduceMaxFast<256> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
+                }
+                else {
+                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    if (cudaGridSize[0] == 1)
+                        oData = (DTYPE*)output->data;
+                    CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!");
+                    adjustThreadForUseWarpOptimization(blocks, threads);
+                    KernelReduceMaxFast<512> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
+                }
            }
-            else {
-                iData = buf2ft16;
-                oData = buf1ft16;
+            else if (input->dataType == X_FLOAT16) {
+                __half * buf1ft16 = (__half *)buf1;
+                __half * buf2ft16 = (__half *)buf2;
+                __half * iData = NULL;
+                __half * oData = NULL;
+                if (iter == 0) {
+                    iData = (__half*)input->data;
+                    oData = buf1ft16;
+                }
+                else if (iter % 2 == 1) {
+                    iData = buf1ft16;
+                    oData = buf2ft16;
+                }
+                else {
+                    iData = buf2ft16;
+                    oData = buf1ft16;
+                }
+
+                /* unroll the reduction procedure. The code is messy but it is faster. */
+                if (strideNum < 32) {
+                    GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    if (cudaGridSize[0] == 1)
+                        oData = (__half*)output->data;
+                    KernelReduceMax << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
+                }
+                else if (strideNum < 128) {
+                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    if (cudaGridSize[0] == 1)
+                        oData = (__half*)output->data;
+                    CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!");
+                    KernelReduceMaxFast<64> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
+                }
+                else if (strideNum < 256) {
+                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    if (cudaGridSize[0] == 1)
+                        oData = (__half*)output->data;
+                    CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!");
+                    KernelReduceMaxFast<128> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
+                }
+                else if (strideNum < 512) {
+                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    if (cudaGridSize[0] == 1)
+                        oData = (__half*)output->data;
+                    CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!");
+                    KernelReduceMaxFast<256> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
+                }
+                else {
+                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    if (cudaGridSize[0] == 1)
+                        oData = (__half*)output->data;
+                    CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!");
+                    KernelReduceMaxFast<512> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
+                }
            }

-            /* unroll the reduction procedure. The code is messy but it is faster. */
-            if (strideNum < 32) {
-                GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                if (cudaGridSize[0] == 1)
-                    oData = (__half*)output->data;
-                KernelReduceMax << <blocks, threads >> >(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
-            }
-            else if (strideNum < 128) {
-                GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                if (cudaGridSize[0] == 1)
-                    oData = (__half*)output->data;
-                CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!");
-                KernelReduceMaxFast<64> << <blocks, threads >> >(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
-            }
-            else if (strideNum < 256) {
-                GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                if (cudaGridSize[0] == 1)
-                    oData = (__half*)output->data;
-                CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!");
-                KernelReduceMaxFast<128> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
-            }
-            else if (strideNum < 512) {
-                GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                if (cudaGridSize[0] == 1)
-                    oData = (__half*)output->data;
-                CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!");
-                KernelReduceMaxFast<256> << <blocks, threads >> >(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
-            }
-            else {
-                GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                if (cudaGridSize[0] == 1)
-                    oData = (__half*)output->data;
-                CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!");
-                KernelReduceMaxFast<512> << <blocks, threads >> >(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
-            }
-        }
-        
-        strideNum = cudaGridSize[0];
-        blockSize = cudaGridSize[0];
+            strideNum = cudaGridSize[0];
+            blockSize = cudaGridSize[0];

-        iter++;
+            iter++;

-    }while(strideNum > 1);
+        } while (strideNum > 1);
+    }

    BacktoCudaDev(input->devID, devIDBackup);


--- a/source/tensor/core/reduce/ReduceSum.cu
+++ b/source/tensor/core/reduce/ReduceSum.cu
@@ -27,6 +27,31 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)

 #ifdef USE_CUDA

+/*
+use PTX code to reduce int data
+*/
+__device__ __forceinline__  int shfl_down_reduce_sum(int input)
+{
+    int output;
+    asm volatile(
+        "{"
+        ".reg .s32 r0;"
+        "shfl.down.b32  r0, %1, 0x10, 0x1f;"
+        "add.s32        %1, r0, %1;"
+        "shfl.down.b32  r0, %1, 0x8, 0xf;"
+        "add.s32        %1, r0, %1;"
+        "shfl.down.b32  r0, %1, 0x4, 0x7;"
+        "add.s32        %1, r0, %1;"
+        "shfl.down.b32  r0, %1, 0x2, 0x3;"
+        "add.s32        %1, r0, %1;"
+        "shfl.down.b32  r0, %1, 0x1, 0x1;"
+        "add.s32        %0, r0, %1;"
+        "}"
+        : "=r"(output) : "r"(input));
+    return output;
+}
+
+
 /* 
 reduce a tensor to another that keeps the sum along a dimension  - slow version
 Given a block of data, we go over each dimension i in the stride and we have
@@ -430,6 +455,196 @@ void KernelReduceSumFast(__half * input, __half * output,
 #endif
 }

+__global__ void KernelReduceSumDiscontinuousStorage(DTYPE * input, DTYPE * output,
+    int stride, int strideNum,
+    DTYPE * shift, DTYPE power, bool isExp)
+{
+    //int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    //int endIndex = (idx+1) * strideNum;
+    int idx = blockDim.x * blockIdx.x + threadIdx.x;
+    int blockIndex = idx / stride;
+    int offsetInBlock = idx% stride;
+    DTYPE ans = 0;
+#pragma unroll
+    for (int i = stride * strideNum * blockIndex + offsetInBlock;
+        i < stride * strideNum * blockIndex + offsetInBlock + stride * strideNum;
+        i += stride)
+    {
+        ans += input[i];
+    }
+    if (threadIdx.x == 0 && blockIdx.x == 0) printf("%d ", stride);
+    output[idx] = ans;
+}
+
+__global__
+void KernelReduceSumOp(DTYPE * input, DTYPE * output,
+    int stride, int strideNum, int reducedStrideNum,
+    int blockSize, int blockNum,
+    DTYPE * shift, DTYPE power, bool isExp)
+{
+    __shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK / 32];
+    __shared__ DTYPE bias[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+
+    unsigned int tid = threadIdx.y;
+    unsigned int j = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= stride * blockNum)
+        return;
+
+    if (threadIdx.y == 0)
+        bias[threadIdx.x] = shift != NULL ? shift[i] : 0;
+
+    __syncthreads();
+
+    /* first level reduction */
+    int k = i / stride;
+    int iOffset = i % stride;
+
+    DTYPE threadSum = 0;
+
+    DTYPE * data = iData + threadIdx.x * blockDim.y;
+    DTYPE * inputData = input + k * blockSize;
+    for (int it = j; it < strideNum; it += blockDim.y)
+    {
+        DTYPE value = inputData[it * stride + iOffset] - bias[threadIdx.x];
+        if (power != (DTYPE)1.0) {
+            if (power == (DTYPE)2.0) {
+                value = value * value;
+            }
+            else if (power == (DTYPE)0.5) {
+                value = sqrt(value);
+            }
+            else {
+                value = pow(value, power);
+            }
+        }
+        if (isExp) value = exp(value);
+        threadSum += value;
+    }
+    __syncthreads();
+    //op reduce
+    /*threadSum += __shfl_down_sync(0xFFFFFFFF, threadSum, 16, 32);
+    threadSum += __shfl_down_sync(0xFFFFFFFF, threadSum, 8, 16);
+    threadSum += __shfl_down_sync(0xFFFFFFFF, threadSum, 4, 8);
+    threadSum += __shfl_down_sync(0xFFFFFFFF, threadSum, 2, 4);
+    threadSum += __shfl_down_sync(0xFFFFFFFF, threadSum, 1, 2);*/
+    threadSum = shfl_down_reduce_sum(threadSum);
+    if ((tid & 0x1f) == 0) { data[tid / 32] = threadSum; }
+    __syncthreads();
+    if (tid < 32)
+    {
+        if (tid < blockDim.y / 32)
+            threadSum = data[tid];
+        else threadSum = 0;
+        threadSum = shfl_down_reduce_sum(threadSum);
+        if (tid == 0 && blockIdx.y < reducedStrideNum)
+            output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = threadSum;
+    }
+
+    /*if (blockDim.y / 32 >= 32) { if (tid < 16) { data[tid] += data[tid + 16]; } __syncthreads(); }
+    if (blockDim.y / 32 >= 16) { if (tid < 8) { data[tid] += data[tid + 8]; } __syncthreads(); }
+    if (blockDim.y / 32 >= 8) { if (tid < 4) { data[tid] += data[tid + 4]; } __syncthreads(); }
+    if (blockDim.y / 32 >= 4) { if (tid < 2) { data[tid] += data[tid + 2]; } __syncthreads(); }
+    if (blockDim.y / 32 >= 2) { if (tid < 1) { data[tid] += data[tid + 1]; } __syncthreads(); }
+    // write result for this block to the output array
+    if (threadIdx.y == 0 && blockIdx.y < reducedStrideNum)
+    output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = data[0];*/
+
+}
+
+__global__
+void KernelReduceSumOpLessBlocks(DTYPE * input, DTYPE * output,
+    int strideNum, int blockNum,
+    DTYPE * shift, DTYPE power, bool isExp)
+{
+    __shared__ DTYPE bias[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+    int idx = threadIdx.x % 32;
+    int idy = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
+
+    if (idx == 0)
+        bias[threadIdx.x / 32] = shift != NULL ? shift[idy] : 0;
+    int startIndex = idy * strideNum;
+    DTYPE threadSum = 0;
+    for (int i = idx; i < strideNum; i += 32)
+    {
+        DTYPE value = input[startIndex + i] - bias[threadIdx.x / 32];
+        if (power != (DTYPE)1.0) {
+            if (power == (DTYPE)2.0) {
+                value = value * value;
+            }
+            else if (power == (DTYPE)0.5) {
+                value = sqrt(value);
+            }
+            else {
+                value = pow(value, power);
+            }
+        }
+        if (isExp) value = exp(value);
+        threadSum += value;
+    }
+    threadSum = shfl_down_reduce_sum(threadSum);
+    if (idx == 0)
+        output[idy] = threadSum;
+    /*__shared__ DTYPE idata[128];
+    idata[threadIdx.x] = threadSum;
+    __syncthreads();
+    if (idx < 16) { idata[threadIdx.x] += idata[threadIdx.x + 16]; }__syncthreads();
+    if (idx < 8) { idata[threadIdx.x ] += idata[threadIdx.x + 8]; }__syncthreads();
+    if (idx < 4) { idata[threadIdx.x ] += idata[threadIdx.x + 4]; }__syncthreads();
+    if (idx < 2) { idata[threadIdx.x ] += idata[threadIdx.x + 2]; }__syncthreads();
+    if (idx < 1) { idata[threadIdx.x ] += idata[threadIdx.x + 1]; }__syncthreads();
+    if (idx == 0)
+    output[idy] = idata[threadIdx.x];*/
+}
+
+//pytorch use this way to allocate threads,they maybe use hard-code according the SM number (the 1080 and 1080 Ti is 20),and it indeed have better perforamnce,
+inline void continuousStorageThreadAllocation(dim3& grid, dim3& block, long long vectorNum, int vectorSize)
+{
+    int warpNum = 4;
+    if (vectorNum < 20 * 8)
+    {
+        warpNum = 8;
+        if (vectorNum < 20 * 4)
+        {
+            warpNum = 16;
+            if (warpNum < 20 * 2)
+                warpNum = 32;
+        }
+    }
+    int minWarpNum = vectorSize / 32;
+    if (vectorSize % 32 != 0) minWarpNum++;
+    warpNum = min(warpNum, minWarpNum);
+    grid.x = vectorNum;
+    grid.y = 1;
+    grid.z = 1;
+    block.x = 1;
+    block.y = warpNum * 32;
+    block.z = 1;
+}
+
+//this situation we use block.x * grid.x deal one vector for continuous read
+inline void discontinuousStorageNoShareMemThreadAllocation(dim3& grid, dim3& block, int stride, int blockNum)
+{
+    block.x = 512;
+    block.y = 1;
+    if ((stride * blockNum) % 512 == 0)
+        grid.x = (stride * blockNum) / 512;
+    else
+        grid.x = (stride * blockNum) / 512 + 1;
+    grid.y = 1;
+}
+
+inline void adjustThreadForUseWarpOptimization(dim3& blocks, dim3& threads)
+{
+    if (threads.x > 1)
+    {
+        blocks.x *= threads.x;
+        threads.x = 1;
+    }
+    if (threads.y<32)
+        threads.y = 32;
+}
+
 /* 
 sum the items along a dimension of the tensor (cuda version). 
 For a 1-dimensional data array a,
@@ -495,137 +710,151 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen

    int devIDBackup;
    ProtectCudaDev(input->devID, devIDBackup);
-
-    do{
-        if(input->dataType == DEFAULT_DTYPE){
-            DTYPE * iData = NULL;
-            DTYPE * oData = NULL;
-            if (iter == 0) {
-                iData = (DTYPE*)input->data;
-                oData = buf1;
-            }
-            else if (iter % 2 == 1) {
-                iData = buf1;
-                oData = buf2;
-            }
-            else {
-                iData = buf2;
-                oData = buf1;
-            }
-            /* unroll the reduction procedure. The code is messy but it is faster. */
-            if(strideNum < 32){
-                GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                if (cudaGridSize[0] == 1)
-                    oData = (DTYPE*)output->data;
-                KernelReduceSum <<<blocks, threads >>>(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp);
-            }
-            else if(strideNum < 128){
-                GDevs.GetCudaThread2D(devID, MAX(strideNum/2+1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                if (cudaGridSize[0] == 1)
-                    oData = (DTYPE*)output->data;
-                CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!");
-                KernelReduceSumFast<64> <<<blocks, threads >>>(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp);
-            }
-            else if(strideNum < 256){
-                GDevs.GetCudaThread2D(devID, MAX(strideNum/2+1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                if (cudaGridSize[0] == 1)
-                    oData = (DTYPE*)output->data;
-                CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!");
-                KernelReduceSumFast<128> <<<blocks, threads >>>(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp);
-            }
-            else if(strideNum < 512){
-                GDevs.GetCudaThread2D(devID, MAX(strideNum/2+1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                if (cudaGridSize[0] == 1)
-                    oData = (DTYPE*)output->data;
-                CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!");
-                KernelReduceSumFast<256> <<<blocks, threads >>>(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp);
-            }
-            else{
-                GDevs.GetCudaThread2D(devID, MAX(strideNum/2+1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                if (cudaGridSize[0] == 1)
-                    oData = (DTYPE*)output->data;
-                CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!");
-                KernelReduceSumFast<512> <<<blocks, threads >>>(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp);
-            }
+    if (stride == 1 && blockNum >= 10)
+    {
+        dim3 grids;
+        dim3 blocks;
+        continuousStorageThreadAllocation(grids, blocks, (long long)blockNum, strideNum);
+        if (blocks.y > 128)
+            KernelReduceSumOp << <grids, blocks >> > ((DTYPE *)input->data, (DTYPE*)output->data, stride, strideNum, grids.y, blockSize, blockNum, sp, power, isExp);
+        else
+        {
+            KernelReduceSumOpLessBlocks << <blockNum / 4, 128 >> > ((DTYPE *)input->data, (DTYPE*)output->data, strideNum, blockNum, sp, power, isExp);
        }
-        else if(input->dataType == X_FLOAT16){
-            __half * buf1ft16 = (__half *)buf1;
-            __half * buf2ft16 = (__half *)buf2;
-            __half * spft16 = (__half *)sp;
-            unsigned short power2 = FloatToFloat16(power);
-            __half * powerft16p = (__half*)&power2;
-            __half * iData = NULL;
-            __half * oData = NULL;
-            if (iter == 0) {
-                iData = (__half*)input->data;
-                oData = buf1ft16;
-            }
-            else if (iter % 2 == 1) {
-                iData = buf1ft16;
-                oData = buf2ft16;
-            }
-            else {
-                iData = buf2ft16;
-                oData = buf1ft16;
-            }
-
-            /* unroll the reduction procedure. The code is messy but it is faster. */
-            if(strideNum < 32){
-                GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                if (cudaGridSize[0] == 1)
-                    oData = (__half*)output->data;
-                KernelReduceSum << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, spft16, *powerft16p, isExp);
-            }
-            else if(strideNum < 128){
-                GDevs.GetCudaThread2D(devID, MAX(strideNum/2+1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                if (cudaGridSize[0] == 1)
-                    oData = (__half*)output->data;
-                CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!");
-                KernelReduceSumFast<64> <<<blocks, threads >>>(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, spft16, *powerft16p, isExp);
-            }
-            else if(strideNum < 256){
-                GDevs.GetCudaThread2D(devID, MAX(strideNum/2+1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                if (cudaGridSize[0] == 1)
-                    oData = (__half*)output->data;
-                CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!");
-                KernelReduceSumFast<128> <<<blocks, threads >>>(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, spft16, *powerft16p, isExp);
-            }
-            else if(strideNum < 512){
-                GDevs.GetCudaThread2D(devID, MAX(strideNum/2+1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                if (cudaGridSize[0] == 1)
-                    oData = (__half*)output->data;
-                CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!");
-                KernelReduceSumFast<256> <<<blocks, threads >>>(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, spft16, *powerft16p, isExp);
+        //printf("grad %d %d thread %d %d\n", grids.x, grids.y, blocks.x, blocks.y);
+    }
+    else
+    {
+        do {
+            if (input->dataType == DEFAULT_DTYPE) {
+                DTYPE * iData = NULL;
+                DTYPE * oData = NULL;
+                if (iter == 0) {
+                    iData = (DTYPE*)input->data;
+                    oData = buf1;
+                }
+                else if (iter % 2 == 1) {
+                    iData = buf1;
+                    oData = buf2;
+                }
+                else {
+                    iData = buf2;
+                    oData = buf1;
+                }
+                /* unroll the reduction procedure. The code is messy but it is faster. */
+                if (strideNum < 32) {
+                    GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    if (cudaGridSize[0] == 1)
+                        oData = (DTYPE*)output->data;
+                    KernelReduceSum << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp);
+                }
+                else if (strideNum < 128) {
+                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    if (cudaGridSize[0] == 1)
+                        oData = (DTYPE*)output->data;
+                    CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!");
+                    KernelReduceSumFast<64> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp);
+                }
+                else if (strideNum < 256) {
+                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    if (cudaGridSize[0] == 1)
+                        oData = (DTYPE*)output->data;
+                    CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!");
+                    KernelReduceSumFast<128> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp);
+                }
+                else if (strideNum < 512) {
+                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    if (cudaGridSize[0] == 1)
+                        oData = (DTYPE*)output->data;
+                    CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!");
+                    KernelReduceSumFast<256> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp);
+                }
+                else {
+                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    if (cudaGridSize[0] == 1)
+                        oData = (DTYPE*)output->data;
+                    CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!");
+                    KernelReduceSumFast<512> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, sp, power, isExp);
+                }
            }
-            else{
-                GDevs.GetCudaThread2D(devID, MAX(strideNum/2+1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                if (cudaGridSize[0] == 1)
-                    oData = (__half*)output->data;
-                CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!");
-                KernelReduceSumFast<512> <<<blocks, threads >>>(iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, spft16, *powerft16p, isExp);
+            else if (input->dataType == X_FLOAT16) {
+                __half * buf1ft16 = (__half *)buf1;
+                __half * buf2ft16 = (__half *)buf2;
+                __half * spft16 = (__half *)sp;
+                unsigned short power2 = FloatToFloat16(power);
+                __half * powerft16p = (__half*)&power2;
+                __half * iData = NULL;
+                __half * oData = NULL;
+                if (iter == 0) {
+                    iData = (__half*)input->data;
+                    oData = buf1ft16;
+                }
+                else if (iter % 2 == 1) {
+                    iData = buf1ft16;
+                    oData = buf2ft16;
+                }
+                else {
+                    iData = buf2ft16;
+                    oData = buf1ft16;
+                }
+
+                /* unroll the reduction procedure. The code is messy but it is faster. */
+                if (strideNum < 32) {
+                    GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    if (cudaGridSize[0] == 1)
+                        oData = (__half*)output->data;
+                    KernelReduceSum << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, spft16, *powerft16p, isExp);
+                }
+                else if (strideNum < 128) {
+                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    if (cudaGridSize[0] == 1)
+                        oData = (__half*)output->data;
+                    CheckNTErrors((cudaBlockSize[0] >= 64), "Incorrect thread number when calling the cuda kernel!");
+                    KernelReduceSumFast<64> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, spft16, *powerft16p, isExp);
+                }
+                else if (strideNum < 256) {
+                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    if (cudaGridSize[0] == 1)
+                        oData = (__half*)output->data;
+                    CheckNTErrors((cudaBlockSize[0] >= 128), "Incorrect thread number when calling the cuda kernel!");
+                    KernelReduceSumFast<128> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, spft16, *powerft16p, isExp);
+                }
+                else if (strideNum < 512) {
+                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    if (cudaGridSize[0] == 1)
+                        oData = (__half*)output->data;
+                    CheckNTErrors((cudaBlockSize[0] >= 256), "Incorrect thread number when calling the cuda kernel!");
+                    KernelReduceSumFast<256> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, spft16, *powerft16p, isExp);
+                }
+                else {
+                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    if (cudaGridSize[0] == 1)
+                        oData = (__half*)output->data;
+                    CheckNTErrors((cudaBlockSize[0] >= 512), "Incorrect thread number when calling the cuda kernel!");
+                    KernelReduceSumFast<512> << <blocks, threads >> > (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum, spft16, *powerft16p, isExp);
+                }
            }
-        }

-        strideNum = cudaGridSize[0];
-        blockSize = cudaGridSize[0];
-        sp = NULL;
-        power = (DTYPE)1.0;
-        isExp = false;
+            strideNum = cudaGridSize[0];
+            blockSize = cudaGridSize[0];
+            sp = NULL;
+            power = (DTYPE)1.0;
+            isExp = false;

-        iter++;
-
-    }while(strideNum > 1);
+            iter++;

+        } while (strideNum > 1);
+    }
    ProtectCudaDev(input->devID, devIDBackup);

    if (mem != NULL)