From 467c2ed790e5327e0449a68b520661b219a57df6 Mon Sep 17 00:00:00 2001
From: ZHNAYGUHAO <yoohao.zhang@gmail.com>
Date: Thu, 17 Oct 2019 09:08:08 +0800
Subject: [PATCH] Add reduceMin operation using #define

---
 source/tensor/core/reduce/ReduceMax.cpp    | 318 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------------------------------------------------------------------------------------------------------------------------------------
 source/tensor/core/reduce/ReduceMax.cu     | 800 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 source/tensor/core/reduce/ReduceMax.cuh    |   3 +++
 source/tensor/core/reduce/ReduceMax.h      |   9 +++++++++
 source/tensor/core/reduce/VectorBuffer.cpp |   9 +++++++++
 source/tensor/core/reduce/VectorBuffer.h   |   3 +++
 6 files changed, 609 insertions(+), 533 deletions(-)

diff --git a/source/tensor/core/reduce/ReduceMax.cpp b/source/tensor/core/reduce/ReduceMax.cpp
index 7074d66..929e5ae 100644
--- a/source/tensor/core/reduce/ReduceMax.cpp
+++ b/source/tensor/core/reduce/ReduceMax.cpp
@@ -28,6 +28,8 @@
 
 namespace nts{ // namespace nts(NiuTrans.Tensor)
 
+
+
 /* 
 get the max value of the items along a dimension of the tensor
 
@@ -35,129 +37,147 @@ get the max value of the items along a dimension of the tensor
 >> output - the output tensor
 >> dim - the dimension where the reduction is performed on
 */
-void _ReduceMax(const XTensor * input, XTensor * output, int dim)
-{
-    CheckNTErrors((input->devID == output->devID || (input->devID < 0 && output->devID < 0)), 
-                  "This code must be run on the same device!");
-    CheckNTErrors((input && output), "Empty input or output tensors!");
-    CheckNTErrors((input->order == output->order + 1), "Incorrect tensor sizes!");
-    CheckNTErrors((input->order > dim && dim >=0), "Illegal dimension to reduce!");
-    CheckNTErrors((input->dataType == output->dataType), "Unmatched data types!");
-    
-    CheckNTErrors(dim < input->order, "Wrong dimension!");
-
-    for(int i = 0; i < input->order; i++){
-        if(i < dim){
-            CheckNTErrors((input->dimSize[i] == output->dimSize[i]), 
-                          "Unmatched tensors!");
-        }
-        else if(i > dim){
-            CheckNTErrors((input->dimSize[i] == output->dimSize[i - 1]), 
-                          "Unmatched tensors!");
-        }
-    }
-
-    if(input->devID >= 0){
-#ifdef USE_CUDA
-        _CudaReduceMax(input, output, dim);
-#endif
-    }
-    else{
-        CheckNTErrors((input->dataType == DEFAULT_DTYPE), "TODO!");
-
-        int stride = 1;
-        int strideNum = input->dimSize[dim];
-        int blockSize = 1;
-        int blockNum = 1;
-        for (int i = 0; i < input->order; i++) {
-            if (i > dim)
-                stride *= input->dimSize[i];
-            else if (i < dim)
-                blockNum *= input->dimSize[i];
-        }
-        blockSize = stride * strideNum;
 
+#define _REDUCE_CPU_FUNCTION(_funcCPUName, _vectorOp, _reduceOp)                                                    \
+void _funcCPUName(const XTensor * input, XTensor * output, int dim)                                                 \
+{                                                                                                                   \
+    CheckNTErrors((input->devID == output->devID || (input->devID < 0 && output->devID < 0)),                       \
+        "This code must be run on the same device!");                                                               \
+    CheckNTErrors((input && output), "Empty input or output tensors!");                                             \
+    CheckNTErrors((input->order == output->order + 1), "Incorrect tensor sizes!");                                  \
+    CheckNTErrors((input->order > dim && dim >= 0), "Illegal dimension to reduce!");                                \
+    CheckNTErrors((input->dataType == output->dataType), "Unmatched data types!");                                  \
+                                                                                                                    \
+    CheckNTErrors(dim < input->order, "Wrong dimension!");                                                          \
+                                                                                                                    \
+    for (int i = 0; i < input->order; i++) {                                                                        \
+                                                                                                                    \
+            if (i < dim) {                                                                                          \
+                                                                                                                    \
+                    CheckNTErrors((input->dimSize[i] == output->dimSize[i]),                                        \
+                        "Unmatched tensors!");                                                                      \
+            }                                                                                                       \
+            else if (i > dim) {                                                                                     \
+                        CheckNTErrors((input->dimSize[i] == output->dimSize[i - 1]),                                \
+                            "Unmatched tensors!");                                                                  \
+                }                                                                                                   \
+    }                                                                                                               \
+    CheckNTErrors((input->dataType == DEFAULT_DTYPE), "TODO!");                                                     \
+    int stride = 1;                                                                                                 \
+    int strideNum = input->dimSize[dim];                                                                            \
+    int blockSize = 1;                                                                                              \
+    int blockNum = 1;                                                                                               \
+    for (int i = 0; i < input->order; i++) {                                                                        \
+        if (i > dim)                                                                                                \
+            stride *= input->dimSize[i];                                                                            \
+        else if (i < dim)                                                                                           \
+            blockNum *= input->dimSize[i];                                                                          \
+    }                                                                                                               \
+    blockSize = stride * strideNum;                                                                                 \
+                                                                                                                    \
+                                                                                                                    \
+    if(input->dimSize[input->order - 1] % (4 * 32 / sizeof(DTYPE)) == 0 && input->dimSize[input->order - 1] >= 32){ \
+        int vecBufLength =  32 / sizeof(DTYPE);                                                                     \
+                                                                                                                    \
+        if (dim == input->order - 1) {                                                                              \
+            /*data is contiguous in dim 0 */                                                                        \
+            for (int i = 0; i < blockNum; i++) {                                                                    \
+                DTYPE * ip = (DTYPE*)input->data + blockSize * i;                                                   \
+                DTYPE * op = (DTYPE*)output->data + i;                                                              \
+                VectorBuffer vecBuf[4];                                                                             \
+                for (int j = 0; j < 4; j++) {                                                                       \
+                    vecBuf[j] = VectorBuffer::loadu((DTYPE*)(ip)+j * vecBufLength);                                 \
+                }                                                                                                   \
+                for (int j = 1; j < strideNum / 32; j++) {                                                          \
+                    const DTYPE* ptr = (DTYPE*)(ip + j * vecBufLength);                                             \
+                    vecBuf[0] = vecBuf[0]._vectorOp(VectorBuffer::loadu(ptr + 0 * vecBufLength));                   \
+                    vecBuf[1] = vecBuf[1]._vectorOp(VectorBuffer::loadu(ptr + 1 * vecBufLength));                   \
+                    vecBuf[2] = vecBuf[2]._vectorOp(VectorBuffer::loadu(ptr + 2 * vecBufLength));                   \
+                    vecBuf[3] = vecBuf[3]._vectorOp(VectorBuffer::loadu(ptr + 3 * vecBufLength));                   \
+                }                                                                                                   \
+                vecBuf[0] = vecBuf[0]._vectorOp(vecBuf[1]);                                                         \
+                vecBuf[0] = vecBuf[0]._vectorOp(vecBuf[2]);                                                         \
+                vecBuf[0] = vecBuf[0]._vectorOp(vecBuf[3]);                                                         \
+                DTYPE maxN = vecBuf[0][0];                                                                          \
+                for (int k = 1; k < vecBufLength; k++) {                                                            \
+                    maxN = _reduceOp(maxN, vecBuf[0][k]);                                                           \
+                }                                                                                                   \
+                *op = maxN;                                                                                         \
+            }                                                                                                       \
+                                                                                                                    \
+        }                                                                                                           \
+        else {                                                                                                      \
+            /* data is separated */                                                                                 \
+            for(int i = 0; i < blockNum; i++){                                                                      \
+                for(int j = 0; j < input->dimSize[input->order - 1] / 32; j++){                                     \
+                    DTYPE * ip = (DTYPE*)input->data + blockSize * i;                                               \
+                    DTYPE * op = (DTYPE*)output->data + stride * i;                                                 \
+                    VectorBuffer vecBuf[4];                                                                         \
+                    for(int k = 0; k < 4; k++){                                                                     \
+                        vecBuf[k] = VectorBuffer::loadu((DTYPE*)(ip) + (j * 4 + k) * 32 / sizeof(DTYPE));           \
+                                                                                                                    \
+                    }                                                                                               \
+                    for(int k = 1; k < strideNum; k++){                                                             \
+                        DTYPE * ptr = ip + k * stride + (j * 4) * vecBufLength;                                     \
+                        vecBuf[0] = vecBuf[0]._vectorOp(VectorBuffer::loadu(ptr + 0 * vecBufLength));               \
+                        vecBuf[1] = vecBuf[1]._vectorOp(VectorBuffer::loadu(ptr + 1 * vecBufLength));               \
+                        vecBuf[2] = vecBuf[2]._vectorOp(VectorBuffer::loadu(ptr + 2 * vecBufLength));               \
+                        vecBuf[3] = vecBuf[3]._vectorOp(VectorBuffer::loadu(ptr + 3 * vecBufLength));               \
+                    }                                                                                               \
+                    for(int k = 0; k < 4; k++){                                                                     \
+                        for(int l = 0; l < vecBufLength; l++)                                                       \
+                            *(op + j * 32 + 8 * k + l) = vecBuf[k][l];                                              \
+                    }                                                                                               \
+                }                                                                                                   \
+            }                                                                                                       \
+        }                                                                                                           \
+    }/* run vector buffer */                                                                                        \
+    else{                                                                                                           \
+        for(int k = 0; k < blockNum; k++){                                                                          \
+            DTYPE * ip = (DTYPE*)input->data + blockSize * k;                                                       \
+            DTYPE * op = (DTYPE*)output->data + stride * k;                                                         \
+            for(int i = 0; i < stride; i++){                                                                        \
+                DTYPE * ipe = ip + blockSize;                                                                       \
+                DTYPE tmpData = *(ip + i);                                                                          \
+                for(DTYPE * ipb = ip + i + stride; ipb < ipe; ipb += stride){                                       \
+                    DTYPE v = *ipb;                                                                                 \
+                    tmpData = _reduceOp(tmpData, v);                                                                \
+                }                                                                                                   \
+                *(op + i) = tmpData;                                                                                \
+            }                                                                                                       \
+        }                                                                                                           \
+    }                                                                                                               \
+}
 
-        if(input->dimSize[input->order - 1] % (4 * 32 / sizeof(DTYPE)) == 0 && input->dimSize[input->order - 1] >= 32){
-            int vecBufLength =  32 / sizeof(DTYPE);
-
-            if (dim == input->order - 1) {
-                //data is contiguous in dim 0
-                for (int i = 0; i < blockNum; i++) {
-                    DTYPE * ip = (DTYPE*)input->data + blockSize * i;
-                    DTYPE * op = (DTYPE*)output->data + i;
-                    VectorBuffer vecBuf[4];
-                    for (int j = 0; j < 4; j++) {
-                        vecBuf[j] = VectorBuffer::loadu((DTYPE*)(ip)+j * vecBufLength);
-                    }
-                    for (int j = 1; j < strideNum / 32; j++) {
-                        const DTYPE* ptr = (DTYPE*)(ip + j * vecBufLength);
-                        vecBuf[0] = vecBuf[0].maxData(VectorBuffer::loadu(ptr + 0 * vecBufLength));
-                        vecBuf[1] = vecBuf[1].maxData(VectorBuffer::loadu(ptr + 1 * vecBufLength));
-                        vecBuf[2] = vecBuf[2].maxData(VectorBuffer::loadu(ptr + 2 * vecBufLength));
-                        vecBuf[3] = vecBuf[3].maxData(VectorBuffer::loadu(ptr + 3 * vecBufLength));
-                    }
-                    vecBuf[0] = vecBuf[0].maxData(vecBuf[1]);
-                    vecBuf[0] = vecBuf[0].maxData(vecBuf[2]);
-                    vecBuf[0] = vecBuf[0].maxData(vecBuf[3]);
-                    DTYPE maxN = DTYPE_MIN;
-                    for (int k = 0; k < vecBufLength; k++) {
-                        maxN = MAX(maxN, vecBuf[0][k]);
-                    }
-                    *op = maxN;
-                }
-
-            }
-            else {
-                //data is separated
-                for(int i = 0; i < blockNum; i++){
-                    for(int j = 0; j < input->dimSize[input->order - 1] / 32; j++){
-                        DTYPE * ip = (DTYPE*)input->data + blockSize * i;
-                        DTYPE * op = (DTYPE*)output->data + stride * i;
-                        VectorBuffer vecBuf[4];
-                        for(int k = 0; k < 4; k++){
-                            vecBuf[k] = VectorBuffer::loadu((DTYPE*)(ip) + (j * 4 + k) * 32 / sizeof(DTYPE));
-
-                        }
-                        for(int k = 1; k < strideNum; k++){
-                            DTYPE * ptr = ip + k * stride + (j * 4) * vecBufLength;
-                            vecBuf[0] = vecBuf[0].maxData(VectorBuffer::loadu(ptr + 0 * vecBufLength));
-                            vecBuf[1] = vecBuf[1].maxData(VectorBuffer::loadu(ptr + 1 * vecBufLength));
-                            vecBuf[2] = vecBuf[2].maxData(VectorBuffer::loadu(ptr + 2 * vecBufLength));
-                            vecBuf[3] = vecBuf[3].maxData(VectorBuffer::loadu(ptr + 3 * vecBufLength));
-                        }
-                        for(int k = 0; k < 4; k++){
-                            for(int l = 0; l < vecBufLength; l++)
-                                *(op + j * 32 + 8 * k + l) = vecBuf[k][l];
-                        }
-                    }
-                }
-            }
-        }//run vector buffer
-        else{
-            for(int k = 0; k < blockNum; k++){
-                DTYPE * ip = (DTYPE*)input->data + blockSize * k;
-                DTYPE * op = (DTYPE*)output->data + stride * k;
-                for(int i = 0; i < stride; i++){
-    //#if defined(USE_BLAS)
-    //                    *(op + i) = *(ip + i + (int)(stride * IAMAX(strideNum, ip + i, stride)));
-    //#else
-                        DTYPE max = DTYPE_MIN;
-                        DTYPE * ipe = ip + blockSize;
-                        for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
-                            DTYPE v = *ipb;
-                            if(max < v)
-                                max = v;
-                        }
-                        *(op + i) = max;
-    //#endif
-                }
-            }
-        }
-    }
+_REDUCE_CPU_FUNCTION(reduceMaxCPU, maxData, MAX)
+_REDUCE_CPU_FUNCTION(reduceMinCPU, minData, MIN)
+
+#ifdef USE_CUDA            
+#define _REDUCE_FUNCTION(_funcName, _cudaFuncName)                                                                   \
+void _funcName(const XTensor * input, XTensor * output, int dim)                                                     \
+{                                                                                                                    \
+    if(input->devID >= 0){                                                                                           \
+        _cudaFuncName(input, output, dim);                                                                           \
+    }                                                                                                                \
+    else{                                                                                                            \
+        reduceMaxCPU(input, output, dim);                                                                            \
+    }                                                                                                                \
 }
+_REDUCE_FUNCTION(_ReduceMax, _CudaReduceMax)
+_REDUCE_FUNCTION(_ReduceMin, _CudaReduceMin)
+#else
+#define _REDUCE_FUNCTION(_funcName, reduceNameCPU)                                                                   \
+void _funcName(const XTensor * input, XTensor * output, int dim)                                                     \
+{                                                                                                                    \
+    CheckNTErrors((input->devID < 0), "This code must be run on the CPU!");                                          \
+    reduceNameCPU(input, output, dim);                                                                               \
+}
+    _REDUCE_FUNCTION(_ReduceMax, reduceMaxCPU)
+    _REDUCE_FUNCTION(_ReduceMin, reduceMinCPU)
+#endif 
 
-/* 
+
+/*
 get the max value of the items along a dimension of the tensor (return an XTensor structure).
 make a new tensor to keep the result and return it
 
@@ -165,34 +185,38 @@ make a new tensor to keep the result and return it
 >> dim - the dimension where the reduction is performed on
 << return - the max value of the items along a dimension of the tensor
 */
-XTensor ReduceMax(const XTensor &input, int dim)
-{
-    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
-	
-    int order = input.order - 1;
-    int * dimSize = new int[order];
-    for(int i = 0; i < order; i++){
-        if(i < dim)
-            dimSize[i] = input.dimSize[i];
-        else if(i >= dim)
-            dimSize[i] = input.dimSize[i + 1];
-    }
-
-    float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
-    XTensor output(order, dimSize, input.dataType, dr, input.devID, input.mem);
-    output.SetTMPFlag();
-
-    /* call _ReduceMax function */
-    _ReduceMax(&input, &output, dim);
-    
-    /* tensor connection */
-    XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCEMAX);
-    XLink::AddParamToHeadInt(&output, dim);
-
-    /* destroy variables */
-    delete[] dimSize;
-
-    return output;
+#define REDUCE_FUNCTION(funcName, funcOp)                                                                           \
+XTensor funcName(const XTensor & input, int dim)                                                                    \
+{                                                                                                                   \
+    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");                                   \
+	                                                                                                                \
+    int order = input.order - 1;                                                                                    \
+    int * dimSize = new int[order];                                                                                 \
+    for(int i = 0; i < order; i++){                                                                                 \
+        if(i < dim)                                                                                                 \
+            dimSize[i] = input.dimSize[i];                                                                          \
+        else if(i >= dim)                                                                                           \
+            dimSize[i] = input.dimSize[i + 1];                                                                      \
+    }                                                                                                               \
+                                                                                                                    \
+    float dr = (!input.isSparse) ? 1.0F : input.denseRatio;                                                         \
+    XTensor output(order, dimSize, input.dataType, dr, input.devID, input.mem);                                     \
+    output.SetTMPFlag();                                                                                            \
+                                                                                                                    \
+    /* call _ReduceMax function */                                                                                  \
+    funcOp(&input, &output, dim);                                                                                   \
+                                                                                                                    \
+    /* tensor connection */                                                                                         \
+    XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCEMAX);                                                       \
+    XLink::AddParamToHeadInt(&output, dim);                                                                         \
+                                                                                                                    \
+    /* destroy variables */                                                                                         \
+    delete[] dimSize;                                                                                               \
+                                                                                                                    \
+    return output;                                                                                                  \
 }
 
+REDUCE_FUNCTION(ReduceMax, _ReduceMax)
+REDUCE_FUNCTION(ReduceMin, _ReduceMin)
+
 } // namespace nts(NiuTrans.Tensor)
diff --git a/source/tensor/core/reduce/ReduceMax.cu b/source/tensor/core/reduce/ReduceMax.cu
index 599945d..942535e 100644
--- a/source/tensor/core/reduce/ReduceMax.cu
+++ b/source/tensor/core/reduce/ReduceMax.cu
@@ -33,67 +33,75 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 /*
 use PTX code to reduce float data
 */
-__device__ __forceinline__  
-float shflDownReduceMax(float input)
-{
-    float output;
-    asm volatile(
-        "{"
-        ".reg .f32 r0;"
-        ".reg .pred p;"
-        "shfl.sync.down.b32  r0, %1, 0x10, 0x1f,0xffffffff;"
-        "setp.lt.f32    p,%1,r0;"
-        "@p mov.f32     %1,r0;"
-        "shfl.sync.down.b32  r0, %1, 0x8, 0xf,0xffffffff;"
-        "setp.lt.f32    p,%1,r0;"
-        "@p mov.f32     %1,r0;"
-        "shfl.sync.down.b32  r0, %1, 0x4, 0x7,0xffffffff;"
-        "setp.lt.f32    p,%1,r0;"
-        "@p mov.f32     %1,r0;"
-        "shfl.sync.down.b32  r0, %1, 0x2, 0x3,0xffffffff;"
-        "setp.lt.f32    p,%1,r0;"
-        "@p mov.f32     %1,r0;"
-        "shfl.sync.down.b32  r0, %1, 0x1, 0x1,0xffffffff;"
-        "setp.lt.f32    p, %1, r0; "
-        "@p mov.f32     %1,r0;"
-        "mov.f32        %0,%1;"
-        "}"
-        : "=f"(output) : "f"(input));
-    return output;
+#define SHLFUNCFLOAT(funcName, reducePTXOp)                         \
+__device__ __forceinline__                                     \
+float funcName(float input)                                    \
+{                                                              \
+    float output;                                              \
+    asm volatile(                                              \
+        "{"                                                    \
+        ".reg .f32 r0;"                                        \
+        ".reg .pred p;"                                        \
+        "shfl.sync.down.b32  r0, %1, 0x10, 0x1f,0xffffffff;"   \
+        "setp."#reducePTXOp".f32    p,%1,r0;"                  \
+        "@p mov.f32     %1,r0;"                                \
+        "shfl.sync.down.b32  r0, %1, 0x8, 0xf,0xffffffff;"     \
+        "setp."#reducePTXOp".f32    p,%1,r0;"                  \
+        "@p mov.f32     %1,r0;"                                \
+        "shfl.sync.down.b32  r0, %1, 0x4, 0x7,0xffffffff;"     \
+        "setp."#reducePTXOp".f32    p,%1,r0;"                  \
+        "@p mov.f32     %1,r0;"                                \
+        "shfl.sync.down.b32  r0, %1, 0x2, 0x3,0xffffffff;"     \
+        "setp."#reducePTXOp".f32    p,%1,r0;"                  \
+        "@p mov.f32     %1,r0;"                                \
+        "shfl.sync.down.b32  r0, %1, 0x1, 0x1,0xffffffff;"     \
+        "setp."#reducePTXOp".f32    p, %1, r0; "               \
+        "@p mov.f32     %1,r0;"                                \
+        "mov.f32        %0,%1;"                                \
+        "}"                                                    \
+        : "=f"(output) : "f"(input));                          \
+    return output;                                             \
 }
 
+SHLFUNCFLOAT(shflDownReduceMax, lt)
+SHLFUNCFLOAT(shflDownReduceMin, gt)
+
 /*
 use PTX code to reduce int data
 */
-__device__ __forceinline__
-int shflDownReduceMax(int input)
-{
-    int output;
-    asm volatile(
-        "{"
-        ".reg .s32 r0;"
-        ".reg .pred p;"
-        "shfl.sync.down.b32  r0, %1, 0x10, 0x1f,0xffffffff;"
-        "setp.lt.s32    p,%1,r0;"
-        "@p mov.s32     %1,r0;"
-        "shfl.sync.down.b32  r0, %1, 0x8, 0xf,0xffffffff;"
-        "setp.lt.s32    p,%1,r0;"
-        "@p mov.s32     %1,r0;"
-        "shfl.sync.down.b32  r0, %1, 0x4, 0x7,0xffffffff;"
-        "setp.lt.s32    p,%1,r0;"
-        "@p mov.s32     %1,r0;"
-        "shfl.sync.down.b32  r0, %1, 0x2, 0x3,0xffffffff;"
-        "setp.lt.s32    p,%1,r0;"
-        "@p mov.s32     %1,r0;"
-        "shfl.sync.down.b32  r0, %1, 0x1, 0x1,0xffffffff;"
-        "setp.lt.s32    p, %1, r0; "
-        "@p mov.s32     %1,r0;"
-        "mov.s32        %0,%1;"
-        "}"
-        : "=r"(output) : "r"(input));
-    return output;
+#define SHLFUNCINT(funcName, reducePTXOp)                      \
+__device__ __forceinline__                                     \
+int funcName(int input)                                        \
+{                                                              \
+    int output;                                                \
+    asm volatile(                                              \
+        "{"                                                    \
+        ".reg .s32 r0;"                                        \
+        ".reg .pred p;"                                        \
+        "shfl.sync.down.b32  r0, %1, 0x10, 0x1f,0xffffffff;"   \
+        "setp."#reducePTXOp".s32    p,%1,r0;"                  \
+        "@p mov.s32     %1,r0;"                                \
+        "shfl.sync.down.b32  r0, %1, 0x8, 0xf,0xffffffff;"     \
+        "setp."#reducePTXOp".s32    p,%1,r0;"                  \
+        "@p mov.s32     %1,r0;"                                \
+        "shfl.sync.down.b32  r0, %1, 0x4, 0x7,0xffffffff;"     \
+        "setp."#reducePTXOp".s32    p,%1,r0;"                  \
+        "@p mov.s32     %1,r0;"                                \
+        "shfl.sync.down.b32  r0, %1, 0x2, 0x3,0xffffffff;"     \
+        "setp."#reducePTXOp".s32    p,%1,r0;"                  \
+        "@p mov.s32     %1,r0;"                                \
+        "shfl.sync.down.b32  r0, %1, 0x1, 0x1,0xffffffff;"     \
+        "setp."#reducePTXOp".s32    p, %1, r0; "               \
+        "@p mov.s32     %1,r0;"                                \
+        "mov.s32        %0,%1;"                                \
+        "}"                                                    \
+        : "=r"(output) : "r"(input));                          \
+    return output;                                             \
 }
 
+SHLFUNCINT(shflDownReduceMax, lt)
+SHLFUNCINT(shflDownReduceMin, gt)
+
 /* 
 reduce a tensor to another that keeps the max value along a dimension  - slow version
 Given a block of data, we go over each dimension i in the stride and we have
@@ -108,48 +116,52 @@ crossing of the i-th columne and the j-th row.
 >> blockSize - size of the block (i.e., stride * strideNum)
 >> blockNum - how many blocks
 */
- __global__
-void KernelReduceMax(DTYPE * input, DTYPE * output, 
-                     int stride, int strideNum, int reducedStrideNum, 
-                     int blockSize, int blockNum)
-{
-    __shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK * MIN_CUDA_SHARED_MEM_COL_SIZE/2];
-
-    int idx = threadIdx.x * blockDim.y + threadIdx.y;
-    unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
-    unsigned int j = blockIdx.y*blockDim.y + threadIdx.y;
-
-    if(i >= stride * blockNum)
-        return;
-
-    __syncthreads();
-
-    int k = i / stride;
-    int iOffset = i % stride;
-
-    DTYPE value = (i < stride * blockNum && j < strideNum) ? 
-                   input[blockSize * k + stride * j + iOffset] : FLOAT_MIN;
-
-    /* load data into the shared mem */
-    iData[threadIdx.x * blockDim.y + threadIdx.y] = value;
-
-    __syncthreads();
-
-    /* do reduction in shared mem */
-    for (unsigned int s = blockDim.y/2; s > 0; s >>= 1){
-        if(threadIdx.y < s && iData[idx] < iData[idx + s]){
-            iData[idx] = iData[idx + s];
-        }
-
-        __syncthreads();
-    }
-
-    /* write result for this block to the output array */
-    if (threadIdx.y == 0 && blockIdx.y < reducedStrideNum) 
-        output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = iData[threadIdx.x * blockDim.y];
-
+#define KERNELREDUCEFUN3(funName, opName, initData)                                                         \
+ __global__                                                                                                 \
+void funName(DTYPE * input, DTYPE * output,                                                                 \
+                     int stride, int strideNum, int reducedStrideNum,                                       \
+                     int blockSize, int blockNum)                                                           \
+{                                                                                                           \
+    __shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK * MIN_CUDA_SHARED_MEM_COL_SIZE/2];                 \
+                                                                                                            \
+    int idx = threadIdx.x * blockDim.y + threadIdx.y;                                                       \
+    unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;                                                   \
+    unsigned int j = blockIdx.y*blockDim.y + threadIdx.y;                                                   \
+                                                                                                            \
+    if(i >= stride * blockNum)                                                                              \
+        return;                                                                                             \
+                                                                                                            \
+    __syncthreads();                                                                                        \
+                                                                                                            \
+    int k = i / stride;                                                                                     \
+    int iOffset = i % stride;                                                                               \
+                                                                                                            \
+    DTYPE value = (i < stride * blockNum && j < strideNum) ?                                                \
+                   input[blockSize * k + stride * j + iOffset] : initData;                                  \
+                                                                                                            \
+    /* load data into the shared mem */                                                                     \
+    iData[threadIdx.x * blockDim.y + threadIdx.y] = value;                                                  \
+                                                                                                            \
+    __syncthreads();                                                                                        \
+                                                                                                            \
+    /* do reduction in shared mem */                                                                        \
+    for (unsigned int s = blockDim.y/2; s > 0; s >>= 1){                                                    \
+        if(threadIdx.y < s){                                                                                \
+            iData[idx] = opName(iData[idx + s], iData[idx]);                                                \
+        }                                                                                                   \
+                                                                                                            \
+        __syncthreads();                                                                                    \
+    }                                                                                                       \
+                                                                                                            \
+    /* write result for this block to the output array */                                                   \
+    if (threadIdx.y == 0 && blockIdx.y < reducedStrideNum)                                                  \
+        output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = iData[threadIdx.x * blockDim.y];   \
+                                                                                                            \
 }
 
+KERNELREDUCEFUN3(KernelReduceMax, MAX, FLOAT_MIN)
+KERNELREDUCEFUN3(KernelReduceMin, MIN, MAX_FLOAT)
+
 /*
 reduce a tensor to another that keeps the max value along a dimension  - slow version
 Given a block of data, we go over each dimension i in the stride and we have
@@ -231,48 +243,52 @@ reduce a tensor to another that keeps the max value along a dimension  - fast ve
 >> blockSize - size of the block (i.e., stride * strideNum)
 >> blockNum - how many blocks
 */
-template <unsigned int goodSize> __global__
-void KernelReduceMaxFast(DTYPE * input, DTYPE * output, 
-                         int stride, int strideNum, int reducedStrideNum, 
-                         int blockSize, int blockNum)
-{
-    __shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK];
-
-    unsigned int tid = threadIdx.y;
-    unsigned int j = blockIdx.y * (blockDim.y * 2) + threadIdx.y;
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-    
-    if(i >= stride * blockNum)
-        return;
-
-    __syncthreads();
-
-    /* first level reduction */
-    int k = i / stride;
-    int iOffset = i % stride;
-
-    DTYPE * data = iData + threadIdx.x * blockDim.y;
-    DTYPE * inputData = input + k * blockSize;
-    DTYPE value = j < strideNum ? inputData[j * stride + iOffset] : FLOAT_MIN;
-    DTYPE value2 = j + blockDim.y < strideNum ? inputData[(j + blockDim.y) * stride + iOffset]: FLOAT_MIN;
-
-    value = MAX(value, value2);
-    value = shflDownReduceMax(value);
-    if ((tid & 0x1f) == 0) 
-        data[tid / 32] = value;
-    __syncthreads();
-
-    if (tid < 32) {
-        if (tid < blockDim.y / 32)
-            value = data[tid];
-        else 
-            value = FLOAT_MIN;
-        value = shflDownReduceMax(value);
-        if (tid == 0 && blockIdx.y < reducedStrideNum)
-            output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = value;
-    }
+#define KERNELREDUCEFUN4(funName, opName, opFuncName, initData)                                            \
+template <unsigned int goodSize> __global__                                                                \
+void funName(DTYPE * input, DTYPE * output,                                                    \
+                         int stride, int strideNum, int reducedStrideNum,                                  \
+                         int blockSize, int blockNum)                                                      \
+{                                                                                                          \
+    __shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK];                                                 \
+                                                                                                           \
+    unsigned int tid = threadIdx.y;                                                                        \
+    unsigned int j = blockIdx.y * (blockDim.y * 2) + threadIdx.y;                                          \
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;                                                \
+                                                                                                           \
+    if(i >= stride * blockNum)                                                                             \
+        return;                                                                                            \
+                                                                                                           \
+    __syncthreads();                                                                                       \
+                                                                                                           \
+    /* first level reduction */                                                                            \
+    int k = i / stride;                                                                                    \
+    int iOffset = i % stride;                                                                              \
+                                                                                                           \
+    DTYPE * data = iData + threadIdx.x * blockDim.y;                                                       \
+    DTYPE * inputData = input + k * blockSize;                                                             \
+    DTYPE value = j < strideNum ? inputData[j * stride + iOffset] : initData;                              \
+    DTYPE value2 = j + blockDim.y < strideNum ? inputData[(j + blockDim.y) * stride + iOffset]: initData;  \
+                                                                                                           \
+    value = opName(value, value2);                                                                         \
+    value = opFuncName(value);                                                                             \
+    if ((tid & 0x1f) == 0)                                                                                 \
+        data[tid / 32] = value;                                                                            \
+    __syncthreads();                                                                                       \
+                                                                                                           \
+    if (tid < 32) {                                                                                        \
+        if (tid < blockDim.y / 32)                                                                         \
+            value = data[tid];                                                                             \
+        else                                                                                               \
+            value = initData;                                                                              \
+        value = opFuncName(value);                                                                         \
+        if (tid == 0 && blockIdx.y < reducedStrideNum)                                                     \
+            output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = value;                        \
+    }                                                                                                      \
 }
 
+KERNELREDUCEFUN4(KernelReduceMaxFast, MAX, shflDownReduceMax, FLOAT_MIN)
+KERNELREDUCEFUN4(KernelReduceMinFast, MIN, shflDownReduceMin, MAX_FLOAT)
+
 /*
 reduce a tensor to another that keeps the max value along a dimension  - fast version
 >> input - the input array (representing a tensor)
@@ -372,14 +388,12 @@ void KernelReduceMaxSimpleFast(DTYPE * input, DTYPE * output,
         int stride4 = stride3 + stride;
         for(int k = 0; k < blockSize; k += stride4){
             DTYPE m = MAX(MAX(ip[k], ip[k + stride]), MAX(ip[k + stride2], ip[k + stride3]));
-            if(max < m)
-                max = m;
+            max = MAX(max, m);
         }
     }
     else{
-        for(int k = 0; k < blockSize; k += stride)
-            if(max < ip[k])
-                max = ip[k];
+        for (int k = 0; k < blockSize; k += stride)
+            max = MAX(max, ip[k]);
     }
 
     __syncthreads();
@@ -429,66 +443,75 @@ inline void adjustThreadForUseWarpOptimization(dim3& blocks, dim3& threads)
 /*
 In some case,we use less block to imporve efficiency
 */
-__global__
-void KernelReduceMaxOpLessBlocks(DTYPE * input, DTYPE * output, int strideNum, int blockNum)
-{
-    int idx = threadIdx.x % 32;
-    int idy = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
-
-    int startIndex = idy * strideNum;
-    DTYPE threadMax = FLOAT_MIN;
-    for (int i = idx; i < strideNum; i += 32) {
-        threadMax = max(input[startIndex + i], threadMax);
-    }
-    threadMax = shflDownReduceMax(threadMax);
-    if (idx == 0) 
-        output[idy] = threadMax;
+#define KERNELREDUCEFUN2(funName, opName, opFuncName, initData)                   \
+__global__                                                                        \
+void funName(DTYPE * input, DTYPE * output, int strideNum, int blockNum)          \
+{                                                                                 \
+    int idx = threadIdx.x % 32;                                                   \
+    int idy = (blockIdx.x * blockDim.x + threadIdx.x) / 32;                       \
+                                                                                  \
+    int startIndex = idy * strideNum;                                             \
+    DTYPE threadMax = initData;                                                   \
+    for (int i = idx; i < strideNum; i += 32) {                                   \
+        threadMax = opName(input[startIndex + i], threadMax);                     \
+    }                                                                             \
+    threadMax = opFuncName(threadMax);                                            \
+    if (idx == 0)                                                                 \
+        output[idy] = threadMax;                                                  \
 }
 
+KERNELREDUCEFUN2(KernelReduceMaxOpLessBlocks, MAX, shflDownReduceMax, FLOAT_MIN)
+KERNELREDUCEFUN2(KernelReduceMinOpLessBlocks, MIN, shflDownReduceMin, MAX_FLOAT)
+
+
 /*
 we use PTX code reduce
 */
-__global__
-void KernelReduceMaxOp(DTYPE * input, DTYPE * output,int stride, int strideNum, 
-                       int reducedStrideNum,int blockSize, int blockNum)
-{
-    __shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK / 32];
-
-    unsigned int tid = threadIdx.y;
-    unsigned int j = blockIdx.y * blockDim.y + threadIdx.y;
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i >= stride * blockNum)
-        return;
-
-    /* first level reduction */
-    int k = i / stride;
-    int iOffset = i % stride;
-
-    DTYPE threadMax = FLOAT_MIN;
-
-    DTYPE * data = iData + threadIdx.x * blockDim.y;
-    DTYPE * inputData = input + k * blockSize;
-    for (int it = j; it < strideNum; it += blockDim.y){
-        threadMax = max(inputData[it * stride + iOffset], threadMax);
-    }
-
-    __syncthreads();
-    threadMax = shflDownReduceMax(threadMax);
-    if ((tid & 0x1f) == 0) 
-        data[tid / 32] = threadMax;
-
-    __syncthreads();
-    /* use one warp to reduce remaining data */
-    if (tid < 32){
-        if (tid < blockDim.y / 32)
-            threadMax = data[tid];
-        else threadMax = FLOAT_MIN;
-        threadMax = shflDownReduceMax(threadMax);
-        if (tid == 0 && blockIdx.y < reducedStrideNum)
-            output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = threadMax;
-    }
+#define KERNELREDUCEFUN1(funName, opName, opFuncName, initData)                          \
+__global__                                                                               \
+void funName(DTYPE * input, DTYPE * output,int stride, int strideNum,                    \
+                       int reducedStrideNum,int blockSize, int blockNum)                 \
+{                                                                                        \
+    __shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK / 32];                          \
+                                                                                         \
+    unsigned int tid = threadIdx.y;                                                      \
+    unsigned int j = blockIdx.y * blockDim.y + threadIdx.y;                              \
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;                              \
+    if (i >= stride * blockNum)                                                          \
+        return;                                                                          \
+                                                                                         \
+    /* first level reduction */                                                          \
+    int k = i / stride;                                                                  \
+    int iOffset = i % stride;                                                            \
+                                                                                         \
+    DTYPE threadMax = initData;                                                          \
+                                                                                         \
+    DTYPE * data = iData + threadIdx.x * blockDim.y;                                     \
+    DTYPE * inputData = input + k * blockSize;                                           \
+    for (int it = j; it < strideNum; it += blockDim.y){                                  \
+        threadMax = opName(inputData[it * stride + iOffset], threadMax);                 \
+    }                                                                                    \
+                                                                                         \
+    __syncthreads();                                                                     \
+    threadMax = opFuncName(threadMax);                                                   \
+    if ((tid & 0x1f) == 0)                                                               \
+        data[tid / 32] = threadMax;                                                      \
+                                                                                         \
+    __syncthreads();                                                                     \
+    /* use one warp to reduce remaining data */                                          \
+    if (tid < 32){                                                                       \
+        if (tid < blockDim.y / 32)                                                       \
+            threadMax = data[tid];                                                       \
+        else threadMax = initData;                                                       \
+        threadMax = opFuncName(threadMax);                                               \
+        if (tid == 0 && blockIdx.y < reducedStrideNum)                                   \
+            output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = threadMax;  \
+    }                                                                                    \
 }
 
+KERNELREDUCEFUN1(KernelReduceMaxOp, MAX, shflDownReduceMax, FLOAT_MIN)
+KERNELREDUCEFUN1(KernelReduceMinOp, MIN, shflDownReduceMin, MAX_FLOAT)
+
 /* 
 get the max-valued items along a dimension of the tensor (cuda version). 
 For a 1-dimensional data array a,
@@ -497,202 +520,207 @@ sum_i = max_{0<=j<strideNum} input_{i,j}
 >> output - the output tensor
 >> dim - which dimension to reduce
 */
-void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
-{
-    CheckNTErrors(input && output, "Empty input or output tensors!");
-    CheckNTErrors(input->order == output->order + 1, "Incorrect tensor sizes!");
-    CheckNTErrors(input->order > dim && dim >=0, "Illegal dimension to reduce!");
-    CheckNTErrors(input->dataType == output->dataType, "Unmatched data types!");
-
-    for(int i = 0; i < input->order; i++){
-        if(i < dim){
-            CheckNTErrors(input->dimSize[i] == output->dimSize[i], "Unmatched tensors!");
-        }
-        else if(i > dim){
-            CheckNTErrors(input->dimSize[i] == output->dimSize[i - 1], "Unmatched tensors!");
-        }
-    }
-
-    int cudaGridSize[3];
-    int cudaBlockSize[3];
-    int iter = 0;
-    int stride = 1;
-    int strideNum = input->dimSize[dim];
-    int blockSize = 1;
-    int blockNum = 1;
-
-    for (int i = 0; i < input->order; i++) {
-        if (i < dim)
-            blockNum *= input->dimSize[i];
-        else if (i > dim)
-            stride *= input->dimSize[i];
-    }
-    blockSize = stride * strideNum;
-
-    int devID = input->devID;
-    XMem * mem = input->mem;
-
-    GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-
-    int bufSize = sizeof(DTYPE) * cudaGridSize[0] * stride * blockNum * 2;
-    DTYPE * buf = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(input->devID, bufSize);
-    DTYPE * buf1 = buf;
-    DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum;
-
-    int devIDBackup;
-    ProtectCudaDev(input->devID, devIDBackup);
-
-    if (stride == 1 && blockNum >= 10) {
-        dim3 grids;
-        dim3 blocks;
-        continuousStorageThreadAllocation(grids, blocks, (long long)blockNum, strideNum);
-        if (blocks.y >= 128) {
-            KernelReduceMaxOp <<<grids, blocks >>> ((DTYPE *)input->data, (DTYPE*)output->data, stride, strideNum, grids.y, blockSize, blockNum);
-        }
-        else {
-            if (blockNum % 4 != 0) blockNum = (int)(blockNum / 4) + 1;
-            else blockNum = blockNum / 4;
-            KernelReduceMaxOpLessBlocks <<<blockNum, 128 >>> ((DTYPE *)input->data, (DTYPE*)output->data, strideNum, blockNum);
-        }
-    }
-    else {
-        do {
-            if (input->dataType == DEFAULT_DTYPE) {
-                DTYPE * iData = NULL;
-                DTYPE * oData = NULL;
-                if (iter == 0) {
-                    iData = (DTYPE*)input->data;
-                    oData = buf1;
-                }
-                else if (iter % 2 == 1) {
-                    iData = buf1;
-                    oData = buf2;
-                }
-                else {
-                    iData = buf2;
-                    oData = buf1;
-                }
-
-                /* unroll the reduction procedure. The code is messy but it is faster. */
-                if (strideNum < 32) {
-                    GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                    if (cudaGridSize[0] == 1)
-                        oData = (DTYPE*)output->data;
-                    KernelReduceMax <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
-                }
-                else if (strideNum < 128) {
-                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                    if (cudaGridSize[0] == 1)
-                        oData = (DTYPE*)output->data;
-                    CheckNTErrors(cudaBlockSize[0] >= 64, "Incorrect thread number when calling the cuda kernel!");
-                    adjustThreadForUseWarpOptimization(blocks, threads);
-                    KernelReduceMaxFast<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
-                }
-                else if (strideNum < 256) {
-                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                    if (cudaGridSize[0] == 1)
-                        oData = (DTYPE*)output->data;
-                    CheckNTErrors(cudaBlockSize[0] >= 128, "Incorrect thread number when calling the cuda kernel!");
-                    adjustThreadForUseWarpOptimization(blocks, threads);
-                    KernelReduceMaxFast<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
-                }
-                else if (strideNum < 512) {
-                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                    if (cudaGridSize[0] == 1)
-                        oData = (DTYPE*)output->data;
-                    CheckNTErrors(cudaBlockSize[0] >= 256, "Incorrect thread number when calling the cuda kernel!");
-                    adjustThreadForUseWarpOptimization(blocks, threads);
-                    KernelReduceMaxFast<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
-                }
-                else {
-                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                    if (cudaGridSize[0] == 1)
-                        oData = (DTYPE*)output->data;
-                    CheckNTErrors(cudaBlockSize[0] >= 512, "Incorrect thread number when calling the cuda kernel!");
-                    adjustThreadForUseWarpOptimization(blocks, threads);
-                    KernelReduceMaxFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
-                }
-            }
-            else if (input->dataType == X_FLOAT16) {
-                __half * buf1ft16 = (__half *)buf1;
-                __half * buf2ft16 = (__half *)buf2;
-                __half * iData = NULL;
-                __half * oData = NULL;
-                if (iter == 0) {
-                    iData = (__half*)input->data;
-                    oData = buf1ft16;
-                }
-                else if (iter % 2 == 1) {
-                    iData = buf1ft16;
-                    oData = buf2ft16;
-                }
-                else {
-                    iData = buf2ft16;
-                    oData = buf1ft16;
-                }
-
-                /* unroll the reduction procedure. The code is messy but it is faster. */
-                if (strideNum < 32) {
-                    GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                    if (cudaGridSize[0] == 1)
-                        oData = (__half*)output->data;
-                    KernelReduceMax <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
-                }
-                else if (strideNum < 128) {
-                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                    if (cudaGridSize[0] == 1)
-                        oData = (__half*)output->data;
-                    CheckNTErrors(cudaBlockSize[0] >= 64, "Incorrect thread number when calling the cuda kernel!");
-                    KernelReduceMaxFast<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
-                }
-                else if (strideNum < 256) {
-                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                    if (cudaGridSize[0] == 1)
-                        oData = (__half*)output->data;
-                    CheckNTErrors(cudaBlockSize[0] >= 128, "Incorrect thread number when calling the cuda kernel!");
-                    KernelReduceMaxFast<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
-                }
-                else if (strideNum < 512) {
-                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                    if (cudaGridSize[0] == 1)
-                        oData = (__half*)output->data;
-                    CheckNTErrors(cudaBlockSize[0] >= 256, "Incorrect thread number when calling the cuda kernel!");
-                    KernelReduceMaxFast<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
-                }
-                else {
-                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
-                    if (cudaGridSize[0] == 1)
-                        oData = (__half*)output->data;
-                    CheckNTErrors(cudaBlockSize[0] >= 512, "Incorrect thread number when calling the cuda kernel!");
-                    KernelReduceMaxFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
-                }
-            }
-
-            strideNum = cudaGridSize[0];
-            blockSize = cudaGridSize[0];
-
-            iter++;
-
-        } while (strideNum > 1);
-    }
+#define _CUDAREDUCE(_funcName, _reduceFunc1, _reduceFunc2, _reduceFunc3, _reduceFun4)                                                         \
+void _funcName(const XTensor * input, XTensor * output, int dim)                                                                              \
+{                                                                                                                                             \
+    CheckNTErrors(input && output, "Empty input or output tensors!");                                                                         \
+    CheckNTErrors(input->order == output->order + 1, "Incorrect tensor sizes!");                                                              \
+    CheckNTErrors(input->order > dim && dim >=0, "Illegal dimension to reduce!");                                                             \
+    CheckNTErrors(input->dataType == output->dataType, "Unmatched data types!");                                                              \
+                                                                                                                                              \
+    for(int i = 0; i < input->order; i++){                                                                                                    \
+        if(i < dim){                                                                                                                          \
+            CheckNTErrors(input->dimSize[i] == output->dimSize[i], "Unmatched tensors!");                                                     \
+        }                                                                                                                                     \
+        else if(i > dim){                                                                                                                     \
+            CheckNTErrors(input->dimSize[i] == output->dimSize[i - 1], "Unmatched tensors!");                                                 \
+        }                                                                                                                                     \
+    }                                                                                                                                         \
+                                                                                                                                              \
+    int cudaGridSize[3];                                                                                                                      \
+    int cudaBlockSize[3];                                                                                                                     \
+    int iter = 0;                                                                                                                             \
+    int stride = 1;                                                                                                                           \
+    int strideNum = input->dimSize[dim];                                                                                                      \
+    int blockSize = 1;                                                                                                                        \
+    int blockNum = 1;                                                                                                                         \
+                                                                                                                                              \
+    for (int i = 0; i < input->order; i++) {                                                                                                  \
+        if (i < dim)                                                                                                                          \
+            blockNum *= input->dimSize[i];                                                                                                    \
+        else if (i > dim)                                                                                                                     \
+            stride *= input->dimSize[i];                                                                                                      \
+    }                                                                                                                                         \
+    blockSize = stride * strideNum;                                                                                                           \
+                                                                                                                                              \
+    int devID = input->devID;                                                                                                                 \
+    XMem * mem = input->mem;                                                                                                                  \
+                                                                                                                                              \
+    GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);                                         \
+                                                                                                                                              \
+    int bufSize = sizeof(DTYPE) * cudaGridSize[0] * stride * blockNum * 2;                                                                    \
+    DTYPE * buf = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(input->devID, bufSize);                        \
+    DTYPE * buf1 = buf;                                                                                                                       \
+    DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum;                                                                                 \
+                                                                                                                                              \
+    int devIDBackup;                                                                                                                          \
+    ProtectCudaDev(input->devID, devIDBackup);                                                                                                \
+                                                                                                                                              \
+    if (stride == 1 && blockNum >= 10) {                                                                                                      \
+        dim3 grids;                                                                                                                           \
+        dim3 blocks;                                                                                                                          \
+        continuousStorageThreadAllocation(grids, blocks, (long long)blockNum, strideNum);                                                     \
+        if (blocks.y >= 128) {                                                                                                                \
+            _reduceFunc1 <<<grids, blocks >>> ((DTYPE *)input->data, (DTYPE*)output->data, stride, strideNum, grids.y, blockSize, blockNum);  \
+        }                                                                                                                                     \
+        else {                                                                                                                                \
+            if (blockNum % 4 != 0) blockNum = (int)(blockNum / 4) + 1;                                                                        \
+            else blockNum = blockNum / 4;                                                                                                     \
+            _reduceFunc2 <<<blockNum, 128 >>> ((DTYPE *)input->data, (DTYPE*)output->data, strideNum, blockNum);                              \
+        }                                                                                                                                     \
+    }                                                                                                                                         \
+    else {                                                                                                                                    \
+        do {                                                                                                                                  \
+            if (input->dataType == DEFAULT_DTYPE) {                                                                                           \
+                DTYPE * iData = NULL;                                                                                                         \
+                DTYPE * oData = NULL;                                                                                                         \
+                if (iter == 0) {                                                                                                              \
+                    iData = (DTYPE*)input->data;                                                                                              \
+                    oData = buf1;                                                                                                             \
+                }                                                                                                                             \
+                else if (iter % 2 == 1) {                                                                                                     \
+                    iData = buf1;                                                                                                             \
+                    oData = buf2;                                                                                                             \
+                }                                                                                                                             \
+                else {                                                                                                                        \
+                    iData = buf2;                                                                                                             \
+                    oData = buf1;                                                                                                             \
+                }                                                                                                                             \
+                                                                                                                                              \
+                /* unroll the reduction procedure. The code is messy but it is faster. */                                                     \
+                if (strideNum < 32) {                                                                                                         \
+                    GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);                         \
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);                               \
+                    if (cudaGridSize[0] == 1)                                                                                                 \
+                        oData = (DTYPE*)output->data;                                                                                         \
+                    _reduceFunc3 <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);                      \
+                }                                                                                                                             \
+                else if (strideNum < 128) {                                                                                                   \
+                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);        \
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);                               \
+                    if (cudaGridSize[0] == 1)                                                                                                 \
+                        oData = (DTYPE*)output->data;                                                                                         \
+                    CheckNTErrors(cudaBlockSize[0] >= 64, "Incorrect thread number when calling the cuda kernel!");                           \
+                    adjustThreadForUseWarpOptimization(blocks, threads);                                                                      \
+                    _reduceFun4<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);                   \
+                }                                                                                                                             \
+                else if (strideNum < 256) {                                                                                                   \
+                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);       \
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);                               \
+                    if (cudaGridSize[0] == 1)                                                                                                 \
+                        oData = (DTYPE*)output->data;                                                                                         \
+                    CheckNTErrors(cudaBlockSize[0] >= 128, "Incorrect thread number when calling the cuda kernel!");                          \
+                    adjustThreadForUseWarpOptimization(blocks, threads);                                                                      \
+                    _reduceFun4<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);                  \
+                }                                                                                                                             \
+                else if (strideNum < 512) {                                                                                                   \
+                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);       \
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);                               \
+                    if (cudaGridSize[0] == 1)                                                                                                 \
+                        oData = (DTYPE*)output->data;                                                                                         \
+                    CheckNTErrors(cudaBlockSize[0] >= 256, "Incorrect thread number when calling the cuda kernel!");                          \
+                    adjustThreadForUseWarpOptimization(blocks, threads);                                                                      \
+                    _reduceFun4<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);                  \
+                }                                                                                                                             \
+                else {                                                                                                                        \
+                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);       \
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);                               \
+                    if (cudaGridSize[0] == 1)                                                                                                 \
+                        oData = (DTYPE*)output->data;                                                                                         \
+                    CheckNTErrors(cudaBlockSize[0] >= 512, "Incorrect thread number when calling the cuda kernel!");                          \
+                    adjustThreadForUseWarpOptimization(blocks, threads);                                                                      \
+                    _reduceFun4<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);                  \
+                }                                                                                                                             \
+            }                                                                                                                                 \
+            else if (input->dataType == X_FLOAT16) {                                                                                          \
+                __half * buf1ft16 = (__half *)buf1;                                                                                           \
+                __half * buf2ft16 = (__half *)buf2;                                                                                           \
+                __half * iData = NULL;                                                                                                        \
+                __half * oData = NULL;                                                                                                        \
+                if (iter == 0) {                                                                                                              \
+                    iData = (__half*)input->data;                                                                                             \
+                    oData = buf1ft16;                                                                                                         \
+                }                                                                                                                             \
+                else if (iter % 2 == 1) {                                                                                                     \
+                    iData = buf1ft16;                                                                                                         \
+                    oData = buf2ft16;                                                                                                         \
+                }                                                                                                                             \
+                else {                                                                                                                        \
+                    iData = buf2ft16;                                                                                                         \
+                    oData = buf1ft16;                                                                                                         \
+                }                                                                                                                             \
+                                                                                                                                              \
+                /* unroll the reduction procedure. The code is messy but it is faster. */                                                     \
+                if (strideNum < 32) {                                                                                                         \
+                    GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);                         \
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);                               \
+                    if (cudaGridSize[0] == 1)                                                                                                 \
+                        oData = (__half*)output->data;                                                                                        \
+                    KernelReduceMax <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);                      \
+                }                                                                                                                             \
+                else if (strideNum < 128) {                                                                                                   \
+                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);        \
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);                               \
+                    if (cudaGridSize[0] == 1)                                                                                                 \
+                        oData = (__half*)output->data;                                                                                        \
+                    CheckNTErrors(cudaBlockSize[0] >= 64, "Incorrect thread number when calling the cuda kernel!");                           \
+                    KernelReduceMaxFast<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);                   \
+                }                                                                                                                             \
+                else if (strideNum < 256) {                                                                                                   \
+                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);       \
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);                               \
+                    if (cudaGridSize[0] == 1)                                                                                                 \
+                        oData = (__half*)output->data;                                                                                        \
+                    CheckNTErrors(cudaBlockSize[0] >= 128, "Incorrect thread number when calling the cuda kernel!");                          \
+                    KernelReduceMaxFast<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);                  \
+                }                                                                                                                             \
+                else if (strideNum < 512) {                                                                                                   \
+                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);       \
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);                               \
+                    if (cudaGridSize[0] == 1)                                                                                                 \
+                        oData = (__half*)output->data;                                                                                        \
+                    CheckNTErrors(cudaBlockSize[0] >= 256, "Incorrect thread number when calling the cuda kernel!");                          \
+                    KernelReduceMaxFast<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);                  \
+                }                                                                                                                             \
+                else {                                                                                                                        \
+                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);       \
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);                               \
+                    if (cudaGridSize[0] == 1)                                                                                                 \
+                        oData = (__half*)output->data;                                                                                        \
+                    CheckNTErrors(cudaBlockSize[0] >= 512, "Incorrect thread number when calling the cuda kernel!");                          \
+                    KernelReduceMaxFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);                  \
+                }                                                                                                                             \
+            }                                                                                                                                 \
+                                                                                                                                              \
+            strideNum = cudaGridSize[0];                                                                                                      \
+            blockSize = cudaGridSize[0];                                                                                                      \
+                                                                                                                                              \
+            iter++;                                                                                                                           \
+                                                                                                                                              \
+        } while (strideNum > 1);                                                                                                              \
+    }                                                                                                                                         \
+                                                                                                                                              \
+    BacktoCudaDev(input->devID, devIDBackup);                                                                                                 \
+                                                                                                                                              \
+    if (mem != NULL)                                                                                                                          \
+        mem->ReleaseBuf(mem->devID, bufSize);                                                                                                 \
+    else                                                                                                                                      \
+        XMemFree(input->devID, buf);                                                                                                          \
+}
 
-    BacktoCudaDev(input->devID, devIDBackup);
+_CUDAREDUCE(_CudaReduceMax, KernelReduceMaxOp, KernelReduceMaxOpLessBlocks, KernelReduceMax, KernelReduceMaxFast)
+_CUDAREDUCE(_CudaReduceMin, KernelReduceMinOp, KernelReduceMinOpLessBlocks, KernelReduceMin, KernelReduceMinFast)
 
-    if (mem != NULL)
-        mem->ReleaseBuf(mem->devID, bufSize);
-    else
-        XMemFree(input->devID, buf);
-}
 
 #endif // USE_CUDA
 
diff --git a/source/tensor/core/reduce/ReduceMax.cuh b/source/tensor/core/reduce/ReduceMax.cuh
index f21ac1d..74fb97d 100644
--- a/source/tensor/core/reduce/ReduceMax.cuh
+++ b/source/tensor/core/reduce/ReduceMax.cuh
@@ -31,6 +31,9 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 /* get the max-valued items along a dimension of the tensor (cuda version) */
 void _CudaReduceMax(const XTensor * input, XTensor * output, int dim);
 
+/* get the min-valued items along a dimension of the tensor (cuda version) */
+void _CudaReduceMin(const XTensor * input, XTensor * output, int dim);
+
 #endif // USE_CUDA
 
 } // namespace nts(NiuTrans.Tensor)
diff --git a/source/tensor/core/reduce/ReduceMax.h b/source/tensor/core/reduce/ReduceMax.h
index 9924195..afaff3d 100644
--- a/source/tensor/core/reduce/ReduceMax.h
+++ b/source/tensor/core/reduce/ReduceMax.h
@@ -29,12 +29,21 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 /* get the max value of the items along a dimension of the tensor. */
 void _ReduceMax(const XTensor * input, XTensor * output, int dim);
 
+/* get the min value of the items along a dimension of the tensor. */
+void _ReduceMin(const XTensor * input, XTensor * output, int dim);
+
 /* 
 get the max value of the items along a dimension of the tensor (return an XTensor structure)
 make a new tensor to keep the result and return it
 */
 XTensor ReduceMax(const XTensor &input, int dim);
 
+/*
+get the min value of the items along a dimension of the tensor (return an XTensor structure)
+make a new tensor to keep the result and return it
+*/
+XTensor ReduceMin(const XTensor &input, int dim);
+
 } // namespace nts(NiuTrans.Tensor)
 
 #endif // __REDUCEMAX_H__
diff --git a/source/tensor/core/reduce/VectorBuffer.cpp b/source/tensor/core/reduce/VectorBuffer.cpp
index 09df1fb..4b90de6 100644
--- a/source/tensor/core/reduce/VectorBuffer.cpp
+++ b/source/tensor/core/reduce/VectorBuffer.cpp
@@ -168,4 +168,13 @@ VectorBuffer VectorBuffer::maxData(const VectorBuffer &a) {
     return *this;
 }
 
+/* conculte the max of two buffer */
+VectorBuffer VectorBuffer::minData(const VectorBuffer &a) {
+    for (int i = 0; i != a.size(); i++) {
+        this->values[i] = MIN(a[i], this->values[i]);
+        printf("runhere");
+    }
+    return *this;
+}
+
 }/* end of the nts (NiuTrans.Tensor) namespace */
\ No newline at end of file
diff --git a/source/tensor/core/reduce/VectorBuffer.h b/source/tensor/core/reduce/VectorBuffer.h
index fc7fa1d..ebe6a72 100644
--- a/source/tensor/core/reduce/VectorBuffer.h
+++ b/source/tensor/core/reduce/VectorBuffer.h
@@ -48,5 +48,8 @@ public:
 
     /* conculte the max of two buffer */
     VectorBuffer maxData(const VectorBuffer &a); 
+
+    /* conculte the max of two buffer */
+    VectorBuffer minData(const VectorBuffer &a);
 };
 }
\ No newline at end of file
--
libgit2 0.26.0