Merge with Yuhao branch (with little bit change).

f5149a15 · liyinqiao · f0b49d6d · f5149a15 · f5149a15 · f5149a15
Commit f5149a15 authored Oct 30, 2019 by liyinqiao
--- a/source/tensor/Main.cpp
+++ b/source/tensor/Main.cpp
@@ -30,8 +30,9 @@
 #include "XDevice.h"
 #include "./test/Test.h"
 #include "./core/CHeader.h"
-#include "./loss/CrossEntropy.h"
+#include "./XBLAS.h"
+#include "./core/sort/TopK.h"
+#include "./core/movement/Gather.h"
 //#define CRTDBG_MAP_ALLOC
 //#include <stdlib.h>  
 //#include <crtdbg.h> 

--- a/source/tensor/XGlobal.cpp
+++ b/source/tensor/XGlobal.cpp
@@ -50,14 +50,6 @@ int CONST_MINUSONE = -1;
 bool CONST_TRUE = true;
 int verboseLevel = 0;
-bool useBLAS = false;
-#ifdef USE_CUDA
-    bool useCUDA = true;
-#else
-    bool useCUDA = false;
-#endif
 FILE * tmpLog = NULL;
 double myTime = 0;

--- a/source/tensor/XGlobal.h
+++ b/source/tensor/XGlobal.h
@@ -135,8 +135,6 @@ extern bool CONST_TRUE;
 #define NIUTRANSNNDEBUG
 extern int verboseLevel;
-extern bool useBLAS;
-extern bool useCUDA;
 #define FFLUSH(FILEH) \
 { \

--- a/source/tensor/XMem.cpp
+++ b/source/tensor/XMem.cpp
@@ -1562,9 +1562,9 @@ void XMemManager::GetBufferSize(MTYPE freeMem, MTYPE * myBufSize)
            if (freeMem >= MILLION * 512){
                *myBufSize = MILLION * 128;
                if (freeMem >= MILLION * 1024) {
-                    *myBufSize = MILLION * 256;
+                    *myBufSize = MILLION * 128;
                    if (freeMem >= MILLION * 2048)
-                        *myBufSize = MILLION * 512;
+                        *myBufSize = MILLION * 128;
                }
            }
        }

--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -266,7 +266,6 @@ void XTensor::Init()
    devID = -1;
    order = -1;
    memset(dimSize, 0, sizeof(int) * MAX_TENSOR_DIM_NUM);
-    memset(dimSizeRDI, 0, sizeof(int) * MAX_TENSOR_DIM_NUM);
    dataType = DEFAULT_DTYPE;
    unitSize = sizeof(float);
    unitNum = 0;
@@ -314,7 +313,6 @@ void XTensor::ShallowCopy(const XTensor &tensor)
    order = tensor.order;
    enableGrad = tensor.enableGrad;
    memcpy(dimSize, tensor.dimSize, sizeof(int) * MAX_TENSOR_DIM_NUM);
-    memcpy(dimSizeRDI, tensor.dimSizeRDI, sizeof(int) * MAX_TENSOR_DIM_NUM);
    dataType = tensor.dataType;
    unitSize = tensor.unitSize;
    unitNum = tensor.unitNum;
@@ -533,7 +531,7 @@ void XTensor::SetDevice(int myDevId, XMem * myMem)
 bool XTensor::IsReduceShaped(const XTensor * a, const XTensor * b, int dim)
 {
-    if (a == NULL || b == NULL)
+    if(a == NULL || b == NULL)
        return false;
    if ((a->order - 1) != b->order)
@@ -570,7 +568,6 @@ void XTensor::SetDim(int * myDimSize)
 {
    for (int i = 0; i < order; i++) {
        dimSize[i] = myDimSize[i];
-        dimSizeRDI[order - i - 1] = myDimSize[i];
    }
 }
@@ -598,20 +595,17 @@ reshape the tensor
 void XTensor::Reshape(const int myOrder, const int * myDimSize)
 {
    int dims[MAX_TENSOR_DIM_NUM];
-    int dimsRDI[MAX_TENSOR_DIM_NUM];
    int num = 1;
    for(int i = 0; i < myOrder; i++){
        num *= myDimSize[i];
        dims[i] = abs(myDimSize[i]);
-        dimsRDI[myOrder - i - 1] = dims[i];
    }
    CheckNTErrors(abs(num) == unitNum, "Wrong size found when we reshape the tensor!");
    order = myOrder;
    memcpy(dimSize, dims, sizeof(int) * order);
-    memcpy(dimSizeRDI, dimsRDI, sizeof(int) * order);
 }
 /* 
@@ -997,18 +991,12 @@ void * XTensor::GetCell(int index[], int size) const
 {
    CheckNTErrors((size == order), "Illegal index!");
-    int * indexRDI = new int[size];
+    int offset = index[0];
-    for (int i = 0; i < size; i++)
+    for(int i = 1; i < size; ++i){
-        indexRDI[size - i - 1] = index[i];
+        CheckNTErrors((index[i] < dimSize[i]), "Index is out of range!");
+        offset = offset * dimSize[i] + index[i];
-    int offset = indexRDI[size - 1];
-    for(int i = size - 2; i >= 0; i--){
-        CheckNTErrors((indexRDI[i] < dimSizeRDI[i]), "Index is out of range!");
-        offset = offset * dimSizeRDI[i] + indexRDI[i];
    }
-    delete[] indexRDI;
    if(isSparse){
        DTYPE value;
        void * p;
@@ -1469,7 +1457,6 @@ bool XTensor::Resize(const int myOrder, const int * myDimSize,
    bool zeroData = false;
    for(int i = 0; i < order; i++){
        dimSize[i] = abs(myDimSize[i]);
-        dimSizeRDI[order - i - 1] = dimSize[i];
        if(myDimSize[i] < 0)
            filledData = false;
        if(myDimSize[i] == 0)
@@ -1668,7 +1655,7 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg, 
        if (isSparse) {
            int num = 0;
            for (int i = 0; i < order; i++)
-                num *= dimSizeRDI[i];
+                num *= dimSize[i];
            num = int(num * denseRatio + 1);
            int tupleSize = sizeof(int) + sizeof(DTYPE);
            int size = sizeof(int) + tupleSize*(num);
@@ -1880,8 +1867,8 @@ void XTensor::Read(FILE * file, const char * label)
            int ds[MAX_TENSOR_DIM_NUM];
            for (int i = 0; i < order; i++) {
-                ds[i] = key % dimSizeRDI[i];
+                ds[i] = key % dimSize[i];
-                key /= dimSizeRDI[i];
+                key /= dimSize[i];
            }
            Set(value, ds);
        }

--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
@@ -100,9 +100,6 @@ public:
    /* size of each dimension */
    int dimSize[MAX_TENSOR_DIM_NUM];
-    /* size of each dimension by Reversed Dimension Indexing (RDI) Mode */
-    int dimSizeRDI[MAX_TENSOR_DIM_NUM];
    /* data unit - data type for every cell */
    TENSOR_DATA_TYPE dataType;

--- a/source/tensor/core/arithmetic/Div.cpp
+++ b/source/tensor/core/arithmetic/Div.cpp
@@ -49,9 +49,6 @@ void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int le
                  "Unmatched tensors!");
    CheckDev(a->devID, b->devID);
-    int leadingDimRDI = a->order - leadingDim - 1;
 #ifdef USE_CUDA
    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
        _CudaDiv(a, b, c, alpha, leadingDim);
@@ -64,17 +61,17 @@ void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int le
    int blockSizeB = 1;
    int blockSizeC = 1;
    int blockNum = 1;
-    int dimensionSizeA = a->dimSizeRDI[leadingDimRDI];
+    int dimensionSizeA = a->dimSize[leadingDim];
-    int dimensionSizeB = b->dimSizeRDI[leadingDimRDI];
+    int dimensionSizeB = b->dimSize[leadingDim];
-    int dimensionSizeC = c->dimSizeRDI[leadingDimRDI];
+    int dimensionSizeC = c->dimSize[leadingDim];
    for (int i = 0; i < a->order; i++) {
-        if (i != leadingDimRDI) {
+        if (i != leadingDim) {
-            CheckNTErrors((a->dimSizeRDI[i] == b->dimSizeRDI[i] && a->dimSizeRDI[i] == c->dimSizeRDI[i]),
+            CheckNTErrors((a->dimSize[i] == b->dimSize[i] && a->dimSize[i] == c->dimSize[i]),
                          "Unmatched tensors!");
        }
-        if (i < leadingDimRDI)
+        if (i > leadingDim)
-            stride *= a->dimSizeRDI[i];
+            stride *= a->dimSize[i];
    }
    blockSizeA = stride * dimensionSizeA;

--- a/source/tensor/core/arithmetic/Div.cu
+++ b/source/tensor/core/arithmetic/Div.cu
@@ -122,7 +122,6 @@ where i is the item index
 */
 void _CudaDiv(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
 {
-    int leadingDimRDI = a->order - leadingDim - 1;
    CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
                  "Unmatched tensors in multiplication!");
    CheckNTErrors((a->order == b->order && a->order == c->order), "Unmatched tensors!");
@@ -130,18 +129,18 @@ void _CudaDiv(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, in
    int stride = 1;
    int blockSizeA = 1;
    int blockNum = 1;
-    int dimensionSizeA = a->dimSizeRDI[leadingDimRDI];
+    int dimensionSizeA = a->dimSize[leadingDim];
-    int dimensionSizeB = b->dimSizeRDI[leadingDimRDI];
+    int dimensionSizeB = b->dimSize[leadingDim];
-    int dimensionSizeC = c->dimSizeRDI[leadingDimRDI];
+    int dimensionSizeC = c->dimSize[leadingDim];
    for (int i = 0; i < a->order; i++) {
-        if (i != leadingDimRDI) {
+        if (i != leadingDim) {
-            CheckNTErrors((a->dimSizeRDI[i] == b->dimSizeRDI[i] &&
+            CheckNTErrors((a->dimSize[i] == b->dimSize[i] &&
-                           a->dimSizeRDI[i] == c->dimSizeRDI[i]),
+                           a->dimSize[i] == c->dimSize[i]),
                          "Unmatched tensors!");
        }
-        if (i < leadingDimRDI)
+        if (i > leadingDim)
-            stride *= a->dimSizeRDI[i];
+            stride *= a->dimSize[i];
    }
    blockSizeA = stride * dimensionSizeA;

--- a/source/tensor/core/arithmetic/MatrixMul.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul.cpp
@@ -77,18 +77,18 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
        return;
    }
-    int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
+    int an = transposedA == X_TRANS ? a->dimSize[a->order - 1] : a->dimSize[a->order - 2];
-    int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
+    int am = transposedA == X_TRANS ? a->dimSize[a->order - 2] : a->dimSize[a->order - 1];
-    int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1];
+    int bn = transposedB == X_TRANS ? b->dimSize[b->order - 1] : b->dimSize[b->order - 2];
-    int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0];
+    int bm = transposedB == X_TRANS ? b->dimSize[b->order - 2] : b->dimSize[b->order - 1];
-    int cn = c->dimSizeRDI[1];
+    int cn = c->dimSize[c->order - 2];
-    int cm = c->dimSizeRDI[0];
+    int cm = c->dimSize[c->order - 1];
    CheckNTErrors((am == bn && an == cn && bm == cm), "Unmatched tensors in multiplication!");
-    int aBlockSize = a->dimSizeRDI[0] * a->dimSizeRDI[1];
+    int aBlockSize = a->dimSize[a->order - 1] * a->dimSize[a->order - 2];
-    int bBlockSize = b->dimSizeRDI[0] * b->dimSizeRDI[1];
+    int bBlockSize = b->dimSize[b->order - 1] * b->dimSize[b->order - 2];
-    int cBlockSize = c->dimSizeRDI[0] * c->dimSizeRDI[1];
+    int cBlockSize = c->dimSize[c->order - 1] * c->dimSize[c->order - 2];
    int aRealBlockSize = aBlockSize * a->unitSize;
    int bRealBlockSize = bBlockSize * b->unitSize;
    int cRealBlockSize = cBlockSize * c->unitSize;
@@ -96,24 +96,25 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
    int bBlockNum = 1;
    int cBlockNum = 1;
-    for (int i = 2; i < a->order; i++) {
-        CheckNTErrors(a->dimSizeRDI[i] == c->dimSizeRDI[i - 2 + b->order], "Incorrect tensor sizes!");
+    for (int i = 0; i < a->order - 2; i++) {
-        aBlockNum *= a->dimSizeRDI[i];
+        CheckNTErrors(a->dimSize[i] == c->dimSize[i], "Incorrect tensor sizes!");
-        cBlockNum *= a->dimSizeRDI[i];
+        aBlockNum *= a->dimSize[i];
+        cBlockNum *= a->dimSize[i];
    }
-    for (int i = 2; i < b->order; i++) {
+    for (int i = 0; i < b->order - 2; i++) {
-        CheckNTErrors(b->dimSizeRDI[i] == c->dimSizeRDI[i], "Incorrect tensor sizes!");
+        CheckNTErrors(b->dimSize[i] == c->dimSize[i - 2 + a->order], "Incorrect tensor sizes!");
-        bBlockNum *= b->dimSizeRDI[i];
+        bBlockNum *= b->dimSize[i];
-        cBlockNum *= b->dimSizeRDI[i];
+        cBlockNum *= b->dimSize[i];
    }
    TensorList * aList = new TensorList(10);
    TensorList * bList = new TensorList(10);
    TensorList * cList = new TensorList(10);
-    int aDimSize[2] = { -a->dimSizeRDI[1], a->dimSizeRDI[0] };
+    int aDimSize[2] = { -a->dimSize[a->order - 2], a->dimSize[a->order - 1] };
-    int bDimSize[2] = { -b->dimSizeRDI[1], b->dimSizeRDI[0] };
+    int bDimSize[2] = { -b->dimSize[b->order - 2], b->dimSize[b->order - 1] };
-    int cDimSize[2] = { -c->dimSizeRDI[1], c->dimSizeRDI[0] };
+    int cDimSize[2] = { -c->dimSize[c->order - 2], c->dimSize[c->order - 1] };
    bool isSparseMul = false;
@@ -215,20 +216,20 @@ bool CheckMMulShape(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
    if (!(a->order >= 2 && b->order >= 2 && c->order >= 2))
        return false;
-    int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
+    int an = transposedA == X_TRANS ? a->dimSize[a->order - 1] : a->dimSize[a->order - 2];
-    int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
+    int am = transposedA == X_TRANS ? a->dimSize[a->order - 2] : a->dimSize[a->order - 1];
-    int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1];
+    int bn = transposedB == X_TRANS ? b->dimSize[b->order - 1] : b->dimSize[b->order - 2];
-    int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0];
+    int bm = transposedB == X_TRANS ? b->dimSize[b->order - 2] : b->dimSize[b->order - 1];
    CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
    int order = a->order + b->order - 2;
    int sub = 0;
    int * dimSize = new int[order];
-    for (int i = 2; i < a->order; i++)
+    for (int i = 0; i < a->order - 2; i++)
-        dimSize[sub++] = a->dimSizeRDI[a->order + 1 - i];
+        dimSize[sub++] = a->dimSize[i];
-    for (int i = 2; i < b->order; i++)
+    for (int i = 0; i < b->order - 2; i++)
-        dimSize[sub++] = b->dimSizeRDI[b->order + 1 - i];
+        dimSize[sub++] = b->dimSize[i];
    dimSize[sub++] = an;
    dimSize[sub++] = bm;
@@ -271,20 +272,20 @@ XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
    CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
    CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
-    int an = transposedA == X_TRANS ? a.dimSizeRDI[0] : a.dimSizeRDI[1];
+    int an = transposedA == X_TRANS ? a.dimSize[a.order - 1] : a.dimSize[a.order - 2];
-    int am = transposedA == X_TRANS ? a.dimSizeRDI[1] : a.dimSizeRDI[0];
+    int am = transposedA == X_TRANS ? a.dimSize[a.order - 2] : a.dimSize[a.order - 1];
-    int bn = transposedB == X_TRANS ? b.dimSizeRDI[0] : b.dimSizeRDI[1];
+    int bn = transposedB == X_TRANS ? b.dimSize[b.order - 1] : b.dimSize[b.order - 2];
-    int bm = transposedB == X_TRANS ? b.dimSizeRDI[1] : b.dimSizeRDI[0];
+    int bm = transposedB == X_TRANS ? b.dimSize[b.order - 2] : b.dimSize[b.order - 1];
    CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
    int order = a.order + b.order - 2;
    int sub = 0;
    int * dimSize = new int[order];
-    for (int i = 2; i < a.order; i++)
+    for (int i = 0; i < a.order - 2; i++)
-        dimSize[sub++] = a.dimSizeRDI[a.order + 1 - i];
+        dimSize[sub++] = a.dimSize[i];
-    for (int i = 2; i < b.order; i++)
+    for (int i = 0; i < b.order - 2; i++)
-        dimSize[sub++] = b.dimSizeRDI[b.order + 1 - i];    
+        dimSize[sub++] = b.dimSize[i];    
    dimSize[sub++] = an;
    dimSize[sub++] = bm;
@@ -318,20 +319,20 @@ void MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
    if (!c.isInit || !CheckMMulShape(&a, transposedA, &b, transposedB, &c)) {
-        int an = transposedA == X_TRANS ? a.dimSizeRDI[0] : a.dimSizeRDI[1];
+        int an = transposedA == X_TRANS ? a.dimSize[a.order - 1] : a.dimSize[a.order - 2];
-        int am = transposedA == X_TRANS ? a.dimSizeRDI[1] : a.dimSizeRDI[0];
+        int am = transposedA == X_TRANS ? a.dimSize[a.order - 2] : a.dimSize[a.order - 1];
-        int bn = transposedB == X_TRANS ? b.dimSizeRDI[0] : b.dimSizeRDI[1];
+        int bn = transposedB == X_TRANS ? b.dimSize[b.order - 1] : b.dimSize[b.order - 2];
-        int bm = transposedB == X_TRANS ? b.dimSizeRDI[1] : b.dimSizeRDI[0];
+        int bm = transposedB == X_TRANS ? b.dimSize[b.order - 2] : b.dimSize[b.order - 1];
        CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
        int order = a.order + b.order - 2;
        int sub = 0;
        int * dimSize = new int[order];
-        for (int i = 2; i < a.order; i++)
+        for (int i = 0; i < a.order - 2; i++)
-            dimSize[sub++] = a.dimSizeRDI[a.order + 1 - i];
+            dimSize[sub++] = a.dimSize[i];
-        for (int i = 2; i < b.order; i++)
+        for (int i = 0; i < b.order - 2; i++)
-            dimSize[sub++] = b.dimSizeRDI[b.order + 1 - i];
+            dimSize[sub++] = b.dimSize[i];
        dimSize[sub++] = an;
        dimSize[sub++] = bm;
@@ -370,20 +371,20 @@ XTensor MatrixMul(const XTensor &a, const XTensor &b,
    CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
    CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
-    int an = a.dimSizeRDI[1];
+    int an = a.dimSize[a.order - 2];
-    int am = a.dimSizeRDI[0];
+    int am = a.dimSize[a.order - 1];
-    int bn = b.dimSizeRDI[1];
+    int bn = b.dimSize[b.order - 2];
-    int bm = b.dimSizeRDI[0];
+    int bm = b.dimSize[b.order - 1];
    CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
    int order = a.order + b.order - 2;
    int sub = 0;
    int * dimSize = new int[order];
-    for (int i = 2; i < a.order; i++)
+    for (int i = 0; i < a.order - 2; i++)
-        dimSize[sub++] = a.dimSizeRDI[a.order + 1 - i];
+        dimSize[sub++] = a.dimSize[i];
-    for (int i = 2; i < b.order; i++)
+    for (int i = 0; i < b.order - 2; i++)
-        dimSize[sub++] = b.dimSizeRDI[b.order + 1 - i];    
+        dimSize[sub++] = b.dimSize[i];    
    dimSize[sub++] = an;
    dimSize[sub++] = bm;
@@ -416,20 +417,20 @@ void MatrixMul(const XTensor &a, const XTensor &b, XTensor &c,
    if (!c.isInit || !CheckMMulShape(&a, X_NOTRANS, &b, X_NOTRANS, &c)) {
-        int an = a.dimSizeRDI[1];
+        int an = a.dimSize[a.order - 2];
-        int am = a.dimSizeRDI[0];
+        int am = a.dimSize[a.order - 1];
-        int bn = b.dimSizeRDI[1];
+        int bn = b.dimSize[b.order - 2];
-        int bm = b.dimSizeRDI[0];
+        int bm = b.dimSize[b.order - 1];
        CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
        int order = a.order + b.order - 2;
        int sub = 0;
        int * dimSize = new int[order];
-        for (int i = 2; i < a.order; i++)
+        for (int i = 0; i < a.order - 2; i++)
-            dimSize[sub++] = a.dimSizeRDI[a.order + 1 - i];
+            dimSize[sub++] = a.dimSize[i];
-        for (int i = 2; i < b.order; i++)
+        for (int i = 0; i < b.order - 2; i++)
-            dimSize[sub++] = b.dimSizeRDI[b.order + 1 - i];
+            dimSize[sub++] = b.dimSize[i];
        dimSize[sub++] = an;
        dimSize[sub++] = bm;

--- a/source/tensor/core/arithmetic/MatrixMulBatched.cpp
+++ b/source/tensor/core/arithmetic/MatrixMulBatched.cpp
@@ -95,27 +95,27 @@ void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
                  "Input tensor and output tensor must have same order!");
    CheckNTErrors(a->devID >= 0 && b->devID >= 0 && c->devID >= 0, "The tensors must be on GPUs");
-    int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
+    int an = transposedA == X_TRANS ? a->dimSize[a->order - 1] : a->dimSize[a->order - 2];
-    int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
+    int am = transposedA == X_TRANS ? a->dimSize[a->order - 2] : a->dimSize[a->order - 1];
-    int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1];
+    int bn = transposedB == X_TRANS ? b->dimSize[b->order - 1] : b->dimSize[b->order - 2];
-    int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0];
+    int bm = transposedB == X_TRANS ? b->dimSize[b->order - 2] : b->dimSize[b->order - 1];
-    int cn = c->dimSizeRDI[1];
+    int cn = c->dimSize[c->order - 2];
-    int cm = c->dimSizeRDI[0];
+    int cm = c->dimSize[c->order - 1];
    CheckNTErrors((am == bn && an == cn && bm == cm), "Unmatched tensors in multiplication!");
-    int aBlockSize = a->dimSizeRDI[0] * a->dimSizeRDI[1];
+    int aBlockSize = a->dimSize[a->order - 1] * a->dimSize[a->order - 2];
-    int bBlockSize = b->dimSizeRDI[0] * b->dimSizeRDI[1];
+    int bBlockSize = b->dimSize[b->order - 1] * b->dimSize[b->order - 2];
-    int cBlockSize = c->dimSizeRDI[0] * c->dimSizeRDI[1];
+    int cBlockSize = c->dimSize[c->order - 1] * c->dimSize[c->order - 2];
    int aRealBlockSize = aBlockSize * a->unitSize;
    int bRealBlockSize = bBlockSize * b->unitSize;
    int cRealBlockSize = cBlockSize * c->unitSize;
    int blockNum = 1;
-    for (int i = 2; i < a->order; i++) {
+    for (int i = 0; i < a->order - 2; i++) {
-        CheckNTErrors((a->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
+        CheckNTErrors((a->dimSize[i] == c->dimSize[i]), "Incorrect tensor sizes!");
-        CheckNTErrors((b->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
+        CheckNTErrors((b->dimSize[i] == c->dimSize[i]), "Incorrect tensor sizes!");
-        blockNum *= a->dimSizeRDI[i];
+        blockNum *= a->dimSize[i];
    }
    int devIDBackup = 0;
@@ -126,9 +126,9 @@ void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
                                     a->data, transposedA, a->dataType, aBlockSize,
                                     b->data, transposedB, b->dataType, bBlockSize,
                                     c->data, c->dataType, cBlockSize, blockNum,
-                                     a->dimSizeRDI[1], a->dimSizeRDI[0],
+                                     a->dimSize[a->order - 2], a->dimSize[a->order - 1],
-                                     b->dimSizeRDI[1], b->dimSizeRDI[0],
+                                     b->dimSize[b->order - 2], b->dimSize[b->order - 1],
-                                     c->dimSizeRDI[1], c->dimSizeRDI[0], alpha, beta);
+                                     c->dimSize[c->order - 2], c->dimSize[c->order - 1], alpha, beta);
    BacktoCudaDev(a->devID, devIDBackup);
 #endif
@@ -164,32 +164,32 @@ void _MatrixMulBatchedCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
                 "Input tensor and output tensor must have same order!");
-    int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
+    int an = transposedA == X_TRANS ? a->dimSize[a->order - 1] : a->dimSize[a->order - 2];
-    int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
+    int am = transposedA == X_TRANS ? a->dimSize[a->order - 2] : a->dimSize[a->order - 1];
-    int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1];
+    int bn = transposedB == X_TRANS ? b->dimSize[b->order - 1] : b->dimSize[b->order - 2];
-    int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0];
+    int bm = transposedB == X_TRANS ? b->dimSize[b->order - 2] : b->dimSize[b->order - 1];
-    int cn = c->dimSizeRDI[1];
+    int cn = c->dimSize[c->order - 2];
-    int cm = c->dimSizeRDI[0];
+    int cm = c->dimSize[c->order - 1];
    CheckNTErrors(am == bn && an == cn && bm == cm, "Unmatched tensors in multiplication!");
-    int aBlockSize = a->dimSizeRDI[0] * a->dimSizeRDI[1];
+    int aBlockSize = a->dimSize[a->order - 1] * a->dimSize[a->order - 2];
-    int bBlockSize = b->dimSizeRDI[0] * b->dimSizeRDI[1];
+    int bBlockSize = b->dimSize[b->order - 1] * b->dimSize[b->order - 2];
-    int cBlockSize = c->dimSizeRDI[0] * c->dimSizeRDI[1];
+    int cBlockSize = c->dimSize[c->order - 1] * c->dimSize[c->order - 2];
    int aRealBlockSize = aBlockSize * a->unitSize;
    int bRealBlockSize = bBlockSize * b->unitSize;
    int cRealBlockSize = cBlockSize * c->unitSize;
    int blockNum = 1;
-    for (int i = 2; i < a->order; i++) {
+    for (int i = 0; i < a->order - 2; i++) {
-        CheckNTErrors((a->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
+        CheckNTErrors((a->dimSize[i] == c->dimSize[i]), "Incorrect tensor sizes!");
-        CheckNTErrors((b->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
+        CheckNTErrors((b->dimSize[i] == c->dimSize[i]), "Incorrect tensor sizes!");
-        blockNum *= a->dimSizeRDI[i];
+        blockNum *= a->dimSize[i];
    }
-    int aDimSize[2] = {-a->dimSizeRDI[1], a->dimSizeRDI[0]};
+    int aDimSize[2] = {-a->dimSize[a->order - 2], a->dimSize[a->order - 1]};
-    int bDimSize[2] = {-b->dimSizeRDI[1], b->dimSizeRDI[0]};
+    int bDimSize[2] = {-b->dimSize[b->order - 2], b->dimSize[b->order - 1]};
-    int cDimSize[2] = {-c->dimSizeRDI[1], c->dimSizeRDI[0]};
+    int cDimSize[2] = {-c->dimSize[c->order - 2], c->dimSize[c->order - 1]};
    XTensor * ai = NewTensor2D(aDimSize[0], aDimSize[1], a->dataType, a->devID, a->mem);
    XTensor * bi = NewTensor2D(bDimSize[0], bDimSize[1], b->dataType, b->devID, b->mem);
@@ -292,10 +292,10 @@ XTensor MatrixMulBatched(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const 
    CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
    CheckNTErrors(a.order == b.order, "Input tensor and output tensor must have same order!");
-    int an = transposedA == X_TRANS ? a.dimSizeRDI[0] : a.dimSizeRDI[1];
+    int an = transposedA == X_TRANS ? a.dimSize[a.order - 1] : a.dimSize[a.order - 2];
-    int am = transposedA == X_TRANS ? a.dimSizeRDI[1] : a.dimSizeRDI[0];
+    int am = transposedA == X_TRANS ? a.dimSize[a.order - 2] : a.dimSize[a.order - 1];
-    int bn = transposedB == X_TRANS ? b.dimSizeRDI[0] : b.dimSizeRDI[1];
+    int bn = transposedB == X_TRANS ? b.dimSize[b.order - 1] : b.dimSize[b.order - 2];
-    int bm = transposedB == X_TRANS ? b.dimSizeRDI[1] : b.dimSizeRDI[0];
+    int bm = transposedB == X_TRANS ? b.dimSize[b.order - 2] : b.dimSize[b.order - 1];
    CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
@@ -350,10 +350,10 @@ XTensor MatrixMulBatched(const XTensor &a, const XTensor &b,
    CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
    CheckNTErrors(a.order == b.order, "Input tensor and output tensor must have same order!");
-    int an = a.dimSizeRDI[1];
+    int an = a.dimSize[a.order - 2];
-    int am = a.dimSizeRDI[0];
+    int am = a.dimSize[a.order - 1];
-    int bn = b.dimSizeRDI[1];
+    int bn = b.dimSize[b.order - 2];
-    int bm = b.dimSizeRDI[0];
+    int bm = b.dimSize[b.order - 1];
    CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");

--- a/source/tensor/core/arithmetic/MulAndShift.cpp
+++ b/source/tensor/core/arithmetic/MulAndShift.cpp
@@ -71,20 +71,21 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
    CheckNTErrors(x.dataType == w.dataType, "Input tensors should have the same data type!");
    CheckNTErrors(x.order >= 2 && w.order >= 2, "Input tensors must have a order >= 2!");
-    int xn = x.dimSizeRDI[1];
+    int xn = x.dimSize[x.order - 2];
-    int xm = x.dimSizeRDI[0];
+    int xm = x.dimSize[x.order - 1];
-    int wn = w.dimSizeRDI[1];
+    int wn = w.dimSize[w.order - 2];
-    int wm = w.dimSizeRDI[0];
+    int wm = w.dimSize[w.order - 1];
    CheckNTErrors(xm == wn, "Unmatched tensors in multiplication!");
    int order = x.order + w.order - 2;
    int sub = 0;
    int * dimSize = new int[order];
-    for (int i = 2; i < x.order; i++)
+    for (int i = 0; i < x.order - 2; i++)
-        dimSize[sub++] = x.dimSizeRDI[x.order + 1 - i];
+        dimSize[sub++] = x.dimSize[i];
-    for (int i = 2; i < w.order; i++)
+    for (int i = 0; i < w.order - 2; i++)
-        dimSize[sub++] = w.dimSizeRDI[w.order + 1 - i];
+        dimSize[sub++] = w.dimSize[i];
    dimSize[sub++] = xn;
    dimSize[sub++] = wm;
@@ -148,18 +149,18 @@ XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedA,
    CheckNTErrors(x.dataType == w.dataType, "Input tensors should have the same data type!");
    CheckNTErrors(x.order >= 2 && w.order >= 2, "Input tensors must have a order >= 2!");
-    int xn = transposedA == X_TRANS ? x.dimSizeRDI[0] : x.dimSizeRDI[1];
+    int xn = transposedA == X_TRANS ? x.dimSize[x.order - 1] : x.dimSize[x.order - 2];
-    int xm = transposedA == X_TRANS ? x.dimSizeRDI[1] : x.dimSizeRDI[0];
+    int xm = transposedA == X_TRANS ? x.dimSize[x.order - 2] : x.dimSize[x.order - 1];
-    int wn = transposedB == X_TRANS ? w.dimSizeRDI[0] : w.dimSizeRDI[1];
+    int wn = transposedB == X_TRANS ? w.dimSize[w.order - 1] : w.dimSize[w.order - 2];
-    int wm = transposedB == X_TRANS ? w.dimSizeRDI[1] : w.dimSizeRDI[0];
+    int wm = transposedB == X_TRANS ? w.dimSize[w.order - 2] : w.dimSize[w.order - 1];
    int order = x.order + w.order - 2;
    int sub = 0;
    int * dimSize = new int[order];
-    for (int i = 2; i < x.order; i++)
+    for (int i = 0; i < x.order - 2; i++)
-        dimSize[sub++] = x.dimSizeRDI[x.order + 1 - i];
+        dimSize[sub++] = x.dimSize[i];
-    for (int i = 2; i < w.order; i++)
+    for (int i = 0; i < w.order - 2; i++)
-        dimSize[sub++] = w.dimSizeRDI[w.order + 1 - i];
+        dimSize[sub++] = w.dimSize[i];
    dimSize[sub++] = xn;
    dimSize[sub++] = wm;

--- a/source/tensor/core/arithmetic/Multiply.cpp
+++ b/source/tensor/core/arithmetic/Multiply.cpp
@@ -49,9 +49,6 @@ void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, i
                  "Unmatched tensors!");
    CheckDev(a->devID, b->devID);
-    int leadingDimRDI = a->order - leadingDim - 1;
 #ifdef USE_CUDA
    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
        _CudaMultiply(a, b, c, alpha, leadingDim);
@@ -64,18 +61,18 @@ void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, i
    int blockSizeB = 1;
    int blockSizeC = 1;
    int blockNum = 1;
-    int dimensionSizeA = a->dimSizeRDI[leadingDimRDI];
+    int dimensionSizeA = a->dimSize[leadingDim];
-    int dimensionSizeB = b->dimSizeRDI[leadingDimRDI];
+    int dimensionSizeB = b->dimSize[leadingDim];
-    int dimensionSizeC = c->dimSizeRDI[leadingDimRDI];
+    int dimensionSizeC = c->dimSize[leadingDim];
    for (int i = 0; i < a->order; i++) {
-        if (i != leadingDimRDI) {
+        if (i != leadingDim) {
-            CheckNTErrors((a->dimSizeRDI[i] == b->dimSizeRDI[i] &&
+            CheckNTErrors((a->dimSize[i] == b->dimSize[i] &&
-                           a->dimSizeRDI[i] == c->dimSizeRDI[i]),
+                           a->dimSize[i] == c->dimSize[i]),
                          "Unmatched tensors!");
        }
-        if (i < leadingDimRDI)
+        if (i > leadingDim)
-            stride *= a->dimSizeRDI[i];
+            stride *= a->dimSize[i];
    }
    blockSizeA = stride * dimensionSizeA;

--- a/source/tensor/core/arithmetic/Multiply.cu
+++ b/source/tensor/core/arithmetic/Multiply.cu
@@ -122,26 +122,25 @@ where i is the item index
 */
 void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
 {
-    int leadingDimRDI = a->order - leadingDim - 1;
+    CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
-    CheckNTErrors(a->unitNum <= c->unitNum && b->unitNum <= c->unitNum,
                  "Unmatched tensors in multiplication!");
-    CheckNTErrors(a->order == b->order && a->order == c->order, "Unmatched tensors!");
+    CheckNTErrors((a->order == b->order && a->order == c->order), "Unmatched tensors!");
    int stride = 1;
    int blockSizeA = 1;
    int blockNum = 1;
-    int dimensionSizeA = a->dimSizeRDI[leadingDimRDI];
+    int dimensionSizeA = a->dimSize[leadingDim];
-    int dimensionSizeB = b->dimSizeRDI[leadingDimRDI];
+    int dimensionSizeB = b->dimSize[leadingDim];
-    int dimensionSizeC = c->dimSizeRDI[leadingDimRDI];
+    int dimensionSizeC = c->dimSize[leadingDim];
    for (int i = 0; i < a->order; i++) {
-        if (i != leadingDimRDI) {
+        if (i != leadingDim) {
-            CheckNTErrors((a->dimSizeRDI[i] == b->dimSizeRDI[i] &&
+            CheckNTErrors((a->dimSize[i] == b->dimSize[i] &&
-                           a->dimSizeRDI[i] == c->dimSizeRDI[i]),
+                           a->dimSize[i] == c->dimSize[i]),
                          "Unmatched tensors!");
        }
-        if (i < leadingDimRDI)
+        if (i > leadingDim)
-            stride *= a->dimSizeRDI[i];
+            stride *= a->dimSize[i];
    }
    blockSizeA = stride * dimensionSizeA;

--- a/source/tensor/core/arithmetic/SumDim.cpp
+++ b/source/tensor/core/arithmetic/SumDim.cpp
@@ -70,20 +70,6 @@ void _SumDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE bet
        return;
    }
-    /*int dims[MAX_TENSOR_DIM_NUM];
-    for(int i = 0; i < a->order; i++)
-        dims[i] = 1;
-    dims[n] = a->GetDim(n);
-    XTensor * b2 = NewTensor(a->order, dims, b->dataType, b->denseRatio, b->devID, b->mem);
-    _CopyValues(b, b2);
-    _SumBroadcast(a, b2, c, beta);
-    DelTensor(b2);
-    return;*/
    if(a->devID >= 0 || b->devID >= 0 || c->devID >= 0){
 #ifdef USE_CUDA
        _CudaSumDim(a, b, c, n, beta);

--- a/source/tensor/core/arithmetic/SumDim.cu
+++ b/source/tensor/core/arithmetic/SumDim.cu
@@ -87,17 +87,17 @@ void KernelAddWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize
    int col = colIndex % colNum;
    int block = colIndex / colNum;
-    if (row >= rowNum || block >= blockNum)
+    if(row >= rowNum || block >= blockNum)
        return;
-    if (threadIdx.x == 0)
+    if(threadIdx.x == 0)
        bv[threadIdx.y] = b[row];
    __syncthreads();
    int offset = block * blockSize + row * colNum + col;
-    if (betaFired)
+    if(betaFired)
        c[offset] = a[offset] + bv[threadIdx.y] * beta;
    else
        c[offset] = a[offset] + bv[threadIdx.y];

--- a/source/tensor/core/getandset/OnehotAndIndex.cpp
+++ b/source/tensor/core/getandset/OnehotAndIndex.cpp
@@ -139,6 +139,47 @@ void _IndexToOnehot(const XTensor * index, XTensor * onehot,
 }
+/*
+convert index tensor to onehot tensor
+>> index - index tensor, which value is an integer num
+>> onehot - onehot tensor, which value is 0 or 1
+>> size - the last dimension size of the onehot tensor
+*/
+void _IndexToOnehot(int * index, int n, XTensor * onehot, int size, float labelSmoothingP)
+{
+    /*CheckNTErrors(onehot->GetDim(-1) == size, "Illegal tensor dimension!");
+    CheckNTErrors(onehot->dataType == X_INT, "The onehot tensor must be in X_INT!")
+        onehot->SetZeroAll();
+#ifdef USE_CUDA
+    if (onehot->devID >= 0) {
+        delete[] cudaIndex;
+        return;
+    }
+#endif
+    int blockNum = n;
+    int stride = size;
+    int * indexData = (int *)index;
+    int * onehotData = (int *)onehot->data;
+    for (int i = 0; i < blockNum; i++) {
+        int id = indexData[i];
+        int * od = onehotData + i * stride;
+        od[id] = 1;
+    }*/
+    XTensor* cudaIndex = NewTensor1D(n, X_INT, onehot->devID);
+    cudaIndex->SetData(index, n);
+    _IndexToOnehot(cudaIndex, onehot, size, labelSmoothingP);
+    delete[] cudaIndex;
+}
 /* 
 convert onehot tensor to index tensor (return an XTensor structure)
 make a new tensor to keep the result and return it 

--- a/source/tensor/core/getandset/OnehotAndIndex.h
+++ b/source/tensor/core/getandset/OnehotAndIndex.h
@@ -36,6 +36,9 @@ XTensor OnehotToIndex(const XTensor & onehot, int num);
 /* convert index tensor to onehot tensor */
 void _IndexToOnehot(const XTensor * index, XTensor * onehot, int size, float labelSmoothingP);
+/* convert index tensor to onehot tensor */
+void _IndexToOnehot(int * index, int n, XTensor * onehot, int size, float labelSmoothingP);
 /* convert index tensor to onehot tensor (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor IndexToOnehot(const XTensor & index, int num, float labelSmoothingP);

--- a/source/tensor/core/getandset/Select.cpp
+++ b/source/tensor/core/getandset/Select.cpp
@@ -25,6 +25,82 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)
+/*
+generate a tensor with selected data in index along the given dimension
+c = select(a)
+>> a - input tensor
+>> c - result tensor
+>> index - the selected index
+>> dim - the dimension along with which we do the job
+*/
+void _Select(const XTensor * a, XTensor * c, int* index, int dim)
+{
+    CheckNTErrors(a != NULL && c != NULL, "empty tensors!");
+    CheckNTErrors(a->order == c->order, "The input and output tensors must in the same order!");
+    CheckNTErrors(dim >= 0 && dim < a->order, "The input dimension is out of bounds!");
+    CheckNTErrors(a->dataType == c->dataType, "The tensor must be of the same data type!");
+    int stride = 1;
+    for (int i = dim + 1; i < a->order; i++)
+        stride *= a->dimSize[i];
+    printf("\n%d %d\n", a->order - dim - 1,stride);
+    int copyTimes = 1;
+    for (int i = 0; i < dim; i++)
+    {
+        copyTimes *= a->dimSize[i];
+    }
+    int cot = c->dimSize[dim];
+    int blockSize = stride * a->unitSize;
+    int stepSizeS = stride * a->dimSize[dim] * a->unitSize;
+    int stepSizeT = stride * c->dimSize[dim] * a->unitSize;
+    char * s = (char*)a->data;
+    char * t = (char*)c->data;
+    for (int i = 0; i < copyTimes; i++) {
+        for (int j = 0; j < cot; ++j) {
+            XMemCopy(t + j * blockSize, c->devID, s + index[j] * blockSize, a->devID, blockSize);
+        }
+        s += stepSizeS;
+        t += stepSizeT;
+    }
+}
+/*
+generate a tensor with selected data in index along the given dimension
+c = select(a)
+>> a - input tensor
+>> c - result tensor
+>> index - the selected index
+>> dim - the dimension along with which we do the job
+*/
+void _Select(const XTensor * a, XTensor * c, XTensor* index, int dim)
+{
+    if (index->devID >= 0)
+    {
+        int* indexCPU = new int[index->unitNum];
+        XMemCopy(indexCPU, -1, index->data,index->devID, index->unitNum * sizeof(int));
+        _Select(a, c, indexCPU, dim);
+        delete[] indexCPU;
+    }
+    else
+    {
+        _Select(a, c, (int *)index->data, dim);
+    }
+}
+/*
+*/
+/*XTensor Select(const XTensor &a, int* index, int dim)
+{
+}*/
 /* 
 generate a tensor with selected data in range[low,high] along the given dimension 
@@ -58,13 +134,12 @@ void _SelectRange(const XTensor * a, XTensor * c, int dim, int low, int high)
    }
    int stride = 1;
-    int dimRDI = a->order - dim - 1;
+    for(int i = dim + 1; i < a->order; i++)
-    for(int i = 0; i < dimRDI; i++)
+        stride *= a->dimSize[i];
-        stride *= a->dimSizeRDI[i];
    int copyTimes = 1;
-    for (int i = dimRDI + 1; i < a->order; i++) 
+    for (int i = 0; i < dim; i++) 
-        copyTimes *= a->dimSizeRDI[i];
+        copyTimes *= a->dimSize[i];
    int blockSize = stride * (high - low) * a->unitSize;
    int stepSizeS = stride * a->dimSize[dim] * a->unitSize;
@@ -117,12 +192,10 @@ XTensor SelectRange(const XTensor &a, int dim, int low, int high)
    _SelectRange(&a, &c, dim, low, high);
    /* tensor connection */
-    if (a.enableGrad) {
+    XLink::MakeLink(&a, NULL, &c, GETANDSET_SELECT);
-        XLink::MakeLink(&a, NULL, &c, GETANDSET_SELECT);
+    XLink::AddParamToHeadInt(&c, dim);
-        XLink::AddParamToHeadInt(&c, dim);
+    XLink::AddParamToHeadInt(&c, low);
-        XLink::AddParamToHeadInt(&c, low);
+    XLink::AddParamToHeadInt(&c, high);
-        XLink::AddParamToHeadInt(&c, high);
-    }
    /* destroy variables */
    delete[] dimSize;

--- a/source/tensor/core/getandset/Select.h
+++ b/source/tensor/core/getandset/Select.h
@@ -27,7 +27,10 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)
 /* generate a tensor with selected data c = select(a) */
-void _Select(const XTensor * a, XTensor * c, XTensor * indexCPU);
+void _Select(const XTensor * a, XTensor * c, int* index, int dim);
+/* generate a tensor with selected data c = select(a) */
+void _Select(const XTensor * a, XTensor * c, XTensor* index, int dim);
 /* 
 generate a tensor with selected data c = select(a) (returna a XTensor structure)

--- a/source/tensor/core/math/Normalize.cpp
+++ b/source/tensor/core/math/Normalize.cpp
@@ -47,26 +47,25 @@ void _Normalize(const XTensor * input, XTensor * output, int dim,
                const XTensor * mean, const XTensor * var, 
                const XTensor * a, const XTensor * b, DTYPE epsilon)
 {
-    int dimRDI = input->order - dim - 1;
    CheckNTErrors((_IsSameShaped(input, output)), "Unmatched input tensors!");
    CheckNTErrors((_IsSameShaped(a, b)), "Unmatched input tensors");
    CheckNTErrors((_IsSameShaped(mean, var)), "Unmatched input tensors");
    CheckNTErrors((input && output && mean && var && a && b), "Empty input tensors!");
-    CheckNTErrors((dimRDI >= 0 && dimRDI < input->order), "Incorrect reduction dimension!");
+    CheckNTErrors((dim >= 0 && dim < input->order), "Incorrect reduction dimension!");
    CheckNTErrors((input->order == mean->order + 1), "Incorrect reduction dimension!");
    int stride = 1;
-    int strideNum = input->dimSizeRDI[dimRDI];
+    int strideNum = input->dimSize[dim];
    int blockSize = 1;
    int blockNum = 1;
    for (int i = 0; i < input->order; i++) {
-        if (i < dimRDI) {
+        if (i < dim) {
-            CheckNTErrors((input->dimSizeRDI[i] == mean->dimSizeRDI[i]), "Wrong size!");
+            CheckNTErrors((input->dimSize[i] == mean->dimSize[i]), "Wrong size!");
-            stride *= input->dimSizeRDI[i];
+            blockNum *= input->dimSize[i];
        }
-        else if (i > dimRDI) {
+        else if (i > dim) {
-            CheckNTErrors((input->dimSizeRDI[i] == mean->dimSizeRDI[i - 1]), "Wrong size!");
+            CheckNTErrors((input->dimSize[i] == mean->dimSize[i - 1]), "Wrong size!");
-            blockNum *= input->dimSizeRDI[i];
+            stride *= input->dimSize[i];
        }
    }
    blockSize = stride * strideNum;

--- a/source/tensor/core/math/Normalize.cu
+++ b/source/tensor/core/math/Normalize.cu
@@ -95,15 +95,14 @@ void _CudaNormalize(const XTensor * input, XTensor * output, int dim,
 {
    CheckNTErrors((input->dataType == DEFAULT_DTYPE), "TODO!");
-    int dimRDI = input->order - dim - 1;
    int stride = 1;
-    int strideNum = input->dimSizeRDI[dimRDI];
+    int strideNum = input->dimSize[dim];
    int blockNum = 1;
    for (int i = 0; i < input->order; i++) {
-        if (i < dimRDI)
+        if (i > dim)
-            stride *= input->dimSizeRDI[i];
+            stride *= input->dimSize[i];
-        else if (i > dimRDI)
+        else if (i < dim)
-            blockNum *= input->dimSizeRDI[i];
+            blockNum *= input->dimSize[i];
    }
    int cudaGridSize[3];

--- a/source/tensor/core/movement/CopyInGrid.cpp
+++ b/source/tensor/core/movement/CopyInGrid.cpp
@@ -41,12 +41,11 @@ void _CopyInGrid(const XTensor * s, XTensor * t, int * index, int blockDim, int 
 {
    CheckNTErrors((_IsSameShaped(s, t)), "Unmatched tensors!");
-    int blockDimRDI = s->order - blockDim - 1;
    int blockSize = 1;
    int blockNum = blockNumInGrid;
    int gridNum = 1;
-    for (int i = 0; i < blockDimRDI; i++)
+    for (int i = blockDim; i < s->order; i++)
-        blockSize *= s->dimSizeRDI[i];
+        blockSize *= s->dimSize[i];
    CheckNTErrors((s->unitNum % (blockSize * blockNum) == 0), "Illegal block number!");
    gridNum = s->unitNum / (blockSize * blockNum);

--- a/source/tensor/core/movement/CopyIndexed.cpp
+++ b/source/tensor/core/movement/CopyIndexed.cpp
@@ -53,26 +53,28 @@ void _CopyIndexed(const XTensor * s, XTensor * t, int dim,
    CheckNTErrors(dim < s->order && dim < t->order, "A too larget dimension specified!");
    CheckNTErrors(s->unitSize == t->unitSize, "Unmatched tensors!");
-    int dimRDI = s->order - dim - 1;
    int blockSizeSrc = 1;
    int blockSizeTgt = 1;
    int blockNumSrc = 1;
    int blockNumTgt = 1;
-    int leadDimSizeSrc = s->dimSizeRDI[dimRDI];
+    int leadDimSizeSrc = s->dimSize[dim];
-    int leadDimSizeTgt = t->dimSizeRDI[dimRDI];
+    int leadDimSizeTgt = t->dimSize[dim];
    int indexOffsetNum = 1;
-    for (int i = 0; i < dimRDI; i++) {
+    for (int i = dim + 1; i < s->order; i++) {
-        blockSizeSrc *= s->dimSizeRDI[i];
+        blockSizeSrc *= s->dimSize[i];
-        blockSizeTgt *= t->dimSizeRDI[i];
+    }
+    for (int i = dim + 1; i < t->order; i++) {
+        blockSizeTgt *= t->dimSize[i];
+    }
+    for (int i = 0; i <= dim; i++)
+    {
+        blockNumSrc *= s->dimSize[i];
+        blockNumTgt *= t->dimSize[i];
    }
-    for (int i = dimRDI; i < s->order; i++)
-        blockNumSrc *= s->dimSizeRDI[i];
-    for (int i = dimRDI; i < t->order; i++)
-        blockNumTgt *= t->dimSizeRDI[i];
    CheckNTErrors(blockSizeSrc == blockSizeTgt, "Unmatched tensors!");
-    indexOffsetNum = blockNumSrc / s->dimSizeRDI[dimRDI];
+    indexOffsetNum = blockNumSrc / s->dimSize[dim];
    int realIndexSize = indexOffsetNum * indexSize * copyNum;
    int * realSrcIndex = new int[realIndexSize];
@@ -219,14 +221,14 @@ make a new tensor to keep the result and return it
 >> s - the source tensor
 >> dim - the leading dimension to define "sub-tensors"
-         e.g., for a tensor of size (4, 2, 3) and dim = 0, 
+         e.g., for a tensor of size (3, 2, 4) and dim = 2, 
-         we have 4 sub-tensors of size (2, 3)
+         we have 4 sub-tensors of size (3,2)
 >> srcIndex - index of the source sub-tensors
 >> indexSize - length of srcIndex (and tgtIndex)
 >> tgtIndex - index of the target sub-tensors
 >> copyNum - number of the sub-tensors we copy for each source index, 
-             e.g., for srcIndex = [0,1] and copyNum = 2,
+             e.g., for srcIndex = [1,4] and copyNum = 2,
-             we actually copy the source sub-tensors 0, 1, 1 and 2
+             we actually copy the source sub-tensors 1, 2, 4, 5
 << return - the result of copying indexed sub-tensors
 */
 XTensor CopyIndexed(const XTensor & s, int dim, 
@@ -277,14 +279,14 @@ make a new tensor to keep the result and return it
 >> s - the source tensor
 >> dim - the leading dimension to define "sub-tensors"
-         e.g., for a tensor of size (4, 2, 3) and dim = 0, 
+         e.g., for a tensor of size (3, 2, 4) and dim = 2, 
-         we have 4 sub-tensors of size (2, 3)
+         we have 4 sub-tensors of size (3,2)
 >> srcIndex - index of the source sub-tensors
 >> indexSize - length of srcIndex (and tgtIndex)
 >> tgtIndex - index of the target sub-tensors
 >> copyNum - number of the sub-tensors we copy for each source index, 
-             e.g., for srcIndex = [0,1] and copyNum = 2,
+             e.g., for srcIndex = [1,4] and copyNum = 2,
-             we actually copy the source sub-tensors 0, 1, 1 and 2
+             we actually copy the source sub-tensors 1, 2, 4, 5
 << return - the result of copying indexed sub-tensors
 */
 XTensor CopyIndexed(const XTensor &s, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum)

--- a/source/tensor/core/movement/Gather.cpp
+++ b/source/tensor/core/movement/Gather.cpp
@@ -33,6 +33,51 @@ gather indexed sub-tensors
 >> s - the source tensor
 >> t - the target tensor
+>> dim - the leading dimension to define "sub-tensors"
+         e.g., for a tensor of size (3, 2, 4) and dim = 2, 
+         we have 4 sub-tensors of size (3, 2)
+>> srcIndex - index of the source sub-tensors
+>> indexSize - length of srcIndex (and tgtIndex)
+*/
+void _Gather(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize)
+{
+    int * tgtIndex = new int[indexSize];
+    for(int i = 0; i < indexSize; i++)
+        tgtIndex[i] = i;
+    _CopyIndexed(s, t, dim, srcIndex, indexSize, tgtIndex, 1);
+    delete[] tgtIndex;
+}
+/*
+gather indexed sub-tensors
+>> s - the source tensor
+>> t - the target tensor
+>> srcIndex - index of the source sub-tensors
+>> dim - the leading dimension to define "sub-tensors"
+e.g., for a tensor of size (3, 2, 4) and dim = 2,
+we have 4 sub-tensors of size (3, 2)
+*/
+void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
+{
+    CheckNTErrors((s && t), "Invalid tensors!");
+    CheckNTErrors(s->devID == t->devID, "the data must be kept on the same device!");
+    CheckNTErrors((t->unitSize == srcIndex->unitSize), "Unmatched tensors!");
+#ifdef USE_CUDA
+    if (s->devID >= 0 && t->devID >= 0) {
+        _CudaGather(s, t, srcIndex, dim);
+        return;
+    }
+#endif
+}
+/*
+gather indexed sub-tensors
+>> s - the source tensor
+>> t - the target tensor
 >> srcIndex - the tensor to save the index of the source tensor
 */
 void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex)
@@ -79,10 +124,15 @@ XTensor Gather(XTensor &s, XTensor &index)
    CheckNTErrors(s.order == 2, "The order of the input tensor must be 2!");
-    int order = index.order + 1;
+    int order = s.order;
    int * dimSize = new int[order];
-    memcpy(dimSize, index.dimSize, index.order * sizeof(int));
-    dimSize[index.order] = s.GetDim(-1);
+    for (int i = 0; i < s.order; i++) {
+        if (i == dim)
+            dimSize[i] = index.unitNum;
+        else
+            dimSize[i] = s.dimSize[i];
+    }
    float dr = (!s.isSparse) ? 1.0F : s.denseRatio;
    XTensor t(order, dimSize, s.dataType, dr, s.devID, s.mem);
@@ -93,11 +143,22 @@ XTensor Gather(XTensor &s, XTensor &index)
    _Gather(&s, &t, &index);
    /* tensor connection */
-    if (s.enableGrad) {
+    XLink::MakeLink(&s, &index, &t, MOVEMENT_GATHER);
-        XLink::MakeLink(&s, &index, &t, MOVEMENT_GATHER);
-    }
+    if(index.order > 1) {
+        int * dims = new int[index.order + 1];
+        memcpy(dims, index.dimSize, index.order * sizeof(int));
+        dims[index.order] = t.GetDim(-1);
-    return t;
+        XTensor tt;
+        tt = Reshape(t, index.order + 1, dims);
+        delete[] dims;
+        return tt;
+    }
+    else {
+        return t;
+    }   
 }
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/movement/Gather.cu
+++ b/source/tensor/core/movement/Gather.cu
@@ -68,6 +68,36 @@ void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int indexSize, int
 /*
 gather indexed sub-tensors(cuda version)
+>> sData - the data pointer of the source tensor
+>> tData - the data pointer of the target tensor
+>> sIndex - the index of the source tensor
+>> indexSize - the size of the srcIndex
+>> stride - stride of a data block
+>> strideNum - strideNum of a data block
+>> blockNum - block size of data
+*/
+__global__
+void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int stride, int strideNum, int blockNum)
+{
+    int idx = blockDim.x * blockIdx.x + threadIdx.x;
+    int idy = blockDim.y * blockIdx.y + threadIdx.y;
+    int blockIndex = idy / stride;
+    int offsetInBlock = idy % stride;
+    int size = stride * strideNum * blockNum;  
+#pragma unroll
+    for (int i = idx * stride + stride * strideNum * blockIndex + offsetInBlock;
+        i < stride * strideNum * blockIndex + offsetInBlock + stride * strideNum && i < size;
+        i += stride * blockDim.x) {
+        tData[i] = sData[sIndex[i]];
+    }
+}
+/*
+gather indexed sub-tensors(cuda version)
 >> s - the source tensor
 >> t - the target tensor
 >> srcIndex - the tensor to save the index of the source tensor
@@ -117,6 +147,44 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
    BacktoCudaDev(devID, devIDBackup);
 }
+/*
+gather indexed sub-tensors(cuda version)
+>> s - the source tensor
+>> t - the target tensor
+>> srcIndex - the tensor to save the index of the source tensor
+>> dim - the leading dimension to define "sub-tensors"
+*/
+void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
+{
+    int devID = srcIndex->devID;
+    XMem * mem = s->mem;
+    int stride = 1;
+    int blockNum = 1;
+    int indexSize = srcIndex->unitNum;
+    int strideNum = srcIndex->dimSize[dim];
+    for (int i = 0; i < dim; i++)
+        blockNum *= srcIndex->dimSize[i];
+    for (int i = dim + 1; i < srcIndex->order; i++)
+        stride *= srcIndex->dimSize[i];
+    int * sIndex = NULL;
+    if (srcIndex->devID < 0) {
+        sIndex = mem != NULL ?
+            (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
+            (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
+        XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize);
+    }
+    else
+        sIndex = (int *)srcIndex->data;
+    int cudaGrids[3];
+    int cudaBlocks[3];
+    GDevs.GetCudaThread2D(devID, max(32, strideNum), stride*blockNum, MAX_INT, cudaGrids, cudaBlocks);
+    KernelGather << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> > ((DTYPE *)s->data, (DTYPE *)t->data, sIndex, stride, strideNum, blockNum);
+}
 #endif // USE_CUDA
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/movement/Gather.cuh
+++ b/source/tensor/core/movement/Gather.cuh
@@ -32,6 +32,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* gather indexed sub-tensors(cuda version) */
 void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex);
+void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex,int dim);
 #endif // USE_CUDA
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/movement/Gather.h
+++ b/source/tensor/core/movement/Gather.h
@@ -27,8 +27,14 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* gather selected sub-tensors */
+void _Gather(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize);
+/* gather selected sub-tensors */
 void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex);
+/* gather selected sub-tensors accoding to the dimension */
+void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim);
 /* gather selected sub-tensors (return an XTensor structure)
   make a new tensor to keep the result and return it */
 XTensor Gather(XTensor &s, XTensor &index);

--- a/source/tensor/core/reduce/ReduceMax.cpp
+++ b/source/tensor/core/reduce/ReduceMax.cpp
@@ -35,122 +35,143 @@ get the max value of the items along a dimension of the tensor
 >> output - the output tensor
 >> dim - the dimension where the reduction is performed on
 */
-void _ReduceMax(const XTensor * input, XTensor * output, int dim)
+#define _REDUCE_CPU_FUNCTION(_funcCPUName, _vectorOp, _reduceOp)                                                    \
-{
+void _funcCPUName(const XTensor * input, XTensor * output, int dim)                                                 \
-    CheckNTErrors((input->devID == output->devID || (input->devID < 0 && output->devID < 0)), 
+{                                                                                                                   \
-                  "This code must be run on the same device!");
+    CheckNTErrors((input->devID == output->devID || (input->devID < 0 && output->devID < 0)),                       \
-    CheckNTErrors((input && output), "Empty input or output tensors!");
+        "This code must be run on the same device!");                                                               \
-    CheckNTErrors((input->order == output->order + 1), "Incorrect tensor sizes!");
+    CheckNTErrors((input && output), "Empty input or output tensors!");                                             \
-    CheckNTErrors((input->order > dim && dim >=0), "Illegal dimension to reduce!");
+    CheckNTErrors((input->order == output->order + 1), "Incorrect tensor sizes!");                                  \
-    CheckNTErrors((input->dataType == output->dataType), "Unmatched data types!");
+    CheckNTErrors((input->order > dim && dim >= 0), "Illegal dimension to reduce!");                                \
+    CheckNTErrors((input->dataType == output->dataType), "Unmatched data types!");                                  \
-    int dimRDI = input->order - dim - 1;
+                                                                                                                    \
-    CheckNTErrors(dimRDI >= 0, "Wrong dimension!");
+    CheckNTErrors(dim < input->order, "Wrong dimension!");                                                          \
+                                                                                                                    \
-    for(int i = 0; i < input->order; i++){
+    for (int i = 0; i < input->order; i++) {                                                                        \
-        if(i < dimRDI){
+                                                                                                                    \
-            CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i]), 
+            if (i < dim) {                                                                                          \
-                          "Unmatched tensors!");
+                                                                                                                    \
-        }
+                    CheckNTErrors((input->dimSize[i] == output->dimSize[i]),                                        \
-        else if(i > dimRDI){
+                        "Unmatched tensors!");                                                                      \
-            CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i - 1]), 
+            }                                                                                                       \
-                          "Unmatched tensors!");
+            else if (i > dim) {                                                                                     \
-        }
+                        CheckNTErrors((input->dimSize[i] == output->dimSize[i - 1]),                                \
-    }
+                            "Unmatched tensors!");                                                                  \
+                }                                                                                                   \
-    if(input->devID >= 0){
+    }                                                                                                               \
-#ifdef USE_CUDA
+    CheckNTErrors((input->dataType == DEFAULT_DTYPE), "TODO!");                                                     \
-        _CudaReduceMax(input, output, dim);
+    int stride = 1;                                                                                                 \
-#endif
+    int strideNum = input->dimSize[dim];                                                                            \
-    }
+    int blockSize = 1;                                                                                              \
-    else{
+    int blockNum = 1;                                                                                               \
-        CheckNTErrors((input->dataType == DEFAULT_DTYPE), "TODO!");
+    for (int i = 0; i < input->order; i++) {                                                                        \
+        if (i > dim)                                                                                                \
-        int stride = 1;
+            stride *= input->dimSize[i];                                                                            \
-        int strideNum = input->dimSizeRDI[dimRDI];
+        else if (i < dim)                                                                                           \
-        int blockSize = 1;
+            blockNum *= input->dimSize[i];                                                                          \
-        int blockNum = 1;
+    }                                                                                                               \
-        for (int i = 0; i < input->order; i++) {
+    blockSize = stride * strideNum;                                                                                 \
-            if (i < dimRDI)
+                                                                                                                    \
-                stride *= input->dimSizeRDI[i];
+                                                                                                                    \
-            else if (i > dimRDI)
+    if(input->dimSize[input->order - 1] % (4 * 32 / sizeof(DTYPE)) == 0 && input->dimSize[input->order - 1] >= 32){ \
-                blockNum *= input->dimSizeRDI[i];
+        int vecBufLength =  32 / sizeof(DTYPE);                                                                     \
-        }
+                                                                                                                    \
-        blockSize = stride * strideNum;
+        if (dim == input->order - 1) {                                                                              \
+            /*data is contiguous in dim 0 */                                                                        \
-        if(input->dimSizeRDI[0] % (4 * 32 / sizeof(DTYPE)) == 0 && input->dimSizeRDI[0] >= 32){
+            for (int i = 0; i < blockNum; i++) {                                                                    \
-            int vecBufLength =  32 / sizeof(DTYPE);
+                DTYPE * ip = (DTYPE*)input->data + blockSize * i;                                                   \
+                DTYPE * op = (DTYPE*)output->data + i;                                                              \
-            if(dimRDI == 0){
+                VectorBuffer vecBuf[4];                                                                             \
-                //data is contiguous in dim 0
+                for (int j = 0; j < 4; j++) {                                                                       \
-                for(int i = 0; i < blockNum; i++){
+                    vecBuf[j] = VectorBuffer::loadu((DTYPE*)(ip)+j * vecBufLength);                                 \
-                    DTYPE * ip = (DTYPE*)input->data + blockSize * i;
+                }                                                                                                   \
-                    DTYPE * op = (DTYPE*)output->data + i;
+                for (int j = 1; j < strideNum / 32; j++) {                                                          \
-                    VectorBuffer vecBuf[4];
+                    const DTYPE* ptr = (DTYPE*)(ip + j * vecBufLength);                                             \
-                    for(int j = 0; j < 4; j++){
+                    vecBuf[0] = vecBuf[0]._vectorOp(VectorBuffer::loadu(ptr + 0 * vecBufLength));                   \
-                        vecBuf[j] = VectorBuffer::loadu((DTYPE*)(ip) + j * vecBufLength);
+                    vecBuf[1] = vecBuf[1]._vectorOp(VectorBuffer::loadu(ptr + 1 * vecBufLength));                   \
-                    }
+                    vecBuf[2] = vecBuf[2]._vectorOp(VectorBuffer::loadu(ptr + 2 * vecBufLength));                   \
-                    for(int j = 1; j < strideNum / 32; j++){
+                    vecBuf[3] = vecBuf[3]._vectorOp(VectorBuffer::loadu(ptr + 3 * vecBufLength));                   \
-                        const DTYPE* ptr = (DTYPE*)(ip + j * vecBufLength);
+                }                                                                                                   \
-                        vecBuf[0] = vecBuf[0].maxData(VectorBuffer::loadu(ptr + 0 * vecBufLength));
+                vecBuf[0] = vecBuf[0]._vectorOp(vecBuf[1]);                                                         \
-                        vecBuf[1] = vecBuf[1].maxData(VectorBuffer::loadu(ptr + 1 * vecBufLength));
+                vecBuf[0] = vecBuf[0]._vectorOp(vecBuf[2]);                                                         \
-                        vecBuf[2] = vecBuf[2].maxData(VectorBuffer::loadu(ptr + 2 * vecBufLength));
+                vecBuf[0] = vecBuf[0]._vectorOp(vecBuf[3]);                                                         \
-                        vecBuf[3] = vecBuf[3].maxData(VectorBuffer::loadu(ptr + 3 * vecBufLength));
+                DTYPE maxN = vecBuf[0][0];                                                                          \
-                    }
+                for (int k = 1; k < vecBufLength; k++) {                                                            \
-                    vecBuf[0] = vecBuf[0].maxData(vecBuf[1]);
+                    maxN = _reduceOp(maxN, vecBuf[0][k]);                                                           \
-                    vecBuf[0] = vecBuf[0].maxData(vecBuf[2]);
+                }                                                                                                   \
-                    vecBuf[0] = vecBuf[0].maxData(vecBuf[3]);
+                *op = maxN;                                                                                         \
-                    DTYPE maxN = DTYPE_MIN;
+            }                                                                                                       \
-                    for(int k = 0; k < vecBufLength; k++){
+                                                                                                                    \
-                        maxN = MAX(maxN,vecBuf[0][k]);
+        }                                                                                                           \
-                    }
+        else {                                                                                                      \
-                    *op = maxN;
+            /* data is separated */                                                                                 \
-                }
+            for(int i = 0; i < blockNum; i++){                                                                      \
+                for(int j = 0; j < input->dimSize[input->order - 1] / 32; j++){                                     \
-            } else{
+                    DTYPE * ip = (DTYPE*)input->data + blockSize * i;                                               \
-                //data is separated
+                    DTYPE * op = (DTYPE*)output->data + stride * i;                                                 \
-                for(int i = 0; i < blockNum; i++){
+                    VectorBuffer vecBuf[4];                                                                         \
-                    for(int j = 0; j < input->dimSizeRDI[0] / 32; j++){
+                    for(int k = 0; k < 4; k++){                                                                     \
-                        DTYPE * ip = (DTYPE*)input->data + blockSize * i;
+                        vecBuf[k] = VectorBuffer::loadu((DTYPE*)(ip) + (j * 4 + k) * 32 / sizeof(DTYPE));           \
-                        DTYPE * op = (DTYPE*)output->data + stride * i;
+                                                                                                                    \
-                        VectorBuffer vecBuf[4];
+                    }                                                                                               \
-                        for(int k = 0; k < 4; k++){
+                    for(int k = 1; k < strideNum; k++){                                                             \
-                            vecBuf[k] = VectorBuffer::loadu((DTYPE*)(ip) + (j * 4 + k) * 32 / sizeof(DTYPE));
+                        DTYPE * ptr = ip + k * stride + (j * 4) * vecBufLength;                                     \
+                        vecBuf[0] = vecBuf[0]._vectorOp(VectorBuffer::loadu(ptr + 0 * vecBufLength));               \
+                        vecBuf[1] = vecBuf[1]._vectorOp(VectorBuffer::loadu(ptr + 1 * vecBufLength));               \
+                        vecBuf[2] = vecBuf[2]._vectorOp(VectorBuffer::loadu(ptr + 2 * vecBufLength));               \
+                        vecBuf[3] = vecBuf[3]._vectorOp(VectorBuffer::loadu(ptr + 3 * vecBufLength));               \
+                    }                                                                                               \
+                    for(int k = 0; k < 4; k++){                                                                     \
+                        for(int l = 0; l < vecBufLength; l++)                                                       \
+                            *(op + j * 32 + 8 * k + l) = vecBuf[k][l];                                              \
+                    }                                                                                               \
+                }                                                                                                   \
+            }                                                                                                       \
+        }                                                                                                           \
+    }/* run vector buffer */                                                                                        \
+    else{                                                                                                           \
+        for(int k = 0; k < blockNum; k++){                                                                          \
+            DTYPE * ip = (DTYPE*)input->data + blockSize * k;                                                       \
+            DTYPE * op = (DTYPE*)output->data + stride * k;                                                         \
+            for(int i = 0; i < stride; i++){                                                                        \
+                DTYPE * ipe = ip + blockSize;                                                                       \
+                DTYPE tmpData = *(ip + i);                                                                          \
+                for(DTYPE * ipb = ip + i + stride; ipb < ipe; ipb += stride){                                       \
+                    DTYPE v = *ipb;                                                                                 \
+                    tmpData = _reduceOp(tmpData, v);                                                                \
+                }                                                                                                   \
+                *(op + i) = tmpData;                                                                                \
+            }                                                                                                       \
+        }                                                                                                           \
+    }                                                                                                               \
+}
-                        }
+_REDUCE_CPU_FUNCTION(reduceMaxCPU, maxData, MAX)
-                        for(int k = 1; k < strideNum; k++){
+_REDUCE_CPU_FUNCTION(reduceMinCPU, minData, MIN)
-                            DTYPE * ptr = ip + k * stride + (j * 4) * vecBufLength;
-                            vecBuf[0] = vecBuf[0].maxData(VectorBuffer::loadu(ptr + 0 * vecBufLength));
+#ifdef USE_CUDA            
-                            vecBuf[1] = vecBuf[1].maxData(VectorBuffer::loadu(ptr + 1 * vecBufLength));
+#define _REDUCE_FUNCTION(_funcName, _cudaFuncName)                                                                   \
-                            vecBuf[2] = vecBuf[2].maxData(VectorBuffer::loadu(ptr + 2 * vecBufLength));
+void _funcName(const XTensor * input, XTensor * output, int dim)                                                     \
-                            vecBuf[3] = vecBuf[3].maxData(VectorBuffer::loadu(ptr + 3 * vecBufLength));
+{                                                                                                                    \
-                        }
+    if(input->devID >= 0){                                                                                           \
-                        for(int k = 0; k < 4; k++){
+        _cudaFuncName(input, output, dim);                                                                           \
-                            for(int l = 0; l < vecBufLength; l++)
+    }                                                                                                                \
-                                *(op + j * 32 + 8 * k + l) = vecBuf[k][l];
+    else{                                                                                                            \
-                        }
+        reduceMaxCPU(input, output, dim);                                                                            \
-                    }
+    }                                                                                                                \
-                }
+}
-            }
+_REDUCE_FUNCTION(_ReduceMax, _CudaReduceMax)
-        }//run vector buffer
+_REDUCE_FUNCTION(_ReduceMin, _CudaReduceMin)
-        else{
+#else
-            for(int k = 0; k < blockNum; k++){
+#define _REDUCE_FUNCTION(_funcName, reduceNameCPU)                                                                   \
-                DTYPE * ip = (DTYPE*)input->data + blockSize * k;
+void _funcName(const XTensor * input, XTensor * output, int dim)                                                     \
-                DTYPE * op = (DTYPE*)output->data + stride * k;
+{                                                                                                                    \
-                for(int i = 0; i < stride; i++){
+    CheckNTErrors((input->devID < 0), "This code must be run on the CPU!");                                          \
-                    DTYPE max = DTYPE_MIN;
+    reduceNameCPU(input, output, dim);                                                                               \
-                    DTYPE * ipe = ip + blockSize;
-                    for(DTYPE * ipb = ip + i; ipb < ipe; ipb += stride){
-                        DTYPE v = *ipb;
-                        if(max < v)
-                            max = v;
-                    }
-                    *(op + i) = max;
-                }
-            }
-        }
-    }
 }
+    _REDUCE_FUNCTION(_ReduceMax, reduceMaxCPU)
+    _REDUCE_FUNCTION(_ReduceMin, reduceMinCPU)
+#endif 
 /* 
 get the max value of the items along a dimension of the tensor (return an XTensor structure).
@@ -160,74 +181,38 @@ make a new tensor to keep the result and return it
 >> dim - the dimension where the reduction is performed on
 << return - the max value of the items along a dimension of the tensor
 */
-XTensor ReduceMax(const XTensor &input, int dim)
+#define REDUCE_FUNCTION(funcName, funcOp)                                                                           \
-{
+XTensor funcName(const XTensor & input, int dim)                                                                    \
-    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
+{                                                                                                                   \
+    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");                                   \
-    int order = input.order - 1;
+	                                                                                                                \
-    int * dimSize = new int[order];
+    int order = input.order - 1;                                                                                    \
-    for(int i = 0; i < order; i++){
+    int * dimSize = new int[order];                                                                                 \
-        if(i < dim)
+    for(int i = 0; i < order; i++){                                                                                 \
-            dimSize[i] = input.dimSize[i];
+        if(i < dim)                                                                                                 \
-        else if(i >= dim)
+            dimSize[i] = input.dimSize[i];                                                                          \
-            dimSize[i] = input.dimSize[i + 1];
+        else if(i >= dim)                                                                                           \
-    }
+            dimSize[i] = input.dimSize[i + 1];                                                                      \
+    }                                                                                                               \
-    float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
+                                                                                                                    \
-    XTensor output(order, dimSize, input.dataType, dr, input.devID, input.mem);
+    float dr = (!input.isSparse) ? 1.0F : input.denseRatio;                                                         \
-    output.SetTMPFlag();
+    XTensor output(order, dimSize, input.dataType, dr, input.devID, input.mem);                                     \
+    output.SetTMPFlag();                                                                                            \
-    /* call _ReduceMax function */
+                                                                                                                    \
-    _ReduceMax(&input, &output, dim);
+    /* call _ReduceMax function */                                                                                  \
+    funcOp(&input, &output, dim);                                                                                   \
-    /* tensor connection */
+                                                                                                                    \
-    if (input.enableGrad) {
+    /* tensor connection */                                                                                         \
-        XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCEMAX);
+    XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCEMAX);                                                       \
-        XLink::AddParamToHeadInt(&output, dim);
+    XLink::AddParamToHeadInt(&output, dim);                                                                         \
-    }
+                                                                                                                    \
+    /* destroy variables */                                                                                         \
-    /* destroy variables */
+    delete[] dimSize;                                                                                               \
-    delete[] dimSize;
+                                                                                                                    \
+    return output;                                                                                                  \
-    return output;
 }
-/* 
+REDUCE_FUNCTION(ReduceMax, _ReduceMax)
-get the max value of the items along a dimension of the tensor
+REDUCE_FUNCTION(ReduceMin, _ReduceMin)
->> input - the input tensor
->> output - the output tensor
->> dim - the dimension where the reduction is performed on
-*/
-void ReduceMax(const XTensor &input, XTensor &output, int dim)
-{
-    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
-    if (!output.isInit || !XTensor::IsReduceShaped(&input, &output, dim)) {
-        int order = input.order - 1;
-        int * dimSize = new int[order];
-        for (int i = 0; i < order; i++) {
-            if (i < dim)
-                dimSize[i] = input.dimSize[i];
-            else if (i >= dim)
-                dimSize[i] = input.dimSize[i + 1];
-        }
-        float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
-        InitTensor(&output, order, dimSize, input.dataType, dr, input.devID, input.mem);
-        /* destroy variables */
-        delete[] dimSize;
-    }
-    /* call _ReduceMax function */
-    _ReduceMax(&input, &output, dim);
-    if (input.enableGrad) {
-        /* tensor connections */
-        XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCEMAX);
-        XLink::AddParamToHeadInt(&output, dim);
-    }
-}
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/reduce/ReduceMax.cu
+++ b/source/tensor/core/reduce/ReduceMax.cu
@@ -33,67 +33,75 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 /*
 use PTX code to reduce float data
 */
-__device__ __forceinline__  
+#define SHLFUNCFLOAT(funcName, reducePTXOp)                         \
-float shflDownReduceMax(float input)
+__device__ __forceinline__                                     \
-{
+float funcName(float input)                                    \
-    float output;
+{                                                              \
-    asm volatile(
+    float output;                                              \
-        "{"
+    asm volatile(                                              \
-        ".reg .f32 r0;"
+        "{"                                                    \
-        ".reg .pred p;"
+        ".reg .f32 r0;"                                        \
-        "shfl.sync.down.b32  r0, %1, 0x10, 0x1f,0xffffffff;"
+        ".reg .pred p;"                                        \
-        "setp.lt.f32    p,%1,r0;"
+        "shfl.sync.down.b32  r0, %1, 0x10, 0x1f,0xffffffff;"   \
-        "@p mov.f32     %1,r0;"
+        "setp."#reducePTXOp".f32    p,%1,r0;"                  \
-        "shfl.sync.down.b32  r0, %1, 0x8, 0xf,0xffffffff;"
+        "@p mov.f32     %1,r0;"                                \
-        "setp.lt.f32    p,%1,r0;"
+        "shfl.sync.down.b32  r0, %1, 0x8, 0xf,0xffffffff;"     \
-        "@p mov.f32     %1,r0;"
+        "setp."#reducePTXOp".f32    p,%1,r0;"                  \
-        "shfl.sync.down.b32  r0, %1, 0x4, 0x7,0xffffffff;"
+        "@p mov.f32     %1,r0;"                                \
-        "setp.lt.f32    p,%1,r0;"
+        "shfl.sync.down.b32  r0, %1, 0x4, 0x7,0xffffffff;"     \
-        "@p mov.f32     %1,r0;"
+        "setp."#reducePTXOp".f32    p,%1,r0;"                  \
-        "shfl.sync.down.b32  r0, %1, 0x2, 0x3,0xffffffff;"
+        "@p mov.f32     %1,r0;"                                \
-        "setp.lt.f32    p,%1,r0;"
+        "shfl.sync.down.b32  r0, %1, 0x2, 0x3,0xffffffff;"     \
-        "@p mov.f32     %1,r0;"
+        "setp."#reducePTXOp".f32    p,%1,r0;"                  \
-        "shfl.sync.down.b32  r0, %1, 0x1, 0x1,0xffffffff;"
+        "@p mov.f32     %1,r0;"                                \
-        "setp.lt.f32    p, %1, r0; "
+        "shfl.sync.down.b32  r0, %1, 0x1, 0x1,0xffffffff;"     \
-        "@p mov.f32     %1,r0;"
+        "setp."#reducePTXOp".f32    p, %1, r0; "               \
-        "mov.f32        %0,%1;"
+        "@p mov.f32     %1,r0;"                                \
-        "}"
+        "mov.f32        %0,%1;"                                \
-        : "=f"(output) : "f"(input));
+        "}"                                                    \
-    return output;
+        : "=f"(output) : "f"(input));                          \
+    return output;                                             \
 }
+SHLFUNCFLOAT(shflDownReduceMax, lt)
+SHLFUNCFLOAT(shflDownReduceMin, gt)
 /*
 use PTX code to reduce int data
 */
-__device__ __forceinline__
+#define SHLFUNCINT(funcName, reducePTXOp)                      \
-int shflDownReduceMax(int input)
+__device__ __forceinline__                                     \
-{
+int funcName(int input)                                        \
-    int output;
+{                                                              \
-    asm volatile(
+    int output;                                                \
-        "{"
+    asm volatile(                                              \
-        ".reg .s32 r0;"
+        "{"                                                    \
-        ".reg .pred p;"
+        ".reg .s32 r0;"                                        \
-        "shfl.sync.down.b32  r0, %1, 0x10, 0x1f,0xffffffff;"
+        ".reg .pred p;"                                        \
-        "setp.lt.s32    p,%1,r0;"
+        "shfl.sync.down.b32  r0, %1, 0x10, 0x1f,0xffffffff;"   \
-        "@p mov.s32     %1,r0;"
+        "setp."#reducePTXOp".s32    p,%1,r0;"                  \
-        "shfl.sync.down.b32  r0, %1, 0x8, 0xf,0xffffffff;"
+        "@p mov.s32     %1,r0;"                                \
-        "setp.lt.s32    p,%1,r0;"
+        "shfl.sync.down.b32  r0, %1, 0x8, 0xf,0xffffffff;"     \
-        "@p mov.s32     %1,r0;"
+        "setp."#reducePTXOp".s32    p,%1,r0;"                  \
-        "shfl.sync.down.b32  r0, %1, 0x4, 0x7,0xffffffff;"
+        "@p mov.s32     %1,r0;"                                \
-        "setp.lt.s32    p,%1,r0;"
+        "shfl.sync.down.b32  r0, %1, 0x4, 0x7,0xffffffff;"     \
-        "@p mov.s32     %1,r0;"
+        "setp."#reducePTXOp".s32    p,%1,r0;"                  \
-        "shfl.sync.down.b32  r0, %1, 0x2, 0x3,0xffffffff;"
+        "@p mov.s32     %1,r0;"                                \
-        "setp.lt.s32    p,%1,r0;"
+        "shfl.sync.down.b32  r0, %1, 0x2, 0x3,0xffffffff;"     \
-        "@p mov.s32     %1,r0;"
+        "setp."#reducePTXOp".s32    p,%1,r0;"                  \
-        "shfl.sync.down.b32  r0, %1, 0x1, 0x1,0xffffffff;"
+        "@p mov.s32     %1,r0;"                                \
-        "setp.lt.s32    p, %1, r0; "
+        "shfl.sync.down.b32  r0, %1, 0x1, 0x1,0xffffffff;"     \
-        "@p mov.s32     %1,r0;"
+        "setp."#reducePTXOp".s32    p, %1, r0; "               \
-        "mov.s32        %0,%1;"
+        "@p mov.s32     %1,r0;"                                \
-        "}"
+        "mov.s32        %0,%1;"                                \
-        : "=r"(output) : "r"(input));
+        "}"                                                    \
-    return output;
+        : "=r"(output) : "r"(input));                          \
+    return output;                                             \
 }
+SHLFUNCINT(shflDownReduceMax, lt)
+SHLFUNCINT(shflDownReduceMin, gt)
 /* 
 reduce a tensor to another that keeps the max value along a dimension  - slow version
 Given a block of data, we go over each dimension i in the stride and we have
@@ -108,48 +116,52 @@ crossing of the i-th columne and the j-th row.
 >> blockSize - size of the block (i.e., stride * strideNum)
 >> blockNum - how many blocks
 */
- __global__
+#define KERNELREDUCEFUN3(funName, opName, initData)                                                         \
-void KernelReduceMax(DTYPE * input, DTYPE * output, 
+ __global__                                                                                                 \
-                     int stride, int strideNum, int reducedStrideNum, 
+void funName(DTYPE * input, DTYPE * output,                                                                 \
-                     int blockSize, int blockNum)
+                     int stride, int strideNum, int reducedStrideNum,                                       \
-{
+                     int blockSize, int blockNum)                                                           \
-    __shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK * MIN_CUDA_SHARED_MEM_COL_SIZE/2];
+{                                                                                                           \
+    __shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK * MIN_CUDA_SHARED_MEM_COL_SIZE/2];                 \
-    int idx = threadIdx.x * blockDim.y + threadIdx.y;
+                                                                                                            \
-    unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
+    int idx = threadIdx.x * blockDim.y + threadIdx.y;                                                       \
-    unsigned int j = blockIdx.y*blockDim.y + threadIdx.y;
+    unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;                                                   \
+    unsigned int j = blockIdx.y*blockDim.y + threadIdx.y;                                                   \
-    if(i >= stride * blockNum)
+                                                                                                            \
-        return;
+    if(i >= stride * blockNum)                                                                              \
+        return;                                                                                             \
-    __syncthreads();
+                                                                                                            \
+    __syncthreads();                                                                                        \
-    int k = i / stride;
+                                                                                                            \
-    int iOffset = i % stride;
+    int k = i / stride;                                                                                     \
+    int iOffset = i % stride;                                                                               \
-    DTYPE value = (i < stride * blockNum && j < strideNum) ? 
+                                                                                                            \
-                   input[blockSize * k + stride * j + iOffset] : FLOAT_MIN;
+    DTYPE value = (i < stride * blockNum && j < strideNum) ?                                                \
+                   input[blockSize * k + stride * j + iOffset] : initData;                                  \
-    /* load data into the shared mem */
+                                                                                                            \
-    iData[threadIdx.x * blockDim.y + threadIdx.y] = value;
+    /* load data into the shared mem */                                                                     \
+    iData[threadIdx.x * blockDim.y + threadIdx.y] = value;                                                  \
-    __syncthreads();
+                                                                                                            \
+    __syncthreads();                                                                                        \
-    /* do reduction in shared mem */
+                                                                                                            \
-    for (unsigned int s = blockDim.y/2; s > 0; s >>= 1){
+    /* do reduction in shared mem */                                                                        \
-        if(threadIdx.y < s && iData[idx] < iData[idx + s]){
+    for (unsigned int s = blockDim.y/2; s > 0; s >>= 1){                                                    \
-            iData[idx] = iData[idx + s];
+        if(threadIdx.y < s){                                                                                \
-        }
+            iData[idx] = opName(iData[idx + s], iData[idx]);                                                \
+        }                                                                                                   \
-        __syncthreads();
+                                                                                                            \
-    }
+        __syncthreads();                                                                                    \
+    }                                                                                                       \
-    /* write result for this block to the output array */
+                                                                                                            \
-    if (threadIdx.y == 0 && blockIdx.y < reducedStrideNum) 
+    /* write result for this block to the output array */                                                   \
-        output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = iData[threadIdx.x * blockDim.y];
+    if (threadIdx.y == 0 && blockIdx.y < reducedStrideNum)                                                  \
+        output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = iData[threadIdx.x * blockDim.y];   \
+                                                                                                            \
 }
+KERNELREDUCEFUN3(KernelReduceMax, MAX, FLOAT_MIN)
+KERNELREDUCEFUN3(KernelReduceMin, MIN, MAX_FLOAT)
 /*
 reduce a tensor to another that keeps the max value along a dimension  - slow version
 Given a block of data, we go over each dimension i in the stride and we have
@@ -231,48 +243,52 @@ reduce a tensor to another that keeps the max value along a dimension  - fast ve
 >> blockSize - size of the block (i.e., stride * strideNum)
 >> blockNum - how many blocks
 */
-template <unsigned int goodSize> __global__
+#define KERNELREDUCEFUN4(funName, opName, opFuncName, initData)                                            \
-void KernelReduceMaxFast(DTYPE * input, DTYPE * output, 
+template <unsigned int goodSize> __global__                                                                \
-                         int stride, int strideNum, int reducedStrideNum, 
+void funName(DTYPE * input, DTYPE * output,                                                    \
-                         int blockSize, int blockNum)
+                         int stride, int strideNum, int reducedStrideNum,                                  \
-{
+                         int blockSize, int blockNum)                                                      \
-    __shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+{                                                                                                          \
+    __shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK];                                                 \
-    unsigned int tid = threadIdx.y;
+                                                                                                           \
-    unsigned int j = blockIdx.y * (blockDim.y * 2) + threadIdx.y;
+    unsigned int tid = threadIdx.y;                                                                        \
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int j = blockIdx.y * (blockDim.y * 2) + threadIdx.y;                                          \
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;                                                \
-    if(i >= stride * blockNum)
+                                                                                                           \
-        return;
+    if(i >= stride * blockNum)                                                                             \
+        return;                                                                                            \
-    __syncthreads();
+                                                                                                           \
+    __syncthreads();                                                                                       \
-    /* first level reduction */
+                                                                                                           \
-    int k = i / stride;
+    /* first level reduction */                                                                            \
-    int iOffset = i % stride;
+    int k = i / stride;                                                                                    \
+    int iOffset = i % stride;                                                                              \
-    DTYPE * data = iData + threadIdx.x * blockDim.y;
+                                                                                                           \
-    DTYPE * inputData = input + k * blockSize;
+    DTYPE * data = iData + threadIdx.x * blockDim.y;                                                       \
-    DTYPE value = j < strideNum ? inputData[j * stride + iOffset] : FLOAT_MIN;
+    DTYPE * inputData = input + k * blockSize;                                                             \
-    DTYPE value2 = j + blockDim.y < strideNum ? inputData[(j + blockDim.y) * stride + iOffset]: FLOAT_MIN;
+    DTYPE value = j < strideNum ? inputData[j * stride + iOffset] : initData;                              \
+    DTYPE value2 = j + blockDim.y < strideNum ? inputData[(j + blockDim.y) * stride + iOffset]: initData;  \
-    value = MAX(value, value2);
+                                                                                                           \
-    value = shflDownReduceMax(value);
+    value = opName(value, value2);                                                                         \
-    if ((tid & 0x1f) == 0) 
+    value = opFuncName(value);                                                                             \
-        data[tid / 32] = value;
+    if ((tid & 0x1f) == 0)                                                                                 \
-    __syncthreads();
+        data[tid / 32] = value;                                                                            \
+    __syncthreads();                                                                                       \
-    if (tid < 32) {
+                                                                                                           \
-        if (tid < blockDim.y / 32)
+    if (tid < 32) {                                                                                        \
-            value = data[tid];
+        if (tid < blockDim.y / 32)                                                                         \
-        else 
+            value = data[tid];                                                                             \
-            value = FLOAT_MIN;
+        else                                                                                               \
-        value = shflDownReduceMax(value);
+            value = initData;                                                                              \
-        if (tid == 0 && blockIdx.y < reducedStrideNum)
+        value = opFuncName(value);                                                                         \
-            output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = value;
+        if (tid == 0 && blockIdx.y < reducedStrideNum)                                                     \
-    }
+            output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = value;                        \
+    }                                                                                                      \
 }
+KERNELREDUCEFUN4(KernelReduceMaxFast, MAX, shflDownReduceMax, FLOAT_MIN)
+KERNELREDUCEFUN4(KernelReduceMinFast, MIN, shflDownReduceMin, MAX_FLOAT)
 /*
 reduce a tensor to another that keeps the max value along a dimension  - fast version
 >> input - the input array (representing a tensor)
@@ -372,14 +388,12 @@ void KernelReduceMaxSimpleFast(DTYPE * input, DTYPE * output,
        int stride4 = stride3 + stride;
        for(int k = 0; k < blockSize; k += stride4){
            DTYPE m = MAX(MAX(ip[k], ip[k + stride]), MAX(ip[k + stride2], ip[k + stride3]));
-            if(max < m)
+            max = MAX(max, m);
-                max = m;
        }
    }
    else{
-        for(int k = 0; k < blockSize; k += stride)
+        for (int k = 0; k < blockSize; k += stride)
-            if(max < ip[k])
+            max = MAX(max, ip[k]);
-                max = ip[k];
    }
    __syncthreads();
@@ -429,66 +443,75 @@ inline void adjustThreadForUseWarpOptimization(dim3& blocks, dim3& threads)
 /*
 In some case,we use less block to imporve efficiency
 */
-__global__
+#define KERNELREDUCEFUN2(funName, opName, opFuncName, initData)                   \
-void KernelReduceMaxOpLessBlocks(DTYPE * input, DTYPE * output, int strideNum, int blockNum)
+__global__                                                                        \
-{
+void funName(DTYPE * input, DTYPE * output, int strideNum, int blockNum)          \
-    int idx = threadIdx.x % 32;
+{                                                                                 \
-    int idy = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
+    int idx = threadIdx.x % 32;                                                   \
+    int idy = (blockIdx.x * blockDim.x + threadIdx.x) / 32;                       \
-    int startIndex = idy * strideNum;
+                                                                                  \
-    DTYPE threadMax = FLOAT_MIN;
+    int startIndex = idy * strideNum;                                             \
-    for (int i = idx; i < strideNum; i += 32) {
+    DTYPE threadMax = initData;                                                   \
-        threadMax = max(input[startIndex + i], threadMax);
+    for (int i = idx; i < strideNum; i += 32) {                                   \
-    }
+        threadMax = opName(input[startIndex + i], threadMax);                     \
-    threadMax = shflDownReduceMax(threadMax);
+    }                                                                             \
-    if (idx == 0) 
+    threadMax = opFuncName(threadMax);                                            \
-        output[idy] = threadMax;
+    if (idx == 0)                                                                 \
+        output[idy] = threadMax;                                                  \
 }
+KERNELREDUCEFUN2(KernelReduceMaxOpLessBlocks, MAX, shflDownReduceMax, FLOAT_MIN)
+KERNELREDUCEFUN2(KernelReduceMinOpLessBlocks, MIN, shflDownReduceMin, MAX_FLOAT)
 /*
 we use PTX code reduce
 */
-__global__
+#define KERNELREDUCEFUN1(funName, opName, opFuncName, initData)                          \
-void KernelReduceMaxOp(DTYPE * input, DTYPE * output,int stride, int strideNum, 
+__global__                                                                               \
-                       int reducedStrideNum,int blockSize, int blockNum)
+void funName(DTYPE * input, DTYPE * output,int stride, int strideNum,                    \
-{
+                       int reducedStrideNum,int blockSize, int blockNum)                 \
-    __shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK / 32];
+{                                                                                        \
+    __shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK / 32];                          \
-    unsigned int tid = threadIdx.y;
+                                                                                         \
-    unsigned int j = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int tid = threadIdx.y;                                                      \
-    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int j = blockIdx.y * blockDim.y + threadIdx.y;                              \
-    if (i >= stride * blockNum)
+    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;                              \
-        return;
+    if (i >= stride * blockNum)                                                          \
+        return;                                                                          \
-    /* first level reduction */
+                                                                                         \
-    int k = i / stride;
+    /* first level reduction */                                                          \
-    int iOffset = i % stride;
+    int k = i / stride;                                                                  \
+    int iOffset = i % stride;                                                            \
-    DTYPE threadMax = FLOAT_MIN;
+                                                                                         \
+    DTYPE threadMax = initData;                                                          \
-    DTYPE * data = iData + threadIdx.x * blockDim.y;
+                                                                                         \
-    DTYPE * inputData = input + k * blockSize;
+    DTYPE * data = iData + threadIdx.x * blockDim.y;                                     \
-    for (int it = j; it < strideNum; it += blockDim.y){
+    DTYPE * inputData = input + k * blockSize;                                           \
-        threadMax = max(inputData[it * stride + iOffset], threadMax);
+    for (int it = j; it < strideNum; it += blockDim.y){                                  \
-    }
+        threadMax = opName(inputData[it * stride + iOffset], threadMax);                 \
+    }                                                                                    \
-    __syncthreads();
+                                                                                         \
-    threadMax = shflDownReduceMax(threadMax);
+    __syncthreads();                                                                     \
-    if ((tid & 0x1f) == 0) 
+    threadMax = opFuncName(threadMax);                                                   \
-        data[tid / 32] = threadMax;
+    if ((tid & 0x1f) == 0)                                                               \
+        data[tid / 32] = threadMax;                                                      \
-    __syncthreads();
+                                                                                         \
-    /* use one warp to reduce remaining data */
+    __syncthreads();                                                                     \
-    if (tid < 32){
+    /* use one warp to reduce remaining data */                                          \
-        if (tid < blockDim.y / 32)
+    if (tid < 32){                                                                       \
-            threadMax = data[tid];
+        if (tid < blockDim.y / 32)                                                       \
-        else threadMax = FLOAT_MIN;
+            threadMax = data[tid];                                                       \
-        threadMax = shflDownReduceMax(threadMax);
+        else threadMax = initData;                                                       \
-        if (tid == 0 && blockIdx.y < reducedStrideNum)
+        threadMax = opFuncName(threadMax);                                               \
-            output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = threadMax;
+        if (tid == 0 && blockIdx.y < reducedStrideNum)                                   \
-    }
+            output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = threadMax;  \
+    }                                                                                    \
 }
+KERNELREDUCEFUN1(KernelReduceMaxOp, MAX, shflDownReduceMax, FLOAT_MIN)
+KERNELREDUCEFUN1(KernelReduceMinOp, MIN, shflDownReduceMin, MAX_FLOAT)
 /* 
 get the max-valued items along a dimension of the tensor (cuda version). 
 For a 1-dimensional data array a,
@@ -497,203 +520,207 @@ sum_i = max_{0<=j<strideNum} input_{i,j}
 >> output - the output tensor
 >> dim - which dimension to reduce
 */
-void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
+#define _CUDAREDUCE(_funcName, _reduceFunc1, _reduceFunc2, _reduceFunc3, _reduceFun4)                                                         \
-{
+void _funcName(const XTensor * input, XTensor * output, int dim)                                                                              \
-    CheckNTErrors(input && output, "Empty input or output tensors!");
+{                                                                                                                                             \
-    CheckNTErrors(input->order == output->order + 1, "Incorrect tensor sizes!");
+    CheckNTErrors(input && output, "Empty input or output tensors!");                                                                         \
-    CheckNTErrors(input->order > dim && dim >=0, "Illegal dimension to reduce!");
+    CheckNTErrors(input->order == output->order + 1, "Incorrect tensor sizes!");                                                              \
-    CheckNTErrors(input->dataType == output->dataType, "Unmatched data types!");
+    CheckNTErrors(input->order > dim && dim >=0, "Illegal dimension to reduce!");                                                             \
+    CheckNTErrors(input->dataType == output->dataType, "Unmatched data types!");                                                              \
-    int dimRDI = input->order - dim - 1;
+                                                                                                                                              \
-    for(int i = 0; i < input->order; i++){
+    for(int i = 0; i < input->order; i++){                                                                                                    \
-        if(i < dimRDI){
+        if(i < dim){                                                                                                                          \
-            CheckNTErrors(input->dimSizeRDI[i] == output->dimSizeRDI[i], "Unmatched tensors!");
+            CheckNTErrors(input->dimSize[i] == output->dimSize[i], "Unmatched tensors!");                                                     \
-        }
+        }                                                                                                                                     \
-        else if(i > dimRDI){
+        else if(i > dim){                                                                                                                     \
-            CheckNTErrors(input->dimSizeRDI[i] == output->dimSizeRDI[i - 1], "Unmatched tensors!");
+            CheckNTErrors(input->dimSize[i] == output->dimSize[i - 1], "Unmatched tensors!");                                                 \
-        }
+        }                                                                                                                                     \
-    }
+    }                                                                                                                                         \
+                                                                                                                                              \
-    int cudaGridSize[3];
+    int cudaGridSize[3];                                                                                                                      \
-    int cudaBlockSize[3];
+    int cudaBlockSize[3];                                                                                                                     \
-    int iter = 0;
+    int iter = 0;                                                                                                                             \
-    int stride = 1;
+    int stride = 1;                                                                                                                           \
-    int strideNum = input->dimSizeRDI[dimRDI];
+    int strideNum = input->dimSize[dim];                                                                                                      \
-    int blockSize = 1;
+    int blockSize = 1;                                                                                                                        \
-    int blockNum = 1;
+    int blockNum = 1;                                                                                                                         \
+                                                                                                                                              \
-    for (int i = 0; i < input->order; i++) {
+    for (int i = 0; i < input->order; i++) {                                                                                                  \
-        if (i < dimRDI)
+        if (i < dim)                                                                                                                          \
-            stride *= input->dimSizeRDI[i];
+            blockNum *= input->dimSize[i];                                                                                                    \
-        else if (i > dimRDI)
+        else if (i > dim)                                                                                                                     \
-            blockNum *= input->dimSizeRDI[i];
+            stride *= input->dimSize[i];                                                                                                      \
-    }
+    }                                                                                                                                         \
-    blockSize = stride * strideNum;
+    blockSize = stride * strideNum;                                                                                                           \
+                                                                                                                                              \
-    int devID = input->devID;
+    int devID = input->devID;                                                                                                                 \
-    XMem * mem = input->mem;
+    XMem * mem = input->mem;                                                                                                                  \
+                                                                                                                                              \
-    GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+    GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);                                         \
+                                                                                                                                              \
-    int bufSize = sizeof(DTYPE) * cudaGridSize[0] * stride * blockNum * 2;
+    int bufSize = sizeof(DTYPE) * cudaGridSize[0] * stride * blockNum * 2;                                                                    \
-    DTYPE * buf = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(input->devID, bufSize);
+    DTYPE * buf = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(input->devID, bufSize);                        \
-    DTYPE * buf1 = buf;
+    DTYPE * buf1 = buf;                                                                                                                       \
-    DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum;
+    DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum;                                                                                 \
+                                                                                                                                              \
-    int devIDBackup;
+    int devIDBackup;                                                                                                                          \
-    ProtectCudaDev(input->devID, devIDBackup);
+    ProtectCudaDev(input->devID, devIDBackup);                                                                                                \
+                                                                                                                                              \
-    if (stride == 1 && blockNum >= 10) {
+    if (stride == 1 && blockNum >= 10) {                                                                                                      \
-        dim3 grids;
+        dim3 grids;                                                                                                                           \
-        dim3 blocks;
+        dim3 blocks;                                                                                                                          \
-        continuousStorageThreadAllocation(grids, blocks, (long long)blockNum, strideNum);
+        continuousStorageThreadAllocation(grids, blocks, (long long)blockNum, strideNum);                                                     \
-        if (blocks.y >= 128) {
+        if (blocks.y >= 128) {                                                                                                                \
-            KernelReduceMaxOp <<<grids, blocks >>> ((DTYPE *)input->data, (DTYPE*)output->data, stride, strideNum, grids.y, blockSize, blockNum);
+            _reduceFunc1 <<<grids, blocks >>> ((DTYPE *)input->data, (DTYPE*)output->data, stride, strideNum, grids.y, blockSize, blockNum);  \
-        }
+        }                                                                                                                                     \
-        else {
+        else {                                                                                                                                \
-            if (blockNum % 4 != 0) blockNum = (int)(blockNum / 4) + 1;
+            if (blockNum % 4 != 0) blockNum = (int)(blockNum / 4) + 1;                                                                        \
-            else blockNum = blockNum / 4;
+            else blockNum = blockNum / 4;                                                                                                     \
-            KernelReduceMaxOpLessBlocks <<<blockNum, 128 >>> ((DTYPE *)input->data, (DTYPE*)output->data, strideNum, blockNum);
+            _reduceFunc2 <<<blockNum, 128 >>> ((DTYPE *)input->data, (DTYPE*)output->data, strideNum, blockNum);                              \
-        }
+        }                                                                                                                                     \
-    }
+    }                                                                                                                                         \
-    else {
+    else {                                                                                                                                    \
-        do {
+        do {                                                                                                                                  \
-            if (input->dataType == DEFAULT_DTYPE) {
+            if (input->dataType == DEFAULT_DTYPE) {                                                                                           \
-                DTYPE * iData = NULL;
+                DTYPE * iData = NULL;                                                                                                         \
-                DTYPE * oData = NULL;
+                DTYPE * oData = NULL;                                                                                                         \
-                if (iter == 0) {
+                if (iter == 0) {                                                                                                              \
-                    iData = (DTYPE*)input->data;
+                    iData = (DTYPE*)input->data;                                                                                              \
-                    oData = buf1;
+                    oData = buf1;                                                                                                             \
-                }
+                }                                                                                                                             \
-                else if (iter % 2 == 1) {
+                else if (iter % 2 == 1) {                                                                                                     \
-                    iData = buf1;
+                    iData = buf1;                                                                                                             \
-                    oData = buf2;
+                    oData = buf2;                                                                                                             \
-                }
+                }                                                                                                                             \
-                else {
+                else {                                                                                                                        \
-                    iData = buf2;
+                    iData = buf2;                                                                                                             \
-                    oData = buf1;
+                    oData = buf1;                                                                                                             \
-                }
+                }                                                                                                                             \
+                                                                                                                                              \
-                /* unroll the reduction procedure. The code is messy but it is faster. */
+                /* unroll the reduction procedure. The code is messy but it is faster. */                                                     \
-                if (strideNum < 32) {
+                if (strideNum < 32) {                                                                                                         \
-                    GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);                         \
-                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);                               \
-                    if (cudaGridSize[0] == 1)
+                    if (cudaGridSize[0] == 1)                                                                                                 \
-                        oData = (DTYPE*)output->data;
+                        oData = (DTYPE*)output->data;                                                                                         \
-                    KernelReduceMax <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
+                    _reduceFunc3 <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);                      \
-                }
+                }                                                                                                                             \
-                else if (strideNum < 128) {
+                else if (strideNum < 128) {                                                                                                   \
-                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);        \
-                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);                               \
-                    if (cudaGridSize[0] == 1)
+                    if (cudaGridSize[0] == 1)                                                                                                 \
-                        oData = (DTYPE*)output->data;
+                        oData = (DTYPE*)output->data;                                                                                         \
-                    CheckNTErrors(cudaBlockSize[0] >= 64, "Incorrect thread number when calling the cuda kernel!");
+                    CheckNTErrors(cudaBlockSize[0] >= 64, "Incorrect thread number when calling the cuda kernel!");                           \
-                    adjustThreadForUseWarpOptimization(blocks, threads);
+                    adjustThreadForUseWarpOptimization(blocks, threads);                                                                      \
-                    KernelReduceMaxFast<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
+                    _reduceFun4<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);                   \
-                }
+                }                                                                                                                             \
-                else if (strideNum < 256) {
+                else if (strideNum < 256) {                                                                                                   \
-                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);       \
-                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);                               \
-                    if (cudaGridSize[0] == 1)
+                    if (cudaGridSize[0] == 1)                                                                                                 \
-                        oData = (DTYPE*)output->data;
+                        oData = (DTYPE*)output->data;                                                                                         \
-                    CheckNTErrors(cudaBlockSize[0] >= 128, "Incorrect thread number when calling the cuda kernel!");
+                    CheckNTErrors(cudaBlockSize[0] >= 128, "Incorrect thread number when calling the cuda kernel!");                          \
-                    adjustThreadForUseWarpOptimization(blocks, threads);
+                    adjustThreadForUseWarpOptimization(blocks, threads);                                                                      \
-                    KernelReduceMaxFast<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
+                    _reduceFun4<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);                  \
-                }
+                }                                                                                                                             \
-                else if (strideNum < 512) {
+                else if (strideNum < 512) {                                                                                                   \
-                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);       \
-                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);                               \
-                    if (cudaGridSize[0] == 1)
+                    if (cudaGridSize[0] == 1)                                                                                                 \
-                        oData = (DTYPE*)output->data;
+                        oData = (DTYPE*)output->data;                                                                                         \
-                    CheckNTErrors(cudaBlockSize[0] >= 256, "Incorrect thread number when calling the cuda kernel!");
+                    CheckNTErrors(cudaBlockSize[0] >= 256, "Incorrect thread number when calling the cuda kernel!");                          \
-                    adjustThreadForUseWarpOptimization(blocks, threads);
+                    adjustThreadForUseWarpOptimization(blocks, threads);                                                                      \
-                    KernelReduceMaxFast<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
+                    _reduceFun4<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);                  \
-                }
+                }                                                                                                                             \
-                else {
+                else {                                                                                                                        \
-                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);       \
-                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);                               \
-                    if (cudaGridSize[0] == 1)
+                    if (cudaGridSize[0] == 1)                                                                                                 \
-                        oData = (DTYPE*)output->data;
+                        oData = (DTYPE*)output->data;                                                                                         \
-                    CheckNTErrors(cudaBlockSize[0] >= 512, "Incorrect thread number when calling the cuda kernel!");
+                    CheckNTErrors(cudaBlockSize[0] >= 512, "Incorrect thread number when calling the cuda kernel!");                          \
-                    adjustThreadForUseWarpOptimization(blocks, threads);
+                    adjustThreadForUseWarpOptimization(blocks, threads);                                                                      \
-                    KernelReduceMaxFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
+                    _reduceFun4<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);                  \
-                }
+                }                                                                                                                             \
-            }
+            }                                                                                                                                 \
-            else if (input->dataType == X_FLOAT16) {
+            else if (input->dataType == X_FLOAT16) {                                                                                          \
-                __half * buf1ft16 = (__half *)buf1;
+                __half * buf1ft16 = (__half *)buf1;                                                                                           \
-                __half * buf2ft16 = (__half *)buf2;
+                __half * buf2ft16 = (__half *)buf2;                                                                                           \
-                __half * iData = NULL;
+                __half * iData = NULL;                                                                                                        \
-                __half * oData = NULL;
+                __half * oData = NULL;                                                                                                        \
-                if (iter == 0) {
+                if (iter == 0) {                                                                                                              \
-                    iData = (__half*)input->data;
+                    iData = (__half*)input->data;                                                                                             \
-                    oData = buf1ft16;
+                    oData = buf1ft16;                                                                                                         \
-                }
+                }                                                                                                                             \
-                else if (iter % 2 == 1) {
+                else if (iter % 2 == 1) {                                                                                                     \
-                    iData = buf1ft16;
+                    iData = buf1ft16;                                                                                                         \
-                    oData = buf2ft16;
+                    oData = buf2ft16;                                                                                                         \
-                }
+                }                                                                                                                             \
-                else {
+                else {                                                                                                                        \
-                    iData = buf2ft16;
+                    iData = buf2ft16;                                                                                                         \
-                    oData = buf1ft16;
+                    oData = buf1ft16;                                                                                                         \
-                }
+                }                                                                                                                             \
+                                                                                                                                              \
-                /* unroll the reduction procedure. The code is messy but it is faster. */
+                /* unroll the reduction procedure. The code is messy but it is faster. */                                                     \
-                if (strideNum < 32) {
+                if (strideNum < 32) {                                                                                                         \
-                    GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);                         \
-                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);                               \
-                    if (cudaGridSize[0] == 1)
+                    if (cudaGridSize[0] == 1)                                                                                                 \
-                        oData = (__half*)output->data;
+                        oData = (__half*)output->data;                                                                                        \
-                    KernelReduceMax <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
+                    KernelReduceMax <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);                      \
-                }
+                }                                                                                                                             \
-                else if (strideNum < 128) {
+                else if (strideNum < 128) {                                                                                                   \
-                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 64), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);        \
-                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);                               \
-                    if (cudaGridSize[0] == 1)
+                    if (cudaGridSize[0] == 1)                                                                                                 \
-                        oData = (__half*)output->data;
+                        oData = (__half*)output->data;                                                                                        \
-                    CheckNTErrors(cudaBlockSize[0] >= 64, "Incorrect thread number when calling the cuda kernel!");
+                    CheckNTErrors(cudaBlockSize[0] >= 64, "Incorrect thread number when calling the cuda kernel!");                           \
-                    KernelReduceMaxFast<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
+                    KernelReduceMaxFast<64> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);                   \
-                }
+                }                                                                                                                             \
-                else if (strideNum < 256) {
+                else if (strideNum < 256) {                                                                                                   \
-                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 128), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);       \
-                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);                               \
-                    if (cudaGridSize[0] == 1)
+                    if (cudaGridSize[0] == 1)                                                                                                 \
-                        oData = (__half*)output->data;
+                        oData = (__half*)output->data;                                                                                        \
-                    CheckNTErrors(cudaBlockSize[0] >= 128, "Incorrect thread number when calling the cuda kernel!");
+                    CheckNTErrors(cudaBlockSize[0] >= 128, "Incorrect thread number when calling the cuda kernel!");                          \
-                    KernelReduceMaxFast<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
+                    KernelReduceMaxFast<128> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);                  \
-                }
+                }                                                                                                                             \
-                else if (strideNum < 512) {
+                else if (strideNum < 512) {                                                                                                   \
-                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 256), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);       \
-                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);                               \
-                    if (cudaGridSize[0] == 1)
+                    if (cudaGridSize[0] == 1)                                                                                                 \
-                        oData = (__half*)output->data;
+                        oData = (__half*)output->data;                                                                                        \
-                    CheckNTErrors(cudaBlockSize[0] >= 256, "Incorrect thread number when calling the cuda kernel!");
+                    CheckNTErrors(cudaBlockSize[0] >= 256, "Incorrect thread number when calling the cuda kernel!");                          \
-                    KernelReduceMaxFast<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
+                    KernelReduceMaxFast<256> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);                  \
-                }
+                }                                                                                                                             \
-                else {
+                else {                                                                                                                        \
-                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+                    GDevs.GetCudaThread2D(devID, MAX(strideNum / 2 + 1, 512), stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);       \
-                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);
+                    dim3 blocks(cudaGridSize[1], cudaGridSize[0]), threads(cudaBlockSize[1], cudaBlockSize[0]);                               \
-                    if (cudaGridSize[0] == 1)
+                    if (cudaGridSize[0] == 1)                                                                                                 \
-                        oData = (__half*)output->data;
+                        oData = (__half*)output->data;                                                                                        \
-                    CheckNTErrors(cudaBlockSize[0] >= 512, "Incorrect thread number when calling the cuda kernel!");
+                    CheckNTErrors(cudaBlockSize[0] >= 512, "Incorrect thread number when calling the cuda kernel!");                          \
-                    KernelReduceMaxFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);
+                    KernelReduceMaxFast<512> <<<blocks, threads>>> (iData, oData, stride, strideNum, blocks.y, blockSize, blockNum);                  \
-                }
+                }                                                                                                                             \
-            }
+            }                                                                                                                                 \
+                                                                                                                                              \
-            strideNum = cudaGridSize[0];
+            strideNum = cudaGridSize[0];                                                                                                      \
-            blockSize = cudaGridSize[0];
+            blockSize = cudaGridSize[0];                                                                                                      \
+                                                                                                                                              \
-            iter++;
+            iter++;                                                                                                                           \
+                                                                                                                                              \
-        } while (strideNum > 1);
+        } while (strideNum > 1);                                                                                                              \
-    }
+    }                                                                                                                                         \
+                                                                                                                                              \
+    BacktoCudaDev(input->devID, devIDBackup);                                                                                                 \
+                                                                                                                                              \
+    if (mem != NULL)                                                                                                                          \
+        mem->ReleaseBuf(mem->devID, bufSize);                                                                                                 \
+    else                                                                                                                                      \
+        XMemFree(input->devID, buf);                                                                                                          \
+}
-    BacktoCudaDev(input->devID, devIDBackup);
+_CUDAREDUCE(_CudaReduceMax, KernelReduceMaxOp, KernelReduceMaxOpLessBlocks, KernelReduceMax, KernelReduceMaxFast)
+_CUDAREDUCE(_CudaReduceMin, KernelReduceMinOp, KernelReduceMinOpLessBlocks, KernelReduceMin, KernelReduceMinFast)
-    if (mem != NULL)
-        mem->ReleaseBuf(mem->devID, bufSize);
-    else
-        XMemFree(input->devID, buf);
-}
 #endif // USE_CUDA

--- a/source/tensor/core/reduce/ReduceMax.cuh
+++ b/source/tensor/core/reduce/ReduceMax.cuh
@@ -31,6 +31,9 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 /* get the max-valued items along a dimension of the tensor (cuda version) */
 void _CudaReduceMax(const XTensor * input, XTensor * output, int dim);
+/* get the min-valued items along a dimension of the tensor (cuda version) */
+void _CudaReduceMin(const XTensor * input, XTensor * output, int dim);
 #endif // USE_CUDA
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/reduce/ReduceMax.h
+++ b/source/tensor/core/reduce/ReduceMax.h
@@ -29,14 +29,20 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 /* get the max value of the items along a dimension of the tensor. */
 void _ReduceMax(const XTensor * input, XTensor * output, int dim);
+/* get the min value of the items along a dimension of the tensor. */
+void _ReduceMin(const XTensor * input, XTensor * output, int dim);
 /* 
 get the max value of the items along a dimension of the tensor (return an XTensor structure)
 make a new tensor to keep the result and return it
 */
 XTensor ReduceMax(const XTensor &input, int dim);
-/* get the max value of the items along a dimension of the tensor. */
+/*
-void ReduceMax(const XTensor &input, XTensor &output, int dim);
+get the min value of the items along a dimension of the tensor (return an XTensor structure)
+make a new tensor to keep the result and return it
+*/
+XTensor ReduceMin(const XTensor &input, int dim);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/reduce/ReduceMean.cpp
+++ b/source/tensor/core/reduce/ReduceMean.cpp
@@ -39,8 +39,7 @@ void _ReduceMean(const XTensor * input, XTensor * output, int dim)
 {
    CheckNTErrors((input->order > dim), "Illegal dimension specified!");
-    int dimRDI = input->order - dim - 1;
+    int num = input->dimSize[dim];
-    int num = input->dimSizeRDI[dimRDI];
    _ReduceSum(input, output, dim);
    _ScaleAndShiftMe(output, (DTYPE)1/num, 0);

--- a/source/tensor/core/reduce/ReduceSum.cpp
+++ b/source/tensor/core/reduce/ReduceSum.cpp
@@ -54,15 +54,14 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor 
    CheckNTErrors((input->dataType == output->dataType), "Unmatched data types!");
    CheckNTErrors((shift == NULL || _IsSameShaped(output, shift)), "Incorrect shift tensor size!");
-    int dimRDI = input->order - dim - 1;
+    CheckNTErrors(dim < input->order, "Wrong dimension!");
-    CheckNTErrors(dimRDI >= 0, "Wrong dimension!");
    for(int i = 0; i < input->order; i++){
-        if(i < dimRDI){
+        if(i < dim){
-            CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i]), "Unmatched tensors!");
+            CheckNTErrors((input->dimSize[i] == output->dimSize[i]), "Unmatched tensors!");
        }
-        else if(i > dimRDI){
+        else if(i > dim){
-            CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i - 1]), "Unmatched tensors!");
+            CheckNTErrors((input->dimSize[i] == output->dimSize[i - 1]), "Unmatched tensors!");
        }
    }
@@ -75,21 +74,21 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor 
        CheckNTErrors((input->dataType == DEFAULT_DTYPE), "TODO!");
        int stride = 1;
-        int strideNum = input->dimSizeRDI[dimRDI];
+        int strideNum = input->dimSize[dim];
        int blockSize = 1;
        int blockNum = 1;
        for (int i = 0; i < input->order; i++) {
-            if (i < dimRDI)
+            if (i < dim)
-                stride *= input->dimSizeRDI[i];
+                blockNum *= input->dimSize[i];
-            else if (i > dimRDI)
+            else if (i > dim)
-                blockNum *= input->dimSizeRDI[i];
+                stride *= input->dimSize[i];
        }
        blockSize = stride * strideNum;
-        if(input->dimSizeRDI[0] % (4 * 32 / sizeof(DTYPE)) == 0 && input->dimSizeRDI[0] >= 32){
+        if(input->dimSize[input->order - 1] % (4 * 32 / sizeof(DTYPE)) == 0 && input->dimSize[input->order - 1] >= 32){
            int vecBufLength =  32 / sizeof(DTYPE);
-            if(dimRDI == 0){
+            if(dim == input->order - 1){
                //data is contiguous in dim 0
                for(int i = 0; i < blockNum; i++){
                    // stride = 1
@@ -123,7 +122,7 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor 
            } else{
                //data is separated
                for(int i = 0; i < blockNum; i++){
-                    for(int j = 0; j < input->dimSizeRDI[0] / 32; j++){
+                    for(int j = 0; j < input->dimSize[input->order - 1] / 32; j++){
                        DTYPE * ip = (DTYPE*)input->data + blockSize * i;
                        DTYPE * op = (DTYPE*)output->data + stride * i;
                        DTYPE * sp = shift != NULL ? (DTYPE*)shift->data + stride * i : NULL;

--- a/source/tensor/core/reduce/ReduceSum.cu
+++ b/source/tensor/core/reduce/ReduceSum.cu
@@ -692,13 +692,12 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
    CheckNTErrors(input->dataType == output->dataType, "Unmatched data types!");
    CheckNTErrors(shift == NULL || output->unitNum == shift->unitNum, "Incorrect shift tensor size!");
-    int dimRDI = input->order - dim - 1;
    for(int i = 0; i < input->order; i++){
-        if(i < dimRDI){
+        if(i < dim){
-            CheckNTErrors(input->dimSizeRDI[i] == output->dimSizeRDI[i], "Unmatched tensors!");
+            CheckNTErrors(input->dimSize[i] == output->dimSize[i], "Unmatched tensors!");
        }
-        else if(i > dimRDI){
+        else if(i > dim){
-            CheckNTErrors(input->dimSizeRDI[i] == output->dimSizeRDI[i - 1], "Unmatched tensors!");
+            CheckNTErrors(input->dimSize[i] == output->dimSize[i - 1], "Unmatched tensors!");
        }
    }
@@ -709,31 +708,23 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
    int cudaBlockSize[3];
    int iter = 0;
    int stride = 1;
-    int strideNum = input->dimSizeRDI[dimRDI];
+    int strideNum = input->dimSize[dim];
    int blockSize = 1;
    int blockNum = 1;
    for (int i = 0; i < input->order; i++) {
-        if (i < dimRDI)
+        if (i < dim)
-            stride *= input->dimSizeRDI[i];
+            blockNum *= input->dimSize[i];
-        else if (i > dimRDI)
+        else if (i > dim)
-            blockNum *= input->dimSizeRDI[i];
+            stride *= input->dimSize[i];
    }
    blockSize = stride * strideNum;
    int devID = input->devID;
-    XMem * mem = input->mem;
+    int devIDBackup;
+    ProtectCudaDev(devID, devIDBackup);
-    GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-    int bufSize = input->unitSize * cudaGridSize[0] * stride * blockNum * 2;
-    DTYPE * buf  = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(input->devID, bufSize);
-    DTYPE * buf1 = buf;
-    DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum;
    DTYPE * sp = shift != NULL ? (DTYPE*)shift->data : NULL;
-    int devIDBackup;
-    ProtectCudaDev(input->devID, devIDBackup);
    if (stride == 1 && blockNum >= 10) {
        dim3 grids;
@@ -761,6 +752,14 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
                                                                strideNum, blockNum,sp, power, isExp);
    }
    else {
+        XMem * mem = input->mem;
+        GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+        int bufSize = input->unitSize * cudaGridSize[0] * stride * blockNum * 2;
+        DTYPE * buf  = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(devID, bufSize);
+        DTYPE * buf1 = buf;
+        DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum;
        do {
            if (input->dataType == DEFAULT_DTYPE) {
                DTYPE * iData = NULL;
@@ -904,13 +903,15 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
            iter++;
        } while (strideNum > 1);
+        if (mem != NULL)
+            mem->ReleaseBuf(mem->devID, bufSize);
+        else
+            XMemFree(devID, buf);
    }
-    ProtectCudaDev(input->devID, devIDBackup);
-    if (mem != NULL)
+    BacktoCudaDev(devID, devIDBackup);
-        mem->ReleaseBuf(mem->devID, bufSize);
-    else
-        XMemFree(input->devID, buf);
 }
 #endif // USE_CUDA

--- a/source/tensor/core/reduce/ReduceVariance.cpp
+++ b/source/tensor/core/reduce/ReduceVariance.cpp
@@ -38,8 +38,7 @@ For a 1-dimensional data array a, variance = 1/n * \sum_i (a_i - mean)^2
 */
 void _ReduceVariance(const XTensor * input, XTensor * output, int dim, const XTensor * mean)
 {
-    int dimRDI = input->order - dim - 1;
+    int num = input->dimSize[dim];
-    int num = input->dimSizeRDI[dimRDI];
    _ReduceSum(input, output, dim, mean, 2.0F);
    _ScaleAndShiftMe(output, (DTYPE)1 / num, 0);
 }

--- a/source/tensor/core/reduce/VectorBuffer.cpp
+++ b/source/tensor/core/reduce/VectorBuffer.cpp
@@ -20,7 +20,7 @@
 */
 #include "VectorBuffer.h"
+//#include "math.h"
 namespace nts {
 /* data size for each buffer */
 int VectorBuffer::size()
@@ -168,4 +168,13 @@ VectorBuffer VectorBuffer::maxData(const VectorBuffer &a) {
    return *this;
 }
+/* conculte the max of two buffer */
+VectorBuffer VectorBuffer::minData(const VectorBuffer &a) {
+    for (int i = 0; i != a.size(); i++) {
+        this->values[i] = MIN(a[i], this->values[i]);
+        printf("runhere");
+    }
+    return *this;
+}
 }/* end of the nts (NiuTrans.Tensor) namespace */
\ No newline at end of file
--- a/source/tensor/core/reduce/VectorBuffer.h
+++ b/source/tensor/core/reduce/VectorBuffer.h
@@ -20,7 +20,6 @@
 */
 //#include <cstring>
-#include <math.h>
 #include "../../XGlobal.h"
 namespace nts {
@@ -49,5 +48,8 @@ public:
    /* conculte the max of two buffer */
    VectorBuffer maxData(const VectorBuffer &a); 
+    /* conculte the max of two buffer */
+    VectorBuffer minData(const VectorBuffer &a);
 };
 }
\ No newline at end of file
--- a/source/tensor/core/shape/ConcatenateSolely.cpp
+++ b/source/tensor/core/shape/ConcatenateSolely.cpp
@@ -39,30 +39,29 @@ void _ConcatenateSolely(const TensorList * smalls, XTensor * big, int dim)
    CheckNTErrors(big->order > dim && dim >= 0, "Illegal dimension to concatenate!");
    int catDimSize = 0;
-    int dimRDI = big->order - dim - 1;
    for (int i = 0; i < smalls->count; i++) {
        XTensor * tensor = (XTensor*)smalls->GetItem(i);
        CheckNTErrors((big->order == tensor->order), "Unmatched tensor orders!");
        for (int j = 0; j < big->order; j++) {
-            if (j != dimRDI) {
+            if (j != dim) {
-                CheckNTErrors((big->dimSizeRDI[j] == tensor->dimSizeRDI[j]), "Unmatched tensor sizes!");
+                CheckNTErrors((big->dimSize[j] == tensor->dimSize[j]), "Unmatched tensor sizes!");
            }
            else {
-                catDimSize += tensor->dimSizeRDI[j];
+                catDimSize += tensor->dimSize[j];
            }
        }
    }
-    CheckNTErrors((catDimSize == big->dimSizeRDI[dimRDI]), "Unmatched tensor sizes!");
+    CheckNTErrors((catDimSize == big->dimSize[dim]), "Unmatched tensor sizes!");
    int stride = 1;
-    for (int i = 0; i < dimRDI; i++)
-        stride *= big->dimSizeRDI[i];
    int blockNum = 1;
-    for (int i = dimRDI + 1; i < big->order; i++)
+    for (int i = 0; i < dim; i++)
-        blockNum *= big->dimSizeRDI[i];
+        blockNum *= big->dimSize[i];
+    for (int i = dim + 1; i < big->order; i++)
+        stride *= big->dimSize[i];
    int offset = 0;
@@ -74,8 +73,8 @@ void _ConcatenateSolely(const TensorList * smalls, XTensor * big, int dim)
    if (smalls->count <= MIN_TENSOR_CAT_NUM) {
        for (int i = 0; i < smalls->count; i++) {
            XTensor * tensor = (XTensor*)smalls->GetItem(i);
-            int sPitch = stride * tensor->dimSizeRDI[dimRDI] * tensor->unitSize;
+            int sPitch = stride * tensor->dimSize[dim] * tensor->unitSize;
-            int tPitch = stride * big->dimSizeRDI[dimRDI] * big->unitSize;
+            int tPitch = stride * big->dimSize[dim] * big->unitSize;
            int mSize = sPitch;
            int n = blockNum;
            XMemCopy2D((char*)big->data + offset, tPitch, big->devID,
@@ -89,7 +88,7 @@ void _ConcatenateSolely(const TensorList * smalls, XTensor * big, int dim)
        int * blockSizes = new int[smalls->count];
        for (int i = 0; i < smalls->count; i++) {
            XTensor * tensor = (XTensor*)smalls->GetItem(i);
-            blockSizes[i] = stride * tensor->dimSizeRDI[dimRDI] * tensor->unitSize;
+            blockSizes[i] = stride * tensor->dimSize[dim] * tensor->unitSize;
            sourceArrays->Add((char*)tensor->data);
        }

--- a/source/tensor/core/shape/IsSameShaped.cpp
+++ b/source/tensor/core/shape/IsSameShaped.cpp
@@ -39,7 +39,7 @@ bool _IsSameShaped(const XTensor * a, const XTensor * b)
        return false;
    for(int i = 0; i < a->order; i++){
-        if(a->dimSizeRDI[i] != b->dimSizeRDI[i])
+        if(a->dimSize[i] != b->dimSize[i])
            return false;
    }

--- a/source/tensor/core/shape/Merge.cpp
+++ b/source/tensor/core/shape/Merge.cpp
@@ -46,10 +46,8 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
    if(leadingDim < 0)
        leadingDim = 0;
-    int whereToMergeRDI = s->order - whereToMerge - 1;
+    if (leadingDim >= s->order)
-    int leadingDimRDI = s->order - leadingDim - 1;
+        leadingDim = leadingDim - s->order;
-    if (leadingDimRDI < 0)
-        leadingDimRDI = s->order - 1;
    CheckNTErrors((s != NULL && t != NULL), "Invalid tensors!");
    CheckNTErrors((s->devID == t->devID || (s->devID < 0 && t->devID < 0)),
@@ -57,19 +55,20 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
    CheckNTErrors((s->unitNum == t->unitNum && s->unitSize == t->unitSize), "Unmatched tensors!");
    CheckNTErrors((s->order == t->order + 1), "Unmatched tensors!");
-    CheckNTErrors((leadingDimRDI > whereToMergeRDI), "Invalid leading dimension!");
+    CheckNTErrors((leadingDim < whereToMerge), "Invalid leading dimension!");
    for (int i = 0; i < s->order; i++) {
-        if (i == whereToMergeRDI) {
+        if (i == whereToMerge) {
-            CheckNTErrors((t->dimSizeRDI[i] == s->dimSizeRDI[i] * s->dimSizeRDI[leadingDimRDI]),
+            CheckNTErrors((t->dimSize[i - 1] == s->dimSize[i] * s->dimSize[leadingDim]),
                          "Unmatched tensor sizes!");
        }
-        else if (i < leadingDimRDI){
+        else if (i < leadingDim){
-            CheckNTErrors((s->dimSizeRDI[i] == t->dimSizeRDI[i]),
+            CheckNTErrors((s->dimSize[i] == t->dimSize[i]),
                          "Unmatched tensor sizes!");
        }
-        else if (i > leadingDimRDI) {
+        else if (i > leadingDim) {
-            CheckNTErrors((s->dimSizeRDI[i] == t->dimSizeRDI[i - 1]),
+            CheckNTErrors((s->dimSize[i] == t->dimSize[i - 1]),
                          "Unmatched tensor sizes!");
        }
    }
@@ -78,14 +77,14 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
    int blockNum = 1;
    int gridSize = 1;
    int gridNum = 1;
-    int mergedNum = s->dimSizeRDI[leadingDimRDI];
+    int mergedNum = s->dimSize[leadingDim];
    for (int i = 0; i < s->order; i++) {
-        if (i <= leadingDimRDI) {
+        if (i >= leadingDim) {
-            if (i <= whereToMergeRDI)
+            if (i >= whereToMerge)
-                blockSize *= s->dimSizeRDI[i];
+                blockSize *= s->dimSize[i];
            else
-                blockNum *= s->dimSizeRDI[i];
+                blockNum *= s->dimSize[i];
        }
    }
@@ -122,7 +121,7 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
        if (!isOnSameDevice)
            dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(mem->devID, size);
-        int blockNumInMerge = s->dimSizeRDI[leadingDimRDI];
+        int blockNumInMerge = s->dimSize[leadingDim];
        int splitSizeInGrid = gridSize / blockNumInMerge;
        int realBlockSize = blockSize * t->unitSize;
@@ -311,12 +310,11 @@ void _Merge(const TensorList * smalls, XTensor * t, int whereToMerge)
    int mergedNum = smalls->count;
    XTensor * s0 = smalls->GetItem(0);
-    int whereToMergeRDI = s0->order - whereToMerge - 1;
    for (int i = 0; i < s0->order; i++) {
-        if (i <= whereToMergeRDI)
+        if (i >= whereToMerge)
-            blockSize *= s0->dimSizeRDI[i];
+            blockSize *= s0->dimSize[i];
        else
-            blockNum *= s0->dimSizeRDI[i];
+            blockNum *= s0->dimSize[i];
    }
    CheckNTErrors((s0->unitNum % (blockSize * blockNum) == 0), "Incorrect size!");

--- a/source/tensor/core/shape/Merge.h
+++ b/source/tensor/core/shape/Merge.h
@@ -46,8 +46,6 @@ void Merge(const TensorList &smalls, XTensor &t, int whereToMerge);
 /* merge two tensors into a big tensor (return an XTensor structure) */
 XTensor Merge(const XTensor &smallA, const XTensor &smallB, int whereToMerge);
-void Merge(const XTensor &smallA, const XTensor &smallB, XTensor &t, int whereToMerge);
 } // namespace nts(NiuTrans.Tensor)
 #endif // __MERGE_H__
\ No newline at end of file
--- a/source/tensor/core/shape/Split.cpp
+++ b/source/tensor/core/shape/Split.cpp
@@ -31,7 +31,7 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /*
-transform a tensor by splitting it, e.g., (N, M) -> (3, N/3, M)
+transform a tensor by splitting it, e.g., (N, M) -> (N/3, M, 3)
 >> s - the source tensor
 >> t - the target tensor (for return)
@@ -46,23 +46,22 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
    CheckNTErrors((s->unitNum == t->unitNum && s->unitSize == t->unitSize), "Unmatched tensors!");
    CheckNTErrors((s->order == t->order - 1), "Unmatched tensors!");
-    CheckNTErrors((t->dimSizeRDI[t->order - 1] == splitNum), "Incorrect tensor sizes!");
+    CheckNTErrors((t->dimSize[0] == splitNum), "Incorrect tensor sizes!");
-    int whereToSplitRDI = s->order - whereToSplit - 1;
    for (int i = 0; i < s->order; i++) {
-        if (i == whereToSplitRDI) {
+        if (i == whereToSplit) {
-            CheckNTErrors((s->dimSizeRDI[i] == t->dimSizeRDI[i] * splitNum),
+            CheckNTErrors((s->dimSize[i] == t->dimSize[i + 1] * splitNum),
                          "Unmatched tensor sizes!");
        }
        else {
-            CheckNTErrors((s->dimSizeRDI[i] == t->dimSizeRDI[i]),
+            CheckNTErrors((s->dimSize[i] == t->dimSize[i + 1]),
                          "Unmatched tensor sizes!");
        }
    }
    /* for the case that we split the last dimension. Actually
-    (N, M) and (3, N/3, M) have the same memory layout */
+    (N, M) and (N, M/3, 3) have the same memory layout */
-    if (s->order - 1 == whereToSplitRDI) {
+    if (0 == whereToSplit) {
        XMemCopy(t->data, t->devID, s->data, s->devID, s->unitNum * s->unitSize);
        return;
    }
@@ -70,14 +69,14 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
    int blockSize = 1;
    int blockNum = 1;
    for (int i = 0; i < s->order; i++) {
-        if (i == whereToSplitRDI) {
+        if (i == whereToSplit) {
-            blockSize *= s->dimSizeRDI[i] / splitNum;
+            blockSize *= s->dimSize[i] / splitNum;
            blockNum *= splitNum;
        }
-        else if (i < whereToSplitRDI)
+        else if (i > whereToSplit)
-            blockSize *= s->dimSizeRDI[i];
+            blockSize *= s->dimSize[i];
        else
-            blockNum *= s->dimSizeRDI[i];
+            blockNum *= s->dimSize[i];
    }
    CheckNTErrors((blockNum % splitNum == 0), "Incorrect split number!");
@@ -184,7 +183,7 @@ bool CheckSplitSize(const XTensor * s, const XTensor * t, int whereToSplit, int 
 }
 /*
-transform a tensor by splitting it, e.g., (N, M) -> (3, N/3, M) (return an XTensor structure)
+transform a tensor by splitting it, e.g., (N, M) -> (N/3, M, 3) (return an XTensor structure)
 make a new tensor to keep the result and return it
 >> s - the source tensor
@@ -276,7 +275,6 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli
    CheckNTErrors((smalls->count == splitNum), "Unmatched tensors!");
    CheckNTErrors((smalls->count > 0), "Wrong input!");
-    int whereToSplitRDI = big->order - whereToSplit - 1;
    bool uniform = true;
    for (int i = 0; i < smalls->count; i++) {
@@ -292,14 +290,14 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli
    int blockSize = 1;
    int blockNum = 1;
    for (int i = 0; i < big->order; i++) {
-        if (i == whereToSplitRDI) {
+        if (i == whereToSplit) {
-            blockSize *= big->dimSizeRDI[i] / splitNum;
+            blockSize *= big->dimSize[i] / splitNum;
            blockNum *= splitNum;
        }
-        else if (i < whereToSplitRDI)
+        else if (i > whereToSplit)
-            blockSize *= big->dimSizeRDI[i];
+            blockSize *= big->dimSize[i];
        else
-            blockNum *= big->dimSizeRDI[i];
+            blockNum *= big->dimSize[i];
    }
    CheckNTErrors((blockNum % splitNum == 0), "Incorrect split number!");

--- a/source/tensor/core/shape/Unsqueeze.cpp
+++ b/source/tensor/core/shape/Unsqueeze.cpp
@@ -42,16 +42,15 @@ void _Unsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
    CheckNTErrors((a->order == b->order - 1), "Unmatched tensors!");
    CheckNTErrors((a->unitSize == b->unitSize), "Unmatched tensors!");
-    int dimRDI = b->order - dim - 1;
    for (int i = 0; i < b->order; i++) {
-        if (i < dimRDI) {
+        if (i < dim) {
-            CheckNTErrors((a->dimSizeRDI[i] == b->dimSizeRDI[i]), "Unmatched tensors!");
+            CheckNTErrors((a->dimSize[i] == b->dimSize[i]), "Unmatched tensors!");
        }
-        else if (i > dimRDI) {
+        else if (i > dim) {
-            CheckNTErrors((a->dimSizeRDI[i - 1] == b->dimSizeRDI[i]), "Unmatched tensors!");
+            CheckNTErrors((a->dimSize[i - 1] == b->dimSize[i]), "Unmatched tensors!");
        }
        else {
-            CheckNTErrors((dSize == b->dimSizeRDI[i]), "Unmatched tensors!");
+            CheckNTErrors((dSize == b->dimSize[i]), "Unmatched tensors!");
        }
    }
@@ -60,8 +59,8 @@ void _Unsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
    int blockNumA = 1;
    int blockNumB = 1;
-    for (int i = 0; i < dimRDI; i++)
+    for (int i = dim; i < a->order; i++)
-        blockSize *= a->dimSizeRDI[i];
+        blockSize *= a->dimSize[i];
    realBlockSize = blockSize * a->unitSize;

--- a/source/tensor/core/shape/Unsqueeze.cu
+++ b/source/tensor/core/shape/Unsqueeze.cu
@@ -235,9 +235,8 @@ void _CudaUnsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
    int blockSize = 1;
    int blockNumA = 1;
    int blockNumB = 1;
-    int dimRDI = b->order - dim - 1;
+    for (int i = dim; i < a->order; i++)
-    for (int i = 0; i < dimRDI; i++)
+        blockSize *= a->dimSize[i];
-        blockSize *= a->dimSizeRDI[i];
    blockNumA = a->unitNum / blockSize;
    blockNumB = b->unitNum / blockSize;
@@ -250,7 +249,7 @@ void _CudaUnsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
    int devIDBackup = 0;
    ProtectCudaDev(a->devID, devIDBackup);
-    if (dimRDI == 0) {
+    if (dim == b->order - 1) {
        GDevs.GetCudaThread2D(a->devID, dSize, blockNumA, MAX_INT, cudaGrids, cudaBlocks);
        if (a->dataType == X_FLOAT && b->dataType == X_FLOAT) {

--- a/source/tensor/core/sort/Sort.cpp
+++ b/source/tensor/core/sort/Sort.cpp
@@ -47,7 +47,6 @@ void _Sort(const XTensor * a, XTensor * b, XTensor * index, int dim)
    CheckNTErrors((a->order == index->order), "Unmatched input tensors!");
    CheckNTErrors((index->dataType == X_INT), "Wrong data type!");
-    int dimRDI = a->order - dim - 1;
    /* make the index tensor */
    SetAscendingOrder(*index, dim);
@@ -60,13 +59,13 @@ void _Sort(const XTensor * a, XTensor * b, XTensor * index, int dim)
    }
    else {
        int stride = 1;
-        int strideNum = a->dimSizeRDI[dimRDI];
-        for (int i = 0; i < dimRDI; i++)
-            stride *= a->dimSizeRDI[i];
        int blockNum = 1;
-        for (int i = dimRDI + 1; i < a->order; i++)
+        int strideNum = a->dimSize[dim];
-            blockNum *= a->dimSizeRDI[i];
+        for (int i = 0; i < dim; i++)
+            blockNum *= a->dimSize[i];
+        for (int i = dim + 1; i < a->order; i++)
+            stride *= a->dimSize[i];
        int blockSize = stride * strideNum;
        _CopyValues(a, b);

--- a/source/tensor/core/sort/Sort.cu
+++ b/source/tensor/core/sort/Sort.cu
@@ -217,20 +217,19 @@ void _CudaSortBig(const XTensor * a, XTensor * b, XTensor * indexA, XTensor * in
    CheckNTErrors((a->order > dim && dim >= 0), "Incorrect dimension specified!");
    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
-    int dimRDI = a->order - dim - 1;
+    if (k < 0 || k > b->dimSize[dim])
-    if (k < 0 || k > b->dimSizeRDI[dimRDI])
+        k = b->dimSize[dim];
-        k = b->dimSizeRDI[dimRDI];
    XMem * mem = a->mem;
    int stride = 1;
-    int strideNum = a->dimSizeRDI[dimRDI];
-    for (int i = 0; i < dimRDI; i++)
-        stride *= a->dimSizeRDI[i];
    int blockNum = 1;
-    for (int i = dimRDI + 1; i < a->order; i++)
+    int strideNum = a->dimSize[dim];
-        blockNum *= a->dimSizeRDI[i];
+    for (int i = 0; i < dim; i++)
+        blockNum *= a->dimSize[i];
+    for (int i = dim + 1; i < a->order; i++)
+        stride *= a->dimSize[i];
    int m = GetNextPower2(strideNum);
    int n = stride * blockNum;

--- a/source/tensor/core/sort/TopK.cpp
+++ b/source/tensor/core/sort/TopK.cpp
@@ -45,15 +45,14 @@ void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
    CheckNTErrors(index == NULL || a->order == index->order, "Unmatched input tensors!");
    CheckNTErrors(index->dataType == X_INT, "Wrong data type!");
-    int dimRDI = a->order - dim - 1;
    for (int i = 0; i < a->order; i++) {
-        if (i == dimRDI) {
+        if (i == dim) {
-            CheckNTErrors(b->dimSizeRDI[i] == k, "A too large K");
+            CheckNTErrors((b->dimSize[i] == k), "A too large K");
-            CheckNTErrors(index == NULL || index->dimSizeRDI[i] == k, "Wrong size!");
+            CheckNTErrors((index == NULL || index->dimSize[i] == k), "Wrong size!");
        }
        else {
-            CheckNTErrors(b->dimSizeRDI[i] == a->dimSizeRDI[i], "Wrong size!");
+            CheckNTErrors((b->dimSize[i] == a->dimSize[i]), "Wrong size!");
-            CheckNTErrors(index == NULL || index->dimSizeRDI[i] == a->dimSizeRDI[i], "Wrong size!");
+            CheckNTErrors((index == NULL || index->dimSize[i] == a->dimSize[i]), "Wrong size!");
        }
    }
@@ -68,14 +67,14 @@ void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
        CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
        int stride = 1;
-        int strideNumA = a->dimSizeRDI[dimRDI];
-        int strideNumB = b->dimSizeRDI[dimRDI];
-        for (int i = 0; i < dimRDI; i++)
-            stride *= a->dimSizeRDI[i];
        int blockNum = 1;
-        for (int i = dimRDI + 1; i < a->order; i++)
+        int strideNumA = a->dimSize[dim];
-            blockNum *= a->dimSizeRDI[i];
+        int strideNumB = b->dimSize[dim];
+        for (int i = 0; i < dim; i++)
+            blockNum *= a->dimSize[i];
+        for (int i = dim + 1; i < a->order; i++)
+            stride *= a->dimSize[i];
        int blockSizeA = stride * strideNumA;
        int blockSizeB = stride * strideNumB;

--- a/source/tensor/core/sort/TopK.cu
+++ b/source/tensor/core/sort/TopK.cu
@@ -812,15 +812,14 @@ void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
    CheckNTErrors((index->dataType == X_INT), "Wrong data type!");
    CheckNTErrors((b->dimSize[dim] == k), "A too large K");
-    int dimRDI = a->order - dim - 1;
    int stride = 1;
-    int strideNumA = a->dimSizeRDI[dimRDI];
-    for (int i = 0; i < dimRDI; i++)
-        stride *= a->dimSizeRDI[i];
    int blockNum = 1;
-    for (int i = dimRDI + 1; i < a->order; i++)
+    int strideNumA = a->dimSize[dim];
-        blockNum *= a->dimSizeRDI[i];
+    for (int i = 0; i < dim; i++)
+        blockNum *= a->dimSize[i];
+    for (int i = dim + 1; i < a->order; i++)
+        stride *= a->dimSize[i];
    int workerNum = blockNum < 16 ? 64 : 32; 
    /* adjust the thread num according size of k for fitting the share memory size */

--- a/source/tensor/core/utilities/SetAscendingOrder.cpp
+++ b/source/tensor/core/utilities/SetAscendingOrder.cpp
@@ -47,7 +47,6 @@ void SetAscendingOrder(XTensor & tensor, int dim)
        return;
    }
-    int dimRDI = tensor.order - dim - 1;
    if(tensor.devID >= 0){
 #ifdef USE_CUDA
        CudaSetAscendingOrder(&tensor, dim);
@@ -57,13 +56,13 @@ void SetAscendingOrder(XTensor & tensor, int dim)
    }
    else{
        int stride = 1;
-        int strideNum = tensor.dimSizeRDI[dimRDI];
-        for(int i = 0; i < dimRDI; i++)
-            stride *= tensor.dimSizeRDI[i];
        int blockNum = 1;
-        for(int i = dimRDI + 1; i < tensor.order; i++)
+        int strideNum = tensor.dimSize[dim];
-            blockNum *= tensor.dimSizeRDI[i];
+        for(int i = 0; i < dim; i++)
+            blockNum *= tensor.dimSize[i];
+        for(int i = dim + 1; i < tensor.order; i++)
+            stride *= tensor.dimSize[i];
        for(int k = 0; k < blockNum; k++){
            for(int j = 0; j < strideNum; j++){

--- a/source/tensor/core/utilities/SetAscendingOrder.cu
+++ b/source/tensor/core/utilities/SetAscendingOrder.cu
@@ -67,15 +67,14 @@ void CudaSetAscendingOrder(XTensor * a, int dim)
 {
    CheckNTErrors((a->dataType == X_INT), "TODO!");
-    int dimRDI = a->order - dim - 1;
+	int stride = 1;
-    int stride = 1;
-    int strideNum = a->dimSizeRDI[dimRDI];
-    for(int i = 0; i < dimRDI; i++)
-        stride *= a->dimSizeRDI[i];
    int blockNum = 1;
-    for(int i = dimRDI + 1; i < a->order; i++)
+    int strideNum = a->dimSize[dim];
-        blockNum *= a->dimSizeRDI[i];
+    for(int i = 0; i < dim; i++)
+        blockNum *= a->dimSize[i];
+    for(int i = dim + 1; i < a->order; i++)
+        stride *= a->dimSize[i];
    int gridSize[3];
    int blockSize[3];

--- a/source/tensor/function/LogSoftmax.cpp
+++ b/source/tensor/function/LogSoftmax.cpp
@@ -50,7 +50,6 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
        return;
    }
-    int leadDimRDI = x->order - leadDim - 1;
    if (!x->isSparse && !y->isSparse &&
        x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE)
    {
@@ -70,13 +69,13 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
        XTensor * blockMax = NULL;
        XTensor * blockSum = NULL;
-        int dimensionSize = y->dimSizeRDI[leadDimRDI];
+        int dimensionSize = y->dimSize[leadDim];
        int stride = 1;
        int blockSize = 1;
        int blockNum = 1;
-        for (int i = 0; i < leadDimRDI; i++)
+        for (int i = leadDim + 1; i < y->order; i++)
-            stride *= y->dimSizeRDI[i];
+            stride *= y->dimSize[i];
        blockSize = stride * dimensionSize;
        blockNum = y->unitNum / blockSize;
@@ -87,7 +86,7 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
        _ReduceSum(x, sum, leadDim, max, 1.0F, true);
        if (x->devID >= 0) {
-            if(leadDimRDI == 0){
+            if(leadDim == x->order - 1){
                blockSize = y->unitNum;
                blockNum  = 1;
                blockx = NewTensor2D(blockSize/dimensionSize, -dimensionSize, x->dataType, x->devID, mem);
@@ -138,7 +137,7 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
                blockMax->data = mp;
                blockSum->data = sp;
 #ifdef USE_CUDA
-                if(leadDimRDI == 0)
+                if(leadDim == x->order - 1)
                    _CudaLogSoftmaxSumMax(blockx, blocky, 1, blockSum, blockMax);
                else
                    _CudaLogSoftmaxSumMax(blockx, blocky, leadDim, blockSum, blockMax);
@@ -299,7 +298,6 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
    if(leadDim < 0)
        leadDim = y->order - 1;
-    int leadDimRDI = y->order - leadDim - 1;
 #ifdef USE_CUDA
    if (gold->devID >= 0) {
        _CudaLogSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
@@ -307,12 +305,12 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
    }
 #endif
-    int dimensionSize = y->dimSizeRDI[leadDimRDI];
+    int dimensionSize = y->dimSize[leadDim];
    int stride = 1;
    int blockSize = 1;
    int blockNum = 1;
-    for (int i = 0; i < leadDimRDI; i++)
+    for (int i = leadDim + 1; i < y->order; i++)
-        stride *= y->dimSizeRDI[i];
+        stride *= y->dimSize[i];
    blockSize = stride * dimensionSize;
    blockNum = y->unitNum / blockSize;
@@ -339,10 +337,10 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
                    int key = gold->GetKeyInSparse(i);
                    DTYPE value = gold->GetInSparse(i);
                    int offset = key;
-                    if (dedx->dimSizeRDI[0] != gm) {
+                    if (dedx->dimSize[dedx->order - 1] != gm) {
                        int mi = key % gm;
                        int ni = key / gm;
-                        int key2 = ni * dedx->dimSizeRDI[0] + mi;
+                        int key2 = ni * dedx->dimSize[dedx->order - 1] + mi;
                        offset = key2;
                    }
                    if (key >= 0 && key < size)
@@ -396,10 +394,10 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
                    int key = gold->GetKeyInSparse(i);
                    DTYPE value = gold->GetInSparse(i);
                    int offset = key;
-                    if (dedx->dimSizeRDI[0] != gm) {
+                    if (dedx->dimSize[dedx->order - 1] != gm) {
                        int mi = key % gm;
                        int ni = key / gm;
-                        int key2 = ni * dedx->dimSizeRDI[0] + mi;
+                        int key2 = ni * dedx->dimSize[dedx->order - 1] + mi;
                        offset = key2;
                    }
                    if (key >= 0 && key < size)
@@ -431,11 +429,11 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
        /* for columns with no xs we set dE/ds = 0 */
        if (gold != NULL && gold->isSparse) {
            CheckNTErrors((gold->order == 2), "The gold standard tensor must be of order 2!");
-            if ((gold->dimSize[1] > 1 && !gold->isAllValued[0]) || gold->dimSize[1] != dedx->dimSizeRDI[0]) {
+            if ((gold->dimSize[1] > 1 && !gold->isAllValued[0]) || gold->dimSize[1] != dedx->dimSize[dedx->order - 1]) {
                int gn = gold->dimSize[0];
                int gm = gold->dimSize[1];
-                int sm = dedx->dimSizeRDI[0];
+                int sm = dedx->dimSize[dedx->order - 1];
-                int sn = dedx->dimSizeRDI[1];
+                int sn = dedx->dimSize[dedx->order - 2];
                int * flags = new int[sm];
                memset(flags, 0, sizeof(int)*sm);

--- a/source/tensor/function/LogSoftmax.cu
+++ b/source/tensor/function/LogSoftmax.cu
@@ -385,13 +385,12 @@ void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
                  "Tensors used in log softmax are not on the same GPU.");
    CheckNTErrors((gold != NULL), "No x gold standard is found!");
-    int leadDimRDI = y->order - leadDim - 1;
+    int dimensionSize = y->dimSize[leadDim];
-    int dimensionSize = y->dimSizeRDI[leadDimRDI];
    int stride = 1;
    int blockSize = 1;
    int blockNum = 1;
-    for (int i = 0; i < leadDimRDI; i++)
+    for (int i = leadDim + 1; i < y->order; i++)
-        stride *= y->dimSizeRDI[i];
+        stride *= y->dimSize[i];
    blockSize = stride * dimensionSize;
    blockNum = y->unitNum / blockSize;

--- a/source/tensor/function/Loss.cpp
+++ b/source/tensor/function/Loss.cpp
@@ -50,18 +50,17 @@ DTYPE _LossCompute(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
    if (output->devID < 0) {
        CheckNTErrors((gLen >= 0 && gLen <= output->unitNum), "Illegal input length!");
        CheckNTErrors((_IsSameShaped(gold, output)), "The input tensors must be of the same size!");
-        CheckNTErrors((gold->dimSizeRDI[0] == 1 && output->dimSizeRDI[0] == 1), "TODO!");
+        CheckNTErrors((gold->dimSize[gold->order - 1] == 1 && output->dimSize[output->order - 1] == 1), "TODO!");
        CheckNTErrors((gold->order > leadDim && leadDim >= 0), "Illegal leading dimension!");
        CheckNTErrors((gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE), "TODO!");
-        int leadDimRDI = output->order - leadDim - 1;
+        int dimensionSize = output->dimSize[leadDim];
-        int dimensionSize = output->dimSizeRDI[leadDimRDI];
        int stride = 1;
        int blockSize = 1;
        int blockNum = 1;
-        for(int i = 0; i < leadDimRDI; i++)
+        for(int i = leadDim + 1; i < output->order; i++)
-            stride *= output->dimSizeRDI[i];
+            stride *= output->dimSize[i];
        blockSize = stride * dimensionSize;
        blockNum = output->unitNum / blockSize;
@@ -207,18 +206,17 @@ DTYPE _LossComputeForLogScale(XTensor * gold, XTensor * output,
 {
    CheckNTErrors(gLen >= 0 && gLen <= output->unitNum, "Illegal input length!");
    CheckNTErrors(_IsSameShaped(gold, output), "The input tensors must be of the same size!");
-    CheckNTErrors(gold->dimSizeRDI[0] == 1 && output->dimSizeRDI[0] == 1, "TODO!");
+    CheckNTErrors(gold->dimSize[gold->order - 1] == 1 && output->dimSize[output->order - 1] == 1, "TODO!");
    CheckNTErrors(gold->order > leadDim && leadDim >= 0, "Illegal leading dimension!");
    CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, "TODO!");
-    int leadDimRDI = output->order - leadDim - 1;
+    int dimensionSize = output->dimSize[leadDim];
-    int dimensionSize = output->dimSizeRDI[leadDimRDI];
    int stride = 1;
    int blockSize = 1;
    int blockNum = 1;
-    for(int i = 0; i < leadDimRDI; i++)
+    for(int i = leadDim + 1; i < output->order; i++)
-        stride *= output->dimSizeRDI[i];
+        stride *= output->dimSize[i];
    blockSize = stride * dimensionSize;
    blockNum = output->unitNum / blockSize;
@@ -409,21 +407,20 @@ void _LossBackward(XTensor * dedy, XTensor * t, XTensor * y,
        CheckNTErrors(t->order > leadDim, "Illegal leading dimension!");
        CheckNTErrors(t->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE, "TODO!");
-        int leadDimRDI = leadDim >= 0 ? y->order - leadDim - 1 : -1;
+        if (leadDim < 0) {
-        if(leadDimRDI < 0){
+            leadDim = 0;
-            leadDimRDI = y->order - 1;
            tBeg = 0;
            yBeg = 0;
-            tLen = y->dimSizeRDI[leadDimRDI];
+            tLen = y->dimSize[leadDim];
        }
-        int dimensionSize = y->dimSizeRDI[leadDimRDI];
+        int dimensionSize = y->dimSize[leadDim];
        int stride = 1;
        int blockSize = 1;
        int blockNum = 1;
-        for(int i = 0; i < leadDimRDI; i++)
+        for(int i = leadDim + 1; i < y->order; i++)
-            stride *= y->dimSizeRDI[i];
+            stride *= y->dimSize[i];
        blockSize = stride * dimensionSize;
        blockNum = y->unitNum / blockSize;

--- a/source/tensor/function/Loss.cu
+++ b/source/tensor/function/Loss.cu
@@ -56,7 +56,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
 {
    CheckNTErrors((gLen >= 0 && gLen <= y->unitNum), "Illegal input length!");
    CheckNTErrors((_IsSameShaped(gold, y)), "The input tensors must be of the same size!");
-    CheckNTErrors((gold->dimSizeRDI[0] == 1 && y->dimSizeRDI[0] == 1), "TODO!");
+    CheckNTErrors((gold->dimSize[gold->order - 1] == 1 && y->dimSize[y->order - 1] == 1), "TODO!");
    CheckNTErrors((gold->order > leadDim && leadDim >= 0), "Illegal leading dimension!");
    CheckNTErrors((gold->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE), "TODO!");
    CheckNTErrors((gold->devID == y->devID), "Tensors must be on the same device!");
@@ -91,7 +91,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
                diffNew->order = 2;
                diffNew->dimSize[1] = diffNew->dimSize[0];
                diffNew->dimSize[0] = 1;
-                diffNew->dimSizeRDI[1] = 1;
+                diffNew->dimSize[diffNew->order - 2] = 1;
            }
            delete diff;
            diff = diffNew;
@@ -125,7 +125,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
                diffNew->order = 2;
                diffNew->dimSize[1] = diffNew->dimSize[0];
                diffNew->dimSize[0] = 1;
-                diffNew->dimSizeRDI[1] = 1;
+                diffNew->dimSize[diffNew->order - 2] = 1;
            }
            delete diff;
            diff = diffNew;
@@ -162,7 +162,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
                diffNew->order = 2;
                diffNew->dimSize[1] = diffNew->dimSize[0];
                diffNew->dimSize[0] = 1;
-                diffNew->dimSizeRDI[1] = 1;
+                diffNew->dimSize[diffNew->order - 2] = 1;
            }
            delete diff;
            diff = diffNew;
@@ -349,22 +349,21 @@ void _CudaLossBackward(XTensor * dedy, XTensor * t, XTensor * y,
                  "The vectors must be on the same GPU.");
    CheckNTErrors((tBeg == yBeg), "TODO!");
-    int leadDimRDI = leadDim >= 0 ? y->order - leadDim - 1 : -1;
+    if (leadDim < 0) {
-    if(leadDimRDI < 0){
+        leadDim = 0;
-        leadDimRDI = y->order - 1;
        tBeg = 0;
        yBeg = 0;
-        tLen = y->dimSizeRDI[leadDimRDI];
+        tLen = y->dimSize[leadDim];
    }
-    int dimensionSize = y->dimSizeRDI[leadDimRDI];
+    int dimensionSize = y->dimSize[leadDim];
    int stride = 1;
    int blockSize = 1;
    int blockNum = 1;
    int size = 1;
-    for(int i = 0; i < leadDimRDI; i++)
+    for(int i = leadDim + 1; i < y->order; i++)
-        stride *= y->dimSizeRDI[i];
+        stride *= y->dimSize[i];
    size = tLen * stride;
    blockSize = stride * dimensionSize;
    blockNum = y->unitNum / blockSize;

--- a/source/tensor/function/Softmax.cpp
+++ b/source/tensor/function/Softmax.cpp
@@ -41,7 +41,6 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
    if(leadDim < 0)
        leadDim = x->order - 1;
-    int leadDimRDI = x->order - leadDim - 1;
    if(!x->isSparse && !y->isSparse && x->dataType == y->dataType){
        int * dimSize = new int[x->order - 1];
        for(int i = 0; i < x->order; i++){
@@ -71,13 +70,13 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
        else{
            CheckNTErrors((x->dataType == DEFAULT_DTYPE), "TODO!");
-            int dimensionSize = y->dimSizeRDI[leadDimRDI];
+            int dimensionSize = y->dimSize[leadDim];
            int stride = 1;
            int blockSize = 1;
            int blockNum = 1;
-            for(int i = 0; i < leadDimRDI; i++)
+            for(int i = leadDim + 1; i < y->order; i++)
-                stride *= y->dimSizeRDI[i];
+                stride *= y->dimSize[i];
            blockSize = stride * dimensionSize;
            blockNum = y->unitNum / blockSize;
@@ -207,8 +206,6 @@ void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
    if(leadDim < 0)
        leadDim = y->order - 1;
-    int leadDimRDI = y->order - leadDim - 1;
 #ifdef USE_CUDA
    if(y->devID >= 0){
        _CudaSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
@@ -216,12 +213,12 @@ void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
    }
 #endif
-    int dimensionSize = y->dimSizeRDI[leadDimRDI];
+    int dimensionSize = y->dimSize[leadDim];
    int stride = 1;
    int blockSize = 1;
    int blockNum = 1;
-    for(int i = 0; i < leadDimRDI; i++)
+    for(int i = leadDim + 1; i < y->order; i++)
-        stride *= y->dimSizeRDI[i];
+        stride *= y->dimSize[i];
    blockSize = stride * dimensionSize;
    blockNum = y->unitNum / blockSize;

--- a/source/tensor/function/Softmax.cu
+++ b/source/tensor/function/Softmax.cu
@@ -226,14 +226,13 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s
    CheckNTErrors((x->devID == y->devID), "Tensors used in softmax are not on the same GPU.");
    CheckNTErrors((_IsSameShaped(x, y)), "Input tensors must be of the same size!");
-    int leadDimRDI = y->order - leadDim - 1;
+    int dimensionSize = y->dimSize[leadDim];
-    int dimensionSize = y->dimSizeRDI[leadDimRDI];
    int stride = 1;
    int blockSize = 1;
    int blockNum = 1;
-    for(int i = 0; i < leadDimRDI; i++)
+    for(int i = leadDim + 1; i < y->order; i++)
-        stride *= y->dimSizeRDI[i];
+        stride *= y->dimSize[i];
    blockSize = stride * dimensionSize;
    blockNum = y->unitNum / blockSize;