Merge with Yuhao branch (with little bit change).

f5149a15 · liyinqiao · f0b49d6d · f5149a15 · f5149a15 · f5149a15
Commit f5149a15 authored Oct 30, 2019 by liyinqiao
--- a/source/tensor/Main.cpp
+++ b/source/tensor/Main.cpp
@@ -30,8 +30,9 @@
 #include "XDevice.h"
 #include "./test/Test.h"
 #include "./core/CHeader.h"
-#include "./loss/CrossEntropy.h"
+#include "./XBLAS.h"
+#include "./core/sort/TopK.h"
+#include "./core/movement/Gather.h"
 //#define CRTDBG_MAP_ALLOC
 //#include <stdlib.h>  
 //#include <crtdbg.h> 

--- a/source/tensor/XGlobal.cpp
+++ b/source/tensor/XGlobal.cpp
@@ -50,14 +50,6 @@ int CONST_MINUSONE = -1;
 bool CONST_TRUE = true;
 int verboseLevel = 0;
-bool useBLAS = false;
-#ifdef USE_CUDA
-    bool useCUDA = true;
-#else
-    bool useCUDA = false;
-#endif
 FILE * tmpLog = NULL;
 double myTime = 0;

--- a/source/tensor/XGlobal.h
+++ b/source/tensor/XGlobal.h
@@ -135,8 +135,6 @@ extern bool CONST_TRUE;
 #define NIUTRANSNNDEBUG
 extern int verboseLevel;
-extern bool useBLAS;
-extern bool useCUDA;
 #define FFLUSH(FILEH) \
 { \

--- a/source/tensor/XMem.cpp
+++ b/source/tensor/XMem.cpp
@@ -1562,9 +1562,9 @@ void XMemManager::GetBufferSize(MTYPE freeMem, MTYPE * myBufSize)
            if (freeMem >= MILLION * 512){
                *myBufSize = MILLION * 128;
                if (freeMem >= MILLION * 1024) {
-                    *myBufSize = MILLION * 256;
+                    *myBufSize = MILLION * 128;
                    if (freeMem >= MILLION * 2048)
-                        *myBufSize = MILLION * 512;
+                        *myBufSize = MILLION * 128;
                }
            }
        }

--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -266,7 +266,6 @@ void XTensor::Init()
    devID = -1;
    order = -1;
    memset(dimSize, 0, sizeof(int) * MAX_TENSOR_DIM_NUM);
-    memset(dimSizeRDI, 0, sizeof(int) * MAX_TENSOR_DIM_NUM);
    dataType = DEFAULT_DTYPE;
    unitSize = sizeof(float);
    unitNum = 0;
@@ -314,7 +313,6 @@ void XTensor::ShallowCopy(const XTensor &tensor)
    order = tensor.order;
    enableGrad = tensor.enableGrad;
    memcpy(dimSize, tensor.dimSize, sizeof(int) * MAX_TENSOR_DIM_NUM);
-    memcpy(dimSizeRDI, tensor.dimSizeRDI, sizeof(int) * MAX_TENSOR_DIM_NUM);
    dataType = tensor.dataType;
    unitSize = tensor.unitSize;
    unitNum = tensor.unitNum;
@@ -533,7 +531,7 @@ void XTensor::SetDevice(int myDevId, XMem * myMem)
 bool XTensor::IsReduceShaped(const XTensor * a, const XTensor * b, int dim)
 {
-    if (a == NULL || b == NULL)
+    if(a == NULL || b == NULL)
        return false;
    if ((a->order - 1) != b->order)
@@ -570,7 +568,6 @@ void XTensor::SetDim(int * myDimSize)
 {
    for (int i = 0; i < order; i++) {
        dimSize[i] = myDimSize[i];
-        dimSizeRDI[order - i - 1] = myDimSize[i];
    }
 }
@@ -598,20 +595,17 @@ reshape the tensor
 void XTensor::Reshape(const int myOrder, const int * myDimSize)
 {
    int dims[MAX_TENSOR_DIM_NUM];
-    int dimsRDI[MAX_TENSOR_DIM_NUM];
    int num = 1;
    for(int i = 0; i < myOrder; i++){
        num *= myDimSize[i];
        dims[i] = abs(myDimSize[i]);
-        dimsRDI[myOrder - i - 1] = dims[i];
    }
    CheckNTErrors(abs(num) == unitNum, "Wrong size found when we reshape the tensor!");
    order = myOrder;
    memcpy(dimSize, dims, sizeof(int) * order);
-    memcpy(dimSizeRDI, dimsRDI, sizeof(int) * order);
 }
 /* 
@@ -997,18 +991,12 @@ void * XTensor::GetCell(int index[], int size) const
 {
    CheckNTErrors((size == order), "Illegal index!");
-    int * indexRDI = new int[size];
+    int offset = index[0];
-    for (int i = 0; i < size; i++)
+    for(int i = 1; i < size; ++i){
-        indexRDI[size - i - 1] = index[i];
+        CheckNTErrors((index[i] < dimSize[i]), "Index is out of range!");
+        offset = offset * dimSize[i] + index[i];
-    int offset = indexRDI[size - 1];
-    for(int i = size - 2; i >= 0; i--){
-        CheckNTErrors((indexRDI[i] < dimSizeRDI[i]), "Index is out of range!");
-        offset = offset * dimSizeRDI[i] + indexRDI[i];
    }
-    delete[] indexRDI;
    if(isSparse){
        DTYPE value;
        void * p;
@@ -1469,7 +1457,6 @@ bool XTensor::Resize(const int myOrder, const int * myDimSize,
    bool zeroData = false;
    for(int i = 0; i < order; i++){
        dimSize[i] = abs(myDimSize[i]);
-        dimSizeRDI[order - i - 1] = dimSize[i];
        if(myDimSize[i] < 0)
            filledData = false;
        if(myDimSize[i] == 0)
@@ -1668,7 +1655,7 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg, 
        if (isSparse) {
            int num = 0;
            for (int i = 0; i < order; i++)
-                num *= dimSizeRDI[i];
+                num *= dimSize[i];
            num = int(num * denseRatio + 1);
            int tupleSize = sizeof(int) + sizeof(DTYPE);
            int size = sizeof(int) + tupleSize*(num);
@@ -1880,8 +1867,8 @@ void XTensor::Read(FILE * file, const char * label)
            int ds[MAX_TENSOR_DIM_NUM];
            for (int i = 0; i < order; i++) {
-                ds[i] = key % dimSizeRDI[i];
+                ds[i] = key % dimSize[i];
-                key /= dimSizeRDI[i];
+                key /= dimSize[i];
            }
            Set(value, ds);
        }

--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
@@ -100,9 +100,6 @@ public:
    /* size of each dimension */
    int dimSize[MAX_TENSOR_DIM_NUM];
-    /* size of each dimension by Reversed Dimension Indexing (RDI) Mode */
-    int dimSizeRDI[MAX_TENSOR_DIM_NUM];
    /* data unit - data type for every cell */
    TENSOR_DATA_TYPE dataType;

--- a/source/tensor/core/arithmetic/Div.cpp
+++ b/source/tensor/core/arithmetic/Div.cpp
@@ -49,9 +49,6 @@ void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int le
                  "Unmatched tensors!");
    CheckDev(a->devID, b->devID);
-    int leadingDimRDI = a->order - leadingDim - 1;
 #ifdef USE_CUDA
    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
        _CudaDiv(a, b, c, alpha, leadingDim);
@@ -64,17 +61,17 @@ void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int le
    int blockSizeB = 1;
    int blockSizeC = 1;
    int blockNum = 1;
-    int dimensionSizeA = a->dimSizeRDI[leadingDimRDI];
+    int dimensionSizeA = a->dimSize[leadingDim];
-    int dimensionSizeB = b->dimSizeRDI[leadingDimRDI];
+    int dimensionSizeB = b->dimSize[leadingDim];
-    int dimensionSizeC = c->dimSizeRDI[leadingDimRDI];
+    int dimensionSizeC = c->dimSize[leadingDim];
    for (int i = 0; i < a->order; i++) {
-        if (i != leadingDimRDI) {
+        if (i != leadingDim) {
-            CheckNTErrors((a->dimSizeRDI[i] == b->dimSizeRDI[i] && a->dimSizeRDI[i] == c->dimSizeRDI[i]),
+            CheckNTErrors((a->dimSize[i] == b->dimSize[i] && a->dimSize[i] == c->dimSize[i]),
                          "Unmatched tensors!");
        }
-        if (i < leadingDimRDI)
+        if (i > leadingDim)
-            stride *= a->dimSizeRDI[i];
+            stride *= a->dimSize[i];
    }
    blockSizeA = stride * dimensionSizeA;

--- a/source/tensor/core/arithmetic/Div.cu
+++ b/source/tensor/core/arithmetic/Div.cu
@@ -122,7 +122,6 @@ where i is the item index
 */
 void _CudaDiv(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
 {
-    int leadingDimRDI = a->order - leadingDim - 1;
    CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
                  "Unmatched tensors in multiplication!");
    CheckNTErrors((a->order == b->order && a->order == c->order), "Unmatched tensors!");
@@ -130,18 +129,18 @@ void _CudaDiv(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, in
    int stride = 1;
    int blockSizeA = 1;
    int blockNum = 1;
-    int dimensionSizeA = a->dimSizeRDI[leadingDimRDI];
+    int dimensionSizeA = a->dimSize[leadingDim];
-    int dimensionSizeB = b->dimSizeRDI[leadingDimRDI];
+    int dimensionSizeB = b->dimSize[leadingDim];
-    int dimensionSizeC = c->dimSizeRDI[leadingDimRDI];
+    int dimensionSizeC = c->dimSize[leadingDim];
    for (int i = 0; i < a->order; i++) {
-        if (i != leadingDimRDI) {
+        if (i != leadingDim) {
-            CheckNTErrors((a->dimSizeRDI[i] == b->dimSizeRDI[i] &&
+            CheckNTErrors((a->dimSize[i] == b->dimSize[i] &&
-                           a->dimSizeRDI[i] == c->dimSizeRDI[i]),
+                           a->dimSize[i] == c->dimSize[i]),
                          "Unmatched tensors!");
        }
-        if (i < leadingDimRDI)
+        if (i > leadingDim)
-            stride *= a->dimSizeRDI[i];
+            stride *= a->dimSize[i];
    }
    blockSizeA = stride * dimensionSizeA;

--- a/source/tensor/core/arithmetic/MatrixMul.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul.cpp
@@ -77,18 +77,18 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
        return;
    }
-    int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
+    int an = transposedA == X_TRANS ? a->dimSize[a->order - 1] : a->dimSize[a->order - 2];
-    int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
+    int am = transposedA == X_TRANS ? a->dimSize[a->order - 2] : a->dimSize[a->order - 1];
-    int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1];
+    int bn = transposedB == X_TRANS ? b->dimSize[b->order - 1] : b->dimSize[b->order - 2];
-    int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0];
+    int bm = transposedB == X_TRANS ? b->dimSize[b->order - 2] : b->dimSize[b->order - 1];
-    int cn = c->dimSizeRDI[1];
+    int cn = c->dimSize[c->order - 2];
-    int cm = c->dimSizeRDI[0];
+    int cm = c->dimSize[c->order - 1];
    CheckNTErrors((am == bn && an == cn && bm == cm), "Unmatched tensors in multiplication!");
-    int aBlockSize = a->dimSizeRDI[0] * a->dimSizeRDI[1];
+    int aBlockSize = a->dimSize[a->order - 1] * a->dimSize[a->order - 2];
-    int bBlockSize = b->dimSizeRDI[0] * b->dimSizeRDI[1];
+    int bBlockSize = b->dimSize[b->order - 1] * b->dimSize[b->order - 2];
-    int cBlockSize = c->dimSizeRDI[0] * c->dimSizeRDI[1];
+    int cBlockSize = c->dimSize[c->order - 1] * c->dimSize[c->order - 2];
    int aRealBlockSize = aBlockSize * a->unitSize;
    int bRealBlockSize = bBlockSize * b->unitSize;
    int cRealBlockSize = cBlockSize * c->unitSize;
@@ -96,24 +96,25 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
    int bBlockNum = 1;
    int cBlockNum = 1;
-    for (int i = 2; i < a->order; i++) {
-        CheckNTErrors(a->dimSizeRDI[i] == c->dimSizeRDI[i - 2 + b->order], "Incorrect tensor sizes!");
+    for (int i = 0; i < a->order - 2; i++) {
-        aBlockNum *= a->dimSizeRDI[i];
+        CheckNTErrors(a->dimSize[i] == c->dimSize[i], "Incorrect tensor sizes!");
-        cBlockNum *= a->dimSizeRDI[i];
+        aBlockNum *= a->dimSize[i];
+        cBlockNum *= a->dimSize[i];
    }
-    for (int i = 2; i < b->order; i++) {
+    for (int i = 0; i < b->order - 2; i++) {
-        CheckNTErrors(b->dimSizeRDI[i] == c->dimSizeRDI[i], "Incorrect tensor sizes!");
+        CheckNTErrors(b->dimSize[i] == c->dimSize[i - 2 + a->order], "Incorrect tensor sizes!");
-        bBlockNum *= b->dimSizeRDI[i];
+        bBlockNum *= b->dimSize[i];
-        cBlockNum *= b->dimSizeRDI[i];
+        cBlockNum *= b->dimSize[i];
    }
    TensorList * aList = new TensorList(10);
    TensorList * bList = new TensorList(10);
    TensorList * cList = new TensorList(10);
-    int aDimSize[2] = { -a->dimSizeRDI[1], a->dimSizeRDI[0] };
+    int aDimSize[2] = { -a->dimSize[a->order - 2], a->dimSize[a->order - 1] };
-    int bDimSize[2] = { -b->dimSizeRDI[1], b->dimSizeRDI[0] };
+    int bDimSize[2] = { -b->dimSize[b->order - 2], b->dimSize[b->order - 1] };
-    int cDimSize[2] = { -c->dimSizeRDI[1], c->dimSizeRDI[0] };
+    int cDimSize[2] = { -c->dimSize[c->order - 2], c->dimSize[c->order - 1] };
    bool isSparseMul = false;
@@ -215,20 +216,20 @@ bool CheckMMulShape(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
    if (!(a->order >= 2 && b->order >= 2 && c->order >= 2))
        return false;
-    int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
+    int an = transposedA == X_TRANS ? a->dimSize[a->order - 1] : a->dimSize[a->order - 2];
-    int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
+    int am = transposedA == X_TRANS ? a->dimSize[a->order - 2] : a->dimSize[a->order - 1];
-    int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1];
+    int bn = transposedB == X_TRANS ? b->dimSize[b->order - 1] : b->dimSize[b->order - 2];
-    int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0];
+    int bm = transposedB == X_TRANS ? b->dimSize[b->order - 2] : b->dimSize[b->order - 1];
    CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
    int order = a->order + b->order - 2;
    int sub = 0;
    int * dimSize = new int[order];
-    for (int i = 2; i < a->order; i++)
+    for (int i = 0; i < a->order - 2; i++)
-        dimSize[sub++] = a->dimSizeRDI[a->order + 1 - i];
+        dimSize[sub++] = a->dimSize[i];
-    for (int i = 2; i < b->order; i++)
+    for (int i = 0; i < b->order - 2; i++)
-        dimSize[sub++] = b->dimSizeRDI[b->order + 1 - i];
+        dimSize[sub++] = b->dimSize[i];
    dimSize[sub++] = an;
    dimSize[sub++] = bm;
@@ -271,20 +272,20 @@ XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
    CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
    CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
-    int an = transposedA == X_TRANS ? a.dimSizeRDI[0] : a.dimSizeRDI[1];
+    int an = transposedA == X_TRANS ? a.dimSize[a.order - 1] : a.dimSize[a.order - 2];
-    int am = transposedA == X_TRANS ? a.dimSizeRDI[1] : a.dimSizeRDI[0];
+    int am = transposedA == X_TRANS ? a.dimSize[a.order - 2] : a.dimSize[a.order - 1];
-    int bn = transposedB == X_TRANS ? b.dimSizeRDI[0] : b.dimSizeRDI[1];
+    int bn = transposedB == X_TRANS ? b.dimSize[b.order - 1] : b.dimSize[b.order - 2];
-    int bm = transposedB == X_TRANS ? b.dimSizeRDI[1] : b.dimSizeRDI[0];
+    int bm = transposedB == X_TRANS ? b.dimSize[b.order - 2] : b.dimSize[b.order - 1];
    CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
    int order = a.order + b.order - 2;
    int sub = 0;
    int * dimSize = new int[order];
-    for (int i = 2; i < a.order; i++)
+    for (int i = 0; i < a.order - 2; i++)
-        dimSize[sub++] = a.dimSizeRDI[a.order + 1 - i];
+        dimSize[sub++] = a.dimSize[i];
-    for (int i = 2; i < b.order; i++)
+    for (int i = 0; i < b.order - 2; i++)
-        dimSize[sub++] = b.dimSizeRDI[b.order + 1 - i];    
+        dimSize[sub++] = b.dimSize[i];    
    dimSize[sub++] = an;
    dimSize[sub++] = bm;
@@ -318,20 +319,20 @@ void MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
    if (!c.isInit || !CheckMMulShape(&a, transposedA, &b, transposedB, &c)) {
-        int an = transposedA == X_TRANS ? a.dimSizeRDI[0] : a.dimSizeRDI[1];
+        int an = transposedA == X_TRANS ? a.dimSize[a.order - 1] : a.dimSize[a.order - 2];
-        int am = transposedA == X_TRANS ? a.dimSizeRDI[1] : a.dimSizeRDI[0];
+        int am = transposedA == X_TRANS ? a.dimSize[a.order - 2] : a.dimSize[a.order - 1];
-        int bn = transposedB == X_TRANS ? b.dimSizeRDI[0] : b.dimSizeRDI[1];
+        int bn = transposedB == X_TRANS ? b.dimSize[b.order - 1] : b.dimSize[b.order - 2];
-        int bm = transposedB == X_TRANS ? b.dimSizeRDI[1] : b.dimSizeRDI[0];
+        int bm = transposedB == X_TRANS ? b.dimSize[b.order - 2] : b.dimSize[b.order - 1];
        CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
        int order = a.order + b.order - 2;
        int sub = 0;
        int * dimSize = new int[order];
-        for (int i = 2; i < a.order; i++)
+        for (int i = 0; i < a.order - 2; i++)
-            dimSize[sub++] = a.dimSizeRDI[a.order + 1 - i];
+            dimSize[sub++] = a.dimSize[i];
-        for (int i = 2; i < b.order; i++)
+        for (int i = 0; i < b.order - 2; i++)
-            dimSize[sub++] = b.dimSizeRDI[b.order + 1 - i];
+            dimSize[sub++] = b.dimSize[i];
        dimSize[sub++] = an;
        dimSize[sub++] = bm;
@@ -370,20 +371,20 @@ XTensor MatrixMul(const XTensor &a, const XTensor &b,
    CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
    CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
-    int an = a.dimSizeRDI[1];
+    int an = a.dimSize[a.order - 2];
-    int am = a.dimSizeRDI[0];
+    int am = a.dimSize[a.order - 1];
-    int bn = b.dimSizeRDI[1];
+    int bn = b.dimSize[b.order - 2];
-    int bm = b.dimSizeRDI[0];
+    int bm = b.dimSize[b.order - 1];
    CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
    int order = a.order + b.order - 2;
    int sub = 0;
    int * dimSize = new int[order];
-    for (int i = 2; i < a.order; i++)
+    for (int i = 0; i < a.order - 2; i++)
-        dimSize[sub++] = a.dimSizeRDI[a.order + 1 - i];
+        dimSize[sub++] = a.dimSize[i];
-    for (int i = 2; i < b.order; i++)
+    for (int i = 0; i < b.order - 2; i++)
-        dimSize[sub++] = b.dimSizeRDI[b.order + 1 - i];    
+        dimSize[sub++] = b.dimSize[i];    
    dimSize[sub++] = an;
    dimSize[sub++] = bm;
@@ -416,20 +417,20 @@ void MatrixMul(const XTensor &a, const XTensor &b, XTensor &c,
    if (!c.isInit || !CheckMMulShape(&a, X_NOTRANS, &b, X_NOTRANS, &c)) {
-        int an = a.dimSizeRDI[1];
+        int an = a.dimSize[a.order - 2];
-        int am = a.dimSizeRDI[0];
+        int am = a.dimSize[a.order - 1];
-        int bn = b.dimSizeRDI[1];
+        int bn = b.dimSize[b.order - 2];
-        int bm = b.dimSizeRDI[0];
+        int bm = b.dimSize[b.order - 1];
        CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
        int order = a.order + b.order - 2;
        int sub = 0;
        int * dimSize = new int[order];
-        for (int i = 2; i < a.order; i++)
+        for (int i = 0; i < a.order - 2; i++)
-            dimSize[sub++] = a.dimSizeRDI[a.order + 1 - i];
+            dimSize[sub++] = a.dimSize[i];
-        for (int i = 2; i < b.order; i++)
+        for (int i = 0; i < b.order - 2; i++)
-            dimSize[sub++] = b.dimSizeRDI[b.order + 1 - i];
+            dimSize[sub++] = b.dimSize[i];
        dimSize[sub++] = an;
        dimSize[sub++] = bm;

--- a/source/tensor/core/arithmetic/MatrixMulBatched.cpp
+++ b/source/tensor/core/arithmetic/MatrixMulBatched.cpp
@@ -95,27 +95,27 @@ void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
                  "Input tensor and output tensor must have same order!");
    CheckNTErrors(a->devID >= 0 && b->devID >= 0 && c->devID >= 0, "The tensors must be on GPUs");
-    int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
+    int an = transposedA == X_TRANS ? a->dimSize[a->order - 1] : a->dimSize[a->order - 2];
-    int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
+    int am = transposedA == X_TRANS ? a->dimSize[a->order - 2] : a->dimSize[a->order - 1];
-    int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1];
+    int bn = transposedB == X_TRANS ? b->dimSize[b->order - 1] : b->dimSize[b->order - 2];
-    int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0];
+    int bm = transposedB == X_TRANS ? b->dimSize[b->order - 2] : b->dimSize[b->order - 1];
-    int cn = c->dimSizeRDI[1];
+    int cn = c->dimSize[c->order - 2];
-    int cm = c->dimSizeRDI[0];
+    int cm = c->dimSize[c->order - 1];
    CheckNTErrors((am == bn && an == cn && bm == cm), "Unmatched tensors in multiplication!");
-    int aBlockSize = a->dimSizeRDI[0] * a->dimSizeRDI[1];
+    int aBlockSize = a->dimSize[a->order - 1] * a->dimSize[a->order - 2];
-    int bBlockSize = b->dimSizeRDI[0] * b->dimSizeRDI[1];
+    int bBlockSize = b->dimSize[b->order - 1] * b->dimSize[b->order - 2];
-    int cBlockSize = c->dimSizeRDI[0] * c->dimSizeRDI[1];
+    int cBlockSize = c->dimSize[c->order - 1] * c->dimSize[c->order - 2];
    int aRealBlockSize = aBlockSize * a->unitSize;
    int bRealBlockSize = bBlockSize * b->unitSize;
    int cRealBlockSize = cBlockSize * c->unitSize;
    int blockNum = 1;
-    for (int i = 2; i < a->order; i++) {
+    for (int i = 0; i < a->order - 2; i++) {
-        CheckNTErrors((a->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
+        CheckNTErrors((a->dimSize[i] == c->dimSize[i]), "Incorrect tensor sizes!");
-        CheckNTErrors((b->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
+        CheckNTErrors((b->dimSize[i] == c->dimSize[i]), "Incorrect tensor sizes!");
-        blockNum *= a->dimSizeRDI[i];
+        blockNum *= a->dimSize[i];
    }
    int devIDBackup = 0;
@@ -126,9 +126,9 @@ void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
                                     a->data, transposedA, a->dataType, aBlockSize,
                                     b->data, transposedB, b->dataType, bBlockSize,
                                     c->data, c->dataType, cBlockSize, blockNum,
-                                     a->dimSizeRDI[1], a->dimSizeRDI[0],
+                                     a->dimSize[a->order - 2], a->dimSize[a->order - 1],
-                                     b->dimSizeRDI[1], b->dimSizeRDI[0],
+                                     b->dimSize[b->order - 2], b->dimSize[b->order - 1],
-                                     c->dimSizeRDI[1], c->dimSizeRDI[0], alpha, beta);
+                                     c->dimSize[c->order - 2], c->dimSize[c->order - 1], alpha, beta);
    BacktoCudaDev(a->devID, devIDBackup);
 #endif
@@ -164,32 +164,32 @@ void _MatrixMulBatchedCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
                 "Input tensor and output tensor must have same order!");
-    int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
+    int an = transposedA == X_TRANS ? a->dimSize[a->order - 1] : a->dimSize[a->order - 2];
-    int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
+    int am = transposedA == X_TRANS ? a->dimSize[a->order - 2] : a->dimSize[a->order - 1];
-    int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1];
+    int bn = transposedB == X_TRANS ? b->dimSize[b->order - 1] : b->dimSize[b->order - 2];
-    int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0];
+    int bm = transposedB == X_TRANS ? b->dimSize[b->order - 2] : b->dimSize[b->order - 1];
-    int cn = c->dimSizeRDI[1];
+    int cn = c->dimSize[c->order - 2];
-    int cm = c->dimSizeRDI[0];
+    int cm = c->dimSize[c->order - 1];
    CheckNTErrors(am == bn && an == cn && bm == cm, "Unmatched tensors in multiplication!");
-    int aBlockSize = a->dimSizeRDI[0] * a->dimSizeRDI[1];
+    int aBlockSize = a->dimSize[a->order - 1] * a->dimSize[a->order - 2];
-    int bBlockSize = b->dimSizeRDI[0] * b->dimSizeRDI[1];
+    int bBlockSize = b->dimSize[b->order - 1] * b->dimSize[b->order - 2];
-    int cBlockSize = c->dimSizeRDI[0] * c->dimSizeRDI[1];
+    int cBlockSize = c->dimSize[c->order - 1] * c->dimSize[c->order - 2];
    int aRealBlockSize = aBlockSize * a->unitSize;
    int bRealBlockSize = bBlockSize * b->unitSize;
    int cRealBlockSize = cBlockSize * c->unitSize;
    int blockNum = 1;
-    for (int i = 2; i < a->order; i++) {
+    for (int i = 0; i < a->order - 2; i++) {
-        CheckNTErrors((a->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
+        CheckNTErrors((a->dimSize[i] == c->dimSize[i]), "Incorrect tensor sizes!");
-        CheckNTErrors((b->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
+        CheckNTErrors((b->dimSize[i] == c->dimSize[i]), "Incorrect tensor sizes!");
-        blockNum *= a->dimSizeRDI[i];
+        blockNum *= a->dimSize[i];
    }
-    int aDimSize[2] = {-a->dimSizeRDI[1], a->dimSizeRDI[0]};
+    int aDimSize[2] = {-a->dimSize[a->order - 2], a->dimSize[a->order - 1]};
-    int bDimSize[2] = {-b->dimSizeRDI[1], b->dimSizeRDI[0]};
+    int bDimSize[2] = {-b->dimSize[b->order - 2], b->dimSize[b->order - 1]};
-    int cDimSize[2] = {-c->dimSizeRDI[1], c->dimSizeRDI[0]};
+    int cDimSize[2] = {-c->dimSize[c->order - 2], c->dimSize[c->order - 1]};
    XTensor * ai = NewTensor2D(aDimSize[0], aDimSize[1], a->dataType, a->devID, a->mem);
    XTensor * bi = NewTensor2D(bDimSize[0], bDimSize[1], b->dataType, b->devID, b->mem);
@@ -292,10 +292,10 @@ XTensor MatrixMulBatched(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const 
    CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
    CheckNTErrors(a.order == b.order, "Input tensor and output tensor must have same order!");
-    int an = transposedA == X_TRANS ? a.dimSizeRDI[0] : a.dimSizeRDI[1];
+    int an = transposedA == X_TRANS ? a.dimSize[a.order - 1] : a.dimSize[a.order - 2];
-    int am = transposedA == X_TRANS ? a.dimSizeRDI[1] : a.dimSizeRDI[0];
+    int am = transposedA == X_TRANS ? a.dimSize[a.order - 2] : a.dimSize[a.order - 1];
-    int bn = transposedB == X_TRANS ? b.dimSizeRDI[0] : b.dimSizeRDI[1];
+    int bn = transposedB == X_TRANS ? b.dimSize[b.order - 1] : b.dimSize[b.order - 2];
-    int bm = transposedB == X_TRANS ? b.dimSizeRDI[1] : b.dimSizeRDI[0];
+    int bm = transposedB == X_TRANS ? b.dimSize[b.order - 2] : b.dimSize[b.order - 1];
    CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
@@ -350,10 +350,10 @@ XTensor MatrixMulBatched(const XTensor &a, const XTensor &b,
    CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
    CheckNTErrors(a.order == b.order, "Input tensor and output tensor must have same order!");
-    int an = a.dimSizeRDI[1];
+    int an = a.dimSize[a.order - 2];
-    int am = a.dimSizeRDI[0];
+    int am = a.dimSize[a.order - 1];
-    int bn = b.dimSizeRDI[1];
+    int bn = b.dimSize[b.order - 2];
-    int bm = b.dimSizeRDI[0];
+    int bm = b.dimSize[b.order - 1];
    CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");

--- a/source/tensor/core/arithmetic/MulAndShift.cpp
+++ b/source/tensor/core/arithmetic/MulAndShift.cpp
@@ -71,20 +71,21 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
    CheckNTErrors(x.dataType == w.dataType, "Input tensors should have the same data type!");
    CheckNTErrors(x.order >= 2 && w.order >= 2, "Input tensors must have a order >= 2!");
-    int xn = x.dimSizeRDI[1];
+    int xn = x.dimSize[x.order - 2];
-    int xm = x.dimSizeRDI[0];
+    int xm = x.dimSize[x.order - 1];
-    int wn = w.dimSizeRDI[1];
+    int wn = w.dimSize[w.order - 2];
-    int wm = w.dimSizeRDI[0];
+    int wm = w.dimSize[w.order - 1];
    CheckNTErrors(xm == wn, "Unmatched tensors in multiplication!");
    int order = x.order + w.order - 2;
    int sub = 0;
    int * dimSize = new int[order];
-    for (int i = 2; i < x.order; i++)
+    for (int i = 0; i < x.order - 2; i++)
-        dimSize[sub++] = x.dimSizeRDI[x.order + 1 - i];
+        dimSize[sub++] = x.dimSize[i];
-    for (int i = 2; i < w.order; i++)
+    for (int i = 0; i < w.order - 2; i++)
-        dimSize[sub++] = w.dimSizeRDI[w.order + 1 - i];
+        dimSize[sub++] = w.dimSize[i];
    dimSize[sub++] = xn;
    dimSize[sub++] = wm;
@@ -148,18 +149,18 @@ XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedA,
    CheckNTErrors(x.dataType == w.dataType, "Input tensors should have the same data type!");
    CheckNTErrors(x.order >= 2 && w.order >= 2, "Input tensors must have a order >= 2!");
-    int xn = transposedA == X_TRANS ? x.dimSizeRDI[0] : x.dimSizeRDI[1];
+    int xn = transposedA == X_TRANS ? x.dimSize[x.order - 1] : x.dimSize[x.order - 2];
-    int xm = transposedA == X_TRANS ? x.dimSizeRDI[1] : x.dimSizeRDI[0];
+    int xm = transposedA == X_TRANS ? x.dimSize[x.order - 2] : x.dimSize[x.order - 1];
-    int wn = transposedB == X_TRANS ? w.dimSizeRDI[0] : w.dimSizeRDI[1];
+    int wn = transposedB == X_TRANS ? w.dimSize[w.order - 1] : w.dimSize[w.order - 2];
-    int wm = transposedB == X_TRANS ? w.dimSizeRDI[1] : w.dimSizeRDI[0];
+    int wm = transposedB == X_TRANS ? w.dimSize[w.order - 2] : w.dimSize[w.order - 1];
    int order = x.order + w.order - 2;
    int sub = 0;
    int * dimSize = new int[order];
-    for (int i = 2; i < x.order; i++)
+    for (int i = 0; i < x.order - 2; i++)
-        dimSize[sub++] = x.dimSizeRDI[x.order + 1 - i];
+        dimSize[sub++] = x.dimSize[i];
-    for (int i = 2; i < w.order; i++)
+    for (int i = 0; i < w.order - 2; i++)
-        dimSize[sub++] = w.dimSizeRDI[w.order + 1 - i];
+        dimSize[sub++] = w.dimSize[i];
    dimSize[sub++] = xn;
    dimSize[sub++] = wm;

--- a/source/tensor/core/arithmetic/Multiply.cpp
+++ b/source/tensor/core/arithmetic/Multiply.cpp
@@ -49,9 +49,6 @@ void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, i
                  "Unmatched tensors!");
    CheckDev(a->devID, b->devID);
-    int leadingDimRDI = a->order - leadingDim - 1;
 #ifdef USE_CUDA
    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
        _CudaMultiply(a, b, c, alpha, leadingDim);
@@ -64,18 +61,18 @@ void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, i
    int blockSizeB = 1;
    int blockSizeC = 1;
    int blockNum = 1;
-    int dimensionSizeA = a->dimSizeRDI[leadingDimRDI];
+    int dimensionSizeA = a->dimSize[leadingDim];
-    int dimensionSizeB = b->dimSizeRDI[leadingDimRDI];
+    int dimensionSizeB = b->dimSize[leadingDim];
-    int dimensionSizeC = c->dimSizeRDI[leadingDimRDI];
+    int dimensionSizeC = c->dimSize[leadingDim];
    for (int i = 0; i < a->order; i++) {
-        if (i != leadingDimRDI) {
+        if (i != leadingDim) {
-            CheckNTErrors((a->dimSizeRDI[i] == b->dimSizeRDI[i] &&
+            CheckNTErrors((a->dimSize[i] == b->dimSize[i] &&
-                           a->dimSizeRDI[i] == c->dimSizeRDI[i]),
+                           a->dimSize[i] == c->dimSize[i]),
                          "Unmatched tensors!");
        }
-        if (i < leadingDimRDI)
+        if (i > leadingDim)
-            stride *= a->dimSizeRDI[i];
+            stride *= a->dimSize[i];
    }
    blockSizeA = stride * dimensionSizeA;

--- a/source/tensor/core/arithmetic/Multiply.cu
+++ b/source/tensor/core/arithmetic/Multiply.cu
@@ -122,26 +122,25 @@ where i is the item index
 */
 void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
 {
-    int leadingDimRDI = a->order - leadingDim - 1;
+    CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
-    CheckNTErrors(a->unitNum <= c->unitNum && b->unitNum <= c->unitNum,
                  "Unmatched tensors in multiplication!");
-    CheckNTErrors(a->order == b->order && a->order == c->order, "Unmatched tensors!");
+    CheckNTErrors((a->order == b->order && a->order == c->order), "Unmatched tensors!");
    int stride = 1;
    int blockSizeA = 1;
    int blockNum = 1;
-    int dimensionSizeA = a->dimSizeRDI[leadingDimRDI];
+    int dimensionSizeA = a->dimSize[leadingDim];
-    int dimensionSizeB = b->dimSizeRDI[leadingDimRDI];
+    int dimensionSizeB = b->dimSize[leadingDim];
-    int dimensionSizeC = c->dimSizeRDI[leadingDimRDI];
+    int dimensionSizeC = c->dimSize[leadingDim];
    for (int i = 0; i < a->order; i++) {
-        if (i != leadingDimRDI) {
+        if (i != leadingDim) {
-            CheckNTErrors((a->dimSizeRDI[i] == b->dimSizeRDI[i] &&
+            CheckNTErrors((a->dimSize[i] == b->dimSize[i] &&
-                           a->dimSizeRDI[i] == c->dimSizeRDI[i]),
+                           a->dimSize[i] == c->dimSize[i]),
                          "Unmatched tensors!");
        }
-        if (i < leadingDimRDI)
+        if (i > leadingDim)
-            stride *= a->dimSizeRDI[i];
+            stride *= a->dimSize[i];
    }
    blockSizeA = stride * dimensionSizeA;

--- a/source/tensor/core/arithmetic/SumDim.cpp
+++ b/source/tensor/core/arithmetic/SumDim.cpp
@@ -70,20 +70,6 @@ void _SumDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE bet
        return;
    }
-    /*int dims[MAX_TENSOR_DIM_NUM];
-    for(int i = 0; i < a->order; i++)
-        dims[i] = 1;
-    dims[n] = a->GetDim(n);
-    XTensor * b2 = NewTensor(a->order, dims, b->dataType, b->denseRatio, b->devID, b->mem);
-    _CopyValues(b, b2);
-    _SumBroadcast(a, b2, c, beta);
-    DelTensor(b2);
-    return;*/
    if(a->devID >= 0 || b->devID >= 0 || c->devID >= 0){
 #ifdef USE_CUDA
        _CudaSumDim(a, b, c, n, beta);

--- a/source/tensor/core/arithmetic/SumDim.cu
+++ b/source/tensor/core/arithmetic/SumDim.cu
@@ -87,17 +87,17 @@ void KernelAddWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize
    int col = colIndex % colNum;
    int block = colIndex / colNum;
-    if (row >= rowNum || block >= blockNum)
+    if(row >= rowNum || block >= blockNum)
        return;
-    if (threadIdx.x == 0)
+    if(threadIdx.x == 0)
        bv[threadIdx.y] = b[row];
    __syncthreads();
    int offset = block * blockSize + row * colNum + col;
-    if (betaFired)
+    if(betaFired)
        c[offset] = a[offset] + bv[threadIdx.y] * beta;
    else
        c[offset] = a[offset] + bv[threadIdx.y];

--- a/source/tensor/core/getandset/OnehotAndIndex.cpp
+++ b/source/tensor/core/getandset/OnehotAndIndex.cpp
@@ -140,6 +140,47 @@ void _IndexToOnehot(const XTensor * index, XTensor * onehot,
 }
 /*
+convert index tensor to onehot tensor
+>> index - index tensor, which value is an integer num
+>> onehot - onehot tensor, which value is 0 or 1
+>> size - the last dimension size of the onehot tensor
+*/
+void _IndexToOnehot(int * index, int n, XTensor * onehot, int size, float labelSmoothingP)
+{
+    /*CheckNTErrors(onehot->GetDim(-1) == size, "Illegal tensor dimension!");
+    CheckNTErrors(onehot->dataType == X_INT, "The onehot tensor must be in X_INT!")
+        onehot->SetZeroAll();
+#ifdef USE_CUDA
+    if (onehot->devID >= 0) {
+        delete[] cudaIndex;
+        return;
+    }
+#endif
+    int blockNum = n;
+    int stride = size;
+    int * indexData = (int *)index;
+    int * onehotData = (int *)onehot->data;
+    for (int i = 0; i < blockNum; i++) {
+        int id = indexData[i];
+        int * od = onehotData + i * stride;
+        od[id] = 1;
+    }*/
+    XTensor* cudaIndex = NewTensor1D(n, X_INT, onehot->devID);
+    cudaIndex->SetData(index, n);
+    _IndexToOnehot(cudaIndex, onehot, size, labelSmoothingP);
+    delete[] cudaIndex;
+}
+/* 
 convert onehot tensor to index tensor (return an XTensor structure)
 make a new tensor to keep the result and return it 

--- a/source/tensor/core/getandset/OnehotAndIndex.h
+++ b/source/tensor/core/getandset/OnehotAndIndex.h
@@ -36,6 +36,9 @@ XTensor OnehotToIndex(const XTensor & onehot, int num);
 /* convert index tensor to onehot tensor */
 void _IndexToOnehot(const XTensor * index, XTensor * onehot, int size, float labelSmoothingP);
+/* convert index tensor to onehot tensor */
+void _IndexToOnehot(int * index, int n, XTensor * onehot, int size, float labelSmoothingP);
 /* convert index tensor to onehot tensor (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor IndexToOnehot(const XTensor & index, int num, float labelSmoothingP);

--- a/source/tensor/core/getandset/Select.cpp
+++ b/source/tensor/core/getandset/Select.cpp
@@ -26,6 +26,82 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)
 /*
+generate a tensor with selected data in index along the given dimension
+c = select(a)
+>> a - input tensor
+>> c - result tensor
+>> index - the selected index
+>> dim - the dimension along with which we do the job
+*/
+void _Select(const XTensor * a, XTensor * c, int* index, int dim)
+{
+    CheckNTErrors(a != NULL && c != NULL, "empty tensors!");
+    CheckNTErrors(a->order == c->order, "The input and output tensors must in the same order!");
+    CheckNTErrors(dim >= 0 && dim < a->order, "The input dimension is out of bounds!");
+    CheckNTErrors(a->dataType == c->dataType, "The tensor must be of the same data type!");
+    int stride = 1;
+    for (int i = dim + 1; i < a->order; i++)
+        stride *= a->dimSize[i];
+    printf("\n%d %d\n", a->order - dim - 1,stride);
+    int copyTimes = 1;
+    for (int i = 0; i < dim; i++)
+    {
+        copyTimes *= a->dimSize[i];
+    }
+    int cot = c->dimSize[dim];
+    int blockSize = stride * a->unitSize;
+    int stepSizeS = stride * a->dimSize[dim] * a->unitSize;
+    int stepSizeT = stride * c->dimSize[dim] * a->unitSize;
+    char * s = (char*)a->data;
+    char * t = (char*)c->data;
+    for (int i = 0; i < copyTimes; i++) {
+        for (int j = 0; j < cot; ++j) {
+            XMemCopy(t + j * blockSize, c->devID, s + index[j] * blockSize, a->devID, blockSize);
+        }
+        s += stepSizeS;
+        t += stepSizeT;
+    }
+}
+/*
+generate a tensor with selected data in index along the given dimension
+c = select(a)
+>> a - input tensor
+>> c - result tensor
+>> index - the selected index
+>> dim - the dimension along with which we do the job
+*/
+void _Select(const XTensor * a, XTensor * c, XTensor* index, int dim)
+{
+    if (index->devID >= 0)
+    {
+        int* indexCPU = new int[index->unitNum];
+        XMemCopy(indexCPU, -1, index->data,index->devID, index->unitNum * sizeof(int));
+        _Select(a, c, indexCPU, dim);
+        delete[] indexCPU;
+    }
+    else
+    {
+        _Select(a, c, (int *)index->data, dim);
+    }
+}
+/*
+*/
+/*XTensor Select(const XTensor &a, int* index, int dim)
+{
+}*/
+/* 
 generate a tensor with selected data in range[low,high] along the given dimension 
 c = select(a) 
@@ -58,13 +134,12 @@ void _SelectRange(const XTensor * a, XTensor * c, int dim, int low, int high)
    }
    int stride = 1;
-    int dimRDI = a->order - dim - 1;
+    for(int i = dim + 1; i < a->order; i++)
-    for(int i = 0; i < dimRDI; i++)
+        stride *= a->dimSize[i];
-        stride *= a->dimSizeRDI[i];
    int copyTimes = 1;
-    for (int i = dimRDI + 1; i < a->order; i++) 
+    for (int i = 0; i < dim; i++) 
-        copyTimes *= a->dimSizeRDI[i];
+        copyTimes *= a->dimSize[i];
    int blockSize = stride * (high - low) * a->unitSize;
    int stepSizeS = stride * a->dimSize[dim] * a->unitSize;
@@ -117,12 +192,10 @@ XTensor SelectRange(const XTensor &a, int dim, int low, int high)
    _SelectRange(&a, &c, dim, low, high);
    /* tensor connection */
-    if (a.enableGrad) {
    XLink::MakeLink(&a, NULL, &c, GETANDSET_SELECT);
    XLink::AddParamToHeadInt(&c, dim);
    XLink::AddParamToHeadInt(&c, low);
    XLink::AddParamToHeadInt(&c, high);
-    }
    /* destroy variables */
    delete[] dimSize;

--- a/source/tensor/core/getandset/Select.h
+++ b/source/tensor/core/getandset/Select.h
@@ -27,7 +27,10 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)
 /* generate a tensor with selected data c = select(a) */
-void _Select(const XTensor * a, XTensor * c, XTensor * indexCPU);
+void _Select(const XTensor * a, XTensor * c, int* index, int dim);
+/* generate a tensor with selected data c = select(a) */
+void _Select(const XTensor * a, XTensor * c, XTensor* index, int dim);
 /* 
 generate a tensor with selected data c = select(a) (returna a XTensor structure)

--- a/source/tensor/core/math/Normalize.cpp
+++ b/source/tensor/core/math/Normalize.cpp
@@ -47,26 +47,25 @@ void _Normalize(const XTensor * input, XTensor * output, int dim,
                const XTensor * mean, const XTensor * var, 
                const XTensor * a, const XTensor * b, DTYPE epsilon)
 {
-    int dimRDI = input->order - dim - 1;
    CheckNTErrors((_IsSameShaped(input, output)), "Unmatched input tensors!");
    CheckNTErrors((_IsSameShaped(a, b)), "Unmatched input tensors");
    CheckNTErrors((_IsSameShaped(mean, var)), "Unmatched input tensors");
    CheckNTErrors((input && output && mean && var && a && b), "Empty input tensors!");
-    CheckNTErrors((dimRDI >= 0 && dimRDI < input->order), "Incorrect reduction dimension!");
+    CheckNTErrors((dim >= 0 && dim < input->order), "Incorrect reduction dimension!");
    CheckNTErrors((input->order == mean->order + 1), "Incorrect reduction dimension!");
    int stride = 1;
-    int strideNum = input->dimSizeRDI[dimRDI];
+    int strideNum = input->dimSize[dim];
    int blockSize = 1;
    int blockNum = 1;
    for (int i = 0; i < input->order; i++) {
-        if (i < dimRDI) {
+        if (i < dim) {
-            CheckNTErrors((input->dimSizeRDI[i] == mean->dimSizeRDI[i]), "Wrong size!");
+            CheckNTErrors((input->dimSize[i] == mean->dimSize[i]), "Wrong size!");
-            stride *= input->dimSizeRDI[i];
+            blockNum *= input->dimSize[i];
        }
-        else if (i > dimRDI) {
+        else if (i > dim) {
-            CheckNTErrors((input->dimSizeRDI[i] == mean->dimSizeRDI[i - 1]), "Wrong size!");
+            CheckNTErrors((input->dimSize[i] == mean->dimSize[i - 1]), "Wrong size!");
-            blockNum *= input->dimSizeRDI[i];
+            stride *= input->dimSize[i];
        }
    }
    blockSize = stride * strideNum;

--- a/source/tensor/core/math/Normalize.cu
+++ b/source/tensor/core/math/Normalize.cu
@@ -95,15 +95,14 @@ void _CudaNormalize(const XTensor * input, XTensor * output, int dim,
 {
    CheckNTErrors((input->dataType == DEFAULT_DTYPE), "TODO!");
-    int dimRDI = input->order - dim - 1;
    int stride = 1;
-    int strideNum = input->dimSizeRDI[dimRDI];
+    int strideNum = input->dimSize[dim];
    int blockNum = 1;
    for (int i = 0; i < input->order; i++) {
-        if (i < dimRDI)
+        if (i > dim)
-            stride *= input->dimSizeRDI[i];
+            stride *= input->dimSize[i];
-        else if (i > dimRDI)
+        else if (i < dim)
-            blockNum *= input->dimSizeRDI[i];
+            blockNum *= input->dimSize[i];
    }
    int cudaGridSize[3];

--- a/source/tensor/core/movement/CopyInGrid.cpp
+++ b/source/tensor/core/movement/CopyInGrid.cpp
@@ -41,12 +41,11 @@ void _CopyInGrid(const XTensor * s, XTensor * t, int * index, int blockDim, int 
 {
    CheckNTErrors((_IsSameShaped(s, t)), "Unmatched tensors!");
-    int blockDimRDI = s->order - blockDim - 1;
    int blockSize = 1;
    int blockNum = blockNumInGrid;
    int gridNum = 1;
-    for (int i = 0; i < blockDimRDI; i++)
+    for (int i = blockDim; i < s->order; i++)
-        blockSize *= s->dimSizeRDI[i];
+        blockSize *= s->dimSize[i];
    CheckNTErrors((s->unitNum % (blockSize * blockNum) == 0), "Illegal block number!");
    gridNum = s->unitNum / (blockSize * blockNum);

--- a/source/tensor/core/movement/CopyIndexed.cpp
+++ b/source/tensor/core/movement/CopyIndexed.cpp
@@ -53,26 +53,28 @@ void _CopyIndexed(const XTensor * s, XTensor * t, int dim,
    CheckNTErrors(dim < s->order && dim < t->order, "A too larget dimension specified!");
    CheckNTErrors(s->unitSize == t->unitSize, "Unmatched tensors!");
-    int dimRDI = s->order - dim - 1;
    int blockSizeSrc = 1;
    int blockSizeTgt = 1;
    int blockNumSrc = 1;
    int blockNumTgt = 1;
-    int leadDimSizeSrc = s->dimSizeRDI[dimRDI];
+    int leadDimSizeSrc = s->dimSize[dim];
-    int leadDimSizeTgt = t->dimSizeRDI[dimRDI];
+    int leadDimSizeTgt = t->dimSize[dim];
    int indexOffsetNum = 1;
-    for (int i = 0; i < dimRDI; i++) {
+    for (int i = dim + 1; i < s->order; i++) {
-        blockSizeSrc *= s->dimSizeRDI[i];
+        blockSizeSrc *= s->dimSize[i];
-        blockSizeTgt *= t->dimSizeRDI[i];
+    }
+    for (int i = dim + 1; i < t->order; i++) {
+        blockSizeTgt *= t->dimSize[i];
+    }
+    for (int i = 0; i <= dim; i++)
+    {
+        blockNumSrc *= s->dimSize[i];
+        blockNumTgt *= t->dimSize[i];
    }
-    for (int i = dimRDI; i < s->order; i++)
-        blockNumSrc *= s->dimSizeRDI[i];
-    for (int i = dimRDI; i < t->order; i++)
-        blockNumTgt *= t->dimSizeRDI[i];
    CheckNTErrors(blockSizeSrc == blockSizeTgt, "Unmatched tensors!");
-    indexOffsetNum = blockNumSrc / s->dimSizeRDI[dimRDI];
+    indexOffsetNum = blockNumSrc / s->dimSize[dim];
    int realIndexSize = indexOffsetNum * indexSize * copyNum;
    int * realSrcIndex = new int[realIndexSize];
@@ -219,14 +221,14 @@ make a new tensor to keep the result and return it
 >> s - the source tensor
 >> dim - the leading dimension to define "sub-tensors"
-         e.g., for a tensor of size (4, 2, 3) and dim = 0, 
+         e.g., for a tensor of size (3, 2, 4) and dim = 2, 
-         we have 4 sub-tensors of size (2, 3)
+         we have 4 sub-tensors of size (3,2)
 >> srcIndex - index of the source sub-tensors
 >> indexSize - length of srcIndex (and tgtIndex)
 >> tgtIndex - index of the target sub-tensors
 >> copyNum - number of the sub-tensors we copy for each source index, 
-             e.g., for srcIndex = [0,1] and copyNum = 2,
+             e.g., for srcIndex = [1,4] and copyNum = 2,
-             we actually copy the source sub-tensors 0, 1, 1 and 2
+             we actually copy the source sub-tensors 1, 2, 4, 5
 << return - the result of copying indexed sub-tensors
 */
 XTensor CopyIndexed(const XTensor & s, int dim, 
@@ -277,14 +279,14 @@ make a new tensor to keep the result and return it
 >> s - the source tensor
 >> dim - the leading dimension to define "sub-tensors"
-         e.g., for a tensor of size (4, 2, 3) and dim = 0, 
+         e.g., for a tensor of size (3, 2, 4) and dim = 2, 
-         we have 4 sub-tensors of size (2, 3)
+         we have 4 sub-tensors of size (3,2)
 >> srcIndex - index of the source sub-tensors
 >> indexSize - length of srcIndex (and tgtIndex)
 >> tgtIndex - index of the target sub-tensors
 >> copyNum - number of the sub-tensors we copy for each source index, 
-             e.g., for srcIndex = [0,1] and copyNum = 2,
+             e.g., for srcIndex = [1,4] and copyNum = 2,
-             we actually copy the source sub-tensors 0, 1, 1 and 2
+             we actually copy the source sub-tensors 1, 2, 4, 5
 << return - the result of copying indexed sub-tensors
 */
 XTensor CopyIndexed(const XTensor &s, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum)

--- a/source/tensor/core/movement/Gather.cpp
+++ b/source/tensor/core/movement/Gather.cpp
@@ -33,6 +33,51 @@ gather indexed sub-tensors
 >> s - the source tensor
 >> t - the target tensor
+>> dim - the leading dimension to define "sub-tensors"
+         e.g., for a tensor of size (3, 2, 4) and dim = 2, 
+         we have 4 sub-tensors of size (3, 2)
+>> srcIndex - index of the source sub-tensors
+>> indexSize - length of srcIndex (and tgtIndex)
+*/
+void _Gather(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize)
+{
+    int * tgtIndex = new int[indexSize];
+    for(int i = 0; i < indexSize; i++)
+        tgtIndex[i] = i;
+    _CopyIndexed(s, t, dim, srcIndex, indexSize, tgtIndex, 1);
+    delete[] tgtIndex;
+}
+/*
+gather indexed sub-tensors
+>> s - the source tensor
+>> t - the target tensor
+>> srcIndex - index of the source sub-tensors
+>> dim - the leading dimension to define "sub-tensors"
+e.g., for a tensor of size (3, 2, 4) and dim = 2,
+we have 4 sub-tensors of size (3, 2)
+*/
+void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
+{
+    CheckNTErrors((s && t), "Invalid tensors!");
+    CheckNTErrors(s->devID == t->devID, "the data must be kept on the same device!");
+    CheckNTErrors((t->unitSize == srcIndex->unitSize), "Unmatched tensors!");
+#ifdef USE_CUDA
+    if (s->devID >= 0 && t->devID >= 0) {
+        _CudaGather(s, t, srcIndex, dim);
+        return;
+    }
+#endif
+}
+/*
+gather indexed sub-tensors
+>> s - the source tensor
+>> t - the target tensor
 >> srcIndex - the tensor to save the index of the source tensor
 */
 void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex)
@@ -79,10 +124,15 @@ XTensor Gather(XTensor &s, XTensor &index)
    CheckNTErrors(s.order == 2, "The order of the input tensor must be 2!");
-    int order = index.order + 1;
+    int order = s.order;
    int * dimSize = new int[order];
-    memcpy(dimSize, index.dimSize, index.order * sizeof(int));
-    dimSize[index.order] = s.GetDim(-1);
+    for (int i = 0; i < s.order; i++) {
+        if (i == dim)
+            dimSize[i] = index.unitNum;
+        else
+            dimSize[i] = s.dimSize[i];
+    }
    float dr = (!s.isSparse) ? 1.0F : s.denseRatio;
    XTensor t(order, dimSize, s.dataType, dr, s.devID, s.mem);
@@ -93,11 +143,22 @@ XTensor Gather(XTensor &s, XTensor &index)
    _Gather(&s, &t, &index);
    /* tensor connection */
-    if (s.enableGrad) {
    XLink::MakeLink(&s, &index, &t, MOVEMENT_GATHER);
-    }
+    if(index.order > 1) {
+        int * dims = new int[index.order + 1];
+        memcpy(dims, index.dimSize, index.order * sizeof(int));
+        dims[index.order] = t.GetDim(-1);
+        XTensor tt;
+        tt = Reshape(t, index.order + 1, dims);
+        delete[] dims;
+        return tt;
+    }
+    else {
        return t;
+    }   
 }
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/movement/Gather.cu
+++ b/source/tensor/core/movement/Gather.cu
@@ -68,6 +68,36 @@ void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int indexSize, int
 /*
 gather indexed sub-tensors(cuda version)
+>> sData - the data pointer of the source tensor
+>> tData - the data pointer of the target tensor
+>> sIndex - the index of the source tensor
+>> indexSize - the size of the srcIndex
+>> stride - stride of a data block
+>> strideNum - strideNum of a data block
+>> blockNum - block size of data
+*/
+__global__
+void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int stride, int strideNum, int blockNum)
+{
+    int idx = blockDim.x * blockIdx.x + threadIdx.x;
+    int idy = blockDim.y * blockIdx.y + threadIdx.y;
+    int blockIndex = idy / stride;
+    int offsetInBlock = idy % stride;
+    int size = stride * strideNum * blockNum;  
+#pragma unroll
+    for (int i = idx * stride + stride * strideNum * blockIndex + offsetInBlock;
+        i < stride * strideNum * blockIndex + offsetInBlock + stride * strideNum && i < size;
+        i += stride * blockDim.x) {
+        tData[i] = sData[sIndex[i]];
+    }
+}
+/*
+gather indexed sub-tensors(cuda version)
 >> s - the source tensor
 >> t - the target tensor
 >> srcIndex - the tensor to save the index of the source tensor
@@ -117,6 +147,44 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
    BacktoCudaDev(devID, devIDBackup);
 }
+/*
+gather indexed sub-tensors(cuda version)
+>> s - the source tensor
+>> t - the target tensor
+>> srcIndex - the tensor to save the index of the source tensor
+>> dim - the leading dimension to define "sub-tensors"
+*/
+void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
+{
+    int devID = srcIndex->devID;
+    XMem * mem = s->mem;
+    int stride = 1;
+    int blockNum = 1;
+    int indexSize = srcIndex->unitNum;
+    int strideNum = srcIndex->dimSize[dim];
+    for (int i = 0; i < dim; i++)
+        blockNum *= srcIndex->dimSize[i];
+    for (int i = dim + 1; i < srcIndex->order; i++)
+        stride *= srcIndex->dimSize[i];
+    int * sIndex = NULL;
+    if (srcIndex->devID < 0) {
+        sIndex = mem != NULL ?
+            (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
+            (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
+        XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize);
+    }
+    else
+        sIndex = (int *)srcIndex->data;
+    int cudaGrids[3];
+    int cudaBlocks[3];
+    GDevs.GetCudaThread2D(devID, max(32, strideNum), stride*blockNum, MAX_INT, cudaGrids, cudaBlocks);
+    KernelGather << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> > ((DTYPE *)s->data, (DTYPE *)t->data, sIndex, stride, strideNum, blockNum);
+}
 #endif // USE_CUDA
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/movement/Gather.cuh
+++ b/source/tensor/core/movement/Gather.cuh
@@ -32,6 +32,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* gather indexed sub-tensors(cuda version) */
 void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex);
+void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex,int dim);
 #endif // USE_CUDA
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/movement/Gather.h
+++ b/source/tensor/core/movement/Gather.h
@@ -27,8 +27,14 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* gather selected sub-tensors */
+void _Gather(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize);
+/* gather selected sub-tensors */
 void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex);
+/* gather selected sub-tensors accoding to the dimension */
+void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim);
 /* gather selected sub-tensors (return an XTensor structure)
   make a new tensor to keep the result and return it */
 XTensor Gather(XTensor &s, XTensor &index);

--- a/source/tensor/core/reduce/ReduceMax.cpp
+++ b/source/tensor/core/reduce/ReduceMax.cpp
--- a/source/tensor/core/reduce/ReduceMax.cu
+++ b/source/tensor/core/reduce/ReduceMax.cu
--- a/source/tensor/core/reduce/ReduceMax.cuh
+++ b/source/tensor/core/reduce/ReduceMax.cuh
@@ -31,6 +31,9 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 /* get the max-valued items along a dimension of the tensor (cuda version) */
 void _CudaReduceMax(const XTensor * input, XTensor * output, int dim);
+/* get the min-valued items along a dimension of the tensor (cuda version) */
+void _CudaReduceMin(const XTensor * input, XTensor * output, int dim);
 #endif // USE_CUDA
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/reduce/ReduceMax.h
+++ b/source/tensor/core/reduce/ReduceMax.h
@@ -29,14 +29,20 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 /* get the max value of the items along a dimension of the tensor. */
 void _ReduceMax(const XTensor * input, XTensor * output, int dim);
+/* get the min value of the items along a dimension of the tensor. */
+void _ReduceMin(const XTensor * input, XTensor * output, int dim);
 /* 
 get the max value of the items along a dimension of the tensor (return an XTensor structure)
 make a new tensor to keep the result and return it
 */
 XTensor ReduceMax(const XTensor &input, int dim);
-/* get the max value of the items along a dimension of the tensor. */
+/*
-void ReduceMax(const XTensor &input, XTensor &output, int dim);
+get the min value of the items along a dimension of the tensor (return an XTensor structure)
+make a new tensor to keep the result and return it
+*/
+XTensor ReduceMin(const XTensor &input, int dim);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/reduce/ReduceMean.cpp
+++ b/source/tensor/core/reduce/ReduceMean.cpp
@@ -39,8 +39,7 @@ void _ReduceMean(const XTensor * input, XTensor * output, int dim)
 {
    CheckNTErrors((input->order > dim), "Illegal dimension specified!");
-    int dimRDI = input->order - dim - 1;
+    int num = input->dimSize[dim];
-    int num = input->dimSizeRDI[dimRDI];
    _ReduceSum(input, output, dim);
    _ScaleAndShiftMe(output, (DTYPE)1/num, 0);

--- a/source/tensor/core/reduce/ReduceSum.cpp
+++ b/source/tensor/core/reduce/ReduceSum.cpp
@@ -54,15 +54,14 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor 
    CheckNTErrors((input->dataType == output->dataType), "Unmatched data types!");
    CheckNTErrors((shift == NULL || _IsSameShaped(output, shift)), "Incorrect shift tensor size!");
-    int dimRDI = input->order - dim - 1;
+    CheckNTErrors(dim < input->order, "Wrong dimension!");
-    CheckNTErrors(dimRDI >= 0, "Wrong dimension!");
    for(int i = 0; i < input->order; i++){
-        if(i < dimRDI){
+        if(i < dim){
-            CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i]), "Unmatched tensors!");
+            CheckNTErrors((input->dimSize[i] == output->dimSize[i]), "Unmatched tensors!");
        }
-        else if(i > dimRDI){
+        else if(i > dim){
-            CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i - 1]), "Unmatched tensors!");
+            CheckNTErrors((input->dimSize[i] == output->dimSize[i - 1]), "Unmatched tensors!");
        }
    }
@@ -75,21 +74,21 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor 
        CheckNTErrors((input->dataType == DEFAULT_DTYPE), "TODO!");
        int stride = 1;
-        int strideNum = input->dimSizeRDI[dimRDI];
+        int strideNum = input->dimSize[dim];
        int blockSize = 1;
        int blockNum = 1;
        for (int i = 0; i < input->order; i++) {
-            if (i < dimRDI)
+            if (i < dim)
-                stride *= input->dimSizeRDI[i];
+                blockNum *= input->dimSize[i];
-            else if (i > dimRDI)
+            else if (i > dim)
-                blockNum *= input->dimSizeRDI[i];
+                stride *= input->dimSize[i];
        }
        blockSize = stride * strideNum;
-        if(input->dimSizeRDI[0] % (4 * 32 / sizeof(DTYPE)) == 0 && input->dimSizeRDI[0] >= 32){
+        if(input->dimSize[input->order - 1] % (4 * 32 / sizeof(DTYPE)) == 0 && input->dimSize[input->order - 1] >= 32){
            int vecBufLength =  32 / sizeof(DTYPE);
-            if(dimRDI == 0){
+            if(dim == input->order - 1){
                //data is contiguous in dim 0
                for(int i = 0; i < blockNum; i++){
                    // stride = 1
@@ -123,7 +122,7 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor 
            } else{
                //data is separated
                for(int i = 0; i < blockNum; i++){
-                    for(int j = 0; j < input->dimSizeRDI[0] / 32; j++){
+                    for(int j = 0; j < input->dimSize[input->order - 1] / 32; j++){
                        DTYPE * ip = (DTYPE*)input->data + blockSize * i;
                        DTYPE * op = (DTYPE*)output->data + stride * i;
                        DTYPE * sp = shift != NULL ? (DTYPE*)shift->data + stride * i : NULL;

--- a/source/tensor/core/reduce/ReduceSum.cu
+++ b/source/tensor/core/reduce/ReduceSum.cu
@@ -692,13 +692,12 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
    CheckNTErrors(input->dataType == output->dataType, "Unmatched data types!");
    CheckNTErrors(shift == NULL || output->unitNum == shift->unitNum, "Incorrect shift tensor size!");
-    int dimRDI = input->order - dim - 1;
    for(int i = 0; i < input->order; i++){
-        if(i < dimRDI){
+        if(i < dim){
-            CheckNTErrors(input->dimSizeRDI[i] == output->dimSizeRDI[i], "Unmatched tensors!");
+            CheckNTErrors(input->dimSize[i] == output->dimSize[i], "Unmatched tensors!");
        }
-        else if(i > dimRDI){
+        else if(i > dim){
-            CheckNTErrors(input->dimSizeRDI[i] == output->dimSizeRDI[i - 1], "Unmatched tensors!");
+            CheckNTErrors(input->dimSize[i] == output->dimSize[i - 1], "Unmatched tensors!");
        }
    }
@@ -709,32 +708,24 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
    int cudaBlockSize[3];
    int iter = 0;
    int stride = 1;
-    int strideNum = input->dimSizeRDI[dimRDI];
+    int strideNum = input->dimSize[dim];
    int blockSize = 1;
    int blockNum = 1;
    for (int i = 0; i < input->order; i++) {
-        if (i < dimRDI)
+        if (i < dim)
-            stride *= input->dimSizeRDI[i];
+            blockNum *= input->dimSize[i];
-        else if (i > dimRDI)
+        else if (i > dim)
-            blockNum *= input->dimSizeRDI[i];
+            stride *= input->dimSize[i];
    }
    blockSize = stride * strideNum;
    int devID = input->devID;
-    XMem * mem = input->mem;
+    int devIDBackup;
+    ProtectCudaDev(devID, devIDBackup);
-    GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
-    int bufSize = input->unitSize * cudaGridSize[0] * stride * blockNum * 2;
-    DTYPE * buf  = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(input->devID, bufSize);
-    DTYPE * buf1 = buf;
-    DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum;
    DTYPE * sp = shift != NULL ? (DTYPE*)shift->data : NULL;
-    int devIDBackup;
-    ProtectCudaDev(input->devID, devIDBackup);
    if (stride == 1 && blockNum >= 10) {
        dim3 grids;
        dim3 blocks;
@@ -761,6 +752,14 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
                                                                strideNum, blockNum,sp, power, isExp);
    }
    else {
+        XMem * mem = input->mem;
+        GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+        int bufSize = input->unitSize * cudaGridSize[0] * stride * blockNum * 2;
+        DTYPE * buf  = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(devID, bufSize);
+        DTYPE * buf1 = buf;
+        DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum;
        do {
            if (input->dataType == DEFAULT_DTYPE) {
                DTYPE * iData = NULL;
@@ -904,13 +903,15 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
            iter++;
        } while (strideNum > 1);
-    }
-    ProtectCudaDev(input->devID, devIDBackup);
        if (mem != NULL)
            mem->ReleaseBuf(mem->devID, bufSize);
        else
-        XMemFree(input->devID, buf);
+            XMemFree(devID, buf);
+    }
+    BacktoCudaDev(devID, devIDBackup);
 }
 #endif // USE_CUDA

--- a/source/tensor/core/reduce/ReduceVariance.cpp
+++ b/source/tensor/core/reduce/ReduceVariance.cpp
@@ -38,8 +38,7 @@ For a 1-dimensional data array a, variance = 1/n * \sum_i (a_i - mean)^2
 */
 void _ReduceVariance(const XTensor * input, XTensor * output, int dim, const XTensor * mean)
 {
-    int dimRDI = input->order - dim - 1;
+    int num = input->dimSize[dim];
-    int num = input->dimSizeRDI[dimRDI];
    _ReduceSum(input, output, dim, mean, 2.0F);
    _ScaleAndShiftMe(output, (DTYPE)1 / num, 0);
 }

--- a/source/tensor/core/reduce/VectorBuffer.cpp
+++ b/source/tensor/core/reduce/VectorBuffer.cpp
@@ -20,7 +20,7 @@
 */
 #include "VectorBuffer.h"
+//#include "math.h"
 namespace nts {
 /* data size for each buffer */
 int VectorBuffer::size()
@@ -168,4 +168,13 @@ VectorBuffer VectorBuffer::maxData(const VectorBuffer &a) {
    return *this;
 }
+/* conculte the max of two buffer */
+VectorBuffer VectorBuffer::minData(const VectorBuffer &a) {
+    for (int i = 0; i != a.size(); i++) {
+        this->values[i] = MIN(a[i], this->values[i]);
+        printf("runhere");
+    }
+    return *this;
+}
 }/* end of the nts (NiuTrans.Tensor) namespace */
\ No newline at end of file
--- a/source/tensor/core/reduce/VectorBuffer.h
+++ b/source/tensor/core/reduce/VectorBuffer.h
@@ -20,7 +20,6 @@
 */
 //#include <cstring>
-#include <math.h>
 #include "../../XGlobal.h"
 namespace nts {
@@ -49,5 +48,8 @@ public:
    /* conculte the max of two buffer */
    VectorBuffer maxData(const VectorBuffer &a); 
+    /* conculte the max of two buffer */
+    VectorBuffer minData(const VectorBuffer &a);
 };
 }
\ No newline at end of file
--- a/source/tensor/core/shape/ConcatenateSolely.cpp
+++ b/source/tensor/core/shape/ConcatenateSolely.cpp
@@ -39,30 +39,29 @@ void _ConcatenateSolely(const TensorList * smalls, XTensor * big, int dim)
    CheckNTErrors(big->order > dim && dim >= 0, "Illegal dimension to concatenate!");
    int catDimSize = 0;
-    int dimRDI = big->order - dim - 1;
    for (int i = 0; i < smalls->count; i++) {
        XTensor * tensor = (XTensor*)smalls->GetItem(i);
        CheckNTErrors((big->order == tensor->order), "Unmatched tensor orders!");
        for (int j = 0; j < big->order; j++) {
-            if (j != dimRDI) {
+            if (j != dim) {
-                CheckNTErrors((big->dimSizeRDI[j] == tensor->dimSizeRDI[j]), "Unmatched tensor sizes!");
+                CheckNTErrors((big->dimSize[j] == tensor->dimSize[j]), "Unmatched tensor sizes!");
            }
            else {
-                catDimSize += tensor->dimSizeRDI[j];
+                catDimSize += tensor->dimSize[j];
            }
        }
    }
-    CheckNTErrors((catDimSize == big->dimSizeRDI[dimRDI]), "Unmatched tensor sizes!");
+    CheckNTErrors((catDimSize == big->dimSize[dim]), "Unmatched tensor sizes!");
    int stride = 1;
-    for (int i = 0; i < dimRDI; i++)
-        stride *= big->dimSizeRDI[i];
    int blockNum = 1;
-    for (int i = dimRDI + 1; i < big->order; i++)
+    for (int i = 0; i < dim; i++)
-        blockNum *= big->dimSizeRDI[i];
+        blockNum *= big->dimSize[i];
+    for (int i = dim + 1; i < big->order; i++)
+        stride *= big->dimSize[i];
    int offset = 0;
@@ -74,8 +73,8 @@ void _ConcatenateSolely(const TensorList * smalls, XTensor * big, int dim)
    if (smalls->count <= MIN_TENSOR_CAT_NUM) {
        for (int i = 0; i < smalls->count; i++) {
            XTensor * tensor = (XTensor*)smalls->GetItem(i);
-            int sPitch = stride * tensor->dimSizeRDI[dimRDI] * tensor->unitSize;
+            int sPitch = stride * tensor->dimSize[dim] * tensor->unitSize;
-            int tPitch = stride * big->dimSizeRDI[dimRDI] * big->unitSize;
+            int tPitch = stride * big->dimSize[dim] * big->unitSize;
            int mSize = sPitch;
            int n = blockNum;
            XMemCopy2D((char*)big->data + offset, tPitch, big->devID,
@@ -89,7 +88,7 @@ void _ConcatenateSolely(const TensorList * smalls, XTensor * big, int dim)
        int * blockSizes = new int[smalls->count];
        for (int i = 0; i < smalls->count; i++) {
            XTensor * tensor = (XTensor*)smalls->GetItem(i);
-            blockSizes[i] = stride * tensor->dimSizeRDI[dimRDI] * tensor->unitSize;
+            blockSizes[i] = stride * tensor->dimSize[dim] * tensor->unitSize;
            sourceArrays->Add((char*)tensor->data);
        }

--- a/source/tensor/core/shape/IsSameShaped.cpp
+++ b/source/tensor/core/shape/IsSameShaped.cpp
@@ -39,7 +39,7 @@ bool _IsSameShaped(const XTensor * a, const XTensor * b)
        return false;
    for(int i = 0; i < a->order; i++){
-        if(a->dimSizeRDI[i] != b->dimSizeRDI[i])
+        if(a->dimSize[i] != b->dimSize[i])
            return false;
    }

--- a/source/tensor/core/shape/Merge.cpp
+++ b/source/tensor/core/shape/Merge.cpp
@@ -46,10 +46,8 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
    if(leadingDim < 0)
        leadingDim = 0;
-    int whereToMergeRDI = s->order - whereToMerge - 1;
+    if (leadingDim >= s->order)
-    int leadingDimRDI = s->order - leadingDim - 1;
+        leadingDim = leadingDim - s->order;
-    if (leadingDimRDI < 0)
-        leadingDimRDI = s->order - 1;
    CheckNTErrors((s != NULL && t != NULL), "Invalid tensors!");
    CheckNTErrors((s->devID == t->devID || (s->devID < 0 && t->devID < 0)),
@@ -57,19 +55,20 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
    CheckNTErrors((s->unitNum == t->unitNum && s->unitSize == t->unitSize), "Unmatched tensors!");
    CheckNTErrors((s->order == t->order + 1), "Unmatched tensors!");
-    CheckNTErrors((leadingDimRDI > whereToMergeRDI), "Invalid leading dimension!");
+    CheckNTErrors((leadingDim < whereToMerge), "Invalid leading dimension!");
    for (int i = 0; i < s->order; i++) {
-        if (i == whereToMergeRDI) {
+        if (i == whereToMerge) {
-            CheckNTErrors((t->dimSizeRDI[i] == s->dimSizeRDI[i] * s->dimSizeRDI[leadingDimRDI]),
+            CheckNTErrors((t->dimSize[i - 1] == s->dimSize[i] * s->dimSize[leadingDim]),
                          "Unmatched tensor sizes!");
        }
-        else if (i < leadingDimRDI){
+        else if (i < leadingDim){
-            CheckNTErrors((s->dimSizeRDI[i] == t->dimSizeRDI[i]),
+            CheckNTErrors((s->dimSize[i] == t->dimSize[i]),
                          "Unmatched tensor sizes!");
        }
-        else if (i > leadingDimRDI) {
+        else if (i > leadingDim) {
-            CheckNTErrors((s->dimSizeRDI[i] == t->dimSizeRDI[i - 1]),
+            CheckNTErrors((s->dimSize[i] == t->dimSize[i - 1]),
                          "Unmatched tensor sizes!");
        }
    }
@@ -78,14 +77,14 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
    int blockNum = 1;
    int gridSize = 1;
    int gridNum = 1;
-    int mergedNum = s->dimSizeRDI[leadingDimRDI];
+    int mergedNum = s->dimSize[leadingDim];
    for (int i = 0; i < s->order; i++) {
-        if (i <= leadingDimRDI) {
+        if (i >= leadingDim) {
-            if (i <= whereToMergeRDI)
+            if (i >= whereToMerge)
-                blockSize *= s->dimSizeRDI[i];
+                blockSize *= s->dimSize[i];
            else
-                blockNum *= s->dimSizeRDI[i];
+                blockNum *= s->dimSize[i];
        }
    }
@@ -122,7 +121,7 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
        if (!isOnSameDevice)
            dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(mem->devID, size);
-        int blockNumInMerge = s->dimSizeRDI[leadingDimRDI];
+        int blockNumInMerge = s->dimSize[leadingDim];
        int splitSizeInGrid = gridSize / blockNumInMerge;
        int realBlockSize = blockSize * t->unitSize;
@@ -311,12 +310,11 @@ void _Merge(const TensorList * smalls, XTensor * t, int whereToMerge)
    int mergedNum = smalls->count;
    XTensor * s0 = smalls->GetItem(0);
-    int whereToMergeRDI = s0->order - whereToMerge - 1;
    for (int i = 0; i < s0->order; i++) {
-        if (i <= whereToMergeRDI)
+        if (i >= whereToMerge)
-            blockSize *= s0->dimSizeRDI[i];
+            blockSize *= s0->dimSize[i];
        else
-            blockNum *= s0->dimSizeRDI[i];
+            blockNum *= s0->dimSize[i];
    }
    CheckNTErrors((s0->unitNum % (blockSize * blockNum) == 0), "Incorrect size!");

--- a/source/tensor/core/shape/Merge.h
+++ b/source/tensor/core/shape/Merge.h
@@ -46,8 +46,6 @@ void Merge(const TensorList &smalls, XTensor &t, int whereToMerge);
 /* merge two tensors into a big tensor (return an XTensor structure) */
 XTensor Merge(const XTensor &smallA, const XTensor &smallB, int whereToMerge);
-void Merge(const XTensor &smallA, const XTensor &smallB, XTensor &t, int whereToMerge);
 } // namespace nts(NiuTrans.Tensor)
 #endif // __MERGE_H__
\ No newline at end of file
--- a/source/tensor/core/shape/Split.cpp
+++ b/source/tensor/core/shape/Split.cpp
@@ -31,7 +31,7 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /*
-transform a tensor by splitting it, e.g., (N, M) -> (3, N/3, M)
+transform a tensor by splitting it, e.g., (N, M) -> (N/3, M, 3)
 >> s - the source tensor
 >> t - the target tensor (for return)
@@ -46,23 +46,22 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
    CheckNTErrors((s->unitNum == t->unitNum && s->unitSize == t->unitSize), "Unmatched tensors!");
    CheckNTErrors((s->order == t->order - 1), "Unmatched tensors!");
-    CheckNTErrors((t->dimSizeRDI[t->order - 1] == splitNum), "Incorrect tensor sizes!");
+    CheckNTErrors((t->dimSize[0] == splitNum), "Incorrect tensor sizes!");
-    int whereToSplitRDI = s->order - whereToSplit - 1;
    for (int i = 0; i < s->order; i++) {
-        if (i == whereToSplitRDI) {
+        if (i == whereToSplit) {
-            CheckNTErrors((s->dimSizeRDI[i] == t->dimSizeRDI[i] * splitNum),
+            CheckNTErrors((s->dimSize[i] == t->dimSize[i + 1] * splitNum),
                          "Unmatched tensor sizes!");
        }
        else {
-            CheckNTErrors((s->dimSizeRDI[i] == t->dimSizeRDI[i]),
+            CheckNTErrors((s->dimSize[i] == t->dimSize[i + 1]),
                          "Unmatched tensor sizes!");
        }
    }
    /* for the case that we split the last dimension. Actually
-    (N, M) and (3, N/3, M) have the same memory layout */
+    (N, M) and (N, M/3, 3) have the same memory layout */
-    if (s->order - 1 == whereToSplitRDI) {
+    if (0 == whereToSplit) {
        XMemCopy(t->data, t->devID, s->data, s->devID, s->unitNum * s->unitSize);
        return;
    }
@@ -70,14 +69,14 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
    int blockSize = 1;
    int blockNum = 1;
    for (int i = 0; i < s->order; i++) {
-        if (i == whereToSplitRDI) {
+        if (i == whereToSplit) {
-            blockSize *= s->dimSizeRDI[i] / splitNum;
+            blockSize *= s->dimSize[i] / splitNum;
            blockNum *= splitNum;
        }
-        else if (i < whereToSplitRDI)
+        else if (i > whereToSplit)
-            blockSize *= s->dimSizeRDI[i];
+            blockSize *= s->dimSize[i];
        else
-            blockNum *= s->dimSizeRDI[i];
+            blockNum *= s->dimSize[i];
    }
    CheckNTErrors((blockNum % splitNum == 0), "Incorrect split number!");
@@ -184,7 +183,7 @@ bool CheckSplitSize(const XTensor * s, const XTensor * t, int whereToSplit, int 
 }
 /*
-transform a tensor by splitting it, e.g., (N, M) -> (3, N/3, M) (return an XTensor structure)
+transform a tensor by splitting it, e.g., (N, M) -> (N/3, M, 3) (return an XTensor structure)
 make a new tensor to keep the result and return it
 >> s - the source tensor
@@ -276,7 +275,6 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli
    CheckNTErrors((smalls->count == splitNum), "Unmatched tensors!");
    CheckNTErrors((smalls->count > 0), "Wrong input!");
-    int whereToSplitRDI = big->order - whereToSplit - 1;
    bool uniform = true;
    for (int i = 0; i < smalls->count; i++) {
@@ -292,14 +290,14 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli
    int blockSize = 1;
    int blockNum = 1;
    for (int i = 0; i < big->order; i++) {
-        if (i == whereToSplitRDI) {
+        if (i == whereToSplit) {
-            blockSize *= big->dimSizeRDI[i] / splitNum;
+            blockSize *= big->dimSize[i] / splitNum;
            blockNum *= splitNum;
        }
-        else if (i < whereToSplitRDI)
+        else if (i > whereToSplit)
-            blockSize *= big->dimSizeRDI[i];
+            blockSize *= big->dimSize[i];
        else
-            blockNum *= big->dimSizeRDI[i];
+            blockNum *= big->dimSize[i];
    }
    CheckNTErrors((blockNum % splitNum == 0), "Incorrect split number!");

--- a/source/tensor/core/shape/Unsqueeze.cpp
+++ b/source/tensor/core/shape/Unsqueeze.cpp
@@ -42,16 +42,15 @@ void _Unsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
    CheckNTErrors((a->order == b->order - 1), "Unmatched tensors!");
    CheckNTErrors((a->unitSize == b->unitSize), "Unmatched tensors!");
-    int dimRDI = b->order - dim - 1;
    for (int i = 0; i < b->order; i++) {
-        if (i < dimRDI) {
+        if (i < dim) {
-            CheckNTErrors((a->dimSizeRDI[i] == b->dimSizeRDI[i]), "Unmatched tensors!");
+            CheckNTErrors((a->dimSize[i] == b->dimSize[i]), "Unmatched tensors!");
        }
-        else if (i > dimRDI) {
+        else if (i > dim) {
-            CheckNTErrors((a->dimSizeRDI[i - 1] == b->dimSizeRDI[i]), "Unmatched tensors!");
+            CheckNTErrors((a->dimSize[i - 1] == b->dimSize[i]), "Unmatched tensors!");
        }
        else {
-            CheckNTErrors((dSize == b->dimSizeRDI[i]), "Unmatched tensors!");
+            CheckNTErrors((dSize == b->dimSize[i]), "Unmatched tensors!");
        }
    }
@@ -60,8 +59,8 @@ void _Unsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
    int blockNumA = 1;
    int blockNumB = 1;
-    for (int i = 0; i < dimRDI; i++)
+    for (int i = dim; i < a->order; i++)
-        blockSize *= a->dimSizeRDI[i];
+        blockSize *= a->dimSize[i];
    realBlockSize = blockSize * a->unitSize;

--- a/source/tensor/core/shape/Unsqueeze.cu
+++ b/source/tensor/core/shape/Unsqueeze.cu
@@ -235,9 +235,8 @@ void _CudaUnsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
    int blockSize = 1;
    int blockNumA = 1;
    int blockNumB = 1;
-    int dimRDI = b->order - dim - 1;
+    for (int i = dim; i < a->order; i++)
-    for (int i = 0; i < dimRDI; i++)
+        blockSize *= a->dimSize[i];
-        blockSize *= a->dimSizeRDI[i];
    blockNumA = a->unitNum / blockSize;
    blockNumB = b->unitNum / blockSize;
@@ -250,7 +249,7 @@ void _CudaUnsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
    int devIDBackup = 0;
    ProtectCudaDev(a->devID, devIDBackup);
-    if (dimRDI == 0) {
+    if (dim == b->order - 1) {
        GDevs.GetCudaThread2D(a->devID, dSize, blockNumA, MAX_INT, cudaGrids, cudaBlocks);
        if (a->dataType == X_FLOAT && b->dataType == X_FLOAT) {

--- a/source/tensor/core/sort/Sort.cpp
+++ b/source/tensor/core/sort/Sort.cpp
@@ -47,7 +47,6 @@ void _Sort(const XTensor * a, XTensor * b, XTensor * index, int dim)
    CheckNTErrors((a->order == index->order), "Unmatched input tensors!");
    CheckNTErrors((index->dataType == X_INT), "Wrong data type!");
-    int dimRDI = a->order - dim - 1;
    /* make the index tensor */
    SetAscendingOrder(*index, dim);
@@ -60,13 +59,13 @@ void _Sort(const XTensor * a, XTensor * b, XTensor * index, int dim)
    }
    else {
        int stride = 1;
-        int strideNum = a->dimSizeRDI[dimRDI];
-        for (int i = 0; i < dimRDI; i++)
-            stride *= a->dimSizeRDI[i];
        int blockNum = 1;
-        for (int i = dimRDI + 1; i < a->order; i++)
+        int strideNum = a->dimSize[dim];
-            blockNum *= a->dimSizeRDI[i];
+        for (int i = 0; i < dim; i++)
+            blockNum *= a->dimSize[i];
+        for (int i = dim + 1; i < a->order; i++)
+            stride *= a->dimSize[i];
        int blockSize = stride * strideNum;
        _CopyValues(a, b);

--- a/source/tensor/core/sort/Sort.cu
+++ b/source/tensor/core/sort/Sort.cu
@@ -217,20 +217,19 @@ void _CudaSortBig(const XTensor * a, XTensor * b, XTensor * indexA, XTensor * in
    CheckNTErrors((a->order > dim && dim >= 0), "Incorrect dimension specified!");
    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
-    int dimRDI = a->order - dim - 1;
+    if (k < 0 || k > b->dimSize[dim])
-    if (k < 0 || k > b->dimSizeRDI[dimRDI])
+        k = b->dimSize[dim];
-        k = b->dimSizeRDI[dimRDI];
    XMem * mem = a->mem;
    int stride = 1;
-    int strideNum = a->dimSizeRDI[dimRDI];
-    for (int i = 0; i < dimRDI; i++)
-        stride *= a->dimSizeRDI[i];
    int blockNum = 1;
-    for (int i = dimRDI + 1; i < a->order; i++)
+    int strideNum = a->dimSize[dim];
-        blockNum *= a->dimSizeRDI[i];
+    for (int i = 0; i < dim; i++)
+        blockNum *= a->dimSize[i];
+    for (int i = dim + 1; i < a->order; i++)
+        stride *= a->dimSize[i];
    int m = GetNextPower2(strideNum);
    int n = stride * blockNum;

--- a/source/tensor/core/sort/TopK.cpp
+++ b/source/tensor/core/sort/TopK.cpp
@@ -45,15 +45,14 @@ void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
    CheckNTErrors(index == NULL || a->order == index->order, "Unmatched input tensors!");
    CheckNTErrors(index->dataType == X_INT, "Wrong data type!");
-    int dimRDI = a->order - dim - 1;
    for (int i = 0; i < a->order; i++) {
-        if (i == dimRDI) {
+        if (i == dim) {
-            CheckNTErrors(b->dimSizeRDI[i] == k, "A too large K");
+            CheckNTErrors((b->dimSize[i] == k), "A too large K");
-            CheckNTErrors(index == NULL || index->dimSizeRDI[i] == k, "Wrong size!");
+            CheckNTErrors((index == NULL || index->dimSize[i] == k), "Wrong size!");
        }
        else {
-            CheckNTErrors(b->dimSizeRDI[i] == a->dimSizeRDI[i], "Wrong size!");
+            CheckNTErrors((b->dimSize[i] == a->dimSize[i]), "Wrong size!");
-            CheckNTErrors(index == NULL || index->dimSizeRDI[i] == a->dimSizeRDI[i], "Wrong size!");
+            CheckNTErrors((index == NULL || index->dimSize[i] == a->dimSize[i]), "Wrong size!");
        }
    }
@@ -68,14 +67,14 @@ void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
        CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
        int stride = 1;
-        int strideNumA = a->dimSizeRDI[dimRDI];
-        int strideNumB = b->dimSizeRDI[dimRDI];
-        for (int i = 0; i < dimRDI; i++)
-            stride *= a->dimSizeRDI[i];
        int blockNum = 1;
-        for (int i = dimRDI + 1; i < a->order; i++)
+        int strideNumA = a->dimSize[dim];
-            blockNum *= a->dimSizeRDI[i];
+        int strideNumB = b->dimSize[dim];
+        for (int i = 0; i < dim; i++)
+            blockNum *= a->dimSize[i];
+        for (int i = dim + 1; i < a->order; i++)
+            stride *= a->dimSize[i];
        int blockSizeA = stride * strideNumA;
        int blockSizeB = stride * strideNumB;

--- a/source/tensor/core/sort/TopK.cu
+++ b/source/tensor/core/sort/TopK.cu
@@ -812,15 +812,14 @@ void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
    CheckNTErrors((index->dataType == X_INT), "Wrong data type!");
    CheckNTErrors((b->dimSize[dim] == k), "A too large K");
-    int dimRDI = a->order - dim - 1;
    int stride = 1;
-    int strideNumA = a->dimSizeRDI[dimRDI];
-    for (int i = 0; i < dimRDI; i++)
-        stride *= a->dimSizeRDI[i];
    int blockNum = 1;
-    for (int i = dimRDI + 1; i < a->order; i++)
+    int strideNumA = a->dimSize[dim];
-        blockNum *= a->dimSizeRDI[i];
+    for (int i = 0; i < dim; i++)
+        blockNum *= a->dimSize[i];
+    for (int i = dim + 1; i < a->order; i++)
+        stride *= a->dimSize[i];
    int workerNum = blockNum < 16 ? 64 : 32; 
    /* adjust the thread num according size of k for fitting the share memory size */

--- a/source/tensor/core/utilities/SetAscendingOrder.cpp
+++ b/source/tensor/core/utilities/SetAscendingOrder.cpp
@@ -47,7 +47,6 @@ void SetAscendingOrder(XTensor & tensor, int dim)
        return;
    }
-    int dimRDI = tensor.order - dim - 1;
    if(tensor.devID >= 0){
 #ifdef USE_CUDA
        CudaSetAscendingOrder(&tensor, dim);
@@ -57,13 +56,13 @@ void SetAscendingOrder(XTensor & tensor, int dim)
    }
    else{
        int stride = 1;
-        int strideNum = tensor.dimSizeRDI[dimRDI];
-        for(int i = 0; i < dimRDI; i++)
-            stride *= tensor.dimSizeRDI[i];
        int blockNum = 1;
-        for(int i = dimRDI + 1; i < tensor.order; i++)
+        int strideNum = tensor.dimSize[dim];
-            blockNum *= tensor.dimSizeRDI[i];
+        for(int i = 0; i < dim; i++)
+            blockNum *= tensor.dimSize[i];
+        for(int i = dim + 1; i < tensor.order; i++)
+            stride *= tensor.dimSize[i];
        for(int k = 0; k < blockNum; k++){
            for(int j = 0; j < strideNum; j++){

--- a/source/tensor/core/utilities/SetAscendingOrder.cu
+++ b/source/tensor/core/utilities/SetAscendingOrder.cu
@@ -67,15 +67,14 @@ void CudaSetAscendingOrder(XTensor * a, int dim)
 {
    CheckNTErrors((a->dataType == X_INT), "TODO!");
-    int dimRDI = a->order - dim - 1;
 	int stride = 1;
-    int strideNum = a->dimSizeRDI[dimRDI];
-    for(int i = 0; i < dimRDI; i++)
-        stride *= a->dimSizeRDI[i];
    int blockNum = 1;
-    for(int i = dimRDI + 1; i < a->order; i++)
+    int strideNum = a->dimSize[dim];
-        blockNum *= a->dimSizeRDI[i];
+    for(int i = 0; i < dim; i++)
+        blockNum *= a->dimSize[i];
+    for(int i = dim + 1; i < a->order; i++)
+        stride *= a->dimSize[i];
    int gridSize[3];
    int blockSize[3];

--- a/source/tensor/function/LogSoftmax.cpp
+++ b/source/tensor/function/LogSoftmax.cpp
@@ -50,7 +50,6 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
        return;
    }
-    int leadDimRDI = x->order - leadDim - 1;
    if (!x->isSparse && !y->isSparse &&
        x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE)
    {
@@ -70,13 +69,13 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
        XTensor * blockMax = NULL;
        XTensor * blockSum = NULL;
-        int dimensionSize = y->dimSizeRDI[leadDimRDI];
+        int dimensionSize = y->dimSize[leadDim];
        int stride = 1;
        int blockSize = 1;
        int blockNum = 1;
-        for (int i = 0; i < leadDimRDI; i++)
+        for (int i = leadDim + 1; i < y->order; i++)
-            stride *= y->dimSizeRDI[i];
+            stride *= y->dimSize[i];
        blockSize = stride * dimensionSize;
        blockNum = y->unitNum / blockSize;
@@ -87,7 +86,7 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
        _ReduceSum(x, sum, leadDim, max, 1.0F, true);
        if (x->devID >= 0) {
-            if(leadDimRDI == 0){
+            if(leadDim == x->order - 1){
                blockSize = y->unitNum;
                blockNum  = 1;
                blockx = NewTensor2D(blockSize/dimensionSize, -dimensionSize, x->dataType, x->devID, mem);
@@ -138,7 +137,7 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
                blockMax->data = mp;
                blockSum->data = sp;
 #ifdef USE_CUDA
-                if(leadDimRDI == 0)
+                if(leadDim == x->order - 1)
                    _CudaLogSoftmaxSumMax(blockx, blocky, 1, blockSum, blockMax);
                else
                    _CudaLogSoftmaxSumMax(blockx, blocky, leadDim, blockSum, blockMax);
@@ -299,7 +298,6 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
    if(leadDim < 0)
        leadDim = y->order - 1;
-    int leadDimRDI = y->order - leadDim - 1;
 #ifdef USE_CUDA
    if (gold->devID >= 0) {
        _CudaLogSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
@@ -307,12 +305,12 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
    }
 #endif
-    int dimensionSize = y->dimSizeRDI[leadDimRDI];
+    int dimensionSize = y->dimSize[leadDim];
    int stride = 1;
    int blockSize = 1;
    int blockNum = 1;
-    for (int i = 0; i < leadDimRDI; i++)
+    for (int i = leadDim + 1; i < y->order; i++)
-        stride *= y->dimSizeRDI[i];
+        stride *= y->dimSize[i];
    blockSize = stride * dimensionSize;
    blockNum = y->unitNum / blockSize;
@@ -339,10 +337,10 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
                    int key = gold->GetKeyInSparse(i);
                    DTYPE value = gold->GetInSparse(i);
                    int offset = key;
-                    if (dedx->dimSizeRDI[0] != gm) {
+                    if (dedx->dimSize[dedx->order - 1] != gm) {
                        int mi = key % gm;
                        int ni = key / gm;
-                        int key2 = ni * dedx->dimSizeRDI[0] + mi;
+                        int key2 = ni * dedx->dimSize[dedx->order - 1] + mi;
                        offset = key2;
                    }
                    if (key >= 0 && key < size)
@@ -396,10 +394,10 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
                    int key = gold->GetKeyInSparse(i);
                    DTYPE value = gold->GetInSparse(i);
                    int offset = key;
-                    if (dedx->dimSizeRDI[0] != gm) {
+                    if (dedx->dimSize[dedx->order - 1] != gm) {
                        int mi = key % gm;
                        int ni = key / gm;
-                        int key2 = ni * dedx->dimSizeRDI[0] + mi;
+                        int key2 = ni * dedx->dimSize[dedx->order - 1] + mi;
                        offset = key2;
                    }
                    if (key >= 0 && key < size)
@@ -431,11 +429,11 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
        /* for columns with no xs we set dE/ds = 0 */
        if (gold != NULL && gold->isSparse) {
            CheckNTErrors((gold->order == 2), "The gold standard tensor must be of order 2!");
-            if ((gold->dimSize[1] > 1 && !gold->isAllValued[0]) || gold->dimSize[1] != dedx->dimSizeRDI[0]) {
+            if ((gold->dimSize[1] > 1 && !gold->isAllValued[0]) || gold->dimSize[1] != dedx->dimSize[dedx->order - 1]) {
                int gn = gold->dimSize[0];
                int gm = gold->dimSize[1];
-                int sm = dedx->dimSizeRDI[0];
+                int sm = dedx->dimSize[dedx->order - 1];
-                int sn = dedx->dimSizeRDI[1];
+                int sn = dedx->dimSize[dedx->order - 2];
                int * flags = new int[sm];
                memset(flags, 0, sizeof(int)*sm);

--- a/source/tensor/function/LogSoftmax.cu
+++ b/source/tensor/function/LogSoftmax.cu
@@ -385,13 +385,12 @@ void _CudaLogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
                  "Tensors used in log softmax are not on the same GPU.");
    CheckNTErrors((gold != NULL), "No x gold standard is found!");
-    int leadDimRDI = y->order - leadDim - 1;
+    int dimensionSize = y->dimSize[leadDim];
-    int dimensionSize = y->dimSizeRDI[leadDimRDI];
    int stride = 1;
    int blockSize = 1;
    int blockNum = 1;
-    for (int i = 0; i < leadDimRDI; i++)
+    for (int i = leadDim + 1; i < y->order; i++)
-        stride *= y->dimSizeRDI[i];
+        stride *= y->dimSize[i];
    blockSize = stride * dimensionSize;
    blockNum = y->unitNum / blockSize;

--- a/source/tensor/function/Loss.cpp
+++ b/source/tensor/function/Loss.cpp
@@ -50,18 +50,17 @@ DTYPE _LossCompute(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
    if (output->devID < 0) {
        CheckNTErrors((gLen >= 0 && gLen <= output->unitNum), "Illegal input length!");
        CheckNTErrors((_IsSameShaped(gold, output)), "The input tensors must be of the same size!");
-        CheckNTErrors((gold->dimSizeRDI[0] == 1 && output->dimSizeRDI[0] == 1), "TODO!");
+        CheckNTErrors((gold->dimSize[gold->order - 1] == 1 && output->dimSize[output->order - 1] == 1), "TODO!");
        CheckNTErrors((gold->order > leadDim && leadDim >= 0), "Illegal leading dimension!");
        CheckNTErrors((gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE), "TODO!");
-        int leadDimRDI = output->order - leadDim - 1;
+        int dimensionSize = output->dimSize[leadDim];
-        int dimensionSize = output->dimSizeRDI[leadDimRDI];
        int stride = 1;
        int blockSize = 1;
        int blockNum = 1;
-        for(int i = 0; i < leadDimRDI; i++)
+        for(int i = leadDim + 1; i < output->order; i++)
-            stride *= output->dimSizeRDI[i];
+            stride *= output->dimSize[i];
        blockSize = stride * dimensionSize;
        blockNum = output->unitNum / blockSize;
@@ -207,18 +206,17 @@ DTYPE _LossComputeForLogScale(XTensor * gold, XTensor * output,
 {
    CheckNTErrors(gLen >= 0 && gLen <= output->unitNum, "Illegal input length!");
    CheckNTErrors(_IsSameShaped(gold, output), "The input tensors must be of the same size!");
-    CheckNTErrors(gold->dimSizeRDI[0] == 1 && output->dimSizeRDI[0] == 1, "TODO!");
+    CheckNTErrors(gold->dimSize[gold->order - 1] == 1 && output->dimSize[output->order - 1] == 1, "TODO!");
    CheckNTErrors(gold->order > leadDim && leadDim >= 0, "Illegal leading dimension!");
    CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, "TODO!");
-    int leadDimRDI = output->order - leadDim - 1;
+    int dimensionSize = output->dimSize[leadDim];
-    int dimensionSize = output->dimSizeRDI[leadDimRDI];
    int stride = 1;
    int blockSize = 1;
    int blockNum = 1;
-    for(int i = 0; i < leadDimRDI; i++)
+    for(int i = leadDim + 1; i < output->order; i++)
-        stride *= output->dimSizeRDI[i];
+        stride *= output->dimSize[i];
    blockSize = stride * dimensionSize;
    blockNum = output->unitNum / blockSize;
@@ -409,21 +407,20 @@ void _LossBackward(XTensor * dedy, XTensor * t, XTensor * y,
        CheckNTErrors(t->order > leadDim, "Illegal leading dimension!");
        CheckNTErrors(t->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE, "TODO!");
-        int leadDimRDI = leadDim >= 0 ? y->order - leadDim - 1 : -1;
+        if (leadDim < 0) {
-        if(leadDimRDI < 0){
+            leadDim = 0;
-            leadDimRDI = y->order - 1;
            tBeg = 0;
            yBeg = 0;
-            tLen = y->dimSizeRDI[leadDimRDI];
+            tLen = y->dimSize[leadDim];
        }
-        int dimensionSize = y->dimSizeRDI[leadDimRDI];
+        int dimensionSize = y->dimSize[leadDim];
        int stride = 1;
        int blockSize = 1;
        int blockNum = 1;
-        for(int i = 0; i < leadDimRDI; i++)
+        for(int i = leadDim + 1; i < y->order; i++)
-            stride *= y->dimSizeRDI[i];
+            stride *= y->dimSize[i];
        blockSize = stride * dimensionSize;
        blockNum = y->unitNum / blockSize;

--- a/source/tensor/function/Loss.cu
+++ b/source/tensor/function/Loss.cu
@@ -56,7 +56,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
 {
    CheckNTErrors((gLen >= 0 && gLen <= y->unitNum), "Illegal input length!");
    CheckNTErrors((_IsSameShaped(gold, y)), "The input tensors must be of the same size!");
-    CheckNTErrors((gold->dimSizeRDI[0] == 1 && y->dimSizeRDI[0] == 1), "TODO!");
+    CheckNTErrors((gold->dimSize[gold->order - 1] == 1 && y->dimSize[y->order - 1] == 1), "TODO!");
    CheckNTErrors((gold->order > leadDim && leadDim >= 0), "Illegal leading dimension!");
    CheckNTErrors((gold->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE), "TODO!");
    CheckNTErrors((gold->devID == y->devID), "Tensors must be on the same device!");
@@ -91,7 +91,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
                diffNew->order = 2;
                diffNew->dimSize[1] = diffNew->dimSize[0];
                diffNew->dimSize[0] = 1;
-                diffNew->dimSizeRDI[1] = 1;
+                diffNew->dimSize[diffNew->order - 2] = 1;
            }
            delete diff;
            diff = diffNew;
@@ -125,7 +125,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
                diffNew->order = 2;
                diffNew->dimSize[1] = diffNew->dimSize[0];
                diffNew->dimSize[0] = 1;
-                diffNew->dimSizeRDI[1] = 1;
+                diffNew->dimSize[diffNew->order - 2] = 1;
            }
            delete diff;
            diff = diffNew;
@@ -162,7 +162,7 @@ DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
                diffNew->order = 2;
                diffNew->dimSize[1] = diffNew->dimSize[0];
                diffNew->dimSize[0] = 1;
-                diffNew->dimSizeRDI[1] = 1;
+                diffNew->dimSize[diffNew->order - 2] = 1;
            }
            delete diff;
            diff = diffNew;
@@ -349,22 +349,21 @@ void _CudaLossBackward(XTensor * dedy, XTensor * t, XTensor * y,
                  "The vectors must be on the same GPU.");
    CheckNTErrors((tBeg == yBeg), "TODO!");
-    int leadDimRDI = leadDim >= 0 ? y->order - leadDim - 1 : -1;
+    if (leadDim < 0) {
-    if(leadDimRDI < 0){
+        leadDim = 0;
-        leadDimRDI = y->order - 1;
        tBeg = 0;
        yBeg = 0;
-        tLen = y->dimSizeRDI[leadDimRDI];
+        tLen = y->dimSize[leadDim];
    }
-    int dimensionSize = y->dimSizeRDI[leadDimRDI];
+    int dimensionSize = y->dimSize[leadDim];
    int stride = 1;
    int blockSize = 1;
    int blockNum = 1;
    int size = 1;
-    for(int i = 0; i < leadDimRDI; i++)
+    for(int i = leadDim + 1; i < y->order; i++)
-        stride *= y->dimSizeRDI[i];
+        stride *= y->dimSize[i];
    size = tLen * stride;
    blockSize = stride * dimensionSize;
    blockNum = y->unitNum / blockSize;

--- a/source/tensor/function/Softmax.cpp
+++ b/source/tensor/function/Softmax.cpp
@@ -41,7 +41,6 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
    if(leadDim < 0)
        leadDim = x->order - 1;
-    int leadDimRDI = x->order - leadDim - 1;
    if(!x->isSparse && !y->isSparse && x->dataType == y->dataType){
        int * dimSize = new int[x->order - 1];
        for(int i = 0; i < x->order; i++){
@@ -71,13 +70,13 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
        else{
            CheckNTErrors((x->dataType == DEFAULT_DTYPE), "TODO!");
-            int dimensionSize = y->dimSizeRDI[leadDimRDI];
+            int dimensionSize = y->dimSize[leadDim];
            int stride = 1;
            int blockSize = 1;
            int blockNum = 1;
-            for(int i = 0; i < leadDimRDI; i++)
+            for(int i = leadDim + 1; i < y->order; i++)
-                stride *= y->dimSizeRDI[i];
+                stride *= y->dimSize[i];
            blockSize = stride * dimensionSize;
            blockNum = y->unitNum / blockSize;
@@ -207,8 +206,6 @@ void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
    if(leadDim < 0)
        leadDim = y->order - 1;
-    int leadDimRDI = y->order - leadDim - 1;
 #ifdef USE_CUDA
    if(y->devID >= 0){
        _CudaSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
@@ -216,12 +213,12 @@ void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
    }
 #endif
-    int dimensionSize = y->dimSizeRDI[leadDimRDI];
+    int dimensionSize = y->dimSize[leadDim];
    int stride = 1;
    int blockSize = 1;
    int blockNum = 1;
-    for(int i = 0; i < leadDimRDI; i++)
+    for(int i = leadDim + 1; i < y->order; i++)
-        stride *= y->dimSizeRDI[i];
+        stride *= y->dimSize[i];
    blockSize = stride * dimensionSize;
    blockNum = y->unitNum / blockSize;

--- a/source/tensor/function/Softmax.cu
+++ b/source/tensor/function/Softmax.cu
@@ -226,14 +226,13 @@ void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * s
    CheckNTErrors((x->devID == y->devID), "Tensors used in softmax are not on the same GPU.");
    CheckNTErrors((_IsSameShaped(x, y)), "Input tensors must be of the same size!");
-    int leadDimRDI = y->order - leadDim - 1;
+    int dimensionSize = y->dimSize[leadDim];
-    int dimensionSize = y->dimSizeRDI[leadDimRDI];
    int stride = 1;
    int blockSize = 1;
    int blockNum = 1;
-    for(int i = 0; i < leadDimRDI; i++)
+    for(int i = leadDim + 1; i < y->order; i++)
-        stride *= y->dimSizeRDI[i];
+        stride *= y->dimSize[i];
    blockSize = stride * dimensionSize;
    blockNum = y->unitNum / blockSize;