Merge with liyinqiao branch and add stack function.

0e585782 · xuchen · 93bc3158 · 0e585782 · 0e585782 · 0e585782
Commit 0e585782 authored Nov 02, 2019 by xuchen
--- a/source/network/Main.cpp
+++ b/source/network/Main.cpp
@@ -76,8 +76,8 @@ void BackwardTest()
    c.enableGrad = false;
    XTensor mean;
    XTensor origin;
-    InitTensor2D(&a, 2, 3);
-    InitTensor1D(&b, 2);
+    InitTensor2DV2(&a, 2, 3);
+    InitTensor1DV2(&b, 2);

    a.SetZeroAll();
    b.SetZeroAll();
@@ -121,9 +121,9 @@ void TransposeTest()

    int nnn = GDevs.nGPU;

-    InitTensor3D(&x, B, N, H, X_FLOAT, 0);
-    InitTensor4D(&y, K, B, N, H/K, X_FLOAT, 0);
-    InitTensor3D(&z, B, N, H, X_FLOAT, 0);
+    InitTensor3DV2(&x, B, N, H, X_FLOAT, 0);
+    InitTensor4DV2(&y, K, B, N, H/K, X_FLOAT, 0);
+    InitTensor3DV2(&z, B, N, H, X_FLOAT, 0);

    cudaEvent_t ctime0;
    cudaEvent_t ctime1;
@@ -191,9 +191,9 @@ void SumDimTest()
    int b = 7;
    int c = 3;

-    InitTensor3D(&x, a, b, c, X_FLOAT, -1);
-    InitTensor1D(&y, c, X_FLOAT, -1);
-    InitTensor3D(&z, a, b, c, X_FLOAT, -1);
+    InitTensor3DV2(&x, a, b, c, X_FLOAT, -1);
+    InitTensor1DV2(&y, c, X_FLOAT, -1);
+    InitTensor3DV2(&z, a, b, c, X_FLOAT, -1);

    x.SetZeroAll();
    y.SetZeroAll();

--- a/source/network/XBackwardMath.cpp
+++ b/source/network/XBackwardMath.cpp
--- a/source/network/XBackwardShape.cpp
+++ b/source/network/XBackwardShape.cpp
@@ -281,7 +281,7 @@ void XShapeGrad::GradMergeList(XTensor * node, bool isEfficient)
        smallsGrad.Add(tail->grad);
        
        if(i > 1){
-            CheckNTErrors(XTensor::IsSameShaped(last, tail), 
+            CheckNTErrors(_IsSameShaped(last, tail), 
                         "Input tensors must be of the same size!");
        }

@@ -391,7 +391,7 @@ void XShapeGrad::GradSplit(XTensor * node, bool isEfficient)
    /* if the tensor is used somewhere else, we need another SUM
       for gradient accumulation */
    else{
-        XTensor * inputGradTMP = NewTensorBuf(input, input->devID, input->mem);
+        XTensor * inputGradTMP = NewTensorBufV2(input, input->devID, input->mem);

        _Merge(node->grad, inputGradTMP, whereToSplit + 1, 0);
        _Sum(input->grad, inputGradTMP, input->grad);
@@ -450,7 +450,7 @@ void XShapeGrad::GradSplitListPost(XTensor * node, bool isEfficient)
        if(income.typeID == SHAPE_SPLIT_LIST){
            int w = income.GetParamInt(0);
            int splitID = income.GetParamInt(1);
-			
+            
            if(whereToSplit < 0)
                whereToSplit = w;
            splitNum++;
@@ -475,7 +475,7 @@ void XShapeGrad::GradSplitListPost(XTensor * node, bool isEfficient)
       somewhere else, we need another SUM for gradient 
       accumulation */
    else{
-        XTensor * nodeGradTMP = NewTensorBuf(node, node->devID, node->mem);
+        XTensor * nodeGradTMP = NewTensorBufV2(node, node->devID, node->mem);

        _Merge(&splits, nodeGradTMP, whereToSplit + 1);
        _Sum(node->grad, nodeGradTMP, node->grad);
@@ -501,7 +501,7 @@ void XShapeGrad::GradTranspose(XTensor * node, bool isEfficient)

    XTensor * output = node;
    XTensor * input = income.tails[0];
-    XTensor * b = NewTensorBuf(input, input->devID, input->mem);
+    XTensor * b = NewTensorBufV2(input, input->devID, input->mem);
    XNoder::MakeGrad(input);

    int i = income.GetParamInt(0);
@@ -543,7 +543,7 @@ void XShapeGrad::GradUnsqueeze(XTensor * node, bool isEfficient)
    CheckNTErrors(dSize == output->GetDim(dim), "Wrong dim size for UNSQUEEZE!");
    CheckNTErrors(output->unitNum = input->unitNum * dSize, "Wrong tensor size!");
    
-    XTensor * g = NewTensorBuf(input->grad, input->devID, input->mem);
+    XTensor * g = NewTensorBufV2(input->grad, input->devID, input->mem);
    
    _ReduceSum(output->grad, g, dim);
    _Sum(input->grad, g, input->grad);

--- a/source/network/XNoder.cpp
+++ b/source/network/XNoder.cpp
@@ -29,7 +29,7 @@ void XNoder::MakeGrad(XTensor * node)
    if(node == NULL)
        return;

-    if(!XTensor::IsSameShaped(node, node->grad)){
+    if(!_IsSameShaped(node, node->grad)){
        delete node->grad;
        node->grad = NewTensor(node);
        node->grad->SetZeroAll();

--- a/source/network/XNoder.h
+++ b/source/network/XNoder.h
@@ -20,7 +20,7 @@
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-18
 */

-#include "../tensor/XTensor.h"
+#include "../tensor/core/CHeader.h"

 #ifndef __XNODER_H__
 #define __XNODER_H__

--- a/source/sample/fnnlm/FNNLM.cpp
+++ b/source/sample/fnnlm/FNNLM.cpp
@@ -242,13 +242,13 @@ void Check(FNNModel &model)
 /* make a hard copy of the fnn model */
 void Copy(FNNModel &tgt, FNNModel &src)
 {
-    InitTensorV2(&tgt.embeddingW, &src.embeddingW);
+    InitTensor(&tgt.embeddingW, &src.embeddingW);
    for(int i = 0; i < MAX_HIDDEN_NUM; i++){
-        InitTensorV2(&tgt.hiddenW[i], &src.hiddenW[i]);
-        InitTensorV2(&tgt.hiddenB[i], &src.hiddenB[i]);
+        InitTensor(&tgt.hiddenW[i], &src.hiddenW[i]);
+        InitTensor(&tgt.hiddenB[i], &src.hiddenB[i]);
    }
-    InitTensorV2(&tgt.outputW, &src.outputW);
-    InitTensorV2(&tgt.outputB, &src.outputB);
+    InitTensor(&tgt.outputW, &src.outputW);
+    InitTensor(&tgt.outputB, &src.outputB);

    tgt.n = src.n;
    tgt.eSize = src.eSize;
@@ -300,7 +300,7 @@ initialize a 1d tensor using the fnn model setting
 */
 void InitModelTensor1D(XTensor &tensor, int num, FNNModel &model)
 {
-    InitTensor1DV2(&tensor, num, X_FLOAT, model.devID);
+    InitTensor1D(&tensor, num, X_FLOAT, model.devID);
 }

 /* 
@@ -312,7 +312,7 @@ initialize a 2d tensor using the fnn model setting
 */
 void InitModelTensor2D(XTensor &tensor, int rowNum, int colNum, FNNModel &model)
 {
-    InitTensor2DV2(&tensor, rowNum, colNum, X_FLOAT, model.devID);
+    InitTensor2D(&tensor, rowNum, colNum, X_FLOAT, model.devID);
 }


@@ -594,14 +594,14 @@ get prediction probabilites of the gold words
 float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs)
 {
    XTensor probs;
-    InitTensorV2(&probs, &output);
+    InitTensor(&probs, &output);
    
    /* probs[i,j] = output[i,j] * gold[i,j] */
    Multiply(output, gold, probs);

    /* probability of each word */
    XTensor wprobs;
-    InitTensor1DV2(&wprobs, output.GetDim(0), output.dataType, output.devID);
+    InitTensor1D(&wprobs, output.GetDim(0), output.dataType, output.devID);
    ReduceSum(probs, wprobs, 1);
    if(wordProbs != NULL)
        CopyValues(wprobs, *wordProbs);
@@ -615,7 +615,7 @@ float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs)
 
    /* probability for the batch */
    XTensor result;
-    InitTensor1DV2(&result, 1, X_FLOAT, output.devID);
+    InitTensor1D(&result, 1, X_FLOAT, output.devID);
    ReduceSum(probs, result, 1);
    
    return result.Get1D(0);
@@ -716,7 +716,7 @@ The indexed cell is set to 1, and 0 otherwise.
 void InitZeroOneTensor2D(XTensor &tensor, int rowNum, int colNum, int * rows, int * cols, 
                         int itemNum, int devID)
 {
-    InitTensor2DV2(&tensor, rowNum, colNum, X_FLOAT, devID);
+    InitTensor2D(&tensor, rowNum, colNum, X_FLOAT, devID);

    tensor.SetZeroAll();

@@ -808,7 +808,7 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)

        /* make a 2d tensor for the bias term */
        XTensor b2D;
-        InitTensorV2(&b2D, &s);
+        InitTensor(&b2D, &s);
        Unsqueeze(b, b2D, 0, batchSize);

        /* introduce bias term:
@@ -840,7 +840,7 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
        MatrixMul(h_last, X_NOTRANS, w, X_NOTRANS, s);

        XTensor b2D;
-        InitTensorV2(&b2D, &s);
+        InitTensor(&b2D, &s);
        Unsqueeze(b, b2D, 0, batchSize);

        Sum(s, b2D, s);
@@ -905,8 +905,8 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
    XTensor dedsHidden;
    XTensor dedxBottom;
    if (depth > 0)
-        InitTensorV2(&dedsHidden, &dedx);
-    InitTensorV2(&dedxBottom, &net.embeddingCat);
+        InitTensor(&dedsHidden, &dedx);
+    InitTensor(&dedxBottom, &net.embeddingCat);

    /* back-propagation from top to bottom in the stack of hidden layers
       for each layer, h = f(s)
@@ -944,7 +944,7 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA

    /* back-propagation for the embedding layer */
    for (int i = 0; i < n - 1; i++) {
-        XTensor * dedy = NewTensor2DV2(batchSize, model.eSize, X_FLOAT, model.devID);
+        XTensor * dedy = NewTensor2D(batchSize, model.eSize, X_FLOAT, model.devID);
        eList.Add(dedy);
    }

@@ -996,7 +996,7 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model
        }
    }

-    InitTensor1DV2(&words, size, X_INT, model.devID);
+    InitTensor1D(&words, size, X_INT, model.devID);
    words.SetData(index, size);

    embeddingBig = Gather(model.embeddingW, words);
@@ -1176,7 +1176,7 @@ void Test(const char * test, const char * result, FNNModel &model)

        /* prediction probabilities */
        XTensor probs;
-        InitTensor1DV2(&probs, ngramNum);
+        InitTensor1D(&probs, ngramNum);

        /* get probabilities */
        float prob = GetProb(output, gold, &probs);

--- a/source/sample/transformer/T2TAttention.cpp
+++ b/source/sample/transformer/T2TAttention.cpp
@@ -69,11 +69,11 @@ void T2TAttention::InitModel(int argc, char ** argv,
    LoadParamFloat(argc, argv, "attminmax", &minmax, 0.1F);
    LoadParamFloat(argc, argv, "dropoutatt", &dropoutP, 0);

-    InitTensor2DV2(&wk, d, dk, X_FLOAT, devID);
-    InitTensor2DV2(&wq, d, dk, X_FLOAT, devID);
-    InitTensor2DV2(&wv, d, dv, X_FLOAT, devID);
-    InitTensor2DV2(&wa, d, d, X_FLOAT, devID);
-    InitTensor2DV2(&wbig, d, 3 * d, X_FLOAT, devID);
+    InitTensor2D(&wk, d, dk, X_FLOAT, devID);
+    InitTensor2D(&wq, d, dk, X_FLOAT, devID);
+    InitTensor2D(&wv, d, dv, X_FLOAT, devID);
+    InitTensor2D(&wa, d, d, X_FLOAT, devID);
+    InitTensor2D(&wbig, d, 3 * d, X_FLOAT, devID);

    float scale = 1.0F;
    _SetDataFanInOut(&wk, scale);
@@ -128,9 +128,9 @@ XTensor T2TAttention::MakeBig(XTensor &kqv, XTensor &mask, bool isTraining)
    int d2 = kqv2.GetDim(1);
    int d3 = kqv2.GetDim(2) / 3;
    
-    InitTensor3DV2(&k2, d1, d2, d3, X_FLOAT, devID);
-    InitTensor3DV2(&q2, d1, d2, d3, X_FLOAT, devID);
-    InitTensor3DV2(&v2, d1, d2, d3, X_FLOAT, devID);
+    InitTensor3D(&k2, d1, d2, d3, X_FLOAT, devID);
+    InitTensor3D(&q2, d1, d2, d3, X_FLOAT, devID);
+    InitTensor3D(&v2, d1, d2, d3, X_FLOAT, devID);
    
    split.Add(&q2);
    split.Add(&k2);

--- a/source/sample/transformer/T2TBatchLoader.cpp
+++ b/source/sample/transformer/T2TBatchLoader.cpp
@@ -365,11 +365,11 @@ int T2TBatchLoader::LoadBatchLM(FILE * file,
    dims[1] = max;
    dims[2] = vSize;

-    InitTensor2DV2(batchEnc, sc, max, X_INT, devID);
-    InitTensor2DV2(label, sc, max, X_INT, devID);
-    InitTensorV2(gold, 3, dims, X_FLOAT, devID);
-    InitTensor2DV2(paddingEnc, sc, max, X_FLOAT, devID);
-    InitTensor2DV2(paddingDec, sc, max, X_FLOAT, devID);
+    InitTensor2D(batchEnc, sc, max, X_INT, devID);
+    InitTensor2D(label, sc, max, X_INT, devID);
+    InitTensor(gold, 3, dims, X_FLOAT, devID);
+    InitTensor2D(paddingEnc, sc, max, X_FLOAT, devID);
+    InitTensor2D(paddingDec, sc, max, X_FLOAT, devID);

    batchEnc->SetZeroAll();
    label->SetZeroAll();
@@ -433,12 +433,12 @@ int T2TBatchLoader::LoadBatchLM(FILE * file,
    paddingEnc->SetDataBatched(paddingEncOffsets, 1.0F, wCount);
    paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCount);

-    /*XTensor * tmp = NewTensorBufV2(paddingEnc, devID);
+    /*XTensor * tmp = NewTensorBuf(paddingEnc, devID);
    _ConvertDataType(batchEnc, tmp);
    _NotEqual(tmp, paddingEnc, 0);
    DelTensorBuf(tmp);
        
-    XTensor * tmp2 = NewTensorBufV2(paddingDec, devID);
+    XTensor * tmp2 = NewTensorBuf(paddingDec, devID);
    _ConvertDataType(batchEnc, tmp2);
    _NotEqual(tmp2, paddingDec, 0);
    DelTensorBuf(tmp2);*/
@@ -563,12 +563,12 @@ int T2TBatchLoader::LoadBatchMT(FILE * file,
    int sCount = sc/2;
    int seqSize = 0;

-    InitTensor2DV2(batchEnc, sCount, maxEnc, X_INT, devID);
-    InitTensor2DV2(paddingEnc, sCount, maxEnc, X_FLOAT, devID);
-    InitTensor2DV2(batchDec, sCount, maxDec, X_INT, devID);
-    InitTensor2DV2(paddingDec, sCount, maxDec, X_FLOAT, devID);
-    InitTensor2DV2(label, sCount, maxDec, X_INT, devID);
-    //InitTensorV2(gold, 3, dimsDec, X_FLOAT, devID);
+    InitTensor2D(batchEnc, sCount, maxEnc, X_INT, devID);
+    InitTensor2D(paddingEnc, sCount, maxEnc, X_FLOAT, devID);
+    InitTensor2D(batchDec, sCount, maxDec, X_INT, devID);
+    InitTensor2D(paddingDec, sCount, maxDec, X_FLOAT, devID);
+    InitTensor2D(label, sCount, maxDec, X_INT, devID);
+    //InitTensor(gold, 3, dimsDec, X_FLOAT, devID);

    batchEnc->SetZeroAll();
    paddingEnc->SetZeroAll();
@@ -607,7 +607,7 @@ int T2TBatchLoader::LoadBatchMT(FILE * file,
    ws = wCountEnc;
    batchEnc->SetData(batchEncValues, batchEnc->unitNum);
    paddingEnc->SetDataBatched(paddingEncOffsets, 1.0F, wCountEnc);
-    //XTensor * tmp = NewTensorBufV2(paddingEnc, devID);
+    //XTensor * tmp = NewTensorBuf(paddingEnc, devID);
    //_ConvertDataType(batchEnc, tmp);
    //tmp->Dump(stderr, "tmp:");
    //_NotEqual(tmp, paddingEnc, 0);
@@ -656,7 +656,7 @@ int T2TBatchLoader::LoadBatchMT(FILE * file,
    label->SetData(labelValues, label->unitNum);
    paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCountPad);

-    //XTensor * tmp2 = NewTensorBufV2(paddingDec, devID);
+    //XTensor * tmp2 = NewTensorBuf(paddingDec, devID);
    //_ConvertDataType(batchDec, tmp2);
    //_NotEqual(tmp2, paddingDec, 0);
    //DelTensorBuf(tmp2);

--- a/source/sample/transformer/T2TEmbedding.cpp
+++ b/source/sample/transformer/T2TEmbedding.cpp
@@ -61,7 +61,7 @@ void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, bool isEnc)
    LoadParamInt(argc, argv, "d", &eSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);

-    InitTensor2DV2(&w, vSize, eSize, X_FLOAT, devID);
+    InitTensor2D(&w, vSize, eSize, X_FLOAT, devID);

    DTYPE v = 1.0F/(float)sqrt((float)eSize);
    w.SetDataRandn(0, v);
@@ -78,7 +78,7 @@ make positional embeddings (of size eSize * length)
 */
 void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length)
 {
-    InitTensor2DV2(&posEmbeddingBase, length, eSize, X_FLOAT, devID);
+    InitTensor2D(&posEmbeddingBase, length, eSize, X_FLOAT, devID);

    float * data = new float[posEmbeddingBase.unitNum];

@@ -142,9 +142,9 @@ XTensor T2TEmbedder::Make(XTensor &input)
    /* we make positional embeddings first */
    //if(!match){
    if(true){
-        InitTensorV2(&posEmbedding, input.order + 1, dims, X_FLOAT, devID);
+        InitTensor(&posEmbedding, input.order + 1, dims, X_FLOAT, devID);

-        XTensor * posTMP = NewTensorBufV2(2, dims + 1, X_FLOAT, devID);
+        XTensor * posTMP = NewTensorBuf(2, dims + 1, X_FLOAT, devID);

        _CopyValues(&posEmbeddingBase, 0, posTMP->unitNum, posTMP, 0);
        _Unsqueeze(posTMP, &posEmbedding, 0, dims[0]);

--- a/source/sample/transformer/T2TFNN.cpp
+++ b/source/sample/transformer/T2TFNN.cpp
@@ -60,11 +60,11 @@ void T2TFNN::InitModel(int argc, char ** argv, int myDevID)
    LoadParamFloat(argc, argv, "fnnminmax", &minmax, 0.1F);
    LoadParamFloat(argc, argv, "dropoutfnn", &dropoutP, 0);

-    InitTensor2DV2(&w1, inSize, hSize, X_FLOAT, devID);
-    InitTensor1DV2(&b1, hSize, X_FLOAT, devID);
+    InitTensor2D(&w1, inSize, hSize, X_FLOAT, devID);
+    InitTensor1D(&b1, hSize, X_FLOAT, devID);

-    InitTensor2DV2(&w2, hSize, outSize, X_FLOAT, devID);
-    InitTensor1DV2(&b2, outSize, X_FLOAT, devID);
+    InitTensor2D(&w2, hSize, outSize, X_FLOAT, devID);
+    InitTensor1D(&b2, outSize, X_FLOAT, devID);

    float scale = 1.0F;
    _SetDataFanInOut(&w1, scale);

--- a/source/sample/transformer/T2TLayerNormal.cpp
+++ b/source/sample/transformer/T2TLayerNormal.cpp
@@ -53,8 +53,8 @@ void T2TLN::InitModel(int argc, char ** argv, int myDevID)
    d = 0;
    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);

-    InitTensor1DV2(&w, d, X_FLOAT, devID);
-    InitTensor1DV2(&b, d, X_FLOAT, devID);
+    InitTensor1D(&w, d, X_FLOAT, devID);
+    InitTensor1D(&b, d, X_FLOAT, devID);

    w.SetDataRand(1.0F, 1.0F);
    b.SetZeroAll();

--- a/source/sample/transformer/T2TModel.cpp
+++ b/source/sample/transformer/T2TModel.cpp
@@ -132,7 +132,7 @@ void T2TModel::MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool is
    dims[0] = nhead;
    dims[input.order + 1] = len;
    XTensor mask;
-    InitTensorV2(&mask, input.order + 2, dims, X_FLOAT, padding.devID);
+    InitTensor(&mask, input.order + 2, dims, X_FLOAT, padding.devID);

    /* a upper triangular matrix where the cells of the upper triangular are set to -1e-9.
        this matrix can be used to prevent the attention to current or following words in
@@ -146,14 +146,14 @@ void T2TModel::MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool is
    dimsPadding[padding.order - 1] = padding.GetDim(-1);
    dimsPadding[padding.order] = padding.GetDim(-1);

-    XTensor * padding2 = NewTensorBufV2(padding.order + 1, dimsPadding, padding.dataType,
+    XTensor * padding2 = NewTensorBuf(padding.order + 1, dimsPadding, padding.dataType,
                                        padding.devID);

    for(int i = 0; i < padding2->order; i++)
        dimsPadding[i + 1] = padding2->GetDim(i);
    dimsPadding[0] = nhead;

-    //XTensor * padding3 = NewTensorBufV2(padding.order + 2, dimsPadding, padding.dataType,
+    //XTensor * padding3 = NewTensorBuf(padding.order + 2, dimsPadding, padding.dataType,
    //                                    padding.devID);
    //    
    ///* mask of the padding */
@@ -224,7 +224,7 @@ void T2TModel::MakeMTMask(XTensor &inputEnc,   XTensor &inputDec,
        dims[i + 1] = inputDec.GetDim(i);
    dims[0] = nhead;
    dims[inputDec.order + 1] = len;
-    InitTensorV2(&maskDec, inputDec.order + 2, dims, X_FLOAT, paddingDec.devID);
+    InitTensor(&maskDec, inputDec.order + 2, dims, X_FLOAT, paddingDec.devID);
        
    /* an upper triangular matrix where the cells of the upper triangular are set to -1e-9.
       this matrix can be used to prevent the attention to current or following words in
@@ -234,11 +234,11 @@ void T2TModel::MakeMTMask(XTensor &inputEnc,   XTensor &inputDec,

    /* encoder-decoder mask that prevents the attention to padding dummy words */
    dims[inputDec.order + 1] = inputEnc.GetDim(inputEnc.order - 1);
-    InitTensorV2(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, paddingEnc.devID);
+    InitTensor(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, paddingEnc.devID);

-    XTensor * maskEncDecTMPEnc = NewTensorBufV2(paddingEnc.order + 1, dims + 1, paddingEnc.dataType,
+    XTensor * maskEncDecTMPEnc = NewTensorBuf(paddingEnc.order + 1, dims + 1, paddingEnc.dataType,
                                                paddingEnc.devID);
-    XTensor * maskEncDecTMPDec = NewTensorBufV2(maskEncDecTMPEnc, paddingEnc.devID);
+    XTensor * maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID);

    _Unsqueeze(&paddingEnc, maskEncDecTMPEnc, paddingEnc.order - 1, paddingDec.GetDim(-1));
    _ScaleAndShiftMe(maskEncDecTMPEnc, 1e9F, -1e9F);
@@ -254,14 +254,14 @@ void T2TModel::MakeMTMask(XTensor &inputEnc,   XTensor &inputDec,
    dimsPadding[paddingEnc.order - 1] = paddingEnc.GetDim(-1);
    dimsPadding[paddingEnc.order] = paddingEnc.GetDim(-1);

-    XTensor * padding2 = NewTensorBufV2(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType,
+    XTensor * padding2 = NewTensorBuf(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType,
                                        paddingEnc.devID);

    for (int i = 0; i < padding2->order; i++)
        dimsPadding[i + 1] = padding2->GetDim(i);
    dimsPadding[0] = nhead;

-    XTensor * padding3 = NewTensorBufV2(paddingEnc.order + 2, dimsPadding, paddingEnc.dataType,
+    XTensor * padding3 = NewTensorBuf(paddingEnc.order + 2, dimsPadding, paddingEnc.dataType,
                                        paddingEnc.devID);

    /* mask of the padding */
@@ -270,7 +270,7 @@ void T2TModel::MakeMTMask(XTensor &inputEnc,   XTensor &inputDec,

    _ScaleAndShiftMe(padding3, 1e9F, -1e9F);

-    InitTensorV2(&maskEnc, padding3);
+    InitTensor(&maskEnc, padding3);
    maskEnc.SetZeroAll();

    /* generate the mask on the source language side (for padding) */
@@ -298,14 +298,14 @@ void T2TModel::MakeMTMaskEnc(XTensor &inputEnc, XTensor &paddingEnc, XTensor &ma
    dimsPadding[paddingEnc.order - 1] = paddingEnc.GetDim(-1);
    dimsPadding[paddingEnc.order] = paddingEnc.GetDim(-1);
    
-    XTensor * padding2 = NewTensorBufV2(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType,
+    XTensor * padding2 = NewTensorBuf(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType,
                                        paddingEnc.devID);
    
    for (int i = 0; i < padding2->order; i++)
        dimsPadding[i + 1] = padding2->GetDim(i);
    dimsPadding[0] = nhead;
    
-    XTensor * padding3 = NewTensorBufV2(paddingEnc.order + 2, dimsPadding, paddingEnc.dataType,
+    XTensor * padding3 = NewTensorBuf(paddingEnc.order + 2, dimsPadding, paddingEnc.dataType,
                                        paddingEnc.devID);
    
    /* mask of the padding */
@@ -314,7 +314,7 @@ void T2TModel::MakeMTMaskEnc(XTensor &inputEnc, XTensor &paddingEnc, XTensor &ma
    
    _ScaleAndShiftMe(padding3, 1e9F, -1e9F);
    
-    InitTensorV2(&maskEnc, padding3);
+    InitTensor(&maskEnc, padding3);
    maskEnc.SetZeroAll();
    
    /* generate the mask on the source language side (for padding) */
@@ -344,7 +344,7 @@ void T2TModel::MakeMTMaskDec(XTensor &inputEnc, XTensor &inputDec,
        dims[i + 1] = inputDec.GetDim(i);
    dims[0] = nhead;
    dims[inputDec.order + 1] = len;
-    InitTensorV2(&maskDec, inputDec.order + 2, dims, X_FLOAT, paddingDec.devID);
+    InitTensor(&maskDec, inputDec.order + 2, dims, X_FLOAT, paddingDec.devID);
    
    /* An upper triangular matrix where the cells of the upper triangular are set to -1e-9.
       This matrix can be used to block the attention to current or following words in
@@ -359,11 +359,11 @@ void T2TModel::MakeMTMaskDec(XTensor &inputEnc, XTensor &inputDec,

    /* encoder-decoder mask that prevents the attention to padding dummy words */
    dims[inputDec.order + 1] = inputEnc.GetDim(inputEnc.order - 1);
-    InitTensorV2(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, paddingEnc.devID);
+    InitTensor(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, paddingEnc.devID);
    
-    XTensor * maskEncDecTMPEnc = NewTensorBufV2(paddingEnc.order + 1, dims + 1, paddingEnc.dataType,
+    XTensor * maskEncDecTMPEnc = NewTensorBuf(paddingEnc.order + 1, dims + 1, paddingEnc.dataType,
                                                paddingEnc.devID);
-    XTensor * maskEncDecTMPDec = NewTensorBufV2(maskEncDecTMPEnc, paddingEnc.devID);
+    XTensor * maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID);
    
    _Unsqueeze(&paddingEnc, maskEncDecTMPEnc, paddingEnc.order - 1, paddingDec.GetDim(-1));


--- a/source/sample/transformer/T2TOutput.cpp
+++ b/source/sample/transformer/T2TOutput.cpp
@@ -58,7 +58,7 @@ void T2TOutput::InitModel(int argc, char ** argv, int myDevID)
    LoadParamInt(argc, argv, "d", &hSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamFloat(argc, argv, "outputminmax", &minmax, 0.08F);

-    InitTensor2DV2(&w, hSize, vSize, X_FLOAT, devID);
+    InitTensor2D(&w, hSize, vSize, X_FLOAT, devID);
    
    float scale = 1.0F;
    float finfout = (float)sqrt(6.0F * scale/(hSize + vSize));

--- a/source/sample/transformer/T2TPredictor.cpp
+++ b/source/sample/transformer/T2TPredictor.cpp
@@ -105,9 +105,9 @@ void T2TPredictor::Create(T2TModel * model, XTensor * top, const XTensor * input
        dims[i] = input->GetDim(i);
    dims[input->order - 1] = beamSize;

-    InitTensorV2(&state->probPath, input->order, dims, X_FLOAT, input->devID);
-    InitTensorV2(&state->nstep, input->order, dims, X_FLOAT, input->devID);
-    InitTensorV2(&state->endMark, input->order, dims, X_INT, input->devID);
+    InitTensor(&state->probPath, input->order, dims, X_FLOAT, input->devID);
+    InitTensor(&state->nstep, input->order, dims, X_FLOAT, input->devID);
+    InitTensor(&state->endMark, input->order, dims, X_INT, input->devID);

    state->probPath.SetZeroAll();
    state->nstep.SetZeroAll();
@@ -170,7 +170,7 @@ void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding,
        dims[i] = inputEnc->GetDim(i);
    dims[inputEnc->order - 1] = 1;

-    InitTensorV2(&first, inputEnc->order, dims, X_INT, inputEnc->devID);
+    InitTensor(&first, inputEnc->order, dims, X_INT, inputEnc->devID);
    _SetDataFixedInt(&first, startSymbol);

    /* add a new word into the input sequence of the decoder side */
@@ -194,7 +194,7 @@ void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding,
    dims[inputDec.order - 1] = inputDec.GetDim(-1);
    
    XTensor paddingDec;
-    InitTensorV2(&paddingDec, inputDec.order, dims, X_INT, paddingEnc->devID);
+    InitTensor(&paddingDec, inputDec.order, dims, X_INT, paddingEnc->devID);
    SetDataFixedInt(paddingDec, 1);
    
    XTensor maskDec;
@@ -213,8 +213,8 @@ void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding,

    int stride = decoding.GetDim(decoding.order - 2);

-    InitTensor1DV2(&selectSrc, 1, X_INT);
-    InitTensor1DV2(&selectTgt, 1, X_INT);
+    InitTensor1D(&selectSrc, 1, X_INT);
+    InitTensor1D(&selectTgt, 1, X_INT);

    selectSrc.SetInt(stride - 1, 0);
    selectTgt.SetInt(0, 0);
@@ -257,7 +257,7 @@ XTensor T2TPredictor::GeneratePaths(T2TStateBundle * state)
    }

    XTensor path;
-    InitTensor2DV2(&path, state->stateNum, distance, X_INT);
+    InitTensor2D(&path, state->stateNum, distance, X_INT);
    path.SetZeroAll();

    for(int i = 0; i < state->stateNum; i++){

--- a/source/sample/transformer/T2TSearch.cpp
+++ b/source/sample/transformer/T2TSearch.cpp
@@ -192,8 +192,8 @@ void T2TSearch::Score(T2TStateBundle * prev, T2TStateBundle * beam)
    for(int i = 0; i < order; i++)
        dims[i] = prob.GetDim(i);
    
-    InitTensorV2(&score, &prob);
-    InitTensorV2(&probPath, &prob);
+    InitTensor(&score, &prob);
+    InitTensor(&probPath, &prob);

    prob.Reshape(prob.unitNum/outputSize, outputSize);
    score.Reshape(score.unitNum/outputSize, outputSize);
@@ -204,8 +204,8 @@ void T2TSearch::Score(T2TStateBundle * prev, T2TStateBundle * beam)
    _SumDim(&prob, &probPathPrev, &probPath, 0);


-    InitTensorV2(&len, &lenPrev);
-    InitTensorV2(&lp, &lenPrev);
+    InitTensor(&len, &lenPrev);
+    InitTensor(&lp, &lenPrev);

    _ScaleAndShift(&lenPrev, &len, 1.0F, 1.0F);

@@ -225,7 +225,7 @@ void T2TSearch::Score(T2TStateBundle * prev, T2TStateBundle * beam)
        _SumDim(&score, &firstMask, &score, 0);
    }

-    InitTensorV2(&mask, 
+    InitTensor(&mask, 
               prev->endMark.order, prev->endMark.dimSize, X_FLOAT, 
               prev->endMark.devID);
    _SetDataFixedCond(&mask, &prev->endMark, -1e9F);
@@ -279,11 +279,11 @@ void T2TSearch::Generate(T2TStateBundle * beam)
    dimsTopK[order - 3] = dimsBeam[order - 3];
    dimsTopK[order - 1] = beamSize;
    
-    InitTensorV2(&scoreTopK, order, dimsTopK, score.dataType,
+    InitTensor(&scoreTopK, order, dimsTopK, score.dataType,
                 score.devID);
-    InitTensorV2(&index, order, dimsTopK, X_INT,
+    InitTensor(&index, order, dimsTopK, X_INT,
                 score.devID);
-    InitTensorV2(&preID, order, dimsTopK, X_INT, -1);
+    InitTensor(&preID, order, dimsTopK, X_INT, -1);
    
    score.Reshape(order, dimsBeam);
    
@@ -307,25 +307,25 @@ void T2TSearch::Generate(T2TStateBundle * beam)
    score.Reshape(order, dims);

    /* we keep the top-k scores */
-    InitTensorV2(&score, &scoreTopK);
+    InitTensor(&score, &scoreTopK);
    CopyValues(scoreTopK, score);

    /*  CPU data (TODO: remove GPU->CPU data copy!!!) */
    XTensor indexGPU;
    indexGPU = CopyValues(index);
-    //InitTensor(&indexCPU, index.order, index.dimSize, index.dataType, index.denseRatio, -1);
+    //InitTensorV2(&indexCPU, index.order, index.dimSize, index.dataType, index.denseRatio, -1);
    //CopyValues(index, indexCPU);

    for (int i = 0; i < indexGPU.unitNum; i++)
        indexGPU.SetInt(i * stride + indexGPU.GetInt(i), i);

-    CheckNTErrors(XTensor::IsSameShaped(&prob, &probPath), "Wrong tensor shape!");
+    CheckNTErrors(IsSameShaped(prob, probPath), "Wrong tensor shape!");

    /* sequence probability of top-k candidates */
    XTensor probPathTopK;
-    InitTensorV2(&probPathTopK, &scoreTopK);
+    InitTensor(&probPathTopK, &scoreTopK);
    XTensor probTopK;
-    InitTensorV2(&probTopK, &scoreTopK);
+    InitTensor(&probTopK, &scoreTopK);

    for (int i = 0; i < probPath.order; i++) {
        dims[i] = probPath.GetDim(i);
@@ -381,7 +381,7 @@ void T2TSearch::Expand(T2TStateBundle * prev, T2TStateBundle * beam)
    InitTensorOnCPU(&probPath, &probPathRef);
    InitTensorOnCPU(&prediction, &predictionRef);
    InitTensorOnCPU(&endMarkCPU, &predictionRef);
-    InitTensorV2(&endMark, &predictionRef);
+    InitTensor(&endMark, &predictionRef);
    
    /* we copy the data to CPU because the frequent access to GPU is slow
       and we can speed-up the process by doing the job on CPU. */
@@ -502,7 +502,7 @@ void T2TSearch::Dump(XTensor * output)
    int dims[3] = {batchSize, beamSize, maxLength};
    int * words = new int[maxLength];

-    InitTensorV2(output, 3, dims, X_INT);
+    InitTensor(output, 3, dims, X_INT);
    SetDataFixedInt(*output, -1);

    /* heap for an input sentence in the batch */
@@ -587,7 +587,7 @@ XTensor T2TSearch::MakeFirstMask(T2TStateBundle * beam)
    for (int i = 0; i < order - 1; i++)
        dims[i] = prob.GetDim(i);

-    InitTensorV2(&mask, order - 1, dims, X_FLOAT);
+    InitTensor(&mask, order - 1, dims, X_FLOAT);
    mask.SetZeroAll();

    for (int i = 0; i < mask.unitNum; i++) {

--- a/source/sample/transformer/T2TTrainer.cpp
+++ b/source/sample/transformer/T2TTrainer.cpp
@@ -367,7 +367,7 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)

        /* prediction probabilities */
        XTensor probs;
-        InitTensor1DV2(&probs, bSize * length);
+        InitTensor1D(&probs, bSize * length);

        XTensor labelOnehot;

@@ -452,13 +452,13 @@ get word probabilities for a batch of sequences
 float T2TTrainer::GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs)
 {
    XTensor probs;
-    InitTensor(&probs, output);
+    InitTensorV2(&probs, output);
    
    _Multiply(output, gold, &probs);
    
    /* probability of each word */
    XTensor wprobs;
-    InitTensor1DV2(&wprobs, output->unitNum/output->GetDim(-1), X_FLOAT, output->devID);
+    InitTensor1D(&wprobs, output->unitNum/output->GetDim(-1), X_FLOAT, output->devID);
    
    int dims[2] = {output->unitNum/output->GetDim(-1), output->GetDim(-1)};
    probs.Reshape(2, dims);
@@ -475,7 +475,7 @@ float T2TTrainer::GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs)
    
    /* probability for the batch */
    XTensor result;
-    InitTensor1DV2(&result, 1, X_FLOAT, output->devID);
+    InitTensor1D(&result, 1, X_FLOAT, output->devID);
    _ReduceSum(&probs, &result, 1);
    
    return result.Get1D(0);
@@ -522,7 +522,7 @@ void T2TTrainer::Update(T2TModel * model, const float lr)
            _ScaleAndShiftMe(v, (1.0F - adamBeta2), 0);

            /* v2 = m / (sqrt(v) + delta) */
-            XTensor * v2 = NewTensorBufV2(v, v->devID);
+            XTensor * v2 = NewTensorBuf(v, v->devID);
            _Power(v, v2, 0.5F);
            _ScaleAndShiftMe(v2, 1.0F, d);
            _Div(m, v2, v2);
@@ -593,7 +593,7 @@ void T2TTrainer::PadOutput(XTensor * output, XTensor * gold, XTensor * padding)

    output->Reshape(output->unitNum/dimso[output->order - 1], dimso[output->order - 1]);

-    XTensor * padding2 = NewTensorBufV2(1, &padding->unitNum, X_FLOAT, padding->devID);
+    XTensor * padding2 = NewTensorBuf(1, &padding->unitNum, X_FLOAT, padding->devID);

    _CopyValues(padding, padding2);
    _MultiplyDim(output, padding2, output, 0);
@@ -647,7 +647,7 @@ void T2TTrainer::LabelSmooth(XTensor * gold, XTensor * smoothed, DTYPE p)
    DTYPE q = 1.0F - p;
    DTYPE gift = p / n;
    
-    InitTensorV2(smoothed, gold);
+    InitTensor(smoothed, gold);
    _CopyValues(gold, smoothed);
    
    if(p == 0)

--- a/source/tensor/Main.cpp
+++ b/source/tensor/Main.cpp
@@ -30,8 +30,9 @@
 #include "XDevice.h"
 #include "./test/Test.h"
 #include "./core/CHeader.h"
-#include "./loss/CrossEntropy.h"
-
+#include "./XBLAS.h"
+#include "./core/sort/TopK.h"
+#include "./core/movement/Gather.h"
 //#define CRTDBG_MAP_ALLOC
 //#include <stdlib.h>  
 //#include <crtdbg.h> 
@@ -40,9 +41,6 @@ using namespace nts;

 void SmallTest();
 void TransposeTest();
-void LittleTest();
-void T2TTest();
-void T2TTest2();
 void PowerTest();

 int main( int argc, const char ** argv )
@@ -167,127 +165,5 @@ void TransposeTest()
    delete[] data;
 }

-void LittleTest()
-{
-    int a = 5000;
-    int b = 100000;
-    int c = a*b;
-    printf("%d\n", c);
-
-    exit(1);
-}
-
-void T2TTest()
-{
-    XTensor * input;
-    XTensor * weight;
-    XTensor * output;
-    XTensor * gold;
-    XTensor * dedy;
-    XTensor * dedx;
-    XTensor * dedxTmp;
-    XTensor * dedw;
-    XTensor * padding;
-
-    DTYPE loss;
-
-    int * dimSize = new int[2];
-    dimSize[0] = 256;
-    dimSize[1] = 10001;
-
-    int * dimSize2 = new int[3];
-    dimSize2[0] = 2;
-    dimSize2[1] = 31;
-    dimSize2[2] = 256;
-   
-    int * dimSize3 = new int[3];
-    dimSize3[0] = 2;
-    dimSize3[1] = 31;
-    dimSize3[2] = 10001;
-
-    int * dimSize4 = new int[2];
-    dimSize4[0] = 2;
-    dimSize4[1] = 31;
-
-    input = NewTensor(3, dimSize2, X_FLOAT, 1.0F, 0);
-    weight = NewTensor(2, dimSize, X_FLOAT, 1.0F, 0);
-    dedw = NewTensor(2, dimSize, X_FLOAT, 1.0F, 0);
-    gold = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
-    output = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
-    dedy = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
-    dedx = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
-    dedxTmp = NewTensor(3, dimSize3, X_FLOAT, 1.0F, 0);
-    padding = NewTensor(2, dimSize4, X_FLOAT, 1.0F, 0);
-
-    //weight = NewTensor(2, dimSize);
-    //dedw = NewTensor(2, dimSize);
-    //input = NewTensor(3, dimSize2);
-    //gold = NewTensor(3, dimSize3);
-    //output = NewTensor(3, dimSize3);
-    //dedy = NewTensor(3, dimSize3);
-    //dedx = NewTensor(3, dimSize3);
-    //dedxTmp = NewTensor(3, dimSize3);
-    //padding = NewTensor(2, dimSize4);
-
-    myRead(input, "x.txt", "x");
-    myRead(weight, "w.txt", "w");
-    myRead(gold, "gold.txt", "gold");
-    myRead(padding, "padding.txt", "padding");
-
-    XTensor inter;
-    inter = MMul(*input, *weight);
-
-    _Softmax(&inter, output, 2);
-
-    //_LogMe(output);
-    loss = _CrossEntropyFast(output, gold, REDUCE_MEAN, NULL, padding);
-
-    printf("loss: %f\n", loss);
-
-    _CrossEntropyBackward(dedy, output, gold, NULL);
-    //_CrossEntropyBackward(dedy, output, gold, NULL, padding);
-
-    myDump(dedy, "dedy.txt", "dedy");
-
-    _SoftmaxBackward(NULL, output, input, dedy, dedx, NULL, -1, NOLOSS);
-    _Sub(output, gold, dedxTmp);
-
-    myDump(dedx, "dedx.txt", "dedx");
-    dedx->Dump(stderr, "dedx", 200);
-    dedxTmp->Dump(stderr, "dedxTmp", 200);
-
-    input->Reshape(input->unitNum/input->GetDim(-1), input->GetDim(-1));
-    dedx->Reshape(dedx->unitNum/dedx->GetDim(-1), dedx->GetDim(-1));
-
-    _MatrixMulBatched(input, X_TRANS, dedx, X_NOTRANS, dedw);
-
-    myDump(dedw, "dedw.txt", "dedw");
-}
-
-void T2TTest2()
-{
-    int dimSize[3];
-    dimSize[0] = 161;
-    dimSize[1] = 47;
-    dimSize[2] = 10001;
-    XTensor * probs = NewTensor(3, dimSize, X_FLOAT, 1.0F, 0);
-    //XTensor * probs = NewTensor(3, dimSize, X_FLOAT, 1.0F, -1);
-
-    //myRead(probs, "probs.txt", " ");
-    _SetDataFixedFloat(probs, 1.0F);
-
-    probs->Reshape(1, probs->unitNum);

-    DTYPE sum = _ReduceSumAll(probs);
-    printf("%e\n", sum);
-
-    //XTensor tmp;
-    //tmp = IsNonZero(*probs);
-    //DTYPE nonZeroNum = ReduceSumAll(tmp);
-    //printf("%f\n", nonZeroNum);
-    //
-    //DTYPE gpu = ReduceSum(*probs, 1).Get2D(0, 0);
-
-    //printf("%e\n", gpu);
-}

--- a/source/tensor/XCall.cpp
+++ b/source/tensor/XCall.cpp
--- a/source/tensor/XCall.h
+++ b/source/tensor/XCall.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: LI Yinqiao (email: li.yin.qiao.2012@hotmail.com) 2019-10-21
+*/
+
+#ifndef __XCALL_H__
+#define __XCALL_H__
+
+#include "XTensor.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/*
+* we define the "new and delete" functions below
+*/
+
+/* initialize a XTensor V2 */
+void InitTensorV2(XTensor * tensor,
+                const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
+                const float myDenseRatio = 1.0F, const int myDevID = -1, XMem * myMem = NULL);
+                
+/* initialize a dense XTensor */
+void InitTensor(XTensor * tensor,
+                const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
+                const int myDevID = -1, const bool isEnableGrad = true);
+
+/* initialize a dense vector V2 */
+void InitTensor1DV2(XTensor * tensor, const int num, 
+                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);
+
+/* initialize a dense vector */
+void InitTensor1D(XTensor * tensor, const int num, 
+                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = true);
+
+/* initialize a dense matrix V2 */
+void InitTensor2DV2(XTensor * tensor, const int rowNum, const int colNum,
+                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);
+
+/* initialize a dense matrix */
+void InitTensor2D(XTensor * tensor, const int rowNum, const int colNum,
+                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = true);
+
+/* initialize a dense 3d tensor V2 */
+void InitTensor3DV2(XTensor * tensor, const int d0, const int d1, const int d2,
+                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);
+
+/* initialize a dense 3d tensor */
+void InitTensor3D(XTensor * tensor, const int d0, const int d1, const int d2,
+                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = true);
+    
+/* initialize a dense 4d tensor V2 */
+void InitTensor4DV2(XTensor * tensor, const int d0, const int d1, const int d2, const int d3,
+                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);
+
+/* initialize a dense 4d tensor */
+void InitTensor4D(XTensor * tensor, const int d0, const int d1, const int d2, const int d3,
+                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = true);
+
+/* initialize a dense 5d tensor V2 */
+void InitTensor5DV2(XTensor * tensor, const int d0, const int d1, const int d2, const int d3, const int d4,
+                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);
+
+/* initialize a dense 5d tensor */
+void InitTensor5D(XTensor * tensor, const int d0, const int d1, const int d2, const int d3, const int d4,
+                    const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = true);
+
+/* initialize a tensor with a reference tensor V2 */
+void InitTensorV2(XTensor * tensor, const XTensor * reference);
+
+/* initialize a tensor with a reference tensor */
+void InitTensor(XTensor * tensor, const XTensor * reference);
+    
+/* initialize a tensor on the CPU with a reference tensor */
+void InitTensorOnCPU(XTensor * tensor, const XTensor * reference);
+    
+/* generate a XTensor with no initialization */
+XTensor * NewTensor();
+
+/* generate a XTensor V2 */
+XTensor * NewTensorV2(const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
+                    const float myDenseRatio = 1.0F, const int myDevID = -1, XMem * myMem = NULL);
+
+/* generate a dense XTensor */
+XTensor * NewTensor(const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
+                      const int myDevID = -1, const bool isEnableGrad = true);
+
+/* generate a XTensor which allocates data on the buffer V2 */
+XTensor * NewTensorBufV2(const int myOrder, const int * myDimSize,
+                       const TENSOR_DATA_TYPE myDataType = X_FLOAT, const float myDenseRatio = 1.0F,
+                       const int myDevID = -1, XMem * myMem = NULL);
+
+/* generate a dense XTensor which allocates data on the buffer */
+XTensor * NewTensorBuf(const int myOrder, const int * myDimSize,
+                       const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = true);
+
+/* generate a XTensor which allocates data on the buffer V2 */
+XTensor * NewTensorBufV2(const XTensor * reference, int devID, XMem * myMem);
+
+/* generate a XTensor which allocates data on the buffer */
+XTensor * NewTensorBuf(const XTensor * reference, int devID, const bool isEnableGrad = true);
+
+/* generate a dense vector V2 */
+XTensor * NewTensor1DV2(const int num, const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, 
+                      XMem * myMem = NULL);
+
+/* generate a dense vector */
+XTensor * NewTensor1D(const int num, const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = true);
+
+/* generate a dense matrix V2 */
+XTensor * NewTensor2DV2(const int rowNum, const int colNum, 
+                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
+                      const int myDevID = -1, XMem * myMem = NULL);
+
+/* generate a dense matrix */
+XTensor * NewTensor2D(const int rowNum, const int colNum, 
+                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
+                      const int myDevID = -1, const bool isEnableGrad = true);
+
+/* generate a dense 3d tensor V2 */
+XTensor * NewTensor3DV2(const int d0, const int d1, const int d2, 
+                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
+                      const int myDevID = -1, XMem * myMem = NULL);
+
+/* generate a dense 3d tensor */
+XTensor * NewTensor3D(const int d0, const int d1, const int d2, 
+                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
+                      const int myDevID = -1, const bool isEnableGrad = true);
+
+/* generate a dense 4d tensor V2 */
+XTensor * NewTensor4DV2(const int d0, const int d1, const int d2, const int d3,
+                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
+                      const int myDevID = -1, XMem * myMem = NULL);
+
+/* generate a dense 4d tensor */
+XTensor * NewTensor4D(const int d0, const int d1, const int d2, const int d3,
+                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
+                      const int myDevID = -1, const bool isEnableGrad = true);
+
+/* generate a dense 5d tensor V2 */
+XTensor * NewTensor5DV2(const int d0, const int d1, const int d2, const int d3, const int d4,
+                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
+                      const int myDevID = -1, XMem * myMem = NULL);
+
+/* generate a dense 5d tensor */
+XTensor * NewTensor5D(const int d0, const int d1, const int d2, const int d3, const int d4,
+                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
+                      const int myDevID = -1, const bool isEnableGrad = true);
+
+/* generate a dense vector by range */
+XTensor * NewTensorRange(int lower, int upper, int step, const TENSOR_DATA_TYPE myDataType = X_INT, const int myDevID = -1, const bool isEnableGrad = true);
+
+/* generate a copy of XTensor (with a reference to a given tensor) */
+XTensor * NewTensor(const XTensor * a, bool isFilledData = true);
+
+/* free the data space of a given tensor */
+void DelTensor(XTensor * tensor);
+
+/* free the data space of a given tensor (on the buffer) */
+void DelTensorBuf(XTensor * tensor);
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __XCALL_H__
\ No newline at end of file
--- a/source/tensor/XGlobal.cpp
+++ b/source/tensor/XGlobal.cpp
-/* NiuTrans.Tensor - an open-source tensor library
+/* NiuTrans.Tensor - an open-source tensor library
 * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
 * All rights reserved.
 *
@@ -50,14 +50,6 @@ int CONST_MINUSONE = -1;
 bool CONST_TRUE = true;

 int verboseLevel = 0;
-bool useBLAS = false;
-
-
-#ifdef USE_CUDA
-    bool useCUDA = true;
-#else
-    bool useCUDA = false;
-#endif

 FILE * tmpLog = NULL;
 double myTime = 0;

--- a/source/tensor/XGlobal.h
+++ b/source/tensor/XGlobal.h
@@ -135,8 +135,6 @@ extern bool CONST_TRUE;
 #define NIUTRANSNNDEBUG

 extern int verboseLevel;
-extern bool useBLAS;
-extern bool useCUDA;

 #define FFLUSH(FILEH) \
 { \

--- a/source/tensor/XMem.cpp
+++ b/source/tensor/XMem.cpp
@@ -1562,9 +1562,9 @@ void XMemManager::GetBufferSize(MTYPE freeMem, MTYPE * myBufSize)
            if (freeMem >= MILLION * 512){
                *myBufSize = MILLION * 128;
                if (freeMem >= MILLION * 1024) {
-                    *myBufSize = MILLION * 256;
+                    *myBufSize = MILLION * 128;
                    if (freeMem >= MILLION * 2048)
-                        *myBufSize = MILLION * 512;
+                        *myBufSize = MILLION * 128;
                }
            }
        }

--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
--- a/source/tensor/core/CHeader.h
+++ b/source/tensor/core/CHeader.h
@@ -86,11 +86,14 @@
 #include "shape/Stack.h"
 #include "shape/Transpose.h"
 #include "shape/Unsqueeze.h"
+#include "shape/IsSameShaped.h"

 #include "sort/Sort.h"
 #include "sort/TopK.h"

 #include "utilities/XMatrixSegment.h"
 #include "utilities/FlushToMem.h"
+#include "utilities/CheckData.h"
+#include "utilities/SetAscendingOrder.h"

 #endif // __CHEADER_H__
--- a/source/tensor/core/arithmetic/Div.cpp
+++ b/source/tensor/core/arithmetic/Div.cpp
@@ -22,6 +22,7 @@
 #include "../../XTensor.h"
 #include "../../XName.h"
 #include "../../XUtility.h"
+#include "../shape/IsSameShaped.h"
 #include "Div.h"
 #include "Div.cuh"
 #include "DivDim.h"
@@ -48,9 +49,6 @@ void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int le
                  "Unmatched tensors!");

    CheckDev(a->devID, b->devID);
-
-    int leadingDimRDI = a->order - leadingDim - 1;
-
 #ifdef USE_CUDA
    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
        _CudaDiv(a, b, c, alpha, leadingDim);
@@ -63,17 +61,17 @@ void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int le
    int blockSizeB = 1;
    int blockSizeC = 1;
    int blockNum = 1;
-    int dimensionSizeA = a->dimSizeRDI[leadingDimRDI];
-    int dimensionSizeB = b->dimSizeRDI[leadingDimRDI];
-    int dimensionSizeC = c->dimSizeRDI[leadingDimRDI];
+    int dimensionSizeA = a->dimSize[leadingDim];
+    int dimensionSizeB = b->dimSize[leadingDim];
+    int dimensionSizeC = c->dimSize[leadingDim];

    for (int i = 0; i < a->order; i++) {
-        if (i != leadingDimRDI) {
-            CheckNTErrors((a->dimSizeRDI[i] == b->dimSizeRDI[i] && a->dimSizeRDI[i] == c->dimSizeRDI[i]),
+        if (i != leadingDim) {
+            CheckNTErrors((a->dimSize[i] == b->dimSize[i] && a->dimSize[i] == c->dimSize[i]),
                          "Unmatched tensors!");
        }
-        if (i < leadingDimRDI)
-            stride *= a->dimSizeRDI[i];
+        if (i > leadingDim)
+            stride *= a->dimSize[i];
    }

    blockSizeA = stride * dimensionSizeA;
@@ -168,7 +166,7 @@ int GetDivDimIndex(const XTensor &a, const XTensor &b)
 {
    if(a.order < b.order)
        return -1;
-    if(XTensor::IsSameShaped(&a, &b))
+    if(IsSameShaped(a, b))
        return -1;

    int hitCount = 0;
@@ -253,8 +251,8 @@ where i is the index of the item
 */
 void Div(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int leadingDim)
 {
-    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
-        InitTensor(&c, &a);
+    if (!c.isInit || !IsSameShaped(a, c)) {
+        InitTensorV2(&c, &a);
    }

    int n = GetDivDimIndex(a, b);

--- a/source/tensor/core/arithmetic/Div.cu
+++ b/source/tensor/core/arithmetic/Div.cu
@@ -122,7 +122,6 @@ where i is the item index
 */
 void _CudaDiv(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
 {
-    int leadingDimRDI = a->order - leadingDim - 1;
    CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
                  "Unmatched tensors in multiplication!");
    CheckNTErrors((a->order == b->order && a->order == c->order), "Unmatched tensors!");
@@ -130,18 +129,18 @@ void _CudaDiv(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, in
    int stride = 1;
    int blockSizeA = 1;
    int blockNum = 1;
-    int dimensionSizeA = a->dimSizeRDI[leadingDimRDI];
-    int dimensionSizeB = b->dimSizeRDI[leadingDimRDI];
-    int dimensionSizeC = c->dimSizeRDI[leadingDimRDI];
+    int dimensionSizeA = a->dimSize[leadingDim];
+    int dimensionSizeB = b->dimSize[leadingDim];
+    int dimensionSizeC = c->dimSize[leadingDim];

    for (int i = 0; i < a->order; i++) {
-        if (i != leadingDimRDI) {
-            CheckNTErrors((a->dimSizeRDI[i] == b->dimSizeRDI[i] &&
-                           a->dimSizeRDI[i] == c->dimSizeRDI[i]),
+        if (i != leadingDim) {
+            CheckNTErrors((a->dimSize[i] == b->dimSize[i] &&
+                           a->dimSize[i] == c->dimSize[i]),
                          "Unmatched tensors!");
        }
-        if (i < leadingDimRDI)
-            stride *= a->dimSizeRDI[i];
+        if (i > leadingDim)
+            stride *= a->dimSize[i];
    }

    blockSizeA = stride * dimensionSizeA;

--- a/source/tensor/core/arithmetic/DivDim.cpp
+++ b/source/tensor/core/arithmetic/DivDim.cpp
@@ -26,6 +26,7 @@
 #include "../../XName.h"
 #include "../../XUtility.h"
 #include "../movement/CopyValues.h"
+#include "../shape/IsSameShaped.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

@@ -56,7 +57,7 @@ void _DivDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alp

    CheckDev(a->devID, b->devID);

-    if(XTensor::IsSameShaped(a, b)){
+    if(_IsSameShaped(a, b)){
        _Div(a, b, c, alpha);
        return;
    }
@@ -188,8 +189,8 @@ i.e., a is divided with b by broadcasting
 */
 void DivDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE alpha)
 {
-    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
-        InitTensor(&c, &a);
+    if (!c.isInit || !IsSameShaped(a, c)) {
+        InitTensorV2(&c, &a);
    }

    /* call _Div function */

--- a/source/tensor/core/arithmetic/Mask.cpp
+++ b/source/tensor/core/arithmetic/Mask.cpp
@@ -24,6 +24,7 @@
 #include "../../XTensor.h"
 #include "../../XName.h"
 #include "../../XUtility.h"
+#include "../shape/IsSameShaped.h"
 #include "Mask.h"
 #include "Mask.cuh"

@@ -171,8 +172,8 @@ where i is the index of the element
 */
 void Mask(const XTensor &a, const XTensor &mask, XTensor &c, DTYPE alpha)
 {
-    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
-        InitTensor(&c, &a);
+    if (!c.isInit || !IsSameShaped(a, c)) {
+        InitTensorV2(&c, &a);
    }

    /* call _Mask function */

--- a/source/tensor/core/arithmetic/MatrixMul.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul.cpp
--- a/source/tensor/core/arithmetic/MatrixMulBatched.cpp
+++ b/source/tensor/core/arithmetic/MatrixMulBatched.cpp
@@ -22,6 +22,7 @@
 #include "../../XTensor.h"
 #include "../../XDevice.h"
 #include "../../XName.h"
+#include "../shape/IsSameShaped.h"
 #include "MatrixMulBatched.h"
 #include "XTensorBLAS.h"
 #include "MatrixMul2D.h"
@@ -94,27 +95,27 @@ void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
                  "Input tensor and output tensor must have same order!");
    CheckNTErrors(a->devID >= 0 && b->devID >= 0 && c->devID >= 0, "The tensors must be on GPUs");

-    int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
-    int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
-    int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1];
-    int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0];
-    int cn = c->dimSizeRDI[1];
-    int cm = c->dimSizeRDI[0];
+    int an = transposedA == X_TRANS ? a->dimSize[a->order - 1] : a->dimSize[a->order - 2];
+    int am = transposedA == X_TRANS ? a->dimSize[a->order - 2] : a->dimSize[a->order - 1];
+    int bn = transposedB == X_TRANS ? b->dimSize[b->order - 1] : b->dimSize[b->order - 2];
+    int bm = transposedB == X_TRANS ? b->dimSize[b->order - 2] : b->dimSize[b->order - 1];
+    int cn = c->dimSize[c->order - 2];
+    int cm = c->dimSize[c->order - 1];

    CheckNTErrors((am == bn && an == cn && bm == cm), "Unmatched tensors in multiplication!");

-    int aBlockSize = a->dimSizeRDI[0] * a->dimSizeRDI[1];
-    int bBlockSize = b->dimSizeRDI[0] * b->dimSizeRDI[1];
-    int cBlockSize = c->dimSizeRDI[0] * c->dimSizeRDI[1];
+    int aBlockSize = a->dimSize[a->order - 1] * a->dimSize[a->order - 2];
+    int bBlockSize = b->dimSize[b->order - 1] * b->dimSize[b->order - 2];
+    int cBlockSize = c->dimSize[c->order - 1] * c->dimSize[c->order - 2];
    int aRealBlockSize = aBlockSize * a->unitSize;
    int bRealBlockSize = bBlockSize * b->unitSize;
    int cRealBlockSize = cBlockSize * c->unitSize;
    int blockNum = 1;

-    for (int i = 2; i < a->order; i++) {
-        CheckNTErrors((a->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
-        CheckNTErrors((b->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
-        blockNum *= a->dimSizeRDI[i];
+    for (int i = 0; i < a->order - 2; i++) {
+        CheckNTErrors((a->dimSize[i] == c->dimSize[i]), "Incorrect tensor sizes!");
+        CheckNTErrors((b->dimSize[i] == c->dimSize[i]), "Incorrect tensor sizes!");
+        blockNum *= a->dimSize[i];
    }

    int devIDBackup = 0;
@@ -125,9 +126,9 @@ void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
                                     a->data, transposedA, a->dataType, aBlockSize,
                                     b->data, transposedB, b->dataType, bBlockSize,
                                     c->data, c->dataType, cBlockSize, blockNum,
-                                     a->dimSizeRDI[1], a->dimSizeRDI[0],
-                                     b->dimSizeRDI[1], b->dimSizeRDI[0],
-                                     c->dimSizeRDI[1], c->dimSizeRDI[0], alpha, beta);
+                                     a->dimSize[a->order - 2], a->dimSize[a->order - 1],
+                                     b->dimSize[b->order - 2], b->dimSize[b->order - 1],
+                                     c->dimSize[c->order - 2], c->dimSize[c->order - 1], alpha, beta);

    BacktoCudaDev(a->devID, devIDBackup);
 #endif
@@ -163,36 +164,36 @@ void _MatrixMulBatchedCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
                 "Input tensor and output tensor must have same order!");


-    int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
-    int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
-    int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1];
-    int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0];
-    int cn = c->dimSizeRDI[1];
-    int cm = c->dimSizeRDI[0];
+    int an = transposedA == X_TRANS ? a->dimSize[a->order - 1] : a->dimSize[a->order - 2];
+    int am = transposedA == X_TRANS ? a->dimSize[a->order - 2] : a->dimSize[a->order - 1];
+    int bn = transposedB == X_TRANS ? b->dimSize[b->order - 1] : b->dimSize[b->order - 2];
+    int bm = transposedB == X_TRANS ? b->dimSize[b->order - 2] : b->dimSize[b->order - 1];
+    int cn = c->dimSize[c->order - 2];
+    int cm = c->dimSize[c->order - 1];

    CheckNTErrors(am == bn && an == cn && bm == cm, "Unmatched tensors in multiplication!");

-    int aBlockSize = a->dimSizeRDI[0] * a->dimSizeRDI[1];
-    int bBlockSize = b->dimSizeRDI[0] * b->dimSizeRDI[1];
-    int cBlockSize = c->dimSizeRDI[0] * c->dimSizeRDI[1];
+    int aBlockSize = a->dimSize[a->order - 1] * a->dimSize[a->order - 2];
+    int bBlockSize = b->dimSize[b->order - 1] * b->dimSize[b->order - 2];
+    int cBlockSize = c->dimSize[c->order - 1] * c->dimSize[c->order - 2];
    int aRealBlockSize = aBlockSize * a->unitSize;
    int bRealBlockSize = bBlockSize * b->unitSize;
    int cRealBlockSize = cBlockSize * c->unitSize;
    int blockNum = 1;

-    for (int i = 2; i < a->order; i++) {
-        CheckNTErrors((a->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
-        CheckNTErrors((b->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
-        blockNum *= a->dimSizeRDI[i];
+    for (int i = 0; i < a->order - 2; i++) {
+        CheckNTErrors((a->dimSize[i] == c->dimSize[i]), "Incorrect tensor sizes!");
+        CheckNTErrors((b->dimSize[i] == c->dimSize[i]), "Incorrect tensor sizes!");
+        blockNum *= a->dimSize[i];
    }

-    int aDimSize[2] = {-a->dimSizeRDI[1], a->dimSizeRDI[0]};
-    int bDimSize[2] = {-b->dimSizeRDI[1], b->dimSizeRDI[0]};
-    int cDimSize[2] = {-c->dimSizeRDI[1], c->dimSizeRDI[0]};
+    int aDimSize[2] = {-a->dimSize[a->order - 2], a->dimSize[a->order - 1]};
+    int bDimSize[2] = {-b->dimSize[b->order - 2], b->dimSize[b->order - 1]};
+    int cDimSize[2] = {-c->dimSize[c->order - 2], c->dimSize[c->order - 1]};

-    XTensor * ai = NewTensor2D(aDimSize[0], aDimSize[1], a->dataType, a->devID, a->mem);
-    XTensor * bi = NewTensor2D(bDimSize[0], bDimSize[1], b->dataType, b->devID, b->mem);
-    XTensor * ci = NewTensor2D(cDimSize[0], cDimSize[1], c->dataType, c->devID, c->mem);
+    XTensor * ai = NewTensor2DV2(aDimSize[0], aDimSize[1], a->dataType, a->devID, a->mem);
+    XTensor * bi = NewTensor2DV2(bDimSize[0], bDimSize[1], b->dataType, b->devID, b->mem);
+    XTensor * ci = NewTensor2DV2(cDimSize[0], cDimSize[1], c->dataType, c->devID, c->mem);

    for (int i = 0; i < blockNum; i++) {
        ai->data = (char*)a->data + i * aRealBlockSize;
@@ -242,9 +243,9 @@ void _MatrixMulBatchedCPU(const TensorList * a, MATRIX_TRANS_TYPE transposedA,
        XTensor * ai = (XTensor*)a->GetItem(i);
        XTensor * bi = (XTensor*)b->GetItem(i);
        XTensor * ci = (XTensor*)c->GetItem(i);
-        if (!XTensor::IsSameShaped(aim, ai) ||
-            !XTensor::IsSameShaped(bim, bi) ||
-            !XTensor::IsSameShaped(cim, ci))
+        if (!_IsSameShaped(aim, ai) ||
+            !_IsSameShaped(bim, bi) ||
+            !_IsSameShaped(cim, ci))
        {
            isUniform = false;
            break;
@@ -291,10 +292,10 @@ XTensor MatrixMulBatched(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const 
    CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
    CheckNTErrors(a.order == b.order, "Input tensor and output tensor must have same order!");

-    int an = transposedA == X_TRANS ? a.dimSizeRDI[0] : a.dimSizeRDI[1];
-    int am = transposedA == X_TRANS ? a.dimSizeRDI[1] : a.dimSizeRDI[0];
-    int bn = transposedB == X_TRANS ? b.dimSizeRDI[0] : b.dimSizeRDI[1];
-    int bm = transposedB == X_TRANS ? b.dimSizeRDI[1] : b.dimSizeRDI[0];
+    int an = transposedA == X_TRANS ? a.dimSize[a.order - 1] : a.dimSize[a.order - 2];
+    int am = transposedA == X_TRANS ? a.dimSize[a.order - 2] : a.dimSize[a.order - 1];
+    int bn = transposedB == X_TRANS ? b.dimSize[b.order - 1] : b.dimSize[b.order - 2];
+    int bm = transposedB == X_TRANS ? b.dimSize[b.order - 2] : b.dimSize[b.order - 1];

    CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");

@@ -349,10 +350,10 @@ XTensor MatrixMulBatched(const XTensor &a, const XTensor &b,
    CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
    CheckNTErrors(a.order == b.order, "Input tensor and output tensor must have same order!");

-    int an = a.dimSizeRDI[1];
-    int am = a.dimSizeRDI[0];
-    int bn = b.dimSizeRDI[1];
-    int bm = b.dimSizeRDI[0];
+    int an = a.dimSize[a.order - 2];
+    int am = a.dimSize[a.order - 1];
+    int bn = b.dimSize[b.order - 2];
+    int bm = b.dimSize[b.order - 1];

    CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");


--- a/source/tensor/core/arithmetic/MulAndShift.cpp
+++ b/source/tensor/core/arithmetic/MulAndShift.cpp
@@ -37,7 +37,7 @@ int GetSumIndex(const XTensor &a, const XTensor &b)
 {
    if (a.order < b.order)
        return -1;
-    if (XTensor::IsSameShaped(&a, &b))
+    if (IsSameShaped(a, b))
        return -1;

    int hitCount = 0;
@@ -71,26 +71,27 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
    CheckNTErrors(x.dataType == w.dataType, "Input tensors should have the same data type!");
    CheckNTErrors(x.order >= 2 && w.order >= 2, "Input tensors must have a order >= 2!");

-    int xn = x.dimSizeRDI[1];
-    int xm = x.dimSizeRDI[0];
-    int wn = w.dimSizeRDI[1];
-    int wm = w.dimSizeRDI[0];
+    int xn = x.dimSize[x.order - 2];
+    int xm = x.dimSize[x.order - 1];
+    int wn = w.dimSize[w.order - 2];
+    int wm = w.dimSize[w.order - 1];

    CheckNTErrors(xm == wn, "Unmatched tensors in multiplication!");

    int order = x.order + w.order - 2;
    int sub = 0;
    int * dimSize = new int[order];
-    for (int i = 2; i < x.order; i++)
-        dimSize[sub++] = x.dimSizeRDI[x.order + 1 - i];
-    for (int i = 2; i < w.order; i++)
-        dimSize[sub++] = w.dimSizeRDI[w.order + 1 - i];
+    for (int i = 0; i < x.order - 2; i++)
+        dimSize[sub++] = x.dimSize[i];
+    for (int i = 0; i < w.order - 2; i++)
+        dimSize[sub++] = w.dimSize[i];
+
    dimSize[sub++] = xn;
    dimSize[sub++] = wm;

    float dr = (!x.isSparse || !w.isSparse) ? 1.0F : MAX(x.denseRatio, w.denseRatio);

-    XTensor * tmp = NewTensorBuf(order, dimSize, x.dataType, dr, x.devID, x.mem);
+    XTensor * tmp = NewTensorBufV2(order, dimSize, x.dataType, dr, x.devID, x.mem);

    /* call _MatrixMul function */
    _MatrixMul(&x, X_NOTRANS, &w, X_NOTRANS, tmp, alpha, 0, parallelRunner);
@@ -148,24 +149,24 @@ XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedA,
    CheckNTErrors(x.dataType == w.dataType, "Input tensors should have the same data type!");
    CheckNTErrors(x.order >= 2 && w.order >= 2, "Input tensors must have a order >= 2!");

-    int xn = transposedA == X_TRANS ? x.dimSizeRDI[0] : x.dimSizeRDI[1];
-    int xm = transposedA == X_TRANS ? x.dimSizeRDI[1] : x.dimSizeRDI[0];
-    int wn = transposedB == X_TRANS ? w.dimSizeRDI[0] : w.dimSizeRDI[1];
-    int wm = transposedB == X_TRANS ? w.dimSizeRDI[1] : w.dimSizeRDI[0];
+    int xn = transposedA == X_TRANS ? x.dimSize[x.order - 1] : x.dimSize[x.order - 2];
+    int xm = transposedA == X_TRANS ? x.dimSize[x.order - 2] : x.dimSize[x.order - 1];
+    int wn = transposedB == X_TRANS ? w.dimSize[w.order - 1] : w.dimSize[w.order - 2];
+    int wm = transposedB == X_TRANS ? w.dimSize[w.order - 2] : w.dimSize[w.order - 1];

    int order = x.order + w.order - 2;
    int sub = 0;
    int * dimSize = new int[order];
-    for (int i = 2; i < x.order; i++)
-        dimSize[sub++] = x.dimSizeRDI[x.order + 1 - i];
-    for (int i = 2; i < w.order; i++)
-        dimSize[sub++] = w.dimSizeRDI[w.order + 1 - i];
+    for (int i = 0; i < x.order - 2; i++)
+        dimSize[sub++] = x.dimSize[i];
+    for (int i = 0; i < w.order - 2; i++)
+        dimSize[sub++] = w.dimSize[i];
    dimSize[sub++] = xn;
    dimSize[sub++] = wm;

    float dr = (!x.isSparse || !w.isSparse) ? 1.0F : MAX(x.denseRatio, w.denseRatio);

-    XTensor * tmp = NewTensorBuf(order, dimSize, x.dataType, dr, x.devID, x.mem);
+    XTensor * tmp = NewTensorBufV2(order, dimSize, x.dataType, dr, x.devID, x.mem);

    /* call _MatrixMul function */
    _MatrixMul(&x, transposedA, &w, transposedB, tmp, alpha, 0, parallelRunner);

--- a/source/tensor/core/arithmetic/Multiply.cpp
+++ b/source/tensor/core/arithmetic/Multiply.cpp
@@ -22,6 +22,7 @@
 #include "../../XTensor.h"
 #include "../../XName.h"
 #include "../../XUtility.h"
+#include "../shape/IsSameShaped.h"
 #include "Multiply.h"
 #include "Multiply.cuh"
 #include "MultiplyDim.h"
@@ -48,9 +49,6 @@ void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, i
                  "Unmatched tensors!");

    CheckDev(a->devID, b->devID);
-
-    int leadingDimRDI = a->order - leadingDim - 1;
-
 #ifdef USE_CUDA
    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
        _CudaMultiply(a, b, c, alpha, leadingDim);
@@ -63,18 +61,18 @@ void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, i
    int blockSizeB = 1;
    int blockSizeC = 1;
    int blockNum = 1;
-    int dimensionSizeA = a->dimSizeRDI[leadingDimRDI];
-    int dimensionSizeB = b->dimSizeRDI[leadingDimRDI];
-    int dimensionSizeC = c->dimSizeRDI[leadingDimRDI];
+    int dimensionSizeA = a->dimSize[leadingDim];
+    int dimensionSizeB = b->dimSize[leadingDim];
+    int dimensionSizeC = c->dimSize[leadingDim];

    for (int i = 0; i < a->order; i++) {
-        if (i != leadingDimRDI) {
-            CheckNTErrors((a->dimSizeRDI[i] == b->dimSizeRDI[i] &&
-                           a->dimSizeRDI[i] == c->dimSizeRDI[i]),
+        if (i != leadingDim) {
+            CheckNTErrors((a->dimSize[i] == b->dimSize[i] &&
+                           a->dimSize[i] == c->dimSize[i]),
                          "Unmatched tensors!");
        }
-        if (i < leadingDimRDI)
-            stride *= a->dimSizeRDI[i];
+        if (i > leadingDim)
+            stride *= a->dimSize[i];
    }

    blockSizeA = stride * dimensionSizeA;
@@ -169,7 +167,7 @@ int GetMultiplyDimIndex(const XTensor &a, const XTensor &b)
 {
    if(a.order < b.order)
        return -1;
-    if(XTensor::IsSameShaped(&a, &b))
+    if(IsSameShaped(a, b))
        return -1;

    int hitCount = 0;
@@ -254,8 +252,8 @@ where i is the index of the item
 */
 void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int leadingDim)
 {
-    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
-        InitTensor(&c, &a);
+    if (!c.isInit || !IsSameShaped(a, c)) {
+        InitTensorV2(&c, &a);
    }

    int n = GetMultiplyDimIndex(a, b);

--- a/source/tensor/core/arithmetic/Multiply.cu
+++ b/source/tensor/core/arithmetic/Multiply.cu
@@ -122,26 +122,25 @@ where i is the item index
 */
 void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
 {
-    int leadingDimRDI = a->order - leadingDim - 1;
-    CheckNTErrors(a->unitNum <= c->unitNum && b->unitNum <= c->unitNum,
+    CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
                  "Unmatched tensors in multiplication!");
-    CheckNTErrors(a->order == b->order && a->order == c->order, "Unmatched tensors!");
+    CheckNTErrors((a->order == b->order && a->order == c->order), "Unmatched tensors!");

    int stride = 1;
    int blockSizeA = 1;
    int blockNum = 1;
-    int dimensionSizeA = a->dimSizeRDI[leadingDimRDI];
-    int dimensionSizeB = b->dimSizeRDI[leadingDimRDI];
-    int dimensionSizeC = c->dimSizeRDI[leadingDimRDI];
+    int dimensionSizeA = a->dimSize[leadingDim];
+    int dimensionSizeB = b->dimSize[leadingDim];
+    int dimensionSizeC = c->dimSize[leadingDim];

    for (int i = 0; i < a->order; i++) {
-        if (i != leadingDimRDI) {
-            CheckNTErrors((a->dimSizeRDI[i] == b->dimSizeRDI[i] &&
-                           a->dimSizeRDI[i] == c->dimSizeRDI[i]),
+        if (i != leadingDim) {
+            CheckNTErrors((a->dimSize[i] == b->dimSize[i] &&
+                           a->dimSize[i] == c->dimSize[i]),
                          "Unmatched tensors!");
        }
-        if (i < leadingDimRDI)
-            stride *= a->dimSizeRDI[i];
+        if (i > leadingDim)
+            stride *= a->dimSize[i];
    }

    blockSizeA = stride * dimensionSizeA;

--- a/source/tensor/core/arithmetic/MultiplyDim.cpp
+++ b/source/tensor/core/arithmetic/MultiplyDim.cpp
@@ -24,6 +24,7 @@
 #include "MultiplyDim.h"
 #include "MultiplyDim.cuh"
 #include "../shape/Unsqueeze.h"
+#include "../shape/IsSameShaped.h"
 #include "../../XName.h"
 #include "../../XUtility.h"
 #include "../movement/CopyValues.h"
@@ -57,7 +58,7 @@ void _MultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYP

    CheckDev(a->devID, b->devID);

-    if(XTensor::IsSameShaped(a, b)){
+    if(_IsSameShaped(a, b)){
        _Multiply(a, b, c, alpha);
        return;
    }
@@ -203,8 +204,8 @@ i.e., a is multiplied with b by broadcasting
 */
 void MultiplyDim(const XTensor &a, const XTensor &b, XTensor &c, int n)
 {
-    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
-        InitTensor(&c, &a);
+    if (!c.isInit || !IsSameShaped(a, c)) {
+        InitTensorV2(&c, &a);
    }

    /* call _Multiply function */
@@ -280,8 +281,8 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
            dimsS[0] = -dimsS[0];
            dimsT[0] = -dimsT[0];
            
-            XTensor * s = NewTensor(order - (j - i), dimsS, a->dataType, a->denseRatio, a->devID, a->mem);
-            XTensor * t = NewTensor(order - (j - i) + 1, dimsT, b->dataType, b->denseRatio, b->devID, b->mem);
+            XTensor * s = NewTensorV2(order - (j - i), dimsS, a->dataType, a->denseRatio, a->devID, a->mem);
+            XTensor * t = NewTensorV2(order - (j - i) + 1, dimsT, b->dataType, b->denseRatio, b->devID, b->mem);
            
            if(count == 0)
                source = b->data;
@@ -371,8 +372,8 @@ where some of dimensions of b can be of size 1
 */
 void MultiplyBroadcast(const XTensor &a, const XTensor &b, XTensor &c)
 {
-    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
-        InitTensor(&c, &a);
+    if (!c.isInit || !IsSameShaped(a, c)) {
+        InitTensorV2(&c, &a);
    }

    /* call _SumBroadcast function */

--- a/source/tensor/core/arithmetic/Sub.cpp
+++ b/source/tensor/core/arithmetic/Sub.cpp
@@ -22,6 +22,7 @@
 #include "../../XTensor.h"
 #include "../../XName.h"
 #include "../../XUtility.h"
+#include "../shape/IsSameShaped.h"
 #include "Sub.h"
 #include "Sub.cuh"
 #include "SubDim.h"
@@ -149,7 +150,7 @@ int GetSubDimIndex(const XTensor &a, const XTensor &b)
 {
    if(a.order < b.order)
        return -1;
-    if(XTensor::IsSameShaped(&a, &b))
+    if(IsSameShaped(a, b))
        return -1;

    int hitCount = 0;
@@ -223,8 +224,8 @@ tensor subtraction c = a - b * \beta
 */
 void Sub(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta)
 {
-    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
-        InitTensor(&c, &a);
+    if (!c.isInit || !IsSameShaped(a, c)) {
+        InitTensorV2(&c, &a);
    }

    int n = GetSubDimIndex(a, b);

--- a/source/tensor/core/arithmetic/SubDim.cpp
+++ b/source/tensor/core/arithmetic/SubDim.cpp
@@ -26,6 +26,7 @@
 #include "../../XName.h"
 #include "../../XUtility.h"
 #include "../movement/CopyValues.h"
+#include "../shape/IsSameShaped.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

@@ -61,7 +62,7 @@ void _SubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE bet
        return;
    }

-    if (XTensor::IsSameShaped(a, b)) {
+    if (_IsSameShaped(a, b)) {
        _Sub(a, b, c, beta);
        return;
    }
@@ -188,8 +189,8 @@ i.e., a is subtracted with b by broadcasting
 */
 void SubDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta)
 {
-    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
-        InitTensor(&c, &a);
+    if (!c.isInit || !IsSameShaped(a, c)) {
+        InitTensorV2(&c, &a);
    }

    /* call _Sub function */

--- a/source/tensor/core/arithmetic/Sum.cpp
+++ b/source/tensor/core/arithmetic/Sum.cpp
@@ -24,6 +24,7 @@
 #include "../../XUtility.h"
 #include "../../XBLAS.h"
 #include "../movement/CopyValues.h"
+#include "../shape/IsSameShaped.h"
 #include "Sum.h"
 #include "Sum.cuh"
 #include "SumDim.h"
@@ -77,7 +78,7 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
    else {
        if (!a->isSparse && !b->isSparse) {
            CheckNTErrors(!c->isSparse, "Illegal use of sparse tensor in addition!");
-    
+
            if (a->dataType == DEFAULT_DTYPE &&
                b->dataType == DEFAULT_DTYPE &&
                c->dataType == DEFAULT_DTYPE)
@@ -183,7 +184,7 @@ int GetSumDimIndex(const XTensor &a, const XTensor &b)
 {
    if(a.order < b.order)
        return -1;
-    if(XTensor::IsSameShaped(&a, &b))
+    if(IsSameShaped(a, b))
        return -1;

    int hitCount = 0;
@@ -256,8 +257,8 @@ tensor summation c = a + b * \beta
 */
 void Sum(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta)
 {
-    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
-        InitTensor(&c, &a);
+    if (!c.isInit || !IsSameShaped(a, c)) {
+        InitTensorV2(&c, &a);
    }

    int n = GetSumDimIndex(a, b);

--- a/source/tensor/core/arithmetic/SumDim.cpp
+++ b/source/tensor/core/arithmetic/SumDim.cpp
@@ -26,6 +26,7 @@
 #include "SumDim.h"
 #include "SumDim.cuh"
 #include "../shape/Unsqueeze.h"
+#include "../shape/IsSameShaped.h"
 #include "../../XName.h"
 #include "../../XUtility.h"
 #include "../movement/CopyValues.h"
@@ -64,25 +65,11 @@ void _SumDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE bet
        return;
    }

-    if(XTensor::IsSameShaped(a, b)){
+    if(_IsSameShaped(a, b)){
        _Sum(a, b, c, beta);
        return;
    }

-    /*int dims[MAX_TENSOR_DIM_NUM];
-    for(int i = 0; i < a->order; i++)
-        dims[i] = 1;
-    dims[n] = a->GetDim(n);
-
-    XTensor * b2 = NewTensor(a->order, dims, b->dataType, b->denseRatio, b->devID, b->mem);
-    _CopyValues(b, b2);
-
-    _SumBroadcast(a, b2, c, beta);
-
-    DelTensor(b2);
-
-    return;*/
-
    if(a->devID >= 0 || b->devID >= 0 || c->devID >= 0){
 #ifdef USE_CUDA
        _CudaSumDim(a, b, c, n, beta);
@@ -205,8 +192,8 @@ i.e., a is summed with b by broadcasting
 */
 void SumDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta)
 {
-    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
-        InitTensor(&c, &a);
+    if (!c.isInit || !IsSameShaped(a, c)) {
+        InitTensorV2(&c, &a);
    }

    /* call _SumDim function */
@@ -281,8 +268,8 @@ void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta
            dimsS[0] = -dimsS[0];
            dimsT[0] = -dimsT[0];
            
-            XTensor * s = NewTensor(order - (j - i), dimsS, a->dataType, a->denseRatio, a->devID, a->mem);
-            XTensor * t = NewTensor(order - (j - i) + 1, dimsT, b->dataType, b->denseRatio, b->devID, b->mem);
+            XTensor * s = NewTensorV2(order - (j - i), dimsS, a->dataType, a->denseRatio, a->devID, a->mem);
+            XTensor * t = NewTensorV2(order - (j - i) + 1, dimsT, b->dataType, b->denseRatio, b->devID, b->mem);
            
            if(count == 0)
                source = b->data;
@@ -374,8 +361,8 @@ c = a + b * \beta
 */
 void SumBroadcast(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta)
 {
-    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
-        InitTensor(&c, &a);
+    if (!c.isInit || !IsSameShaped(a, c)) {
+        InitTensorV2(&c, &a);
    }

    /* call _SumBroadcast function */

--- a/source/tensor/core/arithmetic/XTensorBLAS.cu
+++ b/source/tensor/core/arithmetic/XTensorBLAS.cu
@@ -22,6 +22,7 @@
 #include "../../XUtility.h"
 #include "../../XDevice.h"
 #include "../../XTensor.h"
+#include "../shape/IsSameShaped.h"
 #include "XTensorBLAS.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -224,9 +225,9 @@ void _CudaBLASMatrixMULList(cublasHandle_t * handle,
        XTensor * ai = (XTensor*)a->GetItem(i);
        XTensor * bi = (XTensor*)b->GetItem(i);
        XTensor * ci = (XTensor*)c->GetItem(i);
-        if (!XTensor::IsSameShaped(aim, ai) ||
-            !XTensor::IsSameShaped(bim, bi) ||
-            !XTensor::IsSameShaped(cim, ci))
+        if (!_IsSameShaped(aim, ai) ||
+            !_IsSameShaped(bim, bi) ||
+            !_IsSameShaped(cim, ci))
        {
            isUniform = false;
            break;

--- a/source/tensor/core/getandset/ConvertDataType.cpp
+++ b/source/tensor/core/getandset/ConvertDataType.cpp
@@ -131,7 +131,7 @@ void ConvertDataType(const XTensor & input, XTensor & output, TENSOR_DATA_TYPE d
 {
    if (!output.isInit || input.dataType != output.dataType) {
        float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
-        InitTensor(&output, input.order, input.dimSize, dataType, dr, input.devID, input.mem);
+        InitTensorV2(&output, input.order, input.dimSize, dataType, dr, input.devID, input.mem);
    }

    _ConvertDataType(&input, &output);

--- a/source/tensor/core/getandset/OnehotAndIndex.cpp
+++ b/source/tensor/core/getandset/OnehotAndIndex.cpp
@@ -85,7 +85,7 @@ XTensor OnehotToIndex(const XTensor & onehot, int size)
    CheckNTErrors(onehot.dataType == X_INT, "The onehot tensor must be in X_INT!")

    XTensor index;
-    InitTensor(&index, onehot.order - 1, onehot.dimSize, X_INT, 1.0F, onehot.devID, onehot.mem);
+    InitTensorV2(&index, onehot.order - 1, onehot.dimSize, X_INT, 1.0F, onehot.devID, onehot.mem);
    index.SetTMPFlag();

    _OnehotToIndex(&onehot, &index, size);
@@ -139,6 +139,47 @@ void _IndexToOnehot(const XTensor * index, XTensor * onehot,

 }

+/*
+convert index tensor to onehot tensor
+
+>> index - index tensor, which value is an integer num
+>> onehot - onehot tensor, which value is 0 or 1
+>> size - the last dimension size of the onehot tensor
+*/
+void _IndexToOnehot(int * index, int n, XTensor * onehot, int size, float labelSmoothingP)
+{
+    /*CheckNTErrors(onehot->GetDim(-1) == size, "Illegal tensor dimension!");
+    CheckNTErrors(onehot->dataType == X_INT, "The onehot tensor must be in X_INT!")
+
+
+        onehot->SetZeroAll();
+
+#ifdef USE_CUDA
+    if (onehot->devID >= 0) {
+        
+        delete[] cudaIndex;
+        return;
+    }
+#endif
+
+    int blockNum = n;
+    int stride = size;
+
+    int * indexData = (int *)index;
+    int * onehotData = (int *)onehot->data;
+
+    for (int i = 0; i < blockNum; i++) {
+        int id = indexData[i];
+        int * od = onehotData + i * stride;
+        od[id] = 1;
+    }*/
+    XTensor* cudaIndex = NewTensor1DV2(n, X_INT, onehot->devID);
+    cudaIndex->SetData(index, n);
+    _IndexToOnehot(cudaIndex, onehot, size, labelSmoothingP);
+    delete[] cudaIndex;
+
+}
+
 /* 
 convert onehot tensor to index tensor (return an XTensor structure)
 make a new tensor to keep the result and return it 
@@ -159,7 +200,7 @@ XTensor IndexToOnehot(const XTensor & index, int size, float labelSmoothingP)
    int * dim = new int[order + 1];
    memcpy(dim, index.dimSize, order * sizeof(int));
    dim[order] = size;
-    InitTensor(&onehot, index.order + 1, dim, X_FLOAT, 1.0F, index.devID, index.mem);
+    InitTensorV2(&onehot, index.order + 1, dim, X_FLOAT, 1.0F, index.devID, index.mem);

    _IndexToOnehot(&index, &onehot, size, labelSmoothingP);


--- a/source/tensor/core/getandset/OnehotAndIndex.h
+++ b/source/tensor/core/getandset/OnehotAndIndex.h
@@ -36,6 +36,9 @@ XTensor OnehotToIndex(const XTensor & onehot, int num);
 /* convert index tensor to onehot tensor */
 void _IndexToOnehot(const XTensor * index, XTensor * onehot, int size, float labelSmoothingP);

+/* convert index tensor to onehot tensor */
+void _IndexToOnehot(int * index, int n, XTensor * onehot, int size, float labelSmoothingP);
+
 /* convert index tensor to onehot tensor (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor IndexToOnehot(const XTensor & index, int num, float labelSmoothingP);

--- a/source/tensor/core/getandset/Select.cpp
+++ b/source/tensor/core/getandset/Select.cpp
@@ -25,6 +25,114 @@

 namespace nts{ // namespace nts(NiuTrans.Tensor)

+/*
+generate a tensor with selected data in index along the given dimension
+
+c = select(a)
+
+>> a - input tensor
+>> c - result tensor
+>> index - the selected index
+>> dim - the dimension along with which we do the job
+*/
+
+void _Select(const XTensor * a, XTensor * c, int* index, int dim)
+{
+    CheckNTErrors(a != NULL && c != NULL, "empty tensors!");
+    CheckNTErrors(a->order == c->order, "The input and output tensors must in the same order!");
+    CheckNTErrors(dim >= 0 && dim < a->order, "The input dimension is out of bounds!");
+    CheckNTErrors(a->dataType == c->dataType, "The tensor must be of the same data type!");
+    int stride = 1;
+    for (int i = dim + 1; i < a->order; i++)
+        stride *= a->dimSize[i];
+    int copyTimes = 1;
+    for (int i = 0; i < dim; i++)
+    {
+        copyTimes *= a->dimSize[i];
+    }
+    int cot = c->dimSize[dim];
+    int blockSize = stride * a->unitSize;
+    int stepSizeS = stride * a->dimSize[dim] * a->unitSize;
+    int stepSizeT = stride * c->dimSize[dim] * a->unitSize;
+    char * s = (char*)a->data;
+    char * t = (char*)c->data;
+    for (int i = 0; i < copyTimes; i++) {
+        for (int j = 0; j < cot; ++j) {
+            XMemCopy(t + j * blockSize, c->devID, s + index[j] * blockSize, a->devID, blockSize);
+        }
+        s += stepSizeS;
+        t += stepSizeT;
+    }
+}
+
+/*
+generate a tensor with selected data in index along the given dimension
+
+c = select(a)
+
+>> a - input tensor
+>> c - result tensor
+>> index - the selected index
+>> dim - the dimension along with which we do the job
+*/
+void _Select(const XTensor * a, XTensor * c, XTensor* index, int dim)
+{
+    if (index->devID >= 0)
+    {
+        int* indexCPU = new int[index->unitNum];
+        XMemCopy(indexCPU, -1, index->data,index->devID, index->unitNum * sizeof(int));
+
+        _Select(a, c, indexCPU, dim);
+        delete[] indexCPU;
+    }
+    else
+    {
+        _Select(a, c, (int *)index->data, dim);
+    }
+}
+
+/*
+c = select(a)
+
+>> a - input tensor
+>> index - the selected index
+>> dim - the dimension along with which we do the job 
+<< return - the result of the generated tensor with selected data
+*/
+XTensor Select(const XTensor &a, XTensor &index, int dim)
+{
+    int order = a.order;
+    int * dimSize = new int[order];
+
+    CheckNTErrors(dim >= 0 && dim < a.order, "The input dimension is out of bounds!");
+
+    for (int i = 0; i < a.order; i++) {
+        if (i == dim) {
+            dimSize[i] = index.dimSize[0];
+        }
+        else
+            dimSize[i] = a.dimSize[i];
+    }
+
+    float dr = (!a.isSparse) ? 1.0F : a.denseRatio;
+    XTensor c(order, dimSize, a.dataType, dr, a.devID, a.mem);
+    c.SetTMPFlag();
+
+    /* call _SelectRange function */
+    _Select(&a, &c, &index, dim);
+
+    /* tensor connection */
+    if (a.enableGrad) {
+        XLink::MakeLink(&a, &index, &c, GETANDSET_SELECT);
+        XLink::AddParamToHeadInt(&c, dim);
+    }
+
+    /* destroy variables */
+    delete[] dimSize;
+
+    return c;
+}
+
 /* 
 generate a tensor with selected data in range[low,high] along the given dimension 

@@ -58,13 +166,12 @@ void _SelectRange(const XTensor * a, XTensor * c, int dim, int low, int high)
    }

    int stride = 1;
-    int dimRDI = a->order - dim - 1;
-    for(int i = 0; i < dimRDI; i++)
-        stride *= a->dimSizeRDI[i];
+    for(int i = dim + 1; i < a->order; i++)
+        stride *= a->dimSize[i];

    int copyTimes = 1;
-    for (int i = dimRDI + 1; i < a->order; i++) 
-        copyTimes *= a->dimSizeRDI[i];
+    for (int i = 0; i < dim; i++) 
+        copyTimes *= a->dimSize[i];

    int blockSize = stride * (high - low) * a->unitSize;
    int stepSizeS = stride * a->dimSize[dim] * a->unitSize;

--- a/source/tensor/core/getandset/Select.h
+++ b/source/tensor/core/getandset/Select.h
@@ -27,13 +27,16 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)

 /* generate a tensor with selected data c = select(a) */
-void _Select(const XTensor * a, XTensor * c, XTensor * indexCPU);
+void _Select(const XTensor * a, XTensor * c, int* index, int dim);
+
+/* generate a tensor with selected data c = select(a) */
+void _Select(const XTensor * a, XTensor * c, XTensor* index, int dim);

 /* 
 generate a tensor with selected data c = select(a) (returna a XTensor structure)
 make a new tensor to keep the result and return it
 */
-XTensor Select(const XTensor &a, XTensor &indexCPU);
+XTensor Select(const XTensor &a, XTensor &index, int dim);

 /* 
 generate a tensor with selected data in range[low,high] along the given dimension 

--- a/source/tensor/core/getandset/SetData.cpp
+++ b/source/tensor/core/getandset/SetData.cpp
@@ -470,7 +470,7 @@ void _SetDataLowTri(XTensor * tensor, DTYPE p, int shift)
 void _SetDataRand(XTensor * tensor, int rNum, int cNum)
 {
    if (tensor == NULL || tensor->isInit == false || tensor->order !=2 ) {
-        InitTensor2D(tensor, rNum, cNum);
+        InitTensor2DV2(tensor, rNum, cNum);
    }

    _SetDataRand(tensor, 0.0F, 1.0F);
@@ -519,7 +519,7 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
 #ifdef USE_CUDA
        _CudaSetDataRand(tensor, lower, upper);
 #endif
-        //XTensor * t2 = NewTensor(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, -1);
+        //XTensor * t2 = NewTensorV2(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, -1);
        //_SetDataRand(t2, low, high);
        //_CopyValues(t2, tensor);
        //delete t2;

--- a/source/tensor/core/math/Binary.cpp
+++ b/source/tensor/core/math/Binary.cpp
@@ -21,6 +21,7 @@

 #include <math.h>
 #include "../../XName.h"
+#include "../shape/IsSameShaped.h"
 #include "Binary.h"
 #include "Binary.cuh"

@@ -77,7 +78,7 @@ void _funcName(const XTensor * a, XTensor * b, T num)                           
        _cudaFuncName(a, b, num);                                                    \
        return;                                                                      \
    }                                                                                \
-    CheckNTErrors((XTensor::IsSameShaped(a, b)),                                     \
+    CheckNTErrors((_IsSameShaped(a, b)),                                             \
                  "Input tensors should have the same data type!");                  \
    if (a->dataType == X_INT) {                                                      \
        int * d = (int*)a->data;                                                     \
@@ -112,7 +113,7 @@ void _funcName(const XTensor * a, XTensor * b, T num)                           
    if (a->devID >= 0) {                                                             \
        ShowNTErrors("No GPU devices support!")                                      \
    }                                                                                \
-    CheckNTErrors((XTensor::IsSameShaped(a, b)),                                     \
+    CheckNTErrors((_IsSameShaped(a, b)),                                             \
                  "Input tensors should have the same data type!");                  \
    if (a->dataType == X_INT) {                                                      \
        int * d = (int*)a->data;                                                     \
@@ -169,8 +170,8 @@ XTensor funcName(const XTensor &a, T num)                                       
    _funcName(&a, &b, num);                                                          \
    if(a.enableGrad){                                                                \
        XLink::MakeLink(&a, NULL, &b, operationId);                                  \
+        XLink::AddParamToHead(&b, num);                                              \
    }                                                                                \
-    XLink::AddParamToHead(&b, num);                                                  \
    return b;                                                                        \
 }                                                                                    \
 template XTensor funcName<int>(const XTensor&, int);                                 \
@@ -181,8 +182,8 @@ template XTensor funcName<double>(const XTensor&, double);
 template<class T>                                                                    \
 void funcName(const XTensor &a, XTensor &b, T num)                                   \
 {                                                                                    \
-    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {                               \
-        InitTensor(&b, &a);                                                          \
+    if (!b.isInit || !IsSameShaped(a, b)) {                                          \
+        InitTensorV2(&b, &a);                                                        \
    }                                                                                \
    _funcName(&a, &b, num);                                                          \
    if (a.enableGrad) {                                                              \

--- a/source/tensor/core/math/Binary.cu
+++ b/source/tensor/core/math/Binary.cu
@@ -23,6 +23,7 @@
 #include "../../XDevice.h"
 #include "../../XUtility.h"
 #include "../../XName.h"
+#include "../shape/IsSameShaped.h"
 #include "Binary.h"
 #include "Binary.cuh"

@@ -89,7 +90,7 @@ void Kernel##funcName(T1 * a, T1 * b, int size, T2 num)                         
 template<class T>                                                                   \
 void _Cuda##funcName(const XTensor * a, XTensor * b, T num)                         \
 {                                                                                   \
-    CheckNTErrors((XTensor::IsSameShaped(a, b)),                                    \
+    CheckNTErrors((_IsSameShaped(a, b)),                                    \
                  "Input tensors should have the same type!");                      \
    CheckNTErrors((a->isSparse == false), "TODO!");                                 \
                                                                                    \

--- a/source/tensor/core/math/Clip.cpp
+++ b/source/tensor/core/math/Clip.cpp
@@ -21,6 +21,7 @@

 #include "../../XTensor.h"
 #include "../../XName.h"
+#include "../shape/IsSameShaped.h"
 #include "Clip.h"
 #include "Clip.cuh"

@@ -43,7 +44,7 @@ void _Clip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper)
    }
 #endif

-    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
+    CheckNTErrors((_IsSameShaped(a, b)), "Input tensors should have the same type!");
    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");

    DTYPE * d = (DTYPE*)a->data;
@@ -110,8 +111,8 @@ XTensor Clip(const XTensor & a, DTYPE lower, DTYPE upper)

 void Clip(const XTensor & a, XTensor & b, DTYPE lower, DTYPE upper)
 {
-    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {
-        InitTensor(&b, &a);
+    if (!b.isInit || !IsSameShaped(a, b)) {
+        InitTensorV2(&b, &a);
    }

    /* call _Clip function */

--- a/source/tensor/core/math/Clip.cu
+++ b/source/tensor/core/math/Clip.cu
@@ -21,6 +21,7 @@

 #include "../../XDevice.h"
 #include "../../XTensor.h"
+#include "../shape/IsSameShaped.h"
 #include "Clip.h"
 #include "Clip.cuh"

@@ -36,7 +37,7 @@ set each entry to its clip value (CUDA Kernel)
 >> size - size of the data array
 */
 __global__
-    void KernelClip(DTYPE * a, DTYPE * b, DTYPE lower, DTYPE upper, int size)
+void KernelClip(DTYPE * a, DTYPE * b, DTYPE lower, DTYPE upper, int size)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;

@@ -74,7 +75,7 @@ set each entry to its clip value
 */
 void _CudaClip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper)
 {
-    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
+    CheckNTErrors((_IsSameShaped(a, b)), "Input tensors should have the same type!");
    CheckNTErrors((a->isSparse == false), "TODO!");

    int gridSize[3];

--- a/source/tensor/core/math/Compare.cpp
+++ b/source/tensor/core/math/Compare.cpp
@@ -22,6 +22,7 @@
 #include "../../XTensor.h"
 #include "../../XDevice.h"
 #include "../../XName.h"
+#include "../shape/IsSameShaped.h"
 #include "Compare.h"
 #include "Compare.cuh"

@@ -42,7 +43,7 @@ DTYPE myIsNotEqual(DTYPE a, DTYPE b)
 #define _SIMPLE_COMPARE_FUNCTION(_funcName, _cudaFuncName, origFunc)                 \
 void _funcName(const XTensor * a, XTensor * b, DTYPE number)                         \
 {                                                                                    \
-    CheckNTErrors((XTensor::IsSameShaped(a, b)),                                     \
+    CheckNTErrors((_IsSameShaped(a, b)),                                             \
                  "Input tensors should have the same type!");                       \
    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");                          \
    /* run it on GPUs */                                                             \
@@ -59,7 +60,7 @@ void _funcName(const XTensor * a, XTensor * b, DTYPE number)                    
 #define _SIMPLE_COMPARE_FUNCTION(_funcName, origFunc)                                \
 void _funcName(const XTensor * a, XTensor * b, DTYPE number)                         \
 {                                                                                    \
-    CheckNTErrors((XTensor::IsSameShaped(a, b)),                                     \
+    CheckNTErrors((_IsSameShaped(a, b)),                                             \
                  "Input tensors should have the same type!");                       \
    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");                          \
    /* run it on GPUs */                                                             \
@@ -97,8 +98,8 @@ XTensor funcName(const XTensor &a, DTYPE number)                                
 #define SIMPLE_COMPARE_FUNCTION_VOID(funcName, _funcName, operationId)               \
 void funcName(const XTensor &a, XTensor &b, DTYPE number)                            \
 {                                                                                    \
-    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {                               \
-        InitTensor(&b, &a);                                                          \
+    if (!b.isInit || !IsSameShaped(a, b)) {                                          \
+        InitTensorV2(&b, &a);                                                        \
    }                                                                                \
    _funcName(&a, &b, number);                                                       \
 }
@@ -130,7 +131,7 @@ SIMPLE_COMPARE_FUNCTION_VOID(NotEqual, _NotEqual, MATH_NOTEQUAL)
 #define _SIMPLE_MAX_MIN_FUNCTION(_funcName, _cudaFuncName, origFunc)                 \
 void _funcName(const XTensor * a, const XTensor * b,  XTensor * c)                   \
 {                                                                                    \
-    CheckNTErrors((XTensor::IsSameShaped(a, b, c)),                                  \
+    CheckNTErrors((_IsSameShaped(a, b, c)),                                          \
                  "Input and output tensors should have the same type!");            \
    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");                          \
    CheckDev(a->devID, b->devID);                                                    \
@@ -150,7 +151,7 @@ void _funcName(const XTensor * a, const XTensor * b,  XTensor * c)              
 #define _SIMPLE_MAX_MIN_FUNCTION(_funcName, origFunc)                                \
 void _funcName(const XTensor * a, const XTensor * b, XTensor *c)                     \
 {                                                                                    \
-    CheckNTErrors((XTensor::IsSameShaped(a, b, c)),                                  \
+    CheckNTErrors((_IsSameShaped(a, b, c)),                                          \
                  "Input and output tensors should have the same type!");            \
    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");                          \
    CheckDev(a, b);                                                                  \
@@ -191,7 +192,7 @@ XTensor funcName(const XTensor & a, const XTensor & b)                          
 #define SIMPLE_MAX_MIN_FUNCTION_VOID(funcName, _funcName, operationId)               \
 void funcName(const XTensor &a, const XTensor &b, XTensor c)                         \
 {                                                                                    \
-    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {                               \
+    if (!c.isInit || !_IsSameShaped(&a, &c)) {                                       \
        InitTensor(&c, &a);                                                          \
    }                                                                                \
    _funcName(&a, &b, &c);                                                           \

--- a/source/tensor/core/math/Normalize.cpp
+++ b/source/tensor/core/math/Normalize.cpp
@@ -22,6 +22,7 @@
 #include <math.h>
 #include "../../XTensor.h"
 #include "../../XName.h"
+#include "../shape/IsSameShaped.h"
 #include "Normalize.h"
 #include "Normalize.cuh"

@@ -46,26 +47,25 @@ void _Normalize(const XTensor * input, XTensor * output, int dim,
                const XTensor * mean, const XTensor * var, 
                const XTensor * a, const XTensor * b, DTYPE epsilon)
 {
-    int dimRDI = input->order - dim - 1;
-    CheckNTErrors((XTensor::IsSameShaped(input, output)), "Unmatched input tensors!");
-    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Unmatched input tensors");
-    CheckNTErrors((XTensor::IsSameShaped(mean, var)), "Unmatched input tensors");
+    CheckNTErrors((_IsSameShaped(input, output)), "Unmatched input tensors!");
+    CheckNTErrors((_IsSameShaped(a, b)), "Unmatched input tensors");
+    CheckNTErrors((_IsSameShaped(mean, var)), "Unmatched input tensors");
    CheckNTErrors((input && output && mean && var && a && b), "Empty input tensors!");
-    CheckNTErrors((dimRDI >= 0 && dimRDI < input->order), "Incorrect reduction dimension!");
+    CheckNTErrors((dim >= 0 && dim < input->order), "Incorrect reduction dimension!");
    CheckNTErrors((input->order == mean->order + 1), "Incorrect reduction dimension!");

    int stride = 1;
-    int strideNum = input->dimSizeRDI[dimRDI];
+    int strideNum = input->dimSize[dim];
    int blockSize = 1;
    int blockNum = 1;
    for (int i = 0; i < input->order; i++) {
-        if (i < dimRDI) {
-            CheckNTErrors((input->dimSizeRDI[i] == mean->dimSizeRDI[i]), "Wrong size!");
-            stride *= input->dimSizeRDI[i];
+        if (i < dim) {
+            CheckNTErrors((input->dimSize[i] == mean->dimSize[i]), "Wrong size!");
+            blockNum *= input->dimSize[i];
        }
-        else if (i > dimRDI) {
-            CheckNTErrors((input->dimSizeRDI[i] == mean->dimSizeRDI[i - 1]), "Wrong size!");
-            blockNum *= input->dimSizeRDI[i];
+        else if (i > dim) {
+            CheckNTErrors((input->dimSize[i] == mean->dimSize[i - 1]), "Wrong size!");
+            stride *= input->dimSize[i];
        }
    }
    blockSize = stride * strideNum;
@@ -203,8 +203,8 @@ void Normalize(const XTensor &input, XTensor &output, int dim,
               const XTensor &mean, const XTensor &var, 
               const XTensor &a, const XTensor &b, DTYPE epsilon)
 {
-    if (!output.isInit || !XTensor::IsSameShaped(&input, &output)) {
-        InitTensor(&output, &input);
+    if (!output.isInit || !IsSameShaped(input, output)) {
+        InitTensorV2(&output, &input);
    }

    /* call _Normalize function */

--- a/source/tensor/core/math/Normalize.cu
+++ b/source/tensor/core/math/Normalize.cu
@@ -95,15 +95,14 @@ void _CudaNormalize(const XTensor * input, XTensor * output, int dim,
 {
    CheckNTErrors((input->dataType == DEFAULT_DTYPE), "TODO!");

-	int dimRDI = input->order - dim - 1;
-	int stride = 1;
-    int strideNum = input->dimSizeRDI[dimRDI];
+    int stride = 1;
+    int strideNum = input->dimSize[dim];
    int blockNum = 1;
    for (int i = 0; i < input->order; i++) {
-        if (i < dimRDI)
-            stride *= input->dimSizeRDI[i];
-        else if (i > dimRDI)
-            blockNum *= input->dimSizeRDI[i];
+        if (i > dim)
+            stride *= input->dimSize[i];
+        else if (i < dim)
+            blockNum *= input->dimSize[i];
    }

    int cudaGridSize[3];

--- a/source/tensor/core/math/ScaleAndShift.cpp
+++ b/source/tensor/core/math/ScaleAndShift.cpp
@@ -22,6 +22,7 @@
 #include "../../XTensor.h"
 #include "../../XName.h"
 #include "../../XUtility.h"
+#include "../shape/IsSameShaped.h"
 #include "ScaleAndShift.h"
 #include "ScaleAndShift.cuh"

@@ -147,8 +148,8 @@ b = a * scale + shift
 */
 void ScaleAndShift(const XTensor & a, XTensor & b, DTYPE scale, DTYPE shift)
 {
-    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {
-        InitTensor(&b, &a);
+    if (!b.isInit || !IsSameShaped(a, b)) {
+        InitTensorV2(&b, &a);
    }

    /* call _ScaleAndShift function */

--- a/source/tensor/core/math/Unary.cpp
+++ b/source/tensor/core/math/Unary.cpp
@@ -22,6 +22,7 @@

 #include <math.h>
 #include "../../XName.h"
+#include "../shape/IsSameShaped.h"
 #include "Unary.h"
 #include "Unary.cuh"

@@ -77,7 +78,7 @@ void _funcName(const XTensor * a, XTensor * b)                                  
        _cudaFuncName(a, b);                                                         \
        return;                                                                      \
    }                                                                                \
-    CheckNTErrors((XTensor::IsSameShaped(a, b)),                                     \
+    CheckNTErrors((_IsSameShaped(a, b)),                                              \
                  "Input tensors should have the same type!");                       \
    if (a->dataType == X_INT) {                                                      \
        int * d = (int*)a->data;                                                     \
@@ -108,7 +109,7 @@ void _funcName(const XTensor * a, XTensor * b)                                  
    if (a->devID >= 0) {                                                             \
        ShowNTErrors("No GPU devices support!")                                      \
    }                                                                                \
-    CheckNTErrors((XTensor::IsSameShaped(a, b)),                                     \
+    CheckNTErrors((_IsSameShaped(a, b)),                                              \
                  "Input tensors should have the same type!");                       \
    if (a->dataType == X_INT) {                                                      \
        int * d = (int*)a->data;                                                     \
@@ -160,8 +161,8 @@ XTensor funcName(const XTensor & a)                                             
 #define SIMPLE_UNARY_FUNCTION_VOID(funcName, _funcName, operationId)                 \
 void funcName(const XTensor & a, XTensor & b)                                        \
 {                                                                                    \
-    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {                               \
-        InitTensor(&b, &a);                                                          \
+    if (!b.isInit || !IsSameShaped(a, b)) {                                        \
+        InitTensorV2(&b, &a);                                                          \
    }                                                                                \
    _funcName(&a, &b);                                                               \
    if (a.enableGrad) {                                                              \

--- a/source/tensor/core/math/Unary.cu
+++ b/source/tensor/core/math/Unary.cu
@@ -22,6 +22,7 @@
 #include <math.h>
 #include "../../XDevice.h"
 #include "../../XName.h"
+#include "../shape/IsSameShaped.h"
 #include "Unary.h"
 #include "Unary.cuh"
 #include<cuda_runtime.h>
@@ -154,7 +155,7 @@ void Kernel##funcName(T * a, T * b, int size)                               \
 }                                                                           \
 void _Cuda##funcName(const XTensor * a, XTensor * b)                        \
 {                                                                           \
-    CheckNTErrors((XTensor::IsSameShaped(a, b)),                            \
+    CheckNTErrors((_IsSameShaped(a, b)),                            \
                  "Input tensors should have the same type!");              \
    CheckNTErrors(a->isSparse == false, "TODO!");                           \
                                                                            \

--- a/source/tensor/core/movement/CopyInGrid.cpp
+++ b/source/tensor/core/movement/CopyInGrid.cpp
@@ -20,6 +20,7 @@
 */

 #include "../../XTensor.h"
+#include "../shape/IsSameShaped.h"
 #include "CopyInGrid.h"
 #include "CopyBlocksInGrid.h"

@@ -38,14 +39,13 @@ in the k-th grid
 */
 void _CopyInGrid(const XTensor * s, XTensor * t, int * index, int blockDim, int blockNumInGrid, bool isIndexOnDev)
 {
-    CheckNTErrors((XTensor::IsSameShaped(s, t)), "Unmatched tensors!");
+    CheckNTErrors((_IsSameShaped(s, t)), "Unmatched tensors!");

-    int blockDimRDI = s->order - blockDim - 1;
    int blockSize = 1;
    int blockNum = blockNumInGrid;
    int gridNum = 1;
-    for (int i = 0; i < blockDimRDI; i++)
-        blockSize *= s->dimSizeRDI[i];
+    for (int i = blockDim; i < s->order; i++)
+        blockSize *= s->dimSize[i];

    CheckNTErrors((s->unitNum % (blockSize * blockNum) == 0), "Illegal block number!");
    gridNum = s->unitNum / (blockSize * blockNum);

--- a/source/tensor/core/movement/CopyIndexed.cpp
+++ b/source/tensor/core/movement/CopyIndexed.cpp
@@ -24,6 +24,7 @@
 #include "CopyBlocks.h"
 #include "Gather.h"
 #include "../../XName.h"
+#include "../utilities/SetAscendingOrder.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

@@ -52,26 +53,28 @@ void _CopyIndexed(const XTensor * s, XTensor * t, int dim,
    CheckNTErrors(dim < s->order && dim < t->order, "A too larget dimension specified!");
    CheckNTErrors(s->unitSize == t->unitSize, "Unmatched tensors!");

-    int dimRDI = s->order - dim - 1;
    int blockSizeSrc = 1;
    int blockSizeTgt = 1;
    int blockNumSrc = 1;
    int blockNumTgt = 1;
-    int leadDimSizeSrc = s->dimSizeRDI[dimRDI];
-    int leadDimSizeTgt = t->dimSizeRDI[dimRDI];
+    int leadDimSizeSrc = s->dimSize[dim];
+    int leadDimSizeTgt = t->dimSize[dim];
    int indexOffsetNum = 1;

-    for (int i = 0; i < dimRDI; i++) {
-        blockSizeSrc *= s->dimSizeRDI[i];
-        blockSizeTgt *= t->dimSizeRDI[i];
+    for (int i = dim + 1; i < s->order; i++) {
+        blockSizeSrc *= s->dimSize[i];
+    }
+    for (int i = dim + 1; i < t->order; i++) {
+        blockSizeTgt *= t->dimSize[i];
+    }
+    for (int i = 0; i <= dim; i++)
+    {
+        blockNumSrc *= s->dimSize[i];
+        blockNumTgt *= t->dimSize[i];
    }
-    for (int i = dimRDI; i < s->order; i++)
-        blockNumSrc *= s->dimSizeRDI[i];
-    for (int i = dimRDI; i < t->order; i++)
-        blockNumTgt *= t->dimSizeRDI[i];

    CheckNTErrors(blockSizeSrc == blockSizeTgt, "Unmatched tensors!");
-    indexOffsetNum = blockNumSrc / s->dimSizeRDI[dimRDI];
+    indexOffsetNum = blockNumSrc / s->dimSize[dim];

    int realIndexSize = indexOffsetNum * indexSize * copyNum;
    int * realSrcIndex = new int[realIndexSize];
@@ -206,7 +209,7 @@ void _CopyIndexed(const XTensor * s, XTensor * t, int dim,
                  const XTensor * srcIndex, int copyNum)
 {
    XTensor * tgtIndex = NewTensor(srcIndex);
-    tgtIndex->SetAscendingOrder(0);
+    SetAscendingOrder(*tgtIndex, 0);

    _CopyIndexed(s, t, dim, srcIndex, tgtIndex, copyNum);
    delete tgtIndex;
@@ -218,14 +221,14 @@ make a new tensor to keep the result and return it

 >> s - the source tensor
 >> dim - the leading dimension to define "sub-tensors"
-         e.g., for a tensor of size (4, 2, 3) and dim = 0, 
-         we have 4 sub-tensors of size (2, 3)
+         e.g., for a tensor of size (3, 2, 4) and dim = 2, 
+         we have 4 sub-tensors of size (3,2)
 >> srcIndex - index of the source sub-tensors
 >> indexSize - length of srcIndex (and tgtIndex)
 >> tgtIndex - index of the target sub-tensors
 >> copyNum - number of the sub-tensors we copy for each source index, 
-             e.g., for srcIndex = [0,1] and copyNum = 2,
-             we actually copy the source sub-tensors 0, 1, 1 and 2
+             e.g., for srcIndex = [1,4] and copyNum = 2,
+             we actually copy the source sub-tensors 1, 2, 4, 5
 << return - the result of copying indexed sub-tensors
 */
 XTensor CopyIndexed(const XTensor & s, int dim, 
@@ -276,14 +279,14 @@ make a new tensor to keep the result and return it

 >> s - the source tensor
 >> dim - the leading dimension to define "sub-tensors"
-         e.g., for a tensor of size (4, 2, 3) and dim = 0, 
-         we have 4 sub-tensors of size (2, 3)
+         e.g., for a tensor of size (3, 2, 4) and dim = 2, 
+         we have 4 sub-tensors of size (3,2)
 >> srcIndex - index of the source sub-tensors
 >> indexSize - length of srcIndex (and tgtIndex)
 >> tgtIndex - index of the target sub-tensors
 >> copyNum - number of the sub-tensors we copy for each source index, 
-             e.g., for srcIndex = [0,1] and copyNum = 2,
-             we actually copy the source sub-tensors 0, 1, 1 and 2
+             e.g., for srcIndex = [1,4] and copyNum = 2,
+             we actually copy the source sub-tensors 1, 2, 4, 5
 << return - the result of copying indexed sub-tensors
 */
 XTensor CopyIndexed(const XTensor &s, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum)

--- a/source/tensor/core/movement/Gather.cpp
+++ b/source/tensor/core/movement/Gather.cpp
@@ -33,6 +33,29 @@ gather indexed sub-tensors

 >> s - the source tensor
 >> t - the target tensor
+>> srcIndex - index of the source sub-tensors
+>> dim - the leading dimension to define "sub-tensors"
+e.g., for a tensor of size (3, 2, 4) and dim = 2,
+we have 4 sub-tensors of size (3, 2)
+*/
+void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
+{
+    CheckNTErrors((s && t), "Invalid tensors!");
+    CheckNTErrors(s->devID == t->devID, "the data must be kept on the same device!");
+    CheckNTErrors((t->unitSize == srcIndex->unitSize), "Unmatched tensors!");
+#ifdef USE_CUDA
+    if (s->devID >= 0 && t->devID >= 0) {
+        _CudaGather(s, t, srcIndex, dim);
+        return;
+    }
+#endif
+}
+
+/*
+gather indexed sub-tensors
+
+>> s - the source tensor
+>> t - the target tensor
 >> srcIndex - the tensor to save the index of the source tensor
 */
 void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex)
@@ -79,10 +102,15 @@ XTensor Gather(XTensor &s, XTensor &index)

    CheckNTErrors(s.order == 2, "The order of the input tensor must be 2!");
 
-    int order = index.order + 1;
+    int order = s.order;
    int * dimSize = new int[order];
-    memcpy(dimSize, index.dimSize, index.order * sizeof(int));
-    dimSize[index.order] = s.GetDim(-1);
+
+    for (int i = 0; i < s.order; i++) {
+        if (i == dim)
+            dimSize[i] = index.unitNum;
+        else
+            dimSize[i] = s.dimSize[i];
+    }
    
    float dr = (!s.isSparse) ? 1.0F : s.denseRatio;
    XTensor t(order, dimSize, s.dataType, dr, s.devID, s.mem);
@@ -93,11 +121,25 @@ XTensor Gather(XTensor &s, XTensor &index)
    _Gather(&s, &t, &index);

    /* tensor connection */
-    if (s.enableGrad) {
+    if (s.enableGrad)
+    {
        XLink::MakeLink(&s, &index, &t, MOVEMENT_GATHER);
    }

-    return t;
+    if(index.order > 1) {
+        int * dims = new int[index.order + 1];
+        memcpy(dims, index.dimSize, index.order * sizeof(int));
+        dims[index.order] = t.GetDim(-1);
+
+        XTensor tt;
+        tt = Reshape(t, index.order + 1, dims);
+        delete[] dims;
+
+        return tt;
+    }
+    else {
+        return t;
+    }   
 }

 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/movement/Gather.cu
+++ b/source/tensor/core/movement/Gather.cu
@@ -68,6 +68,35 @@ void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int indexSize, int
 /*
 gather indexed sub-tensors(cuda version)

+>> sData - the data pointer of the source tensor
+>> tData - the data pointer of the target tensor
+>> sIndex - the index of the source tensor
+>> indexSize - the size of the srcIndex
+>> stride - stride of a data block
+>> strideNum - strideNum of a data block
+>> blockNum - block size of data
+*/
+__global__
+void KernelGather(DTYPE * sData, DTYPE * tData, int * sIndex, int stride, int strideNum, int blockNum)
+{
+    int idx = blockDim.x * blockIdx.x + threadIdx.x;
+    int idy = blockDim.y * blockIdx.y + threadIdx.y;
+    int blockIndex = idy / stride;
+    int offsetInBlock = idy % stride;
+
+    int size = stride * strideNum * blockNum;  
+
+#pragma unroll
+    for (int i = idx * stride + stride * strideNum * blockIndex + offsetInBlock;
+        i < stride * strideNum * blockIndex + offsetInBlock + stride * strideNum && i < size;
+        i += stride * blockDim.x) {
+        tData[i] = sData[sIndex[i]];
+    }
+}
+
+/*
+gather indexed sub-tensors(cuda version)
+
 >> s - the source tensor
 >> t - the target tensor
 >> srcIndex - the tensor to save the index of the source tensor
@@ -117,6 +146,44 @@ void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex)
    BacktoCudaDev(devID, devIDBackup);
 }

+/*
+gather indexed sub-tensors(cuda version)
+
+>> s - the source tensor
+>> t - the target tensor
+>> srcIndex - the tensor to save the index of the source tensor
+>> dim - the leading dimension to define "sub-tensors"
+*/
+void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim)
+{
+    int devID = srcIndex->devID;
+    XMem * mem = s->mem;
+
+    int stride = 1;
+    int blockNum = 1;
+    int indexSize = srcIndex->unitNum;
+    int strideNum = srcIndex->dimSize[dim];
+    for (int i = 0; i < dim; i++)
+        blockNum *= srcIndex->dimSize[i];
+    for (int i = dim + 1; i < srcIndex->order; i++)
+        stride *= srcIndex->dimSize[i];
+
+    int * sIndex = NULL;
+    if (srcIndex->devID < 0) {
+        sIndex = mem != NULL ?
+            (int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
+            (int*)XMemAlloc(mem->devID, sizeof(int) * indexSize);
+        XMemCopy(sIndex, devID, srcIndex, -1, sizeof(int) * indexSize);
+    }
+    else
+        sIndex = (int *)srcIndex->data;
+
+    int cudaGrids[3];
+    int cudaBlocks[3];
+    GDevs.GetCudaThread2D(devID, max(32, strideNum), stride*blockNum, MAX_INT, cudaGrids, cudaBlocks);
+
+    KernelGather << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> > ((DTYPE *)s->data, (DTYPE *)t->data, sIndex, stride, strideNum, blockNum);
+}
 #endif // USE_CUDA

 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/movement/Gather.cuh
+++ b/source/tensor/core/movement/Gather.cuh
@@ -32,6 +32,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* gather indexed sub-tensors(cuda version) */
 void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex);

+void _CudaGather(const XTensor * s, XTensor * t, XTensor * srcIndex,int dim);
+
 #endif // USE_CUDA

 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/movement/Gather.h
+++ b/source/tensor/core/movement/Gather.h
@@ -29,6 +29,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* gather selected sub-tensors */
 void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex);

+/* gather selected sub-tensors accoding to the dimension */
+void _Gather(const XTensor * s, XTensor * t, XTensor * srcIndex, int dim);
+
 /* gather selected sub-tensors (return an XTensor structure)
   make a new tensor to keep the result and return it */
 XTensor Gather(XTensor &s, XTensor &index);

--- a/source/tensor/core/reduce/ReduceMax.cpp
+++ b/source/tensor/core/reduce/ReduceMax.cpp
--- a/source/tensor/core/reduce/ReduceMax.cu
+++ b/source/tensor/core/reduce/ReduceMax.cu
--- a/source/tensor/core/reduce/ReduceMax.cuh
+++ b/source/tensor/core/reduce/ReduceMax.cuh
@@ -31,6 +31,9 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 /* get the max-valued items along a dimension of the tensor (cuda version) */
 void _CudaReduceMax(const XTensor * input, XTensor * output, int dim);

+/* get the min-valued items along a dimension of the tensor (cuda version) */
+void _CudaReduceMin(const XTensor * input, XTensor * output, int dim);
+
 #endif // USE_CUDA

 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/reduce/ReduceMax.h
+++ b/source/tensor/core/reduce/ReduceMax.h
@@ -29,14 +29,20 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 /* get the max value of the items along a dimension of the tensor. */
 void _ReduceMax(const XTensor * input, XTensor * output, int dim);

+/* get the min value of the items along a dimension of the tensor. */
+void _ReduceMin(const XTensor * input, XTensor * output, int dim);
+
 /* 
 get the max value of the items along a dimension of the tensor (return an XTensor structure)
 make a new tensor to keep the result and return it
 */
 XTensor ReduceMax(const XTensor &input, int dim);

-/* get the max value of the items along a dimension of the tensor. */
-void ReduceMax(const XTensor &input, XTensor &output, int dim);
+/*
+get the min value of the items along a dimension of the tensor (return an XTensor structure)
+make a new tensor to keep the result and return it
+*/
+XTensor ReduceMin(const XTensor &input, int dim);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/reduce/ReduceMean.cpp
+++ b/source/tensor/core/reduce/ReduceMean.cpp
@@ -39,8 +39,7 @@ void _ReduceMean(const XTensor * input, XTensor * output, int dim)
 {
    CheckNTErrors((input->order > dim), "Illegal dimension specified!");

-    int dimRDI = input->order - dim - 1;
-    int num = input->dimSizeRDI[dimRDI];
+    int num = input->dimSize[dim];

    _ReduceSum(input, output, dim);
    _ScaleAndShiftMe(output, (DTYPE)1/num, 0);
@@ -112,7 +111,7 @@ void ReduceMean(const XTensor &input, XTensor &output, int dim)
        }

        float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
-        InitTensor(&output, order, dimSize, input.dataType, dr, input.devID, input.mem);
+        InitTensorV2(&output, order, dimSize, input.dataType, dr, input.devID, input.mem);

        /* destroy variables */
        delete[] dimSize;

--- a/source/tensor/core/reduce/ReduceSum.cpp
+++ b/source/tensor/core/reduce/ReduceSum.cpp
@@ -22,6 +22,7 @@
 #include <math.h>
 #include "ReduceSum.h"
 #include "ReduceSum.cuh"
+#include "../shape/IsSameShaped.h"
 #include "../../XName.h"
 #include "../../XBLAS.h"
 #include "VectorBuffer.h"
@@ -51,17 +52,16 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor 
    CheckNTErrors((input->order == output->order + 1), "Incorrect tensor sizes!");
    CheckNTErrors((input->order > dim && dim >=0), "Illegal dimension to reduce!");
    CheckNTErrors((input->dataType == output->dataType), "Unmatched data types!");
-    CheckNTErrors((shift == NULL || XTensor::IsSameShaped(output, shift)), "Incorrect shift tensor size!");
+    CheckNTErrors((shift == NULL || _IsSameShaped(output, shift)), "Incorrect shift tensor size!");

-    int dimRDI = input->order - dim - 1;
-    CheckNTErrors(dimRDI >= 0, "Wrong dimension!");
+    CheckNTErrors(dim < input->order, "Wrong dimension!");

    for(int i = 0; i < input->order; i++){
-        if(i < dimRDI){
-            CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i]), "Unmatched tensors!");
+        if(i < dim){
+            CheckNTErrors((input->dimSize[i] == output->dimSize[i]), "Unmatched tensors!");
        }
-        else if(i > dimRDI){
-            CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i - 1]), "Unmatched tensors!");
+        else if(i > dim){
+            CheckNTErrors((input->dimSize[i] == output->dimSize[i - 1]), "Unmatched tensors!");
        }
    }

@@ -74,21 +74,21 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor 
        CheckNTErrors((input->dataType == DEFAULT_DTYPE), "TODO!");

        int stride = 1;
-        int strideNum = input->dimSizeRDI[dimRDI];
+        int strideNum = input->dimSize[dim];
        int blockSize = 1;
        int blockNum = 1;
        for (int i = 0; i < input->order; i++) {
-            if (i < dimRDI)
-                stride *= input->dimSizeRDI[i];
-            else if (i > dimRDI)
-                blockNum *= input->dimSizeRDI[i];
+            if (i < dim)
+                blockNum *= input->dimSize[i];
+            else if (i > dim)
+                stride *= input->dimSize[i];
        }
        blockSize = stride * strideNum;

-        if(input->dimSizeRDI[0] % (4 * 32 / sizeof(DTYPE)) == 0 && input->dimSizeRDI[0] >= 32){
+        if(input->dimSize[input->order - 1] % (4 * 32 / sizeof(DTYPE)) == 0 && input->dimSize[input->order - 1] >= 32){
            int vecBufLength =  32 / sizeof(DTYPE);

-            if(dimRDI == 0){
+            if(dim == input->order - 1){
                //data is contiguous in dim 0
                for(int i = 0; i < blockNum; i++){
                    // stride = 1
@@ -122,7 +122,7 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor 
            } else{
                //data is separated
                for(int i = 0; i < blockNum; i++){
-                    for(int j = 0; j < input->dimSizeRDI[0] / 32; j++){
+                    for(int j = 0; j < input->dimSize[input->order - 1] / 32; j++){
                        DTYPE * ip = (DTYPE*)input->data + blockSize * i;
                        DTYPE * op = (DTYPE*)output->data + stride * i;
                        DTYPE * sp = shift != NULL ? (DTYPE*)shift->data + stride * i : NULL;
@@ -334,7 +334,7 @@ void ReduceSum(const XTensor &input, XTensor &output, int dim, const XTensor &sh
        }

        float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
-        InitTensor(&output, order, dimSize, input.dataType, dr, input.devID, input.mem);
+        InitTensorV2(&output, order, dimSize, input.dataType, dr, input.devID, input.mem);

        /* destroy variables */
        delete[] dimSize;
@@ -429,7 +429,7 @@ void ReduceSum(const XTensor &input, XTensor &output, int dim, DTYPE power, bool
        }

        float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
-        InitTensor(&output, order, dimSize, input.dataType, dr, input.devID, input.mem);
+        InitTensorV2(&output, order, dimSize, input.dataType, dr, input.devID, input.mem);

        /* destroy variables */
        delete[] dimSize;

--- a/source/tensor/core/reduce/ReduceSum.cu
+++ b/source/tensor/core/reduce/ReduceSum.cu
@@ -692,13 +692,12 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
    CheckNTErrors(input->dataType == output->dataType, "Unmatched data types!");
    CheckNTErrors(shift == NULL || output->unitNum == shift->unitNum, "Incorrect shift tensor size!");

-    int dimRDI = input->order - dim - 1;
    for(int i = 0; i < input->order; i++){
-        if(i < dimRDI){
-            CheckNTErrors(input->dimSizeRDI[i] == output->dimSizeRDI[i], "Unmatched tensors!");
+        if(i < dim){
+            CheckNTErrors(input->dimSize[i] == output->dimSize[i], "Unmatched tensors!");
        }
-        else if(i > dimRDI){
-            CheckNTErrors(input->dimSizeRDI[i] == output->dimSizeRDI[i - 1], "Unmatched tensors!");
+        else if(i > dim){
+            CheckNTErrors(input->dimSize[i] == output->dimSize[i - 1], "Unmatched tensors!");
        }
    }

@@ -709,31 +708,23 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
    int cudaBlockSize[3];
    int iter = 0;
    int stride = 1;
-    int strideNum = input->dimSizeRDI[dimRDI];
+    int strideNum = input->dimSize[dim];
    int blockSize = 1;
    int blockNum = 1;

    for (int i = 0; i < input->order; i++) {
-        if (i < dimRDI)
-            stride *= input->dimSizeRDI[i];
-        else if (i > dimRDI)
-            blockNum *= input->dimSizeRDI[i];
+        if (i < dim)
+            blockNum *= input->dimSize[i];
+        else if (i > dim)
+            stride *= input->dimSize[i];
    }
    blockSize = stride * strideNum;

    int devID = input->devID;
-    XMem * mem = input->mem;
-
-    GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+    int devIDBackup;
+    ProtectCudaDev(devID, devIDBackup);

-    int bufSize = input->unitSize * cudaGridSize[0] * stride * blockNum * 2;
-    DTYPE * buf  = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(input->devID, bufSize);
-    DTYPE * buf1 = buf;
-    DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum;
    DTYPE * sp = shift != NULL ? (DTYPE*)shift->data : NULL;
-    
-    int devIDBackup;
-    ProtectCudaDev(input->devID, devIDBackup);

    if (stride == 1 && blockNum >= 10) {
        dim3 grids;
@@ -751,7 +742,7 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
                                                              strideNum, blockNum, sp, power, isExp);
        }
    }
-    else if (stride != 1 && stride * blockNum > 4096){
+    else if (stride != 1 && stride * blockNum > 4096) {
        //GDevs->GetGridAndBlockSize2D(devID, stride * blockNum, strideNum,MAX_INT, cudaGridSize, cudaBlockSize);
        //unsigned int* goutput = (unsigned int *)input->data;
        //convert2uintV2 << <dim3(cudaGridSize[0], cudaGridSize[1]), dim3(cudaBlockSize[0], cudaBlockSize[1]) >> > ((float*)input->data, goutput, stride, strideNum, blockNum, strideNum*blockNum*stride);
@@ -761,6 +752,14 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
                                                                strideNum, blockNum,sp, power, isExp);
    }
    else {
+        XMem * mem = input->mem;
+
+        GDevs.GetCudaThread2D(devID, strideNum, stride * blockNum, MAX_INT, cudaGridSize, cudaBlockSize);
+
+        int bufSize = input->unitSize * cudaGridSize[0] * stride * blockNum * 2;
+        DTYPE * buf  = mem != NULL ? (DTYPE*)mem->AllocBuf(mem->devID, bufSize) : (DTYPE*)XMemAlloc(devID, bufSize);
+        DTYPE * buf1 = buf;
+        DTYPE * buf2 = buf + cudaGridSize[0] * stride * blockNum;
        do {
            if (input->dataType == DEFAULT_DTYPE) {
                DTYPE * iData = NULL;
@@ -904,13 +903,15 @@ void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTen
            iter++;

        } while (strideNum > 1);
+        
+
+        if (mem != NULL)
+            mem->ReleaseBuf(mem->devID, bufSize);
+        else
+            XMemFree(devID, buf);
    }
-    ProtectCudaDev(input->devID, devIDBackup);

-    if (mem != NULL)
-        mem->ReleaseBuf(mem->devID, bufSize);
-    else
-        XMemFree(input->devID, buf);
+    BacktoCudaDev(devID, devIDBackup);
 }

 #endif // USE_CUDA

--- a/source/tensor/core/reduce/ReduceSumAll.cpp
+++ b/source/tensor/core/reduce/ReduceSumAll.cpp
@@ -49,8 +49,8 @@ DTYPE _ReduceSumAll(const XTensor * source)
    int dims[2] = {1, source->unitNum};
    int one = 1;

-    XTensor * all = NewTensorBuf(2, dims, source->dataType, source->denseRatio, source->devID, source->mem);
-    XTensor * result = NewTensorBuf(1, &one, source->dataType, 1.0F, source->devID, source->mem);
+    XTensor * all = NewTensorBufV2(2, dims, source->dataType, source->denseRatio, source->devID, source->mem);
+    XTensor * result = NewTensorBufV2(1, &one, source->dataType, 1.0F, source->devID, source->mem);
    
    _CopyValues(source, all);
    _ReduceSum(all, result, 1);
@@ -74,7 +74,7 @@ DTYPE _ReduceSumAll(const XTensor * source)
        int leadingDim = big->order - 1;
        int * dimSize;
        dimSize = getDimSize(big, leadingDim);
-        XTensor * little = NewTensor(big->order - 1, dimSize, source->dataType, source->denseRatio, 
+        XTensor * little = NewTensorV2(big->order - 1, dimSize, source->dataType, source->denseRatio, 
                                     source->devID, source->mem);
        
        _ReduceSum(big, little, leadingDim);

--- a/source/tensor/core/reduce/ReduceSumSquared.cpp
+++ b/source/tensor/core/reduce/ReduceSumSquared.cpp
@@ -109,7 +109,7 @@ void ReduceSumSquared(const XTensor &input, XTensor &output, int dim, const XTen
        }

        float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
-        InitTensor(&output, order, dimSize, input.dataType, dr, input.devID, input.mem);
+        InitTensorV2(&output, order, dimSize, input.dataType, dr, input.devID, input.mem);

        /* destroy variables */
        delete[] dimSize;

--- a/source/tensor/core/reduce/ReduceVariance.cpp
+++ b/source/tensor/core/reduce/ReduceVariance.cpp
@@ -38,8 +38,7 @@ For a 1-dimensional data array a, variance = 1/n * \sum_i (a_i - mean)^2
 */
 void _ReduceVariance(const XTensor * input, XTensor * output, int dim, const XTensor * mean)
 {
-    int dimRDI = input->order - dim - 1;
-    int num = input->dimSizeRDI[dimRDI];
+    int num = input->dimSize[dim];
    _ReduceSum(input, output, dim, mean, 2.0F);
    _ScaleAndShiftMe(output, (DTYPE)1 / num, 0);
 }
@@ -112,7 +111,7 @@ void ReduceVariance(const XTensor &input, XTensor &output, int dim, const XTenso
        }

        float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
-        InitTensor(&output, order, dimSize, input.dataType, dr, input.devID, input.mem);
+        InitTensorV2(&output, order, dimSize, input.dataType, dr, input.devID, input.mem);

        /* destroy variables */
        delete[] dimSize;

--- a/source/tensor/core/reduce/VectorBuffer.cpp
+++ b/source/tensor/core/reduce/VectorBuffer.cpp
@@ -20,7 +20,7 @@
 */

 #include "VectorBuffer.h"
-
+#include "math.h"
 namespace nts {
 /* data size for each buffer */
 int VectorBuffer::size()
@@ -168,4 +168,12 @@ VectorBuffer VectorBuffer::maxData(const VectorBuffer &a) {
    return *this;
 }

+/* conculte the max of two buffer */
+VectorBuffer VectorBuffer::minData(const VectorBuffer &a) {
+    for (int i = 0; i != a.size(); i++) {
+        this->values[i] = MIN(a[i], this->values[i]);
+    }
+    return *this;
+}
+
 }/* end of the nts (NiuTrans.Tensor) namespace */
\ No newline at end of file
--- a/source/tensor/core/reduce/VectorBuffer.h
+++ b/source/tensor/core/reduce/VectorBuffer.h
@@ -19,8 +19,6 @@
 * $Created by: ZHANG Yuhao (email: zhangyuhao@stu.neu.edu.cn) 2019-07-23
 */

-//#include <cstring>
-#include <math.h>
 #include "../../XGlobal.h"

 namespace nts {
@@ -49,5 +47,8 @@ public:

    /* conculte the max of two buffer */
    VectorBuffer maxData(const VectorBuffer &a); 
+
+    /* conculte the max of two buffer */
+    VectorBuffer minData(const VectorBuffer &a);
 };
 }
\ No newline at end of file
--- a/source/tensor/core/shape/Concatenate.cpp
+++ b/source/tensor/core/shape/Concatenate.cpp
@@ -21,6 +21,7 @@

 #include "../../XTensor.h"
 #include "../../XName.h"
+#include "../shape/IsSameShaped.h"
 #include "Concatenate.h"
 #include "Merge.h"
 #include "ConcatenateSolely.h"
@@ -44,7 +45,7 @@ void _Concatenate(const TensorList * smalls, XTensor * big, int dim)
        XTensor * a = (XTensor*)smalls->GetItem(i - 1);
        XTensor * b = (XTensor*)smalls->GetItem(i);
        CheckNTErrors((a && b), "Empty input tensors!");
-        if (!XTensor::IsSameShaped(a, b))
+        if (!_IsSameShaped(a, b))
            uniform = false;
    }

@@ -76,7 +77,7 @@ XTensor Concatenate(const TensorList &smalls, int dim)
        XTensor * a = (XTensor*)smalls.GetItem(i - 1);
        XTensor * b = (XTensor*)smalls.GetItem(i);
        CheckNTErrors((a && b), "Empty input tensors!");
-        if (!XTensor::IsSameShaped(a, b))
+        if (!_IsSameShaped(a, b))
            uniform = false;
    }
    XTensor * tensor = (XTensor*)smalls.GetItem(0);
@@ -189,7 +190,7 @@ void Concatenate(const TensorList & smalls, XTensor & big, int dim)
        XTensor * a = (XTensor*)smalls.GetItem(i - 1);
        XTensor * b = (XTensor*)smalls.GetItem(i);
        CheckNTErrors((a && b), "Empty input tensors!");
-        if (!XTensor::IsSameShaped(a, b))
+        if (!_IsSameShaped(a, b))
            uniform = false;
    }

@@ -207,7 +208,7 @@ void Concatenate(const TensorList & smalls, XTensor & big, int dim)
            }

            float dr = (!tensor->isSparse) ? 1.0F : tensor->denseRatio;
-            InitTensor(&big, order, dimSize, tensor->dataType, dr, tensor->devID, tensor->mem);
+            InitTensorV2(&big, order, dimSize, tensor->dataType, dr, tensor->devID, tensor->mem);
        }
        else {
            for (int i = 0; i < tensor->order; i++)
@@ -222,7 +223,7 @@ void Concatenate(const TensorList & smalls, XTensor & big, int dim)
            dimSize[dim] = catDimSize;

            float dr = (!tensor->isSparse) ? 1.0F : tensor->denseRatio;
-            InitTensor(&big, order, dimSize, tensor->dataType, dr, tensor->devID, tensor->mem);
+            InitTensorV2(&big, order, dimSize, tensor->dataType, dr, tensor->devID, tensor->mem);
        }    
        /* destroy variables */
        delete[] dimSize;
@@ -290,7 +291,7 @@ XTensor Concatenate(const XTensor &smallA, const XTensor &smallB, int dim)
        XTensor * a = (XTensor*)smalls.Get(i - 1);
        XTensor * b = (XTensor*)smalls.Get(i);
        CheckNTErrors((a && b), "Empty input tensors!");
-        if (!XTensor::IsSameShaped(a, b))
+        if (!_IsSameShaped(a, b))
            uniform = false;
    }
    XTensor * tensor = (XTensor*)smalls.Get(0);

--- a/source/tensor/core/shape/ConcatenateSolely.cpp
+++ b/source/tensor/core/shape/ConcatenateSolely.cpp
@@ -39,30 +39,29 @@ void _ConcatenateSolely(const TensorList * smalls, XTensor * big, int dim)
    CheckNTErrors(big->order > dim && dim >= 0, "Illegal dimension to concatenate!");

    int catDimSize = 0;
-    int dimRDI = big->order - dim - 1;

    for (int i = 0; i < smalls->count; i++) {
        XTensor * tensor = (XTensor*)smalls->GetItem(i);
        CheckNTErrors((big->order == tensor->order), "Unmatched tensor orders!");
        for (int j = 0; j < big->order; j++) {
-            if (j != dimRDI) {
-                CheckNTErrors((big->dimSizeRDI[j] == tensor->dimSizeRDI[j]), "Unmatched tensor sizes!");
+            if (j != dim) {
+                CheckNTErrors((big->dimSize[j] == tensor->dimSize[j]), "Unmatched tensor sizes!");
            }
            else {
-                catDimSize += tensor->dimSizeRDI[j];
+                catDimSize += tensor->dimSize[j];
            }
        }
    }

-    CheckNTErrors((catDimSize == big->dimSizeRDI[dimRDI]), "Unmatched tensor sizes!");
+    CheckNTErrors((catDimSize == big->dimSize[dim]), "Unmatched tensor sizes!");

    int stride = 1;
-    for (int i = 0; i < dimRDI; i++)
-        stride *= big->dimSizeRDI[i];
-
    int blockNum = 1;
-    for (int i = dimRDI + 1; i < big->order; i++)
-        blockNum *= big->dimSizeRDI[i];
+    for (int i = 0; i < dim; i++)
+        blockNum *= big->dimSize[i];
+
+    for (int i = dim + 1; i < big->order; i++)
+        stride *= big->dimSize[i];

    int offset = 0;

@@ -74,8 +73,8 @@ void _ConcatenateSolely(const TensorList * smalls, XTensor * big, int dim)
    if (smalls->count <= MIN_TENSOR_CAT_NUM) {
        for (int i = 0; i < smalls->count; i++) {
            XTensor * tensor = (XTensor*)smalls->GetItem(i);
-            int sPitch = stride * tensor->dimSizeRDI[dimRDI] * tensor->unitSize;
-            int tPitch = stride * big->dimSizeRDI[dimRDI] * big->unitSize;
+            int sPitch = stride * tensor->dimSize[dim] * tensor->unitSize;
+            int tPitch = stride * big->dimSize[dim] * big->unitSize;
            int mSize = sPitch;
            int n = blockNum;
            XMemCopy2D((char*)big->data + offset, tPitch, big->devID,
@@ -85,11 +84,11 @@ void _ConcatenateSolely(const TensorList * smalls, XTensor * big, int dim)
        }
    }
    else {
-		StrList* sourceArrays = new StrList(smalls->count);
+        StrList* sourceArrays = new StrList(smalls->count);
        int * blockSizes = new int[smalls->count];
        for (int i = 0; i < smalls->count; i++) {
            XTensor * tensor = (XTensor*)smalls->GetItem(i);
-            blockSizes[i] = stride * tensor->dimSizeRDI[dimRDI] * tensor->unitSize;
+            blockSizes[i] = stride * tensor->dimSize[dim] * tensor->unitSize;
            sourceArrays->Add((char*)tensor->data);
        }


--- a/source/tensor/core/shape/IsSameShaped.cpp
+++ b/source/tensor/core/shape/IsSameShaped.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: LI Yinqiao (email: li.yin.qiao.2012@hotmail.com) 2019-10-22
+*/
+
+#include "../../XTensor.h"
+#include "IsSameShaped.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* 
+check whether the two matrices are in the same type and size
+>> a - input tensor
+>> b - anther tensor to compare with
+<< return - whether the two input tensors are identical
+*/
+bool _IsSameShaped(const XTensor * a, const XTensor * b)
+{
+    if(a == NULL || b == NULL)
+        return false;
+
+    if(a->order != b->order)
+        return false;
+
+    for(int i = 0; i < a->order; i++){
+        if(a->dimSize[i] != b->dimSize[i])
+            return false;
+    }
+
+    if(a->dataType != b->dataType)
+        return false;
+
+    if(a->denseRatio != b->denseRatio)
+        return false;
+
+    if(a->isSparse != b->isSparse)
+        return false;
+
+    return true;
+}
+
+/*
+check whether the two matrices are in the same type and size
+>> a - input tensor
+>> b - anther tensor to compare with
+<< return - whether the two input tensors are identical
+*/
+bool IsSameShaped(const XTensor & a, const XTensor & b)
+{
+    return _IsSameShaped(&a, &b);
+}
+
+/*
+check whether the three matrices are in the same type and size
+>> a - input tensor
+>> b - anther tensor to compare with
+>> c - a tensor again
+<< return - whether the two input tensors are identical
+*/
+bool _IsSameShaped(const XTensor * a, const XTensor * b, const XTensor * c)
+{
+    return IsSameShaped(a, b) && IsSameShaped(a, c);
+}
+
+/*
+check whether the three matrices are in the same type and size
+>> a - input tensor
+>> b - anther tensor to compare with
+>> c - a tensor again
+<< return - whether the two input tensors are identical
+*/
+bool IsSameShaped(const XTensor & a, const XTensor & b, const XTensor & c)
+{
+    return _IsSameShaped(&a, &b, &c);
+}
+
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/shape/IsSameShaped.h
+++ b/source/tensor/core/shape/IsSameShaped.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: LI Yinqiao (email: li.yin.qiao.2012@hotmail.com) 2019-10-22
+*/
+
+#ifndef __ISSAMESHAPED_H__
+#define __ISSAMESHAPED_H__
+
+#include "../../XTensor.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* judge whether the two matrices are in the same type and size */
+bool _IsSameShaped(const XTensor * a, const XTensor * b);
+
+/* judge whether the two matrices are in the same type and size */
+bool IsSameShaped(const XTensor & a, const XTensor & b);
+
+/* judge whether the three matrices are in the same type and size */
+bool _IsSameShaped(const XTensor * a, const XTensor * b, const XTensor * c);
+
+/* judge whether the three matrices are in the same type and size */
+bool IsSameShaped(const XTensor & a, const XTensor & b, const XTensor & c);
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __ISSAMESHAPED_H__
\ No newline at end of file
--- a/source/tensor/core/shape/Merge.cpp
+++ b/source/tensor/core/shape/Merge.cpp
@@ -22,6 +22,7 @@
 #include "../../XTensor.h"
 #include "../../XUtility.h"
 #include "../../XName.h"
+#include "../shape/IsSameShaped.h"
 #include "Merge.h"
 #include "MakeMergeBlockIndex.h"
 #include "../movement/CopyBlocksOnSite.h"
@@ -45,10 +46,8 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
    if(leadingDim < 0)
        leadingDim = 0;

-    int whereToMergeRDI = s->order - whereToMerge - 1;
-    int leadingDimRDI = s->order - leadingDim - 1;
-    if (leadingDimRDI < 0)
-        leadingDimRDI = s->order - 1;
+    if (leadingDim >= s->order)
+        leadingDim = leadingDim - s->order;

    CheckNTErrors((s != NULL && t != NULL), "Invalid tensors!");
    CheckNTErrors((s->devID == t->devID || (s->devID < 0 && t->devID < 0)),
@@ -56,19 +55,20 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)

    CheckNTErrors((s->unitNum == t->unitNum && s->unitSize == t->unitSize), "Unmatched tensors!");
    CheckNTErrors((s->order == t->order + 1), "Unmatched tensors!");
-    CheckNTErrors((leadingDimRDI > whereToMergeRDI), "Invalid leading dimension!");
+    CheckNTErrors((leadingDim < whereToMerge), "Invalid leading dimension!");

    for (int i = 0; i < s->order; i++) {
-        if (i == whereToMergeRDI) {
-            CheckNTErrors((t->dimSizeRDI[i] == s->dimSizeRDI[i] * s->dimSizeRDI[leadingDimRDI]),
+        if (i == whereToMerge) {
+            
+            CheckNTErrors((t->dimSize[i - 1] == s->dimSize[i] * s->dimSize[leadingDim]),
                          "Unmatched tensor sizes!");
        }
-        else if (i < leadingDimRDI){
-            CheckNTErrors((s->dimSizeRDI[i] == t->dimSizeRDI[i]),
+        else if (i < leadingDim){
+            CheckNTErrors((s->dimSize[i] == t->dimSize[i]),
                          "Unmatched tensor sizes!");
        }
-        else if (i > leadingDimRDI) {
-            CheckNTErrors((s->dimSizeRDI[i] == t->dimSizeRDI[i - 1]),
+        else if (i > leadingDim) {
+            CheckNTErrors((s->dimSize[i] == t->dimSize[i - 1]),
                          "Unmatched tensor sizes!");
        }
    }
@@ -77,14 +77,14 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
    int blockNum = 1;
    int gridSize = 1;
    int gridNum = 1;
-    int mergedNum = s->dimSizeRDI[leadingDimRDI];
+    int mergedNum = s->dimSize[leadingDim];

    for (int i = 0; i < s->order; i++) {
-        if (i <= leadingDimRDI) {
-            if (i <= whereToMergeRDI)
-                blockSize *= s->dimSizeRDI[i];
+        if (i >= leadingDim) {
+            if (i >= whereToMerge)
+                blockSize *= s->dimSize[i];
            else
-                blockNum *= s->dimSizeRDI[i];
+                blockNum *= s->dimSize[i];
        }
    }

@@ -121,7 +121,7 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
        if (!isOnSameDevice)
            dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(mem->devID, size);

-        int blockNumInMerge = s->dimSizeRDI[leadingDimRDI];
+        int blockNumInMerge = s->dimSize[leadingDim];
        int splitSizeInGrid = gridSize / blockNumInMerge;
        int realBlockSize = blockSize * t->unitSize;

@@ -254,7 +254,7 @@ void Merge(const XTensor &s, XTensor &t, int whereToMerge, int leadingDim)
        }

        float dr = (!s.isSparse) ? 1.0F : s.denseRatio;
-        InitTensor(&t, order, dimSize, s.dataType, dr, s.devID, s.mem);
+        InitTensorV2(&t, order, dimSize, s.dataType, dr, s.devID, s.mem);

        /* destroy variables */
        delete[] dimSize;
@@ -284,7 +284,7 @@ void _Merge(const TensorList * smalls, XTensor * t, int whereToMerge)

    CheckNTErrors((smalls != NULL), "Invalid list!");
    CheckNTErrors((smalls->count > 0), "Empty list!");
-    CheckNTErrors((whereToMerge >= 0 && whereToMerge < t->order), "Wrong range of whereToMerge");
+    CheckNTErrors((whereToMerge >= 0 && whereToMerge < t->order), "Wrong range of  whereToMerge");

    bool uniform = true;

@@ -310,12 +310,11 @@ void _Merge(const TensorList * smalls, XTensor * t, int whereToMerge)
    int mergedNum = smalls->count;

    XTensor * s0 = smalls->GetItem(0);
-    int whereToMergeRDI = s0->order - whereToMerge - 1;
    for (int i = 0; i < s0->order; i++) {
-        if (i <= whereToMergeRDI)
-            blockSize *= s0->dimSizeRDI[i];
+        if (i >= whereToMerge)
+            blockSize *= s0->dimSize[i];
        else
-            blockNum *= s0->dimSizeRDI[i];
+            blockNum *= s0->dimSize[i];
    }

    CheckNTErrors((s0->unitNum % (blockSize * blockNum) == 0), "Incorrect size!");
@@ -433,7 +432,7 @@ merge two tensors into a big tensor (return an XTensor structure)
 */
 XTensor Merge(const XTensor &smallA, const XTensor &smallB, int whereToMerge)
 {
-    CheckNTErrors(XTensor::IsSameShaped(&smallA, &smallB), 
+    CheckNTErrors(IsSameShaped(smallA, smallB), 
                 "The two tensors must be of the same size!");

    int order = smallA.order;

--- a/source/tensor/core/shape/Merge.h
+++ b/source/tensor/core/shape/Merge.h
@@ -46,8 +46,6 @@ void Merge(const TensorList &smalls, XTensor &t, int whereToMerge);
 /* merge two tensors into a big tensor (return an XTensor structure) */
 XTensor Merge(const XTensor &smallA, const XTensor &smallB, int whereToMerge);

-void Merge(const XTensor &smallA, const XTensor &smallB, XTensor &t, int whereToMerge);
-
 } // namespace nts(NiuTrans.Tensor)

 #endif // __MERGE_H__
\ No newline at end of file
--- a/source/tensor/core/shape/Reshape.cpp
+++ b/source/tensor/core/shape/Reshape.cpp
@@ -22,6 +22,7 @@
 #include "../../XTensor.h"
 #include "../../XName.h"
 #include "../movement/CopyValues.h"
+#include "../shape/IsSameShaped.h"
 #include "Reshape.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -52,8 +53,8 @@ XTensor Reshape(XTensor &s, int order, int * dimSize)

 void Reshape(XTensor &s, XTensor &t, int order, int * dimSize)
 {
-    if (!t.isInit || !XTensor::IsSameShaped(&t, &s)) {
-        InitTensor(&t, &s);
+    if (!t.isInit || !IsSameShaped(t, s)) {
+        InitTensorV2(&t, &s);
    }

    /* call Reshape function */

--- a/source/tensor/core/shape/Split.cpp
+++ b/source/tensor/core/shape/Split.cpp
@@ -31,7 +31,7 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)

 /*
-transform a tensor by splitting it, e.g., (N, M) -> (3, N/3, M)
+transform a tensor by splitting it, e.g., (N, M) -> (N/3, M, 3)

 >> s - the source tensor
 >> t - the target tensor (for return)
@@ -46,23 +46,22 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)

    CheckNTErrors((s->unitNum == t->unitNum && s->unitSize == t->unitSize), "Unmatched tensors!");
    CheckNTErrors((s->order == t->order - 1), "Unmatched tensors!");
-    CheckNTErrors((t->dimSizeRDI[t->order - 1] == splitNum), "Incorrect tensor sizes!");
+    CheckNTErrors((t->dimSize[0] == splitNum), "Incorrect tensor sizes!");

-    int whereToSplitRDI = s->order - whereToSplit - 1;
    for (int i = 0; i < s->order; i++) {
-        if (i == whereToSplitRDI) {
-            CheckNTErrors((s->dimSizeRDI[i] == t->dimSizeRDI[i] * splitNum),
+        if (i == whereToSplit) {
+            CheckNTErrors((s->dimSize[i] == t->dimSize[i + 1] * splitNum),
                          "Unmatched tensor sizes!");
        }
        else {
-            CheckNTErrors((s->dimSizeRDI[i] == t->dimSizeRDI[i]),
+            CheckNTErrors((s->dimSize[i] == t->dimSize[i + 1]),
                          "Unmatched tensor sizes!");
        }
    }

    /* for the case that we split the last dimension. Actually
-    (N, M) and (3, N/3, M) have the same memory layout */
-    if (s->order - 1 == whereToSplitRDI) {
+    (N, M) and (N, M/3, 3) have the same memory layout */
+    if (0 == whereToSplit) {
        XMemCopy(t->data, t->devID, s->data, s->devID, s->unitNum * s->unitSize);
        return;
    }
@@ -70,14 +69,14 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
    int blockSize = 1;
    int blockNum = 1;
    for (int i = 0; i < s->order; i++) {
-        if (i == whereToSplitRDI) {
-            blockSize *= s->dimSizeRDI[i] / splitNum;
+        if (i == whereToSplit) {
+            blockSize *= s->dimSize[i] / splitNum;
            blockNum *= splitNum;
        }
-        else if (i < whereToSplitRDI)
-            blockSize *= s->dimSizeRDI[i];
+        else if (i > whereToSplit)
+            blockSize *= s->dimSize[i];
        else
-            blockNum *= s->dimSizeRDI[i];
+            blockNum *= s->dimSize[i];
    }

    CheckNTErrors((blockNum % splitNum == 0), "Incorrect split number!");
@@ -184,7 +183,7 @@ bool CheckSplitSize(const XTensor * s, const XTensor * t, int whereToSplit, int 
 }

 /*
-transform a tensor by splitting it, e.g., (N, M) -> (3, N/3, M) (return an XTensor structure)
+transform a tensor by splitting it, e.g., (N, M) -> (N/3, M, 3) (return an XTensor structure)
 make a new tensor to keep the result and return it

 >> s - the source tensor
@@ -244,7 +243,7 @@ void Split(const XTensor &s, XTensor &t, int whereToSplit, int splitNum)
        }

        float dr = (!s.isSparse) ? 1.0F : s.denseRatio;
-        InitTensor(&t, order, dimSize, s.dataType, dr, s.devID, s.mem);
+        InitTensorV2(&t, order, dimSize, s.dataType, dr, s.devID, s.mem);

        /* destroy variables */
        delete[] dimSize;
@@ -276,7 +275,6 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli
    CheckNTErrors((smalls->count == splitNum), "Unmatched tensors!");
    CheckNTErrors((smalls->count > 0), "Wrong input!");

-    int whereToSplitRDI = big->order - whereToSplit - 1;
    bool uniform = true;

    for (int i = 0; i < smalls->count; i++) {
@@ -292,14 +290,14 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli
    int blockSize = 1;
    int blockNum = 1;
    for (int i = 0; i < big->order; i++) {
-        if (i == whereToSplitRDI) {
-            blockSize *= big->dimSizeRDI[i] / splitNum;
+        if (i == whereToSplit) {
+            blockSize *= big->dimSize[i] / splitNum;
            blockNum *= splitNum;
        }
-        else if (i < whereToSplitRDI)
-            blockSize *= big->dimSizeRDI[i];
+        else if (i > whereToSplit)
+            blockSize *= big->dimSize[i];
        else
-            blockNum *= big->dimSizeRDI[i];
+            blockNum *= big->dimSize[i];
    }

    CheckNTErrors((blockNum % splitNum == 0), "Incorrect split number!");

--- a/source/tensor/core/shape/Squeeze.cpp
+++ b/source/tensor/core/shape/Squeeze.cpp
@@ -21,6 +21,7 @@

 #include "Squeeze.h"
 #include "../movement/CopyValues.h"
+#include "../shape/IsSameShaped.h"
 #include "../../XName.h"

 namespace nts{ // namespace nts(NiuTrans.Tensor)
@@ -37,7 +38,7 @@ void _Squeeze(XTensor * source, XTensor * target, int leadingDim)
 {
    int order = target->order;

-    CheckNTErrors(XTensor::IsSameShaped(source, target), 
+    CheckNTErrors(_IsSameShaped(source, target), 
                 "The source and target tensor must be of the same size!");
    CheckNTErrors(leadingDim >= -1 && leadingDim < order,
                  "Wrong leading dimension");
@@ -130,8 +131,8 @@ XTensor Squeeze(XTensor & source, int leadingDim)

 void Squeeze(XTensor & source, XTensor & target, int leadingDim)
 {
-    if (!target.isInit || !XTensor::IsSameShaped(&source, &target)) {
-        InitTensor(&target, &source);
+    if (!target.isInit || !IsSameShaped(source, target)) {
+        InitTensorV2(&target, &source);
    }

    /* call _Squeeze function */

--- a/source/tensor/core/shape/Stack.cpp
+++ b/source/tensor/core/shape/Stack.cpp
@@ -20,6 +20,7 @@
 */

 #include "Stack.h"
+#include "IsSameShaped.h"
 #include "../../XUtility.h"
 #include "../../XName.h"

@@ -37,7 +38,7 @@ void _Stack(const TensorList * smalls, XTensor * t, int dim)
    for (int i = 1; i < count; i++) {
        XTensor * tmp1 = smalls->GetItem(i);
        XTensor * tmp2 = smalls->GetItem(i-1);
-        CheckNTErrors(XTensor::IsSameShaped(tmp1, tmp2), "The input tensor must be same size!");
+        CheckNTErrors(_IsSameShaped(tmp1, tmp2), "The input tensor must be same size!");
    }

    int blockSize = 1;
@@ -165,7 +166,7 @@ void Stack(const TensorList &smalls, XTensor &t, int dim)
        }

        float dr = (!tensor->isSparse) ? 1.0F : tensor->denseRatio;
-        InitTensor(&t, order, dimSize, tensor->dataType, dr, tensor->devID, tensor->mem);
+        InitTensorV2(&t, order, dimSize, tensor->dataType, dr, tensor->devID, tensor->mem);

        /* destroy variables */
        delete[] dimSize;

--- a/source/tensor/core/shape/Unsqueeze.cpp
+++ b/source/tensor/core/shape/Unsqueeze.cpp
@@ -42,16 +42,15 @@ void _Unsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
    CheckNTErrors((a->order == b->order - 1), "Unmatched tensors!");
    CheckNTErrors((a->unitSize == b->unitSize), "Unmatched tensors!");

-    int dimRDI = b->order - dim - 1;
    for (int i = 0; i < b->order; i++) {
-        if (i < dimRDI) {
-            CheckNTErrors((a->dimSizeRDI[i] == b->dimSizeRDI[i]), "Unmatched tensors!");
+        if (i < dim) {
+            CheckNTErrors((a->dimSize[i] == b->dimSize[i]), "Unmatched tensors!");
        }
-        else if (i > dimRDI) {
-            CheckNTErrors((a->dimSizeRDI[i - 1] == b->dimSizeRDI[i]), "Unmatched tensors!");
+        else if (i > dim) {
+            CheckNTErrors((a->dimSize[i - 1] == b->dimSize[i]), "Unmatched tensors!");
        }
        else {
-            CheckNTErrors((dSize == b->dimSizeRDI[i]), "Unmatched tensors!");
+            CheckNTErrors((dSize == b->dimSize[i]), "Unmatched tensors!");
        }
    }

@@ -60,8 +59,8 @@ void _Unsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)

    int blockNumA = 1;
    int blockNumB = 1;
-    for (int i = 0; i < dimRDI; i++)
-        blockSize *= a->dimSizeRDI[i];
+    for (int i = dim; i < a->order; i++)
+        blockSize *= a->dimSize[i];

    realBlockSize = blockSize * a->unitSize;

@@ -184,7 +183,7 @@ void Unsqueeze(const XTensor &a, XTensor &b, int dim, int dSize)
        }

        float dr = (!a.isSparse) ? 1.0F : a.denseRatio;
-        InitTensor(&b, order, dimSize, a.dataType, dr, a.devID, a.mem);
+        InitTensorV2(&b, order, dimSize, a.dataType, dr, a.devID, a.mem);

        /* destroy variables */
        delete[] dimSize;

--- a/source/tensor/core/shape/Unsqueeze.cu
+++ b/source/tensor/core/shape/Unsqueeze.cu
@@ -235,9 +235,8 @@ void _CudaUnsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
    int blockSize = 1;
    int blockNumA = 1;
    int blockNumB = 1;
-    int dimRDI = b->order - dim - 1;
-    for (int i = 0; i < dimRDI; i++)
-        blockSize *= a->dimSizeRDI[i];
+    for (int i = dim; i < a->order; i++)
+        blockSize *= a->dimSize[i];

    blockNumA = a->unitNum / blockSize;
    blockNumB = b->unitNum / blockSize;
@@ -250,7 +249,7 @@ void _CudaUnsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
    int devIDBackup = 0;
    ProtectCudaDev(a->devID, devIDBackup);

-    if (dimRDI == 0) {
+    if (dim == b->order - 1) {
        GDevs.GetCudaThread2D(a->devID, dSize, blockNumA, MAX_INT, cudaGrids, cudaBlocks);

        if (a->dataType == X_FLOAT && b->dataType == X_FLOAT) {

--- a/source/tensor/core/sort/Sort.cpp
+++ b/source/tensor/core/sort/Sort.cpp
@@ -22,6 +22,8 @@
 #include <math.h>
 #include "../../XTensor.h"
 #include "../movement/CopyValues.h"
+#include "../shape/IsSameShaped.h"
+#include "../utilities/SetAscendingOrder.h"
 #include "../../XUtility.h"
 #include "../../XName.h"
 #include "Sort.h"
@@ -40,14 +42,13 @@ void _Sort(const XTensor * a, XTensor * b, XTensor * index, int dim)
 {
    dim = MODX(dim, a->order);
    
-    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
+    CheckNTErrors((_IsSameShaped(a, b)), "Input tensors should have the same type!");
    CheckNTErrors((dim >= 0 && dim < a->order), "Incorrect dimension specified!");
    CheckNTErrors((a->order == index->order), "Unmatched input tensors!");
    CheckNTErrors((index->dataType == X_INT), "Wrong data type!");

-	int dimRDI = a->order - dim - 1;
    /* make the index tensor */
-    index->SetAscendingOrder(dim);
+    SetAscendingOrder(*index, dim);

    if (a->devID >= 0) {
 #ifdef USE_CUDA
@@ -58,16 +59,16 @@ void _Sort(const XTensor * a, XTensor * b, XTensor * index, int dim)
    }
    else {
        int stride = 1;
-        int strideNum = a->dimSizeRDI[dimRDI];
-        for (int i = 0; i < dimRDI; i++)
-            stride *= a->dimSizeRDI[i];
-
        int blockNum = 1;
-        for (int i = dimRDI + 1; i < a->order; i++)
-            blockNum *= a->dimSizeRDI[i];
+        int strideNum = a->dimSize[dim];
+        for (int i = 0; i < dim; i++)
+            blockNum *= a->dimSize[i];
+
+        for (int i = dim + 1; i < a->order; i++)
+            stride *= a->dimSize[i];
        int blockSize = stride * strideNum;

-		_CopyValues(a, b);
+        _CopyValues(a, b);
        for (int k = 0; k < blockNum; k++) {
        for (int i = 0; i < stride; i++) {
                void * dataB = (char*)b->data + (k * blockSize + i) * b->unitSize;

--- a/source/tensor/core/sort/Sort.cu
+++ b/source/tensor/core/sort/Sort.cu
@@ -217,20 +217,19 @@ void _CudaSortBig(const XTensor * a, XTensor * b, XTensor * indexA, XTensor * in
    CheckNTErrors((a->order > dim && dim >= 0), "Incorrect dimension specified!");
    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");

-    int dimRDI = a->order - dim - 1;
-    if (k < 0 || k > b->dimSizeRDI[dimRDI])
-        k = b->dimSizeRDI[dimRDI];
+    if (k < 0 || k > b->dimSize[dim])
+        k = b->dimSize[dim];

    XMem * mem = a->mem;

    int stride = 1;
-    int strideNum = a->dimSizeRDI[dimRDI];
-    for (int i = 0; i < dimRDI; i++)
-        stride *= a->dimSizeRDI[i];
-
    int blockNum = 1;
-    for (int i = dimRDI + 1; i < a->order; i++)
-        blockNum *= a->dimSizeRDI[i];
+    int strideNum = a->dimSize[dim];
+    for (int i = 0; i < dim; i++)
+        blockNum *= a->dimSize[i];
+
+    for (int i = dim + 1; i < a->order; i++)
+        stride *= a->dimSize[i];

    int m = GetNextPower2(strideNum);
    int n = stride * blockNum;

--- a/source/tensor/core/sort/TopK.cpp
+++ b/source/tensor/core/sort/TopK.cpp
@@ -45,15 +45,14 @@ void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
    CheckNTErrors(index == NULL || a->order == index->order, "Unmatched input tensors!");
    CheckNTErrors(index->dataType == X_INT, "Wrong data type!");

-    int dimRDI = a->order - dim - 1;
    for (int i = 0; i < a->order; i++) {
-        if (i == dimRDI) {
-            CheckNTErrors(b->dimSizeRDI[i] == k, "A too large K");
-            CheckNTErrors(index == NULL || index->dimSizeRDI[i] == k, "Wrong size!");
+        if (i == dim) {
+            CheckNTErrors((b->dimSize[i] == k), "A too large K");
+            CheckNTErrors((index == NULL || index->dimSize[i] == k), "Wrong size!");
        }
        else {
-            CheckNTErrors(b->dimSizeRDI[i] == a->dimSizeRDI[i], "Wrong size!");
-            CheckNTErrors(index == NULL || index->dimSizeRDI[i] == a->dimSizeRDI[i], "Wrong size!");
+            CheckNTErrors((b->dimSize[i] == a->dimSize[i]), "Wrong size!");
+            CheckNTErrors((index == NULL || index->dimSize[i] == a->dimSize[i]), "Wrong size!");
        }
    }

@@ -68,14 +67,14 @@ void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
        CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");

        int stride = 1;
-        int strideNumA = a->dimSizeRDI[dimRDI];
-        int strideNumB = b->dimSizeRDI[dimRDI];
-        for (int i = 0; i < dimRDI; i++)
-            stride *= a->dimSizeRDI[i];
-
        int blockNum = 1;
-        for (int i = dimRDI + 1; i < a->order; i++)
-            blockNum *= a->dimSizeRDI[i];
+        int strideNumA = a->dimSize[dim];
+        int strideNumB = b->dimSize[dim];
+        for (int i = 0; i < dim; i++)
+            blockNum *= a->dimSize[i];
+
+        for (int i = dim + 1; i < a->order; i++)
+            stride *= a->dimSize[i];
        int blockSizeA = stride * strideNumA;
        int blockSizeB = stride * strideNumB;


--- a/source/tensor/core/sort/TopK.cu
+++ b/source/tensor/core/sort/TopK.cu
@@ -22,6 +22,7 @@
 #include "../../XDevice.h"
 #include "../../XUtility.h"
 #include "../../XTensor.h"
+#include "../utilities/SetAscendingOrder.h"
 #include "TopK.h"
 #include "TopK.cuh"
 #include "Sort.cuh"
@@ -811,15 +812,14 @@ void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
    CheckNTErrors((index->dataType == X_INT), "Wrong data type!");
    CheckNTErrors((b->dimSize[dim] == k), "A too large K");

-    int dimRDI = a->order - dim - 1;
    int stride = 1;
-    int strideNumA = a->dimSizeRDI[dimRDI];
-    for (int i = 0; i < dimRDI; i++)
-        stride *= a->dimSizeRDI[i];
-
    int blockNum = 1;
-    for (int i = dimRDI + 1; i < a->order; i++)
-        blockNum *= a->dimSizeRDI[i];
+    int strideNumA = a->dimSize[dim];
+    for (int i = 0; i < dim; i++)
+        blockNum *= a->dimSize[i];
+
+    for (int i = dim + 1; i < a->order; i++)
+        stride *= a->dimSize[i];

    int workerNum = blockNum < 16 ? 64 : 32; 
    /* adjust the thread num according size of k for fitting the share memory size */
@@ -828,7 +828,7 @@ void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
    else if (k < 22) workerNum = 128;
    else if (k < 44) workerNum = 64;
    else workerNum = 32;
-
+ 
    int cudaGrids[3];
    int cudaBlocks[3];

@@ -862,7 +862,7 @@ void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
        //indexA->data = a->mem != NULL ? a->mem->AllocBuf(a->devID, a->unitNum * sizeof(int)) : XMemAlloc(a->devID, a->unitNum * sizeof(int));

        /* make the index tensor */
-        //indexA->SetAscendingOrder(dim);
+        //SetAscendingOrder(*indexA, dim);

        //_CudaSortBig(a, b, indexA, index, dim, k);


--- a/source/tensor/core/utilities/CheckData.cpp
+++ b/source/tensor/core/utilities/CheckData.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: LI Yinqiao (email: li.yin.qiao.2012@hotmail.com) 2019-10-22
+*/
+
+#include "../../XTensor.h"
+#include "../../XUtility.h"
+#include "CheckData.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* compare two numbers */
+bool IsFloatEqual(DTYPE a, DTYPE b, float absError, float relError)
+{
+    if(a == b)
+        return true;
+    if(fabs(a - b) < absError)
+        return true;
+    if(fabs(a) < fabs(b))
+        return (fabs((a - b) / b) < relError) ? true : false;
+    else
+        return (fabs((a - b) / a) < relError) ? true : false;
+}
+
+/* check whether the data array is the same as the answer
+>> tensor - input tensor
+>> d - input data (it must be on CPUs)
+>> num - number of data items
+>> beg - where we start this in the data array of the tensor
+*/
+bool _CheckData(const XTensor * tensor, const void * d, int num, int beg)
+{
+    if (tensor->data == NULL || d == NULL)
+        return false;
+
+    CheckNTErrors(!tensor->isSparse, "TODO");
+    CheckNTErrors(num == tensor->unitNum - beg, "Illegal size!");
+
+    if (tensor->devID < 0) {
+        return !memcmp(tensor->data, d, num * tensor->unitSize);
+    }
+#ifdef USE_CUDA
+    else {
+        char * copy = new char[num * tensor->unitSize];
+        XMemCopy(copy, -1, tensor->data, tensor->devID, num * tensor->unitSize);
+        int cmpResult = memcmp(copy, d, num * tensor->unitSize);
+        bool result = (cmpResult == 0) ? true : false;
+        delete[] copy;
+        return result;
+    }
+#endif
+    return true;
+}
+
+/* check whether the data array is the same as the answer
+>> tensor - input tensor
+>> d - input data (it must be on CPUs)
+>> num - number of data items
+>> tolerance - error value we tolerant between result and answer
+>> beg - where we start this in the data array of the tensor
+*/bool _CheckData(const XTensor * tensor, const void * d, int num, float tolerance, int beg)
+{
+    if (tensor->data == NULL || d == NULL)
+        return false;
+
+    CheckNTErrors(!tensor->isSparse, "TODO");
+    CheckNTErrors(num == tensor->unitNum - beg, "Illegal size!");
+
+    DTYPE * valuePrt = (DTYPE*)tensor->data;
+    DTYPE value = 0;
+    DTYPE * answerPrt = (DTYPE*)d;
+    for (int i = beg; i < num; i++) {
+        value = ToCPU(tensor->devID, valuePrt);
+        if(IsFloatEqual(value, *answerPrt, tolerance, 1e-4F) == false)
+            return false;
+        valuePrt++;
+        answerPrt++;
+    }
+    return true;
+}
+
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/utilities/CheckData.h
+++ b/source/tensor/core/utilities/CheckData.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: LI Yinqiao (email: li.yin.qiao.2012@hotmail.com) 2019-10-22
+*/
+
+#ifndef __CHECKDATA_H__
+#define __CHECKDATA_H__
+
+#include "../../XTensor.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+    
+/* check whether the data array is the same as the answer */
+bool _CheckData(const XTensor * tensor, const void * answer, int num, int beg = 0);
+
+/* check whether the data array is the same as the answer */
+bool _CheckData(const XTensor * tensor, const void * answer, int num, float tolerance, int beg = 0);
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __CHECKDATA_H__
\ No newline at end of file
--- a/source/tensor/core/utilities/SetAscendingOrder.cpp
+++ b/source/tensor/core/utilities/SetAscendingOrder.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: LI Yinqiao (email: li.yin.qiao.2012@hotmail.com) 2019-10-23
+*/
+
+#include "../../XTensor.h"
+#include "SetAscendingOrder.cuh"
+#include "SetAscendingOrder.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* 
+set the cell to the ascending order along a given dimension 
+>> tensor - input tensor
+>> dim - the dimension specified
+*/
+void SetAscendingOrder(XTensor & tensor, int dim)
+{
+    CheckNTErrors(dim < tensor.order, "Wrong dimension specified!");
+    CheckNTErrors(tensor.dataType == X_INT, "TODO!");
+
+    if(dim < 0){        
+        int o = tensor.order;
+        int ds[MAX_TENSOR_DIM_NUM];
+        memcpy(ds, tensor.dimSize, sizeof(int) * tensor.order);
+
+        tensor.Reshape(tensor.unitNum);
+        SetAscendingOrder(tensor, 0);
+        tensor.Reshape(o, ds);
+
+        return;
+    }
+
+    if(tensor.devID >= 0){
+#ifdef USE_CUDA
+        CudaSetAscendingOrder(&tensor, dim);
+#else
+        ShowNTErrors("Plesae specify USE_CUDA and recompile the code!");
+#endif
+    }
+    else{
+        int stride = 1;
+        int blockNum = 1;
+        int strideNum = tensor.dimSize[dim];
+        for(int i = 0; i < dim; i++)
+            blockNum *= tensor.dimSize[i];
+
+        for(int i = dim + 1; i < tensor.order; i++)
+            stride *= tensor.dimSize[i];
+
+        for(int k = 0; k < blockNum; k++){
+            for(int j = 0; j < strideNum; j++){
+                int * d = (int*)tensor.data + stride * strideNum * k + stride * j;
+                for(int i = 0; i < stride; i++)
+                    d[i] = j;
+            }
+        }
+    }
+}
+
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/utilities/SetAscendingOrder.cu
+++ b/source/tensor/core/utilities/SetAscendingOrder.cu
@@ -67,15 +67,14 @@ void CudaSetAscendingOrder(XTensor * a, int dim)
 {
    CheckNTErrors((a->dataType == X_INT), "TODO!");

-    int dimRDI = a->order - dim - 1;
-    int stride = 1;
-    int strideNum = a->dimSizeRDI[dimRDI];
-    for(int i = 0; i < dimRDI; i++)
-        stride *= a->dimSizeRDI[i];
-
+	int stride = 1;
    int blockNum = 1;
-    for(int i = dimRDI + 1; i < a->order; i++)
-        blockNum *= a->dimSizeRDI[i];
+    int strideNum = a->dimSize[dim];
+    for(int i = 0; i < dim; i++)
+        blockNum *= a->dimSize[i];
+
+    for(int i = dim + 1; i < a->order; i++)
+        stride *= a->dimSize[i];

    int gridSize[3];
    int blockSize[3];

--- a/source/tensor/core/utilities/SetAscendingOrder.h
+++ b/source/tensor/core/utilities/SetAscendingOrder.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: LI Yinqiao (email: li.yin.qiao.2012@hotmail.com) 2019-10-23
+*/
+
+#ifndef __SETASCENDINGORDER_H__
+#define __SETASCENDINGORDER_H__
+
+#include "../../XTensor.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+    
+/* set the cell to the ascending order along a given dimension */
+void SetAscendingOrder(XTensor & tensor, int dim);
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __SETASCENDINGORDER_H__
\ No newline at end of file
--- a/source/tensor/function/Dropout.cpp
+++ b/source/tensor/function/Dropout.cpp
@@ -68,7 +68,7 @@ void _Dropout(const XTensor * x, XTensor * y, unsigned int seed, DTYPE dropProb,
    for (int i = 0; i < unitNum; i++)
        maskArray[i] = RandomBernoulli(dropProb, scaleFactor);

-    XTensor * mask = NewTensor1D(unitNum, x->dataType, x->devID, x->mem);
+    XTensor * mask = NewTensor1DV2(unitNum, x->dataType, x->devID, x->mem);
    mask->SetData(maskArray, unitNum);

    /* call Multiply function for mask */
@@ -113,7 +113,7 @@ void _DropoutBackward(const XTensor * y, const XTensor * x,
        for (int i = 0; i < unitNum; i++)
            maskArray[i] = RandomBernoulli(dropProb, scaleFactor);

-        XTensor * mask = NewTensor1D(unitNum, x->dataType, x->devID, x->mem);
+        XTensor * mask = NewTensor1DV2(unitNum, x->dataType, x->devID, x->mem);
        mask->SetData(maskArray, unitNum);

        /* call MultiplyDim function for mask */
@@ -149,13 +149,12 @@ XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim, int leadingDim
    CheckNTErrors(dropProb >= 0.0 && dropProb <= 1.0, "The probability must be 0-1!");

    XTensor mask;
-    // int * maskArrayInt = NULL;
    DTYPE * maskArray = NULL;
    DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - dropProb);

    if(leadingDim < 0 && leadingDim2 < 0){
        XTensor mask;
-        InitTensor(&mask, &x);
+        InitTensorV2(&mask, &x);

        _SetDataRandP(&mask, 0, 1.0F, dropProb, scaleFactor);

@@ -169,7 +168,7 @@ XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim, int leadingDim
            maskArrayInt[i] = rand() % x.unitNum;

        XTensor maskindex;
-        InitTensor1D(&maskindex, unitNum, X_INT, x.devID, x.mem);
+        InitTensor1DV2(&maskindex, unitNum, X_INT, x.devID, x.mem);

        maskindex.SetData(maskArrayInt, unitNum);

@@ -192,7 +191,7 @@ XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim, int leadingDim
            maskArray[i] = RandomBernoulli(dropProb, scaleFactor);
    
        XTensor mask;
-        InitTensor1D(&mask, unitNum, x.dataType, x.devID, x.mem);
+        InitTensor1DV2(&mask, unitNum, x.dataType, x.devID, x.mem);
        mask.SetData(maskArray, unitNum);

        delete[] maskArray;
@@ -221,7 +220,7 @@ XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim, int leadingDim
        dims[n] = x.GetDim(n);
        dims[m] = x.GetDim(m);
    
-        InitTensor(&mask, x.order, dims, x.dataType, x.denseRatio,x.devID, x.mem);
+        InitTensorV2(&mask, x.order, dims, x.dataType, x.denseRatio,x.devID, x.mem);
        mask.SetData(maskArray, unitNum);

        delete[] maskArray;
@@ -251,7 +250,7 @@ XTensor DropoutWithoutBroadcast(const XTensor &x, DTYPE dropProb)
        maskArray[i] = RandomBernoulli(dropProb, scaleFactor);
    
    XTensor mask;
-    InitTensor(&mask, x.order, x.dimSize, x.dataType, x.denseRatio, x.devID, x.mem);
+    InitTensorV2(&mask, x.order, x.dimSize, x.dataType, x.denseRatio, x.devID, x.mem);
    mask.SetData(maskArray, unitNum);

    delete[] maskArray;
@@ -259,4 +258,4 @@ XTensor DropoutWithoutBroadcast(const XTensor &x, DTYPE dropProb)
    return Multiply(x, mask);
 }

-} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/function/DropoutWithIndex.cpp
+++ b/source/tensor/function/DropoutWithIndex.cpp
@@ -68,7 +68,7 @@ XTensor DropoutWithIndex(const XTensor &x, XTensor &maskIndex, DTYPE scale)
        dimSize[i] = x.dimSize[i];
    }

-    InitTensor1D(&c, x.unitNum, x.dataType, x.devID, x.mem);
+    InitTensor1DV2(&c, x.unitNum, x.dataType, x.devID, x.mem);

    _SetDataFixedFloat(&c, 1.0F);


--- a/source/tensor/function/HardTanH.cpp
+++ b/source/tensor/function/HardTanH.cpp
@@ -21,6 +21,7 @@

 #include <stdlib.h>
 #include "../XName.h"
+#include "../../tensor/core/shape/IsSameShaped.h"
 #include "HardTanH.h"
 #include "HardTanH.cuh"

@@ -36,7 +37,7 @@ y =  1    if x > 1
 */
 void _HardTanH(const XTensor * x, XTensor * y)
 {
-    CheckNTErrors(XTensor::IsSameShaped(x, y), 
+    CheckNTErrors(_IsSameShaped(x, y), 
                 "The input tensor and output tensor must have the same shape!")

 #ifdef USE_CUDA
@@ -87,8 +88,8 @@ XTensor HardTanH(const XTensor &x)

 void HardTanH(const XTensor &x, XTensor &y)
 {
-    if (!y.isInit || !XTensor::IsSameShaped(&y, &x)) {
-        InitTensor(&y, &x);
+    if (!y.isInit || !IsSameShaped(y, x)) {
+        InitTensorV2(&y, &x);
    }

    /* call _HardTanH function */

--- a/source/tensor/function/Identity.cpp
+++ b/source/tensor/function/Identity.cpp
@@ -23,6 +23,7 @@
 #include "../XName.h"
 #include "../XUtility.h"
 #include "../core/movement/CopyValues.h"
+#include "../core/shape/IsSameShaped.h"

 namespace nts{ // namespace nts(NiuTrans.Tensor)

@@ -33,7 +34,7 @@ identity function y = x
 */
 void _Identity(const XTensor * x, XTensor * y)
 {
-    CheckNTErrors(XTensor::IsSameShaped(x, y), 
+    CheckNTErrors(_IsSameShaped(x, y), 
                 "The input tensor and output tensor must have the same shape!")
    _CopyValues(x, y);
 }
@@ -63,8 +64,8 @@ XTensor Identity(const XTensor &x)

 void Identity(const XTensor &x, XTensor &y)
 {
-    if (!y.isInit || !y.IsSameShaped(&y, &x)) {
-        InitTensor(&y, &x);
+    if (!y.isInit || !IsSameShaped(y, x)) {
+        InitTensorV2(&y, &x);
    }

    /* call _Identity function */

--- a/source/tensor/function/LogSoftmax.cpp
+++ b/source/tensor/function/LogSoftmax.cpp
@@ -27,6 +27,7 @@
 #include "../core/reduce/ReduceSum.h"
 #include "../core/reduce/ReduceMax.h"
 #include "../core/movement/CopyValues.h"
+#include "../core/shape/IsSameShaped.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

@@ -49,7 +50,6 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
        return;
    }

-    int leadDimRDI = x->order - leadDim - 1;
    if (!x->isSparse && !y->isSparse &&
        x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE)
    {
@@ -69,36 +69,36 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
        XTensor * blockMax = NULL;
        XTensor * blockSum = NULL;

-        int dimensionSize = y->dimSizeRDI[leadDimRDI];
+        int dimensionSize = y->dimSize[leadDim];
        int stride = 1;
        int blockSize = 1;
        int blockNum = 1;

-        for (int i = 0; i < leadDimRDI; i++)
-            stride *= y->dimSizeRDI[i];
+        for (int i = leadDim + 1; i < x->order; i++)
+            stride *= y->dimSize[i];
        blockSize = stride * dimensionSize;
        blockNum = y->unitNum / blockSize;

-        max = NewTensorBuf(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
-        sum = NewTensorBuf(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
+        max = NewTensorBufV2(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);
+        sum = NewTensorBufV2(x->order - 1, dimSize, x->dataType, x->denseRatio, x->devID, mem);

        _ReduceMax(x, max, leadDim);
        _ReduceSum(x, sum, leadDim, max, 1.0F, true);

        if (x->devID >= 0) {
-            if(leadDimRDI == 0){
+            if(leadDim == x->order - 1){
                blockSize = y->unitNum;
                blockNum  = 1;
-                blockx = NewTensor2D(blockSize/dimensionSize, -dimensionSize, x->dataType, x->devID, mem);
-                blocky = NewTensor2D(blockSize/dimensionSize, -dimensionSize, x->dataType, x->devID, mem);
-                blockMax = NewTensor2D(blockSize/dimensionSize, -1, x->dataType, x->devID, mem);
-                blockSum = NewTensor2D(blockSize/dimensionSize, -1, x->dataType, x->devID, mem);
+                blockx = NewTensor2DV2(blockSize/dimensionSize, -dimensionSize, x->dataType, x->devID, mem);
+                blocky = NewTensor2DV2(blockSize/dimensionSize, -dimensionSize, x->dataType, x->devID, mem);
+                blockMax = NewTensor2DV2(blockSize/dimensionSize, -1, x->dataType, x->devID, mem);
+                blockSum = NewTensor2DV2(blockSize/dimensionSize, -1, x->dataType, x->devID, mem);
            }
            else{
-                blockx = NewTensor2D(-stride, dimensionSize, x->dataType, x->devID, mem);
-                blocky = NewTensor2D(-stride, dimensionSize, x->dataType, x->devID, mem);
-                blockMax = NewTensor2D(-stride, 1, x->dataType, x->devID, mem);
-                blockSum = NewTensor2D(-stride, 1, x->dataType, x->devID, mem);
+                blockx = NewTensor2DV2(-stride, dimensionSize, x->dataType, x->devID, mem);
+                blocky = NewTensor2DV2(-stride, dimensionSize, x->dataType, x->devID, mem);
+                blockMax = NewTensor2DV2(-stride, 1, x->dataType, x->devID, mem);
+                blockSum = NewTensor2DV2(-stride, 1, x->dataType, x->devID, mem);
            }
        }

@@ -137,7 +137,7 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
                blockMax->data = mp;
                blockSum->data = sp;
 #ifdef USE_CUDA
-                if(leadDimRDI == 0)
+                if(leadDim == x->order - 1)
                    _CudaLogSoftmaxSumMax(blockx, blocky, 1, blockSum, blockMax);
                else
                    _CudaLogSoftmaxSumMax(blockx, blocky, leadDim, blockSum, blockMax);
@@ -210,8 +210,8 @@ void LogSoftmax(const XTensor &x, XTensor &y, int leadDim)
    if (ld < 0)
        ld = x.order - 1;

-    if (!y.isInit || !XTensor::IsSameShaped(&y, &x)) {
-        InitTensor(&y, &x);
+    if (!y.isInit || !IsSameShaped(y, x)) {
+        InitTensorV2(&y, &x);
    }

    /* call _LogSoftmax function */
@@ -298,7 +298,6 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
    if(leadDim < 0)
        leadDim = y->order - 1;

-    int leadDimRDI = y->order - leadDim - 1;
 #ifdef USE_CUDA
    if (gold->devID >= 0) {
        _CudaLogSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
@@ -306,12 +305,12 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
    }
 #endif

-    int dimensionSize = y->dimSizeRDI[leadDimRDI];
+    int dimensionSize = y->dimSize[leadDim];
    int stride = 1;
    int blockSize = 1;
    int blockNum = 1;
-    for (int i = 0; i < leadDimRDI; i++)
-        stride *= y->dimSizeRDI[i];
+    for (int i = leadDim + 1; i < y->order; i++)
+        stride *= y->dimSize[i];
    blockSize = stride * dimensionSize;
    blockNum = y->unitNum / blockSize;

@@ -338,10 +337,10 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
                    int key = gold->GetKeyInSparse(i);
                    DTYPE value = gold->GetInSparse(i);
                    int offset = key;
-                    if (dedx->dimSizeRDI[0] != gm) {
+                    if (dedx->dimSize[dedx->order - 1] != gm) {
                        int mi = key % gm;
                        int ni = key / gm;
-                        int key2 = ni * dedx->dimSizeRDI[0] + mi;
+                        int key2 = ni * dedx->dimSize[dedx->order - 1] + mi;
                        offset = key2;
                    }
                    if (key >= 0 && key < size)
@@ -352,7 +351,7 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
                }
            }
            else {
-                CheckNTErrors((XTensor::IsSameShaped(gold, y)), "The tensors must be of the same size!");
+                CheckNTErrors((_IsSameShaped(gold, y)), "The tensors must be of the same size!");
                for (int k = 0; k < blockNum; k++) {
                    gp = (DTYPE*)gold->data + k * blockSize;
                    op = (DTYPE*)y->data + k * blockSize;
@@ -395,10 +394,10 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
                    int key = gold->GetKeyInSparse(i);
                    DTYPE value = gold->GetInSparse(i);
                    int offset = key;
-                    if (dedx->dimSizeRDI[0] != gm) {
+                    if (dedx->dimSize[dedx->order - 1] != gm) {
                        int mi = key % gm;
                        int ni = key / gm;
-                        int key2 = ni * dedx->dimSizeRDI[0] + mi;
+                        int key2 = ni * dedx->dimSize[dedx->order - 1] + mi;
                        offset = key2;
                    }
                    if (key >= 0 && key < size)
@@ -406,7 +405,7 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
                }
            }
            else {
-                CheckNTErrors((XTensor::IsSameShaped(gold, y)), "The tensors must be of the same size!");
+                CheckNTErrors((_IsSameShaped(gold, y)), "The tensors must be of the same size!");
                for (int k = 0; k < blockNum; k++) {
                    gp = (DTYPE*)gold->data + k * blockSize;
                    op = (DTYPE*)y->data + k * blockSize;
@@ -430,11 +429,11 @@ void _LogSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
        /* for columns with no xs we set dE/ds = 0 */
        if (gold != NULL && gold->isSparse) {
            CheckNTErrors((gold->order == 2), "The gold standard tensor must be of order 2!");
-            if ((gold->dimSize[1] > 1 && !gold->isAllValued[0]) || gold->dimSize[1] != dedx->dimSizeRDI[0]) {
+            if ((gold->dimSize[1] > 1 && !gold->isAllValued[0]) || gold->dimSize[1] != dedx->dimSize[dedx->order - 1]) {
                int gn = gold->dimSize[0];
                int gm = gold->dimSize[1];
-                int sm = dedx->dimSizeRDI[0];
-                int sn = dedx->dimSizeRDI[1];
+                int sm = dedx->dimSize[dedx->order - 1];
+                int sn = dedx->dimSize[dedx->order - 2];

                int * flags = new int[sm];
                memset(flags, 0, sizeof(int)*sm);

--- a/source/tensor/function/LogSoftmax.cu
+++ b/source/tensor/function/LogSoftmax.cu
--- a/source/tensor/function/Loss.cpp
+++ b/source/tensor/function/Loss.cpp
--- a/source/tensor/function/Loss.cu
+++ b/source/tensor/function/Loss.cu
--- a/source/tensor/function/Rectify.cpp
+++ b/source/tensor/function/Rectify.cpp
--- a/source/tensor/function/Sigmoid.cpp
+++ b/source/tensor/function/Sigmoid.cpp
--- a/source/tensor/function/Softmax.cpp
+++ b/source/tensor/function/Softmax.cpp
--- a/source/tensor/function/Softmax.cu
+++ b/source/tensor/function/Softmax.cu
--- a/source/tensor/loss/CrossEntropy.cpp
+++ b/source/tensor/loss/CrossEntropy.cpp
--- a/source/tensor/loss/CrossEntropy.cu
+++ b/source/tensor/loss/CrossEntropy.cu
--- a/source/tensor/test/TAbsolute.cpp
+++ b/source/tensor/test/TAbsolute.cpp
--- a/source/tensor/test/TClip.cpp
+++ b/source/tensor/test/TClip.cpp
--- a/source/tensor/test/TCompare.cpp
+++ b/source/tensor/test/TCompare.cpp
--- a/source/tensor/test/TConcatenate.cpp
+++ b/source/tensor/test/TConcatenate.cpp
--- a/source/tensor/test/TConcatenateSolely.cpp
+++ b/source/tensor/test/TConcatenateSolely.cpp
--- a/source/tensor/test/TConvertDataType.cpp
+++ b/source/tensor/test/TConvertDataType.cpp
--- a/source/tensor/test/TCopyIndexed.cpp
+++ b/source/tensor/test/TCopyIndexed.cpp
--- a/source/tensor/test/TCopyValues.cpp
+++ b/source/tensor/test/TCopyValues.cpp
--- a/source/tensor/test/TCos.cpp
+++ b/source/tensor/test/TCos.cpp
--- a/source/tensor/test/TCrossEntropy.cpp
+++ b/source/tensor/test/TCrossEntropy.cpp
--- a/source/tensor/test/TDiv.cpp
+++ b/source/tensor/test/TDiv.cpp
--- a/source/tensor/test/TDivDim.cpp
+++ b/source/tensor/test/TDivDim.cpp
--- a/source/tensor/test/TDropout.cpp
+++ b/source/tensor/test/TDropout.cpp
--- a/source/tensor/test/TExp.cpp
+++ b/source/tensor/test/TExp.cpp
--- a/source/tensor/test/TGather.cpp
+++ b/source/tensor/test/TGather.cpp
--- a/source/tensor/test/THardTanH.cpp
+++ b/source/tensor/test/THardTanH.cpp
--- a/source/tensor/test/TIdentity.cpp
+++ b/source/tensor/test/TIdentity.cpp
--- a/source/tensor/test/TLog.cpp
+++ b/source/tensor/test/TLog.cpp
--- a/source/tensor/test/TLogSoftmax.cpp
+++ b/source/tensor/test/TLogSoftmax.cpp
--- a/source/tensor/test/TLoss.cpp
+++ b/source/tensor/test/TLoss.cpp
--- a/source/tensor/test/TMatrixMul.cpp
+++ b/source/tensor/test/TMatrixMul.cpp
--- a/source/tensor/test/TMatrixMul2D.cpp
+++ b/source/tensor/test/TMatrixMul2D.cpp
--- a/source/tensor/test/TMatrixMul2DParallel.cpp
+++ b/source/tensor/test/TMatrixMul2DParallel.cpp
--- a/source/tensor/test/TMatrixMulBatched.cpp
+++ b/source/tensor/test/TMatrixMulBatched.cpp
--- a/source/tensor/test/TMerge.cpp
+++ b/source/tensor/test/TMerge.cpp
--- a/source/tensor/test/TMultiply.cpp
+++ b/source/tensor/test/TMultiply.cpp
--- a/source/tensor/test/TMultiplyDim.cpp
+++ b/source/tensor/test/TMultiplyDim.cpp
--- a/source/tensor/test/TNegate.cpp
+++ b/source/tensor/test/TNegate.cpp
--- a/source/tensor/test/TNormalize.cpp
+++ b/source/tensor/test/TNormalize.cpp
--- a/source/tensor/test/TPower.cpp
+++ b/source/tensor/test/TPower.cpp
--- a/source/tensor/test/TRectify.cpp
+++ b/source/tensor/test/TRectify.cpp
--- a/source/tensor/test/TReduceMax.cpp
+++ b/source/tensor/test/TReduceMax.cpp
--- a/source/tensor/test/TReduceMean.cpp
+++ b/source/tensor/test/TReduceMean.cpp
--- a/source/tensor/test/TReduceSum.cpp
+++ b/source/tensor/test/TReduceSum.cpp
--- a/source/tensor/test/TReduceSumAll.cpp
+++ b/source/tensor/test/TReduceSumAll.cpp
--- a/source/tensor/test/TReduceSumSquared.cpp
+++ b/source/tensor/test/TReduceSumSquared.cpp
--- a/source/tensor/test/TReduceVariance.cpp
+++ b/source/tensor/test/TReduceVariance.cpp
--- a/source/tensor/test/TRound.cpp
+++ b/source/tensor/test/TRound.cpp
--- a/source/tensor/test/TScaleAndShift.cpp
+++ b/source/tensor/test/TScaleAndShift.cpp
--- a/source/tensor/test/TSelect.cpp
+++ b/source/tensor/test/TSelect.cpp
--- a/source/tensor/test/TSetAscendingOrder.cpp
+++ b/source/tensor/test/TSetAscendingOrder.cpp
--- a/source/tensor/test/TSetData.cpp
+++ b/source/tensor/test/TSetData.cpp
--- a/source/tensor/test/TSigmoid.cpp
+++ b/source/tensor/test/TSigmoid.cpp
--- a/source/tensor/test/TSign.cpp
+++ b/source/tensor/test/TSign.cpp
--- a/source/tensor/test/TSin.cpp
+++ b/source/tensor/test/TSin.cpp
--- a/source/tensor/test/TSoftmax.cpp
+++ b/source/tensor/test/TSoftmax.cpp
--- a/source/tensor/test/TSort.cpp
+++ b/source/tensor/test/TSort.cpp
--- a/source/tensor/test/TSplit.cpp
+++ b/source/tensor/test/TSplit.cpp
--- a/source/tensor/test/TSpread.cpp
+++ b/source/tensor/test/TSpread.cpp
--- a/source/tensor/test/TSub.cpp
+++ b/source/tensor/test/TSub.cpp
--- a/source/tensor/test/TSubDim.cpp
+++ b/source/tensor/test/TSubDim.cpp
--- a/source/tensor/test/TSum.cpp
+++ b/source/tensor/test/TSum.cpp
--- a/source/tensor/test/TSumDim.cpp
+++ b/source/tensor/test/TSumDim.cpp
--- a/source/tensor/test/TTan.cpp
+++ b/source/tensor/test/TTan.cpp
--- a/source/tensor/test/TTopK.cpp
+++ b/source/tensor/test/TTopK.cpp
--- a/source/tensor/test/TTranspose.cpp
+++ b/source/tensor/test/TTranspose.cpp
--- a/source/tensor/test/TUnsqueeze.cpp
+++ b/source/tensor/test/TUnsqueeze.cpp
--- a/source/tensor/test/Test.cpp
+++ b/source/tensor/test/Test.cpp