refactor the translator engine for nmt

e925cfd9 · huchi · 143e048c · e925cfd9 · 143e048c · 143e048c
Commit e925cfd9 authored Feb 07, 2020 by huchi
--- a/source/Main.cpp
+++ b/source/Main.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-10
+ */
+
+#include <stdio.h>
+#include "./network/XNet.h"
+#include "./tensor/XUtility.h"
+#include "./tensor/function/FHeader.h"
+#include "./tensor/core/CHeader.h"
+#include "./sample/fnnlm/FNNLM.h"
+#include "./sample/transformer/Transformer.h"
+
+//#define CRTDBG_MAP_ALLOC
+//#include <stdlib.h>
+//#include <crtdbg.h>
+
+using namespace nts;
+using namespace fnnlm;
+using namespace transformer;
+
+int main( int argc, const char ** argv )
+{
+    //_CrtSetDbgFlag(_CrtSetDbgFlag(_CRTDBG_REPORT_FLAG) | _CRTDBG_LEAK_CHECK_DF);
+    //_CrtSetBreakAlloc(2708);
+
+    TransformerMain(argc - 1, argv + 1);
+
+    //_CrtDumpMemoryLeaks();
+    
+    return 0;
+}
+
--- a/source/model/Model.cpp
+++ b/source/model/Model.cpp
-#include "Model.h"
-
-
-/* the nts (NiuTrans.Tensor) namespace */
-namespace nts {
-
-/* register a parameter with a unique name */
-void Model::Register(const char* name, Dim dims, TENSOR_DATA_TYPE dataType, int devID)
-{
-    parameters.AddParameter(name, dims, dataType, devID);
-}
-
-/* get a parameter by its name */
-XTensor* Model::operator[](const char* name)
-{
-    return parameters.GetParameter(name);
-}
-
-/* load a model from a binary file */
-void Model::Load(const char* fn)
-{
-    CheckNTErrors(parameters.list.Size() > 0, "empty tensor list");
-
-    FILE* file = fopen(fn, "rb");
-    LongList offset(parameters.list.Size());
-
-    /* check number of parameter */
-    unsigned long int number;
-    fread(&number, sizeof(number), 1, file);
-    CheckNTErrors(number == parameters.list.Size(), "parameter number not matched");
-
-    /* read offset from the file */
-    fread(parameters.list.items, sizeof(long), offset.Size(), file);
-
-    /* read parameters from the file */
-    for (int i = 0; i < offset.Size(); i++) {
-        parameters.list[i]->BinaryRead(file, offset[i]);
-    }
-    fclose(file);
-}
-
-/* dump a model to a binary file */
-void Model::Dump(const char* fn)
-{
-    FILE* file = fopen(fn, "wb");
-    
-    /* dump number of parameter */
-    unsigned long int number = parameters.list.Size();
-    fwrite(&number, sizeof(number), 1, file);
-
-    /* dump offset of parameters */
-    unsigned long int offset = sizeof(number);
-    for (int i = 0; i < parameters.list.Size(); i++) {
-        if (i > 0) {
-            offset += parameters.list[i - 1]->unitNum;
-        }
-        fwrite(&offset, sizeof(offset), 1, file);
-    }
-
-    /* dump parameters to the file */
-    for (int i = 0; i < parameters.list.Size(); i++) {
-        parameters.list[i]->BinaryDump(file);
-    }
-    fclose(file);
-}
-
-/* get a parameter by its name */
-XTensor* Model::Get(const char* name)
-{
-    return parameters.GetParameter(name);
-}
-
-/* add a parameter to the list */
-void Parameter::AddParameter(const char* name, Dim dims, TENSOR_DATA_TYPE dataType, int devID)
-{
-    CheckNTErrors(GetParameter(name) == NULL, "the name must be unique");
-
-    IntList dim;
-    for (int i : dims) {
-        dim.Add(i);
-    }
-    XTensor* p = NewTensorV2(dims.size(), dim.items, dataType, devID);
-    strcpy(p->name, (char*)name);
-    list.Add(p);
-}
-
-/* get a parameter by its name */
-XTensor* Parameter::GetParameter(const char* name)
-{
-    for (int i = 0; i < list.Size(); i++) {
-        if (strcmp(list[i]->name, name) == 0)
-            return list[i];
-    }
-
-    /* if miss, return a null pointer */
-    return NULL;
-}
-
-}
\ No newline at end of file
--- a/source/model/Model.h
+++ b/source/model/Model.h
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* 
-* the model class
-*
-* $Created by: HU Chi (huchinlp@foxmail.com) 2019-09-12
-*
-*/
-
-#ifndef __MODEL_H__
-#define __MODEL_H__
-
-#include <utility>
-#include "../tensor/XGlobal.h"
-#include "../tensor/XTensor.h"
-
-/* the nts (NiuTrans.Tensor) namespace */
-namespace nts {
-
-using Dim = std::initializer_list<int>;
-
-/* Parameter is a base class for parameters */
-struct Parameter {
-
-public:
-    /* the parameter list */
-    TensorList list;
-
-public:
-    /* add a parameter to the list */
-    void AddParameter(const char* name, Dim dims, TENSOR_DATA_TYPE dataType, int devID);
-
-    /* get a parameter by its name */
-    XTensor* GetParameter(const char* name);
-};
-
-/* Model is a base class for neural networks */
-struct Model {
-
-public:
-    Parameter parameters;
-
-public:
-    /* load a model from a binary file */
-    void Load(const char* fn);
-
-    /* dump the model to a binary file */
-    void Dump(const char* fn);
-
-    /* get a parameter by its name */
-    XTensor* Get(const char* name);
-
-    /* get a parameter by its name */
-    XTensor* operator[] (const char* name);
-
-    /* register a parameter with a unique name */
-    void Register(const char* name, Dim dims, TENSOR_DATA_TYPE dataType, int devID);
-
-};
-
-}
-
-#endif // __MODEL_H__
\ No newline at end of file
--- a/source/network/Main.cpp
+++ b/source/network/Main.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- /*
-  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-10
-  */
-
-#include <stdio.h>
-#include "XNet.h"
-#include "../tensor/XUtility.h"
-#include "../tensor/function/FHeader.h"
-#include "../tensor/core/CHeader.h"
-#include "../sample/transformer/Transformer.h"
-#include <fstream>
-#include <string>
-
-using namespace nts;
-using namespace transformer;
-
-void test() {
-    XTensor posEmbeddingBase;
-    int length = 5;
-    int eSize = 4;
-    int d = 4;
-    InitTensor2D(&posEmbeddingBase, length, eSize, X_FLOAT);
-
-    float* data = new float[posEmbeddingBase.unitNum];
-
-    for (int pos = 0; pos < length; pos++) {
-        float* dp = data + pos * eSize;
-
-        //int channelSize = eSize / 2;
-        //int offset = 0;
-        //for(int i = 0; i < channelSize; i++){
-        //    dp[offset++] = (float)sin(pos/pow(10000.0F, 2.0F*i/(d - 2)));
-        //}
-        //for(int i = 0; i < channelSize; i++){
-        //    dp[offset++] = (float)cos(pos/pow(10000.0F, 2.0F*i/(d - 2)));
-        //}
-
-
-        for (int k = 0; k < eSize; k++) {
-            if (k % 2 == 0) {
-                int i = k / 2;
-                dp[k] = (float)sin(pos / pow(10000.0F, 2.0F * i / d));
-            }
-            else {
-                int i = (k - 1) / 2;
-                dp[k] = (float)cos(pos / pow(10000.0F, 2.0F * i / d));
-            }
-        }
-
-    }
-
-    posEmbeddingBase.SetData(data, posEmbeddingBase.unitNum);
-    posEmbeddingBase.Dump(stderr);
-    delete[] data;
-}
-
-struct A {
-    XTensor a;
-
-    void update(XTensor b) {
-        a = b;
-    }
-};
-
-void test2(A *a) {
-    XTensor x;
-    InitTensor2D(&x, 2, 3);
-    XTensor y;
-    InitTensor2D(&y, 3, 2);
-    float data[]{ 1,1,1,1,1,1 };
-    x.SetData(data, 6);
-    y.SetData(data, 6);
-    XTensor z;
-    z = MatrixMul(x, y);
-    a->update(z);
-}
-
-void TestMemory() {
-    int devID = 0;
-    int memSize = 1024;
-    XMem *mem = new XMem(devID, FREE_ON_THE_FLY, (MTYPE)MILLION * 256, 1024, MILLION * 128);
-    mem->SetDesiredSize(devID, 0, (MTYPE)memSize * MILLION);
-    XTensor a;
-    InitTensor2D(&a, 5, 5, X_FLOAT, 0, mem);
-    float d[25]{ 0 };
-    for (int i = 0; i < 25; i++)
-        d[i] = float(i);
-    a.SetData(d, 25);
-
-    int index[]{ 0,1,2,3,4 };
-    for (int i = 0; i < 4; i++) {
-        XTensor srcIdx, tgtIdx;
-        InitTensor1D(&srcIdx, 4 - i, X_INT, a.devID, a.mem);
-        InitTensor1D(&tgtIdx, 4 - i, X_INT, a.devID, a.mem);
-        srcIdx.SetData(index, srcIdx.unitNum);
-        tgtIdx.SetAscendingOrder(0);
-
-        a = CopyIndexed(a, 0, srcIdx, tgtIdx);
-        printf("\nround %d\n", i);
-        a.Dump(stderr);
-    }
-
-    delete mem;
-}
-
-int main(int argc, const char** argv)
-{
-
-    TransformerMain(argc - 1, argv + 1);
-        
-
-    return 0;
-}
-
--- a/source/network/XBackwardMath.cpp
+++ b/source/network/XBackwardMath.cpp
--- a/source/network/XBackwardMath.h
+++ b/source/network/XBackwardMath.h
@@ -173,6 +173,10 @@ private:
    static
    void GradReduceSum(XTensor * node, bool isEfficient);

+    /* gradient for reduceSumAll */
+    static
+    void GradReduceSumAll(XTensor * node, bool isEfficient);
+
    /* gradient for reduceSumSquared */
    static
    void GradReduceSumSquared(XTensor * node, bool isEfficient);

--- a/source/network/XBackwardShape.cpp
+++ b/source/network/XBackwardShape.cpp
@@ -281,7 +281,7 @@ void XShapeGrad::GradMergeList(XTensor * node, bool isEfficient)
        smallsGrad.Add(tail->grad);
        
        if(i > 1){
-            CheckNTErrors(XTensor::IsSameShaped(last, tail), 
+            CheckNTErrors(_IsSameShaped(last, tail), 
                         "Input tensors must be of the same size!");
        }

@@ -391,7 +391,7 @@ void XShapeGrad::GradSplit(XTensor * node, bool isEfficient)
    /* if the tensor is used somewhere else, we need another SUM
       for gradient accumulation */
    else{
-        XTensor * inputGradTMP = NewTensorBuf(input, input->devID, input->mem);
+        XTensor * inputGradTMP = NewTensorBufV2(input, input->devID, input->mem);

        _Merge(node->grad, inputGradTMP, whereToSplit + 1, 0);
        _Sum(input->grad, inputGradTMP, input->grad);
@@ -475,7 +475,7 @@ void XShapeGrad::GradSplitListPost(XTensor * node, bool isEfficient)
       somewhere else, we need another SUM for gradient 
       accumulation */
    else{
-        XTensor * nodeGradTMP = NewTensorBuf(node, node->devID, node->mem);
+        XTensor * nodeGradTMP = NewTensorBufV2(node, node->devID, node->mem);

        _Merge(&splits, nodeGradTMP, whereToSplit + 1);
        _Sum(node->grad, nodeGradTMP, node->grad);
@@ -501,7 +501,7 @@ void XShapeGrad::GradTranspose(XTensor * node, bool isEfficient)

    XTensor * output = node;
    XTensor * input = income.tails[0];
-    XTensor * b = NewTensorBuf(input, input->devID, input->mem);
+    XTensor * b = NewTensorBufV2(input, input->devID, input->mem);
    XNoder::MakeGrad(input);

    int i = income.GetParamInt(0);
@@ -543,7 +543,7 @@ void XShapeGrad::GradUnsqueeze(XTensor * node, bool isEfficient)
    CheckNTErrors(dSize == output->GetDim(dim), "Wrong dim size for UNSQUEEZE!");
    CheckNTErrors(output->unitNum = input->unitNum * dSize, "Wrong tensor size!");
    
-    XTensor * g = NewTensorBuf(input->grad, input->devID, input->mem);
+    XTensor * g = NewTensorBufV2(input->grad, input->devID, input->mem);
    
    _ReduceSum(output->grad, g, dim);
    _Sum(input->grad, g, input->grad);

--- a/source/network/XNet.cpp
+++ b/source/network/XNet.cpp
@@ -77,104 +77,20 @@ backward propagation to obtain gradient
 >> root - root node (output) of the network
 >> loss - name of loss function
 */
-void XNet::Backward(XTensor &root, LOSS_FUNCTION_NAME loss)
+void XNet::Backward(XTensor &root)
 {
    TensorList roots(1);
    roots.Add(&root);

-    TensorList golds(1);
-    golds.Add(NULL);
-
-    TensorList paddings(1);
-    paddings.Add(NULL);
-
-    Backward(roots, golds, paddings, loss);
-}
-
-/*
-backward propagation to obtain gradient wrt. the loss/error function
->> root - root node (output) of the network
->> gold - gold standard for the output
->> loss - name of loss function
-*/
-void XNet::Backward(XTensor &root, XTensor &gold, LOSS_FUNCTION_NAME loss)
-{
-    TensorList roots(1);
-    roots.Add(&root);
-
-    TensorList golds(1);
-    golds.Add(&gold);
-
-    TensorList paddings(1);
-    paddings.Add(NULL);
-
-    Backward(roots, golds, paddings, loss);
-}
-
-/* 
-backward propagation to obtain gradient wrt. the loss/error function 
->> root - root node (output) of the network
->> gold - gold standard for the output
->> padding - specify a target value that is ignored and does not contribute to the gradient computation
->> loss - name of loss function
-*/
-void XNet::Backward(XTensor &root, XTensor &gold, XTensor &padding, LOSS_FUNCTION_NAME loss)
-{
-    TensorList roots(1);
-    roots.Add(&root);
-
-    TensorList golds(1);
-    golds.Add(&gold);
-
-    TensorList paddings(1);
-    paddings.Add(&padding);
-
-    Backward(roots, golds, paddings, loss);
-}
-
-/*
-backward propagation to obtain gradient
-with a number of root nodes
->> roots - a list of root nodes (output) of the network
->> loss - name of loss function
-*/
-void XNet::Backward(TensorList &roots, LOSS_FUNCTION_NAME loss)
-{
-    TensorList golds(roots.count);
-    TensorList paddings(roots.count);
-    for (int i = 0; i < roots.count; i++) {
-        golds.Add(NULL);
-        paddings.Add(NULL);
-    }
-
-    Backward(roots, golds, paddings, loss);
-}
-
-/*
-backward propagation to obtain gradient
-with a number of root nodes
->> roots - a list of root nodes (output) of the network
->> golds - a list of gold standard for the output
->> loss - name of loss function
-*/
-void XNet::Backward(TensorList &roots, TensorList &golds, LOSS_FUNCTION_NAME loss)
-{
-    TensorList paddings(roots.count);
-    for (int i = 0; i < roots.count; i++)
-        paddings.Add(NULL);
-
-    Backward(roots, golds, paddings, loss);
+    Backward(roots);
 }

 /* 
 backward propagation to obtain gradient wrt. the loss/error function
 with a number of root nodes 
 >> roots - a list of root nodes (output) of the network
->> golds - a list of gold standard for the output
->> paddings - specify a target value that is ignored
->> loss - name of loss function
 */
-void XNet::Backward(TensorList &roots, TensorList &golds, TensorList &paddings, LOSS_FUNCTION_NAME loss)
+void XNet::Backward(TensorList &roots)
 {
    Traverse(roots);

@@ -187,39 +103,6 @@ void XNet::Backward(TensorList &roots, TensorList &golds, TensorList &paddings, 
        node->visitMark = NODE_UNFINISHED;
    }

-    //XLossGrad lossGrad;
-
-    /* we start with the gradient with respect to the loss for output layers */
-    /*for(int i = 0; i < roots.count; i++){
-        XTensor * root = (XTensor*)roots.Get(i);
-        XTensor * gold = (XTensor*)golds.Get(i);
-        XTensor * padding = (XTensor*)paddings.Get(i);
-        XLink &income = root->income;
-        int funcID = income.typeID;
-        void * params = income.params;*/
-
-        /* we compute dE/dx if the output is generated by an activation function y = f(x).
-           Note that we do not need to obtain dE/dy here because it is no use in the 
-           folloing process of back-propagation */
-        /*if(gold != NULL && income.tailNum == 1 && (funcID & FUNCTION_BASE)){
-            if(funcID == FUNC_LOGSOFTMAX || funcID == FUNC_SOFTMAX) {
-                XTensor * x = income.tails[0];
-                XNoder::MakeGrad(x);
-                lossGrad.Compute(gold, root, x, NULL, x->grad, padding, funcID, params, loss);
-                root->visitMark = NODE_FINISHED;
-            }
-            else {
-                XNoder::MakeGrad(root);
-                lossGrad.Compute(gold, root, root->grad, padding, loss);
-            }
-        }*/
-        /* we compuate dE/dy (y is the output) if no predefined activation function is used */
-        /*else{
-            XNoder::MakeGrad(root);
-            lossGrad.Compute(gold, root, root->grad, NULL, loss);
-        }
-    }*/
-    
    /* back-propagation from output to input */
    for(int i = nodes.count - 1; i >= 0; i--){
        XTensor * node = (XTensor*)nodes.Get(i);
@@ -460,7 +343,6 @@ void XNet::ShowNetwork(FILE * file, XTensor * node)
    }
 }

-
 /*
 search for a node in a top-down manner by its name
 >> top - the top most node

--- a/source/network/XNet.h
+++ b/source/network/XNet.h
@@ -61,25 +61,11 @@ struct XNet
    void Clear();

    /* backward propagation to obtain gradient */
-    void Backward(XTensor &root, LOSS_FUNCTION_NAME loss = NOLOSS);
-
-    /* backward propagation to obtain gradient wrt. the loss/error function */
-    void Backward(XTensor &root, XTensor &gold, LOSS_FUNCTION_NAME loss = NOLOSS);
-
-    /* backward propagation to obtain gradient wrt. the loss/error function */
-    void Backward(XTensor &root, XTensor &gold, XTensor &padding, LOSS_FUNCTION_NAME loss = NOLOSS);
-
-    /* backward propagation to obtain gradient
-       with a number of root nodes */
-    void Backward(TensorList &roots, LOSS_FUNCTION_NAME loss = NOLOSS);
-
-    /* backward propagation to obtain gradient
-       with a number of root nodes */
-    void Backward(TensorList &roots, TensorList &golds, LOSS_FUNCTION_NAME loss = NOLOSS);
+    void Backward(XTensor &root);

    /* backward propagation to obtain gradient wrt. the loss/error function
       with a number of root nodes */
-    void Backward(TensorList &roots, TensorList &golds, TensorList &paddings, LOSS_FUNCTION_NAME loss = NOLOSS);
+    void Backward(TensorList &roots);

    /* backward computation for a given node */
    void BackwardNode(XTensor * node, bool isEfficent = false);

--- a/source/network/XNoder.cpp
+++ b/source/network/XNoder.cpp
@@ -29,7 +29,7 @@ void XNoder::MakeGrad(XTensor * node)
    if(node == NULL)
        return;

-    if(!XTensor::IsSameShaped(node, node->grad)){
+    if(!_IsSameShaped(node, node->grad)){
        delete node->grad;
        node->grad = NewTensor(node);
        node->grad->SetZeroAll();

--- a/source/network/XNoder.h
+++ b/source/network/XNoder.h
@@ -20,7 +20,7 @@
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-18
 */

-#include "../tensor/XTensor.h"
+#include "../tensor/core/CHeader.h"

 #ifndef __XNODER_H__
 #define __XNODER_H__

--- a/source/sample/fnnlm/FNNLM.cpp
+++ b/source/sample/fnnlm/FNNLM.cpp
--- a/source/sample/fnnlm/FNNLM.h
+++ b/source/sample/fnnlm/FNNLM.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ *
+ * This is a simple impelementation of the feed-forward network-baesd language
+ * model (FNNLM). See more details about FNNLM in
+ * "A Neural Probabilistic Language Model" by Bengio et al.
+ * Journal of Machine Learning Research 3 (2003) 1137¨C1155
+ *
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-06-22
+ * Today I was awarded as the most popular teacher in our college. 
+ * It was the great honour for me!!!    
+ */
+
+#ifndef __FNNLM_H__
+#define __FNNLM_H__
+
+#include "../../tensor/XGlobal.h"
+#include "../../tensor/XTensor.h"
+#include "../../tensor/core/CHeader.h"
+
+using namespace nts;
+
+namespace fnnlm
+{
+
+#define _EXIT_(x)// exit(x)
+#define CheckErrors(x, msg) { if(!(x)) { fprintf(stderr, "Error! calling '%s' (%s line %d): %s\n", #x, __FILENAME__, __LINE__, msg);  _EXIT_(1); } }
+#define ShowErrors(msg) { { fprintf(stderr, "Error! (%s line %d): %s\n", __FILENAME__, __LINE__, msg); _EXIT_(1); } } 
+
+#define MAX_N_GRAM 8
+#define MAX_HIDDEN_NUM 8
+
+/* an n-gram = a sequence of n words 
+   words[0..n-2] is the history, and
+   words[n-1] is the word for prediction. */
+struct NGram
+{
+    int words[MAX_N_GRAM];
+};
+
+/* fnn model */
+struct FNNModel
+{
+    /* word embedding */
+    XTensor embeddingW;
+
+    /* parameter matrix of each hidden layer 
+       hidden layer: y = f(x * w + b)
+       where x is the input, y is the output, w is
+       the tranformation (parameter) matrix, b is
+       the bias and f() is the activation function. */
+    XTensor hiddenW[MAX_HIDDEN_NUM];
+
+    /* bias of each hidden layer */
+    XTensor hiddenB[MAX_HIDDEN_NUM];
+
+    /* parameter matrix of the output layer */
+    XTensor outputW;
+
+    /* bias of the output layer */
+    XTensor outputB;
+
+    /* order of the language model */
+    int n;
+
+    /* embedding size */
+    int eSize;
+    
+    /* number of hidden layers */
+    int hDepth;
+
+    /* hidden layer size */
+    int hSize;
+
+    /* vocabulary size */
+    int vSize;
+
+    /* id of the device for running the model */
+    int devID;
+
+    /* indicates whether we use memory pool */
+    bool useMemPool;
+
+    /* memory pool */
+    XMem * mem;
+
+    FNNModel(){ n = -1; vSize = -1;hDepth = 0;devID = -1;mem = NULL;};
+    ~FNNModel(){delete mem;};
+};
+
+/* the network built on the fly */
+struct FNNNet
+{
+    /* embedding result of the previous n - 1 words */
+    XTensor embeddings[MAX_N_GRAM];
+
+    /* concatenation of embeddings */
+    XTensor embeddingCat;
+
+    /* output of the hidden layers */
+    XTensor hiddens[MAX_HIDDEN_NUM];
+
+    /* state of the hidden layers (before activation function) */
+    XTensor hiddenStates[MAX_HIDDEN_NUM];
+
+    /* state before softmax */
+    XTensor stateLast;
+
+    /* output of the net */
+    XTensor output;
+};
+
+/* entrance of the program */
+int FNNLMMain(int argc, const char ** argv);
+
+};
+
+#endif
--- a/source/sample/transformer/T2TAttention.cpp
+++ b/source/sample/transformer/T2TAttention.cpp
--- a/source/sample/transformer/T2TAttention.h
+++ b/source/sample/transformer/T2TAttention.h
@@ -15,7 +15,7 @@
 * limitations under the License.
 */

-/*
+ /*
  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
  */

@@ -41,12 +41,12 @@ public:

 public:

-    bool IsEmpty(){
+    bool IsEmpty() {
        return (k == NULL) && (v == NULL);
    }

    void Clear() {
-        if (k && v && k->id > 0 && v->id >0) {
+        if (k && v && k->id > 0 && v->id > 0) {
            DelTensor(k);
            DelTensor(v);
        }
@@ -84,32 +84,29 @@ public:
    /* device id */
    int devID;

-    /* memory pool */
-    XMem * mem;
-    
    /* head number */
    int nhead;

-    /* transformation matrix for query */
+    /* transformation matrix for Q */
    XTensor wq;

-    /* bias for query */
+    /* bias for Q */
    XTensor bq;

-    /* transformation matrix for query */
+    /* transformation matrix for K */
    XTensor wk;

-    /* bias for query */
+    /* bias for K */
    XTensor bk;

-    /* transformation matrix for query */
+    /* transformation matrix for V */
    XTensor wv;

-    /* bias for query */
+    /* bias for V */
    XTensor bv;

-    /* max relative window size */
-    XTensor rpEmbK;
+    /* RPR emb */
+    XTensor rp_embedding_k;

    /* transformation after dot-product attention */
    XTensor wa;
@@ -140,7 +137,7 @@ public:
    DTYPE dropoutP;

    /* max relative window size */
-    int maxRP;
+    int max_relative_position;


 public:
@@ -151,23 +148,23 @@ public:
    ~T2TAttention();

    /* initialize the model */
-    void InitModel(int argc, char ** argv, 
+    void InitModel(int argc, char** argv,
        bool myIsMasked, int myIgnored,
-                   int myDevID = -1, XMem * myMem = NULL);
+        int myDevID = -1);

    /* make the network */
-    XTensor Make(XTensor &k, XTensor &q, XTensor &v, XTensor *mask, 
+    XTensor Make(XTensor& k, XTensor& q, XTensor& v, XTensor* mask,
        bool isTraining, Cache* cache, int cacheType);

    /* make the attention network given keys, queries and values (after linear transformation) */
-    XTensor MakeAttention(XTensor *k, XTensor *q, XTensor *v, const XTensor *mask, bool isTraining, bool isEnc);
+    XTensor MakeAttention(XTensor* k, XTensor* q, XTensor* v, const XTensor* mask, bool isTraining, bool is_encoder);

    /* make the attention network given keys, queries and values (after linear transformation) */
-    XTensor MakeRPRAttention(XTensor *k, XTensor *q, XTensor *v, XTensor *mask, bool isTraining, bool isEnc);
+    XTensor MakeRPRAttention(XTensor* k, XTensor* q, XTensor* v, XTensor* mask, bool isTraining, bool is_encoder);

-    void GetRPEmbedding(XTensor* embMatrix, const int lenQ, const int lenKV, const int maxRelativeLen, const int device_id, const bool isEnc);
+    void GetRPEmbedding(XTensor* emb_matrix, const int len_q, const int len_kv, const int max_relative_length, const int device_id, const bool is_encoder);

-    void RPDotProduct(XTensor* x, XTensor* y, XTensor* z, XTensor* attention, const bool isKey);
+    void RPDotProduct(XTensor* x, XTensor* y, XTensor* z, XTensor* attention, const bool is_key);
 };

 }

--- a/source/sample/transformer/T2TDecoder.cpp
+++ b/source/sample/transformer/T2TDecoder.cpp
@@ -61,29 +61,27 @@ initialize the model
 >> myIsMasked - indicates whether the masked attention is employed
 >> myIgnored - number of positions ignored in attention (from the start)
 >> myDevID - device id
->> myMem - the memory pool
 */
 void AttDecoder::InitModel(int argc, char ** argv, 
                           bool myIsMasked, int myIgnored, 
-                           int myDevID, XMem * myMem)
+                           int myDevID)
 {
-    //AttEncoder::InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
+    //AttEncoder::InitModel(argc, argv, myIsMasked, myIgnored, myDevID);

    devID = myDevID;
-    mem = myMem;
    ignored = myIgnored;

    LoadParamInt(argc, argv, "nlayer", &nlayer, 3);
    LoadParamInt(argc, argv, "hsize", &hSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "esize", &eSize, DEFAULT_EMBEDDING_SIZE);
-    LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
+    LoadParamInt(argc, argv, "vsizetgt", &vSize, 34040);
    LoadParamFloat(argc, argv, "dropout", &dropoutP, 0);

    CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
    CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsizetgt\"");

    /* embedding model */
-    embedder.InitModel(argc, argv, devID, mem, false);
+    embedder.InitModel(argc, argv, devID, false);

    attentions = new T2TAttention[nlayer];
    fnns = new T2TFNN[nlayer];
@@ -96,11 +94,11 @@ void AttDecoder::InitModel(int argc, char ** argv,

    /* initialize the stacked layers */
    for (int i = 0; i < nlayer; i++) {
-        attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
-        fnns[i].InitModel(argc, argv, myDevID, myMem);
-        attLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
-        attentionsEnde[i].InitModel(argc, argv, true, myIgnored, myDevID, myMem);
-        attEndeLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
+        attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID);
+        fnns[i].InitModel(argc, argv, myDevID);
+        attLayerNorms[i].InitModel(argc, argv, myDevID);
+        attentionsEnde[i].InitModel(argc, argv, true, myIgnored, myDevID);
+        attEndeLayerNorms[i].InitModel(argc, argv, myDevID);
    }
    decodeLayerNorm->InitModel(argc, argv, myDevID);
 }
@@ -114,7 +112,7 @@ make the decoding network
 >> isTraining - indicates whether the model is used for training
 << return - the output tensor of the encoder
 */
-XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, const XTensor *mask, XTensor &maskEncDec, bool isTraining)
+XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor *mask, XTensor &maskEncDec, bool isTraining)
 {
    XTensor x;


--- a/source/sample/transformer/T2TDecoder.h
+++ b/source/sample/transformer/T2TDecoder.h
@@ -22,7 +22,6 @@
 #ifndef __T2TDECODER_H__
 #define __T2TDECODER_H__

-#include <array>
 #include "T2TEncoder.h"

 namespace transformer
@@ -38,9 +37,6 @@ public:
    /* device id */
    int devID;

-    /* memory pool */
-    XMem * mem;
-
    /* layer number */
    int nlayer;

@@ -103,10 +99,10 @@ public:
    /* initialize the model */
    void InitModel(int argc, char ** argv, 
                   bool myIsMasked, int myIgnored, 
-                   int myDevID = -1, XMem * myMem = NULL);
+                   int myDevID = -1);

    /* make the decoding network */
-    XTensor Make(XTensor &inputDec, XTensor &outputEnc, const XTensor *mask, XTensor &maskEncDec, bool isTraining);
+    XTensor Make(XTensor &inputDec, XTensor &outputEnc, XTensor *mask, XTensor &maskEncDec, bool isTraining);
 };

 }

--- a/source/sample/transformer/T2TEmbedding.cpp
+++ b/source/sample/transformer/T2TEmbedding.cpp
@@ -31,7 +31,6 @@ namespace transformer
 T2TEmbedder::T2TEmbedder()
 {
    devID = -1;
-    mem = NULL;
    vSize = -1;
    maxLength = -1;
 }
@@ -46,12 +45,10 @@ initialize the model
 >> argc - number of arguments
 >> argv - list of pointers to the arguments
 >> myDevID - device id
->> myMem - the memory pool
 */
-void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, XMem * myMem, bool isEnc)
+void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, bool isEnc)
 {
    devID = myDevID;
-    mem = myMem;
    
    if(isEnc){
        LoadParamInt(argc, argv, "vsize", &vSize, -1);
@@ -65,7 +62,7 @@ void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, XMem * myMem, b
    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "pad", &padIdx, 1);

-    InitTensor2DV2(&w, vSize, eSize, X_FLOAT, devID);
+    InitTensor2D(&w, vSize, eSize, X_FLOAT, devID);

    maxLength = maxLength + 1 + 1;
    DTYPE v = 1.0F/(float)sqrt((float)eSize);
@@ -83,7 +80,7 @@ make positional embeddings (of size eSize * length)
 */
 void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length, int padIdx)
 {
-    InitTensor2DV2(&posEmbeddingBase, length, eSize, X_FLOAT, devID);
+    InitTensor2D(&posEmbeddingBase, length, eSize, X_FLOAT, devID);

    float * data = new float[posEmbeddingBase.unitNum];

@@ -101,14 +98,13 @@ void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length, int padIdx)
        
    }
    
-    /* zero pad */
+    /* padding zeros */
    int padStart = padIdx * eSize;
-    for (int i = padStart; i < padStart + eSize; i++)
+    for (int i = padStart; i < padStart + eSize; ++i)
        data[i] = 0.F;

    posEmbeddingBase.SetData(data, posEmbeddingBase.unitNum);

-
    delete[] data;
 }


--- a/source/sample/transformer/T2TEmbedding.h
+++ b/source/sample/transformer/T2TEmbedding.h
@@ -41,9 +41,6 @@ public:
    /* device id */
    int devID;
    
-    /* memory pool */
-    XMem * mem;
-    
    /* vocabulary size */
    int vSize;

@@ -74,7 +71,7 @@ public:
    ~T2TEmbedder();

    /* initialize the model */
-    void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL, bool isEnc = true);
+    void InitModel(int argc, char ** argv, int myDevID = -1, bool isEnc = true);

    /* make positional embeddings */
    void MakePosEmbedding(int eSize, int d, int length, int padIdx);

--- a/source/sample/transformer/T2TEncoder.cpp
+++ b/source/sample/transformer/T2TEncoder.cpp
@@ -53,20 +53,18 @@ initialize the model
 >> myIsMasked - indicates whether the masked attention is employed
 >> myIgnored - number of positions ignored in attention (from the start)
 >> myDevID - device id
->> myMem - the memory pool
 */
 void AttEncoder::InitModel(int argc, char ** argv, 
                           bool myIsMasked, int myIgnored, 
-                           int myDevID, XMem * myMem)
+                           int myDevID)
 {
    devID = myDevID;
-    mem = myMem;
    ignored = myIgnored;
    
    LoadParamInt(argc, argv, "nlayer", &nlayer, 35);
    LoadParamInt(argc, argv, "hsize", &hSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "esize", &eSize, DEFAULT_EMBEDDING_SIZE);
-    LoadParamInt(argc, argv, "vsize", &vSize, -1);
+    LoadParamInt(argc, argv, "vsize", &vSize, 34040);
    LoadParamFloat(argc, argv, "dropout", &dropoutP, 0);

    CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
@@ -82,12 +80,11 @@ void AttEncoder::InitModel(int argc, char ** argv,

    /* initialize the stacked layers */
    for(int i = 0; i < nlayer; i++){
-        attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
-        fnns[i].InitModel(argc, argv, myDevID, myMem);
-        attLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
-        
+        attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID);
+        fnns[i].InitModel(argc, argv, myDevID);
+        attLayerNorms[i].InitModel(argc, argv, myDevID);
    }
-    encodeLayerNorm->InitModel(argc, argv, myDevID, myMem);
+    encodeLayerNorm->InitModel(argc, argv, myDevID);
 }

 /* 
@@ -104,6 +101,10 @@ XTensor AttEncoder::Make(XTensor &input, XTensor *mask, XTensor &maskEncDec, boo

    x = embedder.Make(input, 0);

+    /* dropout */
+    if(isTraining && dropoutP > 0)
+        x = Dropout(x, dropoutP);
+
    for(int i = 0; i < nlayer; i++){
        XTensor att;
        XTensor ln;

--- a/source/sample/transformer/T2TEncoder.h
+++ b/source/sample/transformer/T2TEncoder.h
@@ -65,9 +65,6 @@ public:
    /* device id */
    int devID;

-    /* memory pool */
-    XMem * mem;
-
    /* layer number */
    int nlayer;

@@ -118,7 +115,7 @@ public:
    /* initialize the model */
    void InitModel(int argc, char ** argv, 
                   bool myIsMasked, int myIgnored, 
-                   int myDevID = -1, XMem * myMem = NULL);
+                   int myDevID = -1);

    /* make the encoding network */
    XTensor Make(XTensor &input, XTensor *mask, XTensor &maskEncDec, bool isTraining);

--- a/source/sample/transformer/T2TFNN.cpp
+++ b/source/sample/transformer/T2TFNN.cpp
@@ -47,12 +47,10 @@ initialize the model
 >> argc - number of arguments
 >> argv - list of pointers to the arguments
 >> myDevID - device id
->> myMem - the memory pool
 */
-void T2TFNN::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
+void T2TFNN::InitModel(int argc, char ** argv, int myDevID)
 {
    devID = myDevID;
-    mem = myMem;
    
    float minmax = 0;

@@ -68,7 +66,7 @@ void T2TFNN::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
    InitTensor2DV2(&w2, outSize, hSize, X_FLOAT, devID);
    InitTensor1DV2(&b2, outSize, X_FLOAT, devID);

-    fnnLayerNorm.InitModel(argc, argv, myDevID, myMem);
+    fnnLayerNorm.InitModel(argc, argv, myDevID);

    //float scale = 1.0F;
    //float finfout1 = (float)sqrt(6.0F * scale/(inSize + hSize));

--- a/source/sample/transformer/T2TFNN.h
+++ b/source/sample/transformer/T2TFNN.h
@@ -37,9 +37,6 @@ public:
    /* device id */
    int devID;

-    /* memory pool */
-    XMem * mem;
-
    /* size of input vector */
    int inSize;

@@ -76,7 +73,7 @@ public:
    ~T2TFNN();

    /* initialize the model */
-    void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL);
+    void InitModel(int argc, char ** argv, int myDevID = -1);

    /* make the network */
    XTensor Make(XTensor &input, bool isTraining);

--- a/source/sample/transformer/T2TLayerNormal.cpp
+++ b/source/sample/transformer/T2TLayerNormal.cpp
@@ -32,7 +32,6 @@ namespace transformer
 T2TLN::T2TLN()
 {
    devID = -1;
-    mem = NULL;
    d = 0;
 }

@@ -46,21 +45,19 @@ initialize the model
 >> argc - number of arguments
 >> argv - list of pointers to the arguments
 >> myDevID - device id
->> myMem - the memory pool
 */
-void T2TLN::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
+void T2TLN::InitModel(int argc, char ** argv, int myDevID)
 {
    devID = myDevID;
-    mem = myMem;

    d = 0;
    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);

-    InitTensor1DV2(&w, d, X_FLOAT, devID);
-    InitTensor1DV2(&b, d, X_FLOAT, devID);
+    InitTensor1D(&w, d, X_FLOAT, devID);
+    InitTensor1D(&b, d, X_FLOAT, devID);

-    //w.SetDataRand(1.0F, 1.0F);
-    //b.SetZeroAll();
+    w.SetDataRand(1.0F, 1.0F);
+    b.SetZeroAll();
 }

 /*

--- a/source/sample/transformer/T2TLayerNormal.h
+++ b/source/sample/transformer/T2TLayerNormal.h
@@ -37,9 +37,6 @@ public:
    /* device id */
    int devID;

-    /* memory pool */
-    XMem * mem;
-
    /* the transformation matrix w */
    XTensor w;

@@ -57,7 +54,7 @@ public:
    ~T2TLN();
    
    /* initialize the model */
-    void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL);
+    void InitModel(int argc, char ** argv, int myDevID = -1);
    
    /* make the network */
    XTensor Make(XTensor &input);

--- a/source/sample/transformer/T2TLengthPenalty.cpp
+++ b/source/sample/transformer/T2TLengthPenalty.cpp
@@ -35,7 +35,9 @@ XTensor T2TLengthPenalizer::GNMT(const XTensor & length, float alpha)
    XTensor base;
    XTensor lp;

-    base = (length + 5)/(1.0F + 5.0F);
+    //base = ScaleAndShift(ScaleAndShift(length, 0, 5.0F), 1.0F/(5 + 1));
+    base = (length + 5)/(1 + 5);
+
    lp = Power(base, alpha);
    
    return lp;

--- a/source/sample/transformer/T2TModel.cpp
+++ b/source/sample/transformer/T2TModel.cpp
--- a/source/sample/transformer/T2TModel.h
+++ b/source/sample/transformer/T2TModel.h
@@ -40,9 +40,6 @@ public:
    /* device id */
    int devID;

-    /* memory pool */
-    XMem * mem;
-
    /* the encoder */
    AttEncoder * encoder;

@@ -71,9 +68,6 @@ public:
    /* initialize the model */
    void InitModel(int argc, char ** argv);

-    /* reset cache for decoder */
-    void ResetCache();
-
    /* make the encoding network */
    XTensor MakeEncoder(XTensor &input, XTensor *mask, bool isTraining);

@@ -110,9 +104,6 @@ public:
    void Read(const char * fn);
 };

-void FastRead(XTensor* x, FILE* f);
-void FastDump(XTensor* x, FILE* f);
-void ConvertModelFile(const TensorList* params, const char* src, const char* tgt);
 }

 #endif
--- a/source/sample/transformer/T2TOutput.cpp
+++ b/source/sample/transformer/T2TOutput.cpp
@@ -25,14 +25,12 @@
 #include "T2TEmbedding.h"
 #include "../../tensor/core/CHeader.h"

-
 namespace transformer
 {
 /* constructor */
 T2TOutput::T2TOutput()
 {
    devID = -1;
-    mem = NULL;
    vSize = -1;
    inSize = -1;
    hSize = -1;
@@ -48,12 +46,10 @@ initialize the model
 >> argc - number of arguments
 >> argv - list of pointers to the arguments
 >> myDevID - device id
->> myMem - the memory pool
 */
-void T2TOutput::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
+void T2TOutput::InitModel(int argc, char ** argv, int myDevID)
 {
    devID = myDevID;
-    mem = myMem;

    float minmax = 0;

@@ -62,14 +58,7 @@ void T2TOutput::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
    LoadParamInt(argc, argv, "d", &hSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamFloat(argc, argv, "outputminmax", &minmax, 0.08F);

-    InitTensor2DV2(&w, hSize, vSize,  X_FLOAT, devID);
-    
-    //float scale = 1.0F;
-    //float finfout = (float)sqrt(6.0F * scale/(hSize + vSize));
-    //w.SetDataRand(-finfout, finfout);
-
-    //DTYPE v = 1.0F/(float)sqrt((float)hSize);
-    //w.SetDataRandn(0, v);
+    InitTensor2D(&w, hSize, vSize, X_FLOAT, devID);
 }

 /* 
@@ -83,7 +72,6 @@ XTensor T2TOutput::Make(XTensor &input)
    XTensor &x = input;

    return Softmax(MMul(x, X_NOTRANS, w, X_TRANS), -1);
-    //return MulAndShift(x, X_NOTRANS, w, X_TRANS, b);
 }

 /* 

--- a/source/sample/transformer/T2TOutput.h
+++ b/source/sample/transformer/T2TOutput.h
@@ -38,9 +38,6 @@ public:
    /* device id */
    int devID;

-    /* memory pool */
-    XMem * mem;
-
    /* vocabulary size */
    int vSize;

@@ -61,7 +58,7 @@ public:
    ~T2TOutput();

    /* initialize the model */
-    void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL);
+    void InitModel(int argc, char ** argv, int myDevID = -1);

    /* make the network */
    XTensor Make(XTensor &input);

--- a/source/sample/transformer/T2TPredictor.cpp
+++ b/source/sample/transformer/T2TPredictor.cpp
--- a/source/sample/transformer/T2TPredictor.h
+++ b/source/sample/transformer/T2TPredictor.h
@@ -146,7 +146,7 @@ public:
    ~T2TPredictor();

    /* create an initial state */
-    void Create(T2TModel * model, XTensor * top, const XTensor * input, int beamSize, T2TStateBundle * state, XTensor * encoding);
+    void Create(T2TModel * model, XTensor * top, const XTensor * input, int beamSize, T2TStateBundle * state);

    /* set the start symbol */
    void SetStartSymbol(int symbol);
@@ -155,12 +155,13 @@ public:
    void Read(T2TModel * model, T2TStateBundle * state);

    /* predict the next state */
-    void Predict(T2TStateBundle * next, XTensor & encoding,
-                 XTensor & inputEnc, XTensor & paddingEnc,
-                 XTensor& nonFinished, bool updateFinished);
+    void Predict(T2TStateBundle * next, XTensor * encoding, XTensor * inputEnc, XTensor * paddingEnc);

    /* generate paths up to the states of the current step */
    XTensor GeneratePaths(T2TStateBundle * state);
+
+    /* get the predictions of the previous step */
+    XTensor GetLastPrediction(T2TStateBundle* state);
 };

 }

--- a/source/sample/transformer/T2TSearch.cpp
+++ b/source/sample/transformer/T2TSearch.cpp
--- a/source/sample/transformer/T2TSearch.h
+++ b/source/sample/transformer/T2TSearch.h
@@ -62,12 +62,6 @@ private:
    /* start symbol */
    int startSymbol;

-    /* scalar of the input sequence (for max number of search steps) */
-    float scalarMaxLength;
-
-    /* indicate whether the early stop strategy is used */
-    bool isEarlyStop;
-
 public:
    /* constructor */
    T2TSearch();
@@ -79,8 +73,7 @@ public:
    void Init(int argc, char ** argv);

    /* search for the most promising states */
-    void Search(T2TModel * model, XTensor * input, XTensor * padding, 
-                XTensor * output, XTensor * score);
+    void Search(T2TModel * model, XTensor * input, XTensor * padding, XTensor * output);

    /* preparation */
    void Prepare(int myBatchSize,int myBeamSize);
@@ -101,7 +94,7 @@ public:
    void FillHeap(T2TStateBundle * beam);

    /* save the output sequences in a tensor */
-    void Dump(XTensor * output, XTensor * score);
+    void Dump(XTensor * output);

    /* check if the token is an end symbol */
    bool IsEnd(int token);
@@ -109,17 +102,6 @@ public:
    /* set end symbols for search */
    void SetEnd(const int * tokens, const int tokenNum);

-    /* penalize beams that completed  */
-    int UpdateCompleted(T2TStateBundle * beam, XTensor & encoding,
-                         XTensor& inputEnc, XTensor& paddingEnc, 
-                         IntList completedStates, XTensor &nonFinished);
-
-    /* check whether all hypotheses are completed */
-    bool IsAllCompleted(T2TStateBundle * beam);
-
-    /* check if any hypotheses are completed */
-    IntList IsAnyCompleted(T2TStateBundle * beam);
-
    /* make a mask to prevent duplicated entries in beam expansion for the first position */
    XTensor MakeFirstMask(T2TStateBundle * beam);
 };

--- a/source/sample/transformer/T2TTester.cpp
+++ b/source/sample/transformer/T2TTester.cpp
@@ -15,17 +15,18 @@
 * limitations under the License.
 */

- /*
+/*
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
 */

 #include <math.h>
-
+#include "T2TUtility.h"
 #include "T2TTester.h"
 #include "T2TSearch.h"
-#include "T2TUtility.h"
 #include "../../tensor/XUtility.h"
 #include "../../tensor/core/CHeader.h"
+#include "../../network/XNoder.h"
+#include "..//..//tensor/XTensor.h"

 using namespace nts;

@@ -35,7 +36,6 @@ namespace transformer
 /* constructor */
 T2TTester::T2TTester()
 {
-
 }

 /* de-constructor */
@@ -44,39 +44,23 @@ T2TTester::~T2TTester()
 }

 /* initialize the model */
-void T2TTester::Init(int argc, char** argv)
+void T2TTester::Init(int argc, char ** argv)
 {
-    LoadParamInt(argc, argv, "vsize", &vSize, 1);
+    LoadParamInt(argc, argv, "vsize", &vSize, 34040);
    LoadParamInt(argc, argv, "vsizetgt", &vSizeTgt, vSize);
    LoadParamInt(argc, argv, "sentbatch", &sentBatch, 1);
-    LoadParamBool(argc, argv, "sort", &batchLoader.sortBuffer, false);
+    LoadParamBool(argc, argv, "sort", &batchLoader.sortBuffer, true);
    
    seacher.Init(argc, argv);
 }

-Result ExtractRes(XTensor& output, IntList& indices, int i) {
-    Result res;
-    XTensor sent, srcIdx, tgtIdx;
-    InitTensor1D(&srcIdx, 1, X_INT, output.devID);
-    int idx[]{ i };
-    srcIdx.SetData(idx, 1);
-    InitTensor(&tgtIdx, &srcIdx);
-    tgtIdx.SetAscendingOrder(0);
-
-    sent = CopyIndexed(output, 0, srcIdx, tgtIdx);
-
-    res.data.Add((int*)sent.data, sent.unitNum);
-    res.id = indices[i];
-    return res;
-}
-
 /* 
 test the model
 >> fn - test data file
 >> ofn - output data file
 >> model - model that is trained
 */
-void T2TTester::Test(const char* fn, const char* ofn, T2TModel* model)
+void T2TTester::Test(const char * fn, const char * ofn, T2TModel * model)
 {
    int wc = 0;
    int wordCount = 0;
@@ -85,11 +69,10 @@ void T2TTester::Test(const char* fn, const char* ofn, T2TModel* model)
    int batchCount = 0;

    /* data files */
-    FILE* ofile = fopen(ofn, "w");
+    FILE* ofile = fopen(ofn, "wb");
    CheckNTErrors(ofile, "Cannot open the output file");

    int devID = model->devID;
-    XMem* mem = model->mem;

    double startT = GetClockSec();

@@ -102,23 +85,44 @@ void T2TTester::Test(const char* fn, const char* ofn, T2TModel* model)
    /* an array that keeps the sequences */
    int* seqs = new int[MILLION];

-    batchLoader.Init(fn, 100, true);
+    batchLoader.Init(fn);
+    

    int count = 0;
-    while (!batchLoader.IsEmpty()) {
+    while (!batchLoader.IsEmpty())
+    {
+        count++;
        wordCount = 0;
+        /*if (count % 10 == 0 && sentBatch < 128)
+            sentBatch *= 2;*/

        /* reset cache for decoder */
-        model->ResetCache();
+        for (int i = 0; i < model->decoder->nlayer; ++i) {
+            model->decoder->selfCache[i].Clear();
+            model->decoder->contextCache[i].Clear();
+        }

-        IntList indices = batchLoader.LoadBatch(&batchEnc, &paddingEnc, sentBatch, devID);
+        vector<int> indices = batchLoader.LoadBatch(&batchEnc, &paddingEnc, sentBatch, devID);

-        XTensor output, score;
+        XTensor output;

-        seacher.Search(model, &batchEnc, &paddingEnc, &output, &score);
+        seacher.Search(model, &batchEnc, &paddingEnc, &output);
+
+        for (int i = 0; i < indices.size(); ++i) {
+            Result res;
+            XTensor sent, srcIdx, tgtIdx;
+            InitTensor1D(&srcIdx, 1, X_INT, output.devID);
+            int idx[]{i};
+            srcIdx.SetData(idx, 1);
+            InitTensor(&tgtIdx, &srcIdx);
+            SetAscendingOrder(tgtIdx, 0);
+
+            sent = CopyIndexed(output, 0, srcIdx, tgtIdx);
+            res.values = sent;
+            res.id = indices[i];
+            batchLoader.resBuffer.emplace_back(res);
+        }
        
-        for (int i = 0; i < indices.Size(); i++)
-            batchLoader.resBuffer.Add(ExtractRes(output, indices, i));

        wc = batchEnc.GetDim(-1);
        wordCount += wc;
@@ -126,41 +130,48 @@ void T2TTester::Test(const char* fn, const char* ofn, T2TModel* model)
        sentCount += batchEnc.GetDim(-2);
        batchCount += 1;

+        if (batchCount % 1 == 0) {
            double elapsed = GetClockSec() - startT;
-        XPRINT3(0, stderr, "[INFO] elapsed=%.1fs, sent=%d, sword=%d\n", elapsed, sentCount, wordCount);
+            XPRINT3(0, stderr, 
+                   "[INFO] elapsed=%.1fs, sentence=%d, sword=%d\n",
+                    elapsed, sentCount, wordCount);
+        }
    }

-    batchLoader.SortRes();
+    batchLoader.RerankRes();

-    for (int i = 0; i < batchLoader.resBuffer.Size(); i++)
-        Dump(ofile, batchLoader.resBuffer[i].data);
+    for (auto res : batchLoader.resBuffer) {
+        Dump(ofile, &res.values);
+    }
        
    fclose(ofile);
+
    delete[] seqs;
    
    double elapsed = GetClockSec() - startT;
+
    XPRINT3(0, stderr, "[INFO] test finished (took %.1fs, word=%d, sent=%d)\n", elapsed, wordCountTotal, sentCount);
 }

 /*
 dump the result into the file
 >> file - data file
->> output - output list
+>> output - output tensor
 */
-void T2TTester::Dump(FILE* file, IntList& output)
+void T2TTester::Dump(FILE * file, XTensor * output)
 {
-    for (int i = 0; i < output.Size(); i++) {
-        int w = output[i];
-        if (w < 0) {
-            if (i == 0)
-                return;
-            else
-                break;
-        }
+    int seqLength = output->GetDim(-1);

+    for (int i = 0; i < output->unitNum; i += seqLength) {
+        for (int j = 0; j < seqLength; j++) {
+            int w = output->GetInt(i + j);
            fprintf(file, "%d ", w);
+            if (w < 0)
+                break;
        }
+
        fprintf(file, "\n");
+    }
 }

 }
--- a/source/sample/transformer/T2TTester.h
+++ b/source/sample/transformer/T2TTester.h
@@ -62,7 +62,7 @@ public:
    void Test(const char * fn, const char * ofn, T2TModel * model);

    /* dump the result into the file */
-    void Dump(FILE * file, IntList& output);
+    void Dump(FILE * file, XTensor * output);
 };

 }

--- a/source/sample/transformer/T2TUtility.cpp
+++ b/source/sample/transformer/T2TUtility.cpp
@@ -22,7 +22,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include "T2TUtility.h"

 namespace transformer
 {
@@ -115,11 +114,4 @@ void ShowParams(int argc, char ** argv)
    fprintf(stderr, "\n");
 }

-/* dump tensors */
-void DumpTensors(std::initializer_list<nts::XTensor*> list) {
-    int i(0);
-    for (auto& x : list)
-        x->Dump(stderr, std::to_string(++i).c_str());
-}
-
 }
--- a/source/sample/transformer/T2TUtility.h
+++ b/source/sample/transformer/T2TUtility.h
@@ -23,9 +23,6 @@
 #define __T2TUTILITY_H__

 #include <stdio.h>
-#include <string>
-#include "..//..//tensor/XTensor.h"
-#include <initializer_list>

 namespace transformer
 {
@@ -41,10 +38,6 @@ void LoadParamFloat(int argc, char ** argv, const char * name, float * p, float 
 /* show arguments */
 void ShowParams(int argc, char ** argv);

-/* dump tensors */
-void DumpTensors(std::initializer_list<nts::XTensor*> list);
-
-
 extern int llnum;
 extern FILE * tf;


--- a/source/sample/transformer/Transformer.cpp
+++ b/source/sample/transformer/Transformer.cpp
@@ -29,35 +29,10 @@
 #include "../../tensor/XDevice.h"
 #include "../../tensor/XUtility.h"
 #include "../../tensor/XGlobal.h"
-#include "..//..//model/Model.h"

 namespace transformer
 {

-struct AttModel : Model {
-    AttModel(int devID) {
-        Register("w1", {2,3,4}, X_FLOAT, devID);
-        Register("b1", {2,3,4}, X_FLOAT, devID);
-        Register("3", {2,3,4}, X_FLOAT, devID);
-    }
-};
-
-struct Transformer {
-    AttModel *att;
-    Transformer(int devID) {
-        att = new AttModel(devID);
-    }
-    ~Transformer() {
-        delete att;
-    }
-};
-
-void test() {
-    Transformer model(0);
-    model.att->Get("w1")->SetZeroAll();
-    model.att->Get("w1")->Dump(stderr);
-}
-
 int TransformerMain(int argc, const char ** argv)
 {
    if(argc == 0)
@@ -71,43 +46,24 @@ int TransformerMain(int argc, const char ** argv)

    ShowParams(argc, args);

-    bool convertFile = false;
    bool isBeamSearch = false;
-    bool convertModel = false;
-    
+    char * trainFN = new char[MAX_LINE_LENGTH];
    char * modelFN = new char[MAX_LINE_LENGTH];
-    char * rawFN = new char[MAX_LINE_LENGTH];
    char * testFN = new char[MAX_LINE_LENGTH];
    char * outputFN = new char[MAX_LINE_LENGTH];
    char * rawModel = new char[MAX_LINE_LENGTH];

    LoadParamString(argc, args, "model", modelFN, "");
-    LoadParamString(argc, args, "rawmodel", rawModel, "");
-    LoadParamString(argc, args, "input", testFN, "");
-    LoadParamString(argc, args, "rawinput", rawFN, "");
+    LoadParamString(argc, args, "rawModel", rawModel, "");
+    LoadParamString(argc, args, "test", testFN, "");
    LoadParamString(argc, args, "output", outputFN, "");
    LoadParamBool(argc, args, "beamsearch", &isBeamSearch, false);
-    LoadParamBool(argc, args, "convertfile", &convertFile, false);
-    LoadParamBool(argc, args, "convertmodel", &convertModel, false);
-    

    srand((unsigned int)time(NULL));

    T2TModel model;
    model.InitModel(argc, args);

-    /* convert test file from text to binary */
-    if (convertFile) {
-        DataSet::ConvertFile(rawFN, testFN);
-    }
-    
-    /* convert parameters from text to binary */
-    if (convertModel) {
-        TensorList params(100);
-        model.GetParams(params);
-        ConvertModelFile(&params, rawModel, modelFN);
-    }
-
    /* load the model if neccessary */
    if(strcmp(modelFN, ""))
        model.Read(modelFN);
@@ -119,6 +75,7 @@ int TransformerMain(int argc, const char ** argv)
        searcher.Test(testFN, outputFN, &model);
    }

+    delete[] trainFN;
    delete[] modelFN;
    delete[] testFN;
    delete[] outputFN;

--- a/source/sample/transformer/t2tdata/DataSet.cpp
+++ b/source/sample/transformer/t2tdata/DataSet.cpp
@@ -19,23 +19,27 @@
 * $Created by: HU Chi (huchinlp@foxmail.com) 2019-04-05
 */

+#include "DataSet.h"
+#include "StringUtil.h"
+
 #include <string>
+#include <vector>
 #include <fstream>
 #include <algorithm>
-#include "DataSet.h"
-#include "StringUtil.h"
-#include "../../../tensor/XUtility.h"
+#include "..//..//..//tensor/XUtility.h"

 using namespace nts;
-using namespace std;

-/* sort results by their ids */
-void DataSet::SortRes()
-{
-    auto cmp = [](Result& a, Result& b) {
+bool Compare(Example& a, Example& b) {
+    return a.values.size() > b.values.size();
+}
+
+bool CompareRes(Result& a, Result& b) {
    return a.id < b.id;
-    };
-    std::sort(resBuffer.items, resBuffer.items + resBuffer.count, cmp);
+}
+
+void DataSet::RerankRes(){
+    sort(resBuffer.begin(), resBuffer.end(), CompareRes);
 }

 /*
@@ -43,30 +47,27 @@ load data from the file to the buffer
 */
 void DataSet::LoadDataToBuffer()
 {
+    string line;
+    buffer.clear();
    bufferUsed = 0;
-    srcBuffer.Clear();
-    
-    bufferSize = min(bufferSize, exampleNumber);
+    const string tokenDelimiter = " ";

-    for (int i = 0; i < bufferSize; i++) {
-        
-        long off = offset[index++];
-        IntList data(off);
-        data.count = off;
-        fread(data.items, sizeof(int), off, fp);
+    int id = 0;
+    while (getline(*fp, line)) {
+        vector<int> values = Split<int>(line, tokenDelimiter);

        Example example;
        example.id = id++;
-        example.data = data;
-        srcBuffer.Add(example);
+        example.values = values;
+        buffer.emplace_back(example);
+    }
+    if (fp->eof()) {
+        fp->seekg(fp->beg);
    }
-
    if (sortBuffer) {
-        auto cmp = [](Example& a, Example& b) {
-            return a.data.Size() > b.data.Size(); 
-        };
-        std::sort(srcBuffer.items, srcBuffer.items + srcBuffer.count, cmp);
+        sort(buffer.begin(), buffer.end(), Compare);
    }
+    resBuffer.reserve(buffer.size());
 }

 /*
@@ -77,52 +78,43 @@ select a field and generate a mini-batch by indices
 >>> devID - devices id, -1 for CPU
 >>> mem - the memory pool
 */
-IntList DataSet::LoadBatch(XTensor * batchEnc, XTensor * paddingEnc, size_t batchSize, int devID)
+vector<int> DataSet::LoadBatch(XTensor * batchEnc, XTensor * paddingEnc, 
+                        size_t batchSize, int devID)
 {
-    if(srcBuffer.count == 0)
-        LoadDataToBuffer();
-
    size_t realBatchSize = batchSize;

    /* real batch size */
-    if ((srcBuffer.Size() - bufferUsed) < batchSize) {
-        realBatchSize = srcBuffer.Size() - bufferUsed;
+    if ((buffer.size()-bufferUsed) < batchSize) {
+        realBatchSize = buffer.size()-bufferUsed;
    }

    /* get the maximum sentence length in a mini-batch */
    size_t maxLen = 0;
-    if (realBatchSize == 1) {
-        maxLen = srcBuffer[bufferUsed].data.Size();
-    }
-
-    for (size_t i = 0; i < realBatchSize - 1; i++) {
-        maxLen = max(maxLen, srcBuffer[bufferUsed + i].data.Size());
+    if (realBatchSize == 1)
+        maxLen = buffer[bufferUsed].values.size();
+    for (size_t i = 0; i < realBatchSize - 1; ++i) {
+        maxLen = max(maxLen, buffer[bufferUsed+i].values.size());
    }
    CheckNTErrors(maxLen != 0, "wrong length dectected");

-    int* batchValues = new int[maxLen * realBatchSize];
-    float* paddingValues = new float[maxLen * realBatchSize];
+    int* batchValues = new int[realBatchSize * maxLen];
+    float* paddingValues = new float[realBatchSize * maxLen];

-    for (int i = 0; i < realBatchSize * maxLen; i++) {
+    for (int i = 0; i < realBatchSize * maxLen; ++i) {
        batchValues[i] = 1.0F;
    }
-    memset(batchValues, 0, sizeof(int) * maxLen * realBatchSize);
    memset(paddingValues, 0, sizeof(float) * maxLen * realBatchSize);

    size_t cur = 0;
    
    /* left padding */
-    IntList indices;
-    indices.Reserve(realBatchSize);
-
-    for (size_t i = 0; i < realBatchSize; i++) {
-
-        indices.Add(srcBuffer[bufferUsed + i].id);
-        IntList& data = srcBuffer[bufferUsed + i].data;
-
-        cur = maxLen * (i + 1) - data.Size();
-        for (int j = 0; j < data.Size(); j++) {
-            batchValues[cur] = data[j];
+    vector<int> indices;
+    indices.reserve(realBatchSize);
+    for (size_t i = 0; i < realBatchSize; ++i) {
+        indices.push_back(buffer[bufferUsed + i].id);
+        cur = maxLen * (i + 1) - buffer[bufferUsed+i].values.size();
+        for (int v : buffer[bufferUsed + i].values) {
+            batchValues[cur] = v;
            paddingValues[cur++] = 1.0F;
        }
        cur = maxLen * (i + 1);
@@ -135,6 +127,7 @@ IntList DataSet::LoadBatch(XTensor * batchEnc, XTensor * paddingEnc, size_t batc

    batchEnc->SetData(batchValues, batchEnc->unitNum);
    paddingEnc->SetData(paddingValues, paddingEnc->unitNum);
+
    delete[] batchValues;
    delete[] paddingValues;

@@ -142,99 +135,17 @@ IntList DataSet::LoadBatch(XTensor * batchEnc, XTensor * paddingEnc, size_t batc
 }

 /*
-convert text file to binary file
-format of the text file: 
-one sentence per line, seperated by a blank
-format of the binary file: 
-part 1: number of all examples
-part 2: offsets of all examples
-part 3: the raw data 
->>> src - the path of source text file
->>> tgt - the path of target binary file
-*/
-void nts::DataSet::ConvertFile(const char* src, const char* tgt)
-{
-    ifstream ifile(src, ios::in);
-    FILE* ofile = fopen(tgt, "wb");
-
-    CheckNTErrors(ofile, "unable to create the output file");
-
-    string line;
-    size_t idx = 0;
-    const int maxExample = 10240;
-    IntList dataList[maxExample];
-    
-    while (getline(ifile, line)){
-        SplitInt(line, " ", dataList[idx++]);
-    }
-    
-    /* part 1: number of examples */
-    fwrite(&idx, sizeof(idx), 1, ofile);
-
-    /* part 2: offset of all examples */
-    for (int i = 0; i < idx; i++) {
-        size_t size = (dataList[i].Size());
-        fwrite(&size, sizeof(size), 1, ofile);
-    }
-
-    /* part 3: value of examples */
-    for (int i = 0; i < idx; i++) {
-        fwrite(dataList[i].items, sizeof(int), dataList[i].Size(), ofile);
-    }
-
-    ifile.close();
-    fclose(ofile);
-}
-
-/*
 the constructor of DataSet
-the binary data consists of three parts
-part 1: number of all examples
-part 2: offsets of all examples
-part 3: the raw data 
 >>> fname - path of the data file
->>> myBufferSize - size of the data buffer
->>> mySortBuffer - whether sort the data
 */
-void DataSet::Init(const char* fname, size_t myBufferSize, bool mySortBuffer)
+void DataSet::Init(const char* fname)
 {
-    id = 0; 
-    index = 0;
+    fp = new ifstream(fname);
+    CheckNTErrors(fp->is_open(), "can not open the file");
    bufferUsed = 0;
-    bufferSize = myBufferSize;
-    sortBuffer = mySortBuffer;
-    
-    fp = fopen(fname, "rb");
-    
-    CheckNTErrors(fp, "can not open the file");
-
-    /* read offsets */
-    exampleNumber = 0;
-    fread(&exampleNumber, sizeof(exampleNumber), 1, fp);
-    CheckNTErrors(exampleNumber > 0, "invalid example numbers");
-    offset.Reserve(exampleNumber);
-    for (int i = 0; i < exampleNumber; i++) {
-        size_t off;
-        fread(&off, sizeof(off), 1, fp);
-        offset.Add(off);
-    }
-
-    /* reset the buffer size if it is too big */
-    bufferSize = min(bufferSize, exampleNumber);
-
-    srcBuffer.Reserve(bufferSize);
-}
    
-/* check if the buffer is empty */
-bool nts::DataSet::IsEmpty()
-{
-    return (index >= offset.count) && (bufferUsed >= bufferSize);
+    LoadDataToBuffer();
+    if (bufferSize == 0)
+        bufferSize = buffer.size();
 }

-/* de-constructor */
-nts::DataSet::~DataSet()
-{
-    if (fp) {
-        fclose(fp);
-    }
-}
\ No newline at end of file
--- a/source/sample/transformer/t2tdata/DataSet.h
+++ b/source/sample/transformer/t2tdata/DataSet.h
@@ -22,70 +22,79 @@
 #ifndef __DATASET_H__
 #define __DATASET_H__

-#include <cstdio>
 #include "../../..//tensor/XTensor.h"
 #include "../../..//tensor/XGlobal.h"

-namespace nts {
+#include <cstdio>
+#include <fstream>
+#include <unordered_map>
+#include <vector>

+using namespace std;
+using namespace nts;

-/* `DataSet` maintains data buffers for the inference stage .*/
-struct DataSet {
+struct Example {
+    int id;
+    vector<int> values;
+};

-public:
+struct Result {
+    int id;
+    XTensor values;
+};

-    /* the data buffer */
-    ExampleList srcBuffer;
+using BufferType = vector<Example>;
+using ResBufferType = vector<Result>;

-    /* the result buffer */
-    ResultList resBuffer;
+namespace nts { // namespace nts(NiuTrans.Tensor)

-    /* the offset of all examples in the data */
-    LongList offset;
+/* A `DataSet` is associated with a file which contains variable length data.*/
+struct DataSet {

-    /* wether sort the dataset */
-    bool sortBuffer;
+    /* the data buffer */
+    BufferType buffer;

-    /* id for each example */
-    size_t id;
+    /* the result buffer */
+    ResBufferType resBuffer;
+
+    /* the pointer to file stream */
+    ifstream* fp{nullptr};

    /* size of the data buffer */
-    size_t bufferSize;
+    size_t bufferSize{ 0 };

    /* size of used data in buffer */
-    size_t bufferUsed;
-
-    /* size of data in the src file */
-    size_t exampleNumber;
-
-    /* current index of the offset */
-    size_t index;
-
-    /* the pointer of the src file stream */
-    FILE * fp;
-
-public:
+    size_t bufferUsed{ 0 };

-    /* check if the buffer is empty */
-    bool IsEmpty();
+    /* wether sort the dataset */
+    bool sortBuffer{ true };

    /* load data from a file to the buffer */
    void LoadDataToBuffer();

-    /* initlization function */
-    void Init(const char* fname, size_t myBufferSize, bool mySortBuffer);
+    /* rerank result for output */
+    void RerankRes();

    /* generate a mini-batch */
-    IntList LoadBatch(XTensor * batchEnc, XTensor * paddingEnc, size_t batchSize, int devID);
+    vector<int> LoadBatch(XTensor * batchEnc, XTensor * paddingEnc, 
+                   size_t batchSize, int devID);

-    /* sort results by their ids */
-    void SortRes();
+    /* initlization function */
+    void Init(const char* fname);

-    /* transform text file to binary file */
-    static void ConvertFile(const char* src, const char* tgt);
+    /* check if the buffer is empty */
+    bool IsEmpty() {
+        if (bufferUsed < bufferSize)
+            return false;
+        return true;
+    }

    /* de-constructor */
-    ~DataSet();
+    ~DataSet() {
+        if (fp)
+            fp->close();
+        delete fp;
+    }
 };

 } // namespace nts(NiuTrans.Tensor)

--- a/source/sample/transformer/t2tdata/StringUtil.cpp
+++ b/source/sample/transformer/t2tdata/StringUtil.cpp
@@ -21,66 +21,27 @@

 #include "StringUtil.h"

-/* 
-split string by delimiter, this will return indices of all sub-strings
->>> s - the original string
->>> delimiter - as it is
->>> a - the indices of all sub-strings
-*/
-void SplitToPos(const string& s, const string& delimiter, LongList& indices)
+namespace nts {
+
+/* split string by delimiter, this will return indices of all sub-strings */
+vector<pair<int, int>> SplitToPos(const string& s, const string& delimiter)
 {
+    vector<pair<int, int>> fields;
    if (delimiter.length() == 0) {
-        indices.Add(0);
+        fields.emplace_back(0, s.length());
+        return fields;
    }
    int pos = 0;
    int start = 0;
    while ((pos = s.find(delimiter, start)) != string::npos) {
        if (pos != start) {
-            indices.Add(start);
+            fields.emplace_back(start, pos);
        }
        start = pos + delimiter.length();
    }
    if (start != s.length()) {
-        indices.Add(start);
-    }
-}
-
-IntList SplitInt(const string& s, const string& delimiter)
-{
-    IntList fields;
-    LongList indices;
-    SplitToPos(s, delimiter, indices);
-    for (int i = 0; i < indices.Size(); i++) {
-        fields.Add(strtol(s.data() + indices[i], nullptr, 10));
-    }
-    return fields;
-}
-
-void SplitInt(const string& s, const string& delimiter, IntList& fields)
-{
-    LongList indices;
-    SplitToPos(s, delimiter, indices);
-    for (int i = 0; i < indices.Size(); i++) {
-        fields.Add(strtol(s.data() + indices[i], nullptr, 10));
-    }
-}
-
-FloatList SplitFloat(const string& s, const string& delimiter)
-{
-    FloatList fields;
-    LongList indices;
-    SplitToPos(s, delimiter, indices);
-    for (int i = 0; i < indices.Size(); i++) {
-        fields.Add(strtof(s.data() + indices[i], nullptr));
+        fields.emplace_back(start, s.length());
    }
    return fields;
 }
-
-void SplitInt(const string& s, const string& delimiter, FloatList& fields)
-{
-    LongList indices;
-    SplitToPos(s, delimiter, indices);
-    for (int i = 0; i < indices.Size(); i++) {
-        fields.Add(strtof(s.data() + indices[i], nullptr));
-    }
 }
\ No newline at end of file
--- a/source/sample/transformer/t2tdata/StringUtil.h
+++ b/source/sample/transformer/t2tdata/StringUtil.h
@@ -22,25 +22,85 @@
 #ifndef __STRING_UTIL_H__
 #define __STRING_UTIL_H__

+#include <cstdlib>
 #include <string>
-
-#include "..//..//..//tensor/XList.h"
+#include <utility>
+#include <vector>
 using namespace std;
-using namespace nts;
+
+namespace nts {

 /* Splits a string based on the given delimiter string. Each pair in the
 * returned vector has the start and past-the-end positions for each of the
 * parts of the original string. Empty fields are not represented in the output.
 */
-void SplitToPos(const string& s, const string& delimiter, LongList& indices);
+vector<pair<int, int>> SplitToPos(const string& s, const string& delimiter);
+
+/* Splits the given string and converts each part to the given T. */
+template <typename T>
+vector<T> Split(const string& s, const string& delimiter);
+
+template <>
+inline vector<string> Split(const string& s, const string& delimiter)
+{
+    vector<string> fields;
+    for (const auto& p : SplitToPos(s, delimiter)) {
+        fields.emplace_back(s.substr(p.first, p.second - p.first));
+    }
+    return fields;
+}
+
+template <>
+inline vector<int> Split(const string& s, const string& delimiter)
+{
+    vector<int> fields;
+    for (const auto& p : SplitToPos(s, delimiter)) {
+        fields.emplace_back(strtol(s.data() + p.first, nullptr, 10));
+    }
+    return fields;
+}

+template <>
+inline vector<int64_t> Split(const string& s, const string& delimiter)
+{
+    vector<int64_t> fields;
+    for (const auto& p : SplitToPos(s, delimiter)) {
+        fields.emplace_back(strtoll(s.data() + p.first, nullptr, 10));
+    }
+    return fields;
+}

-IntList SplitInt(const string& s, const string& delimiter);
+template <>
+inline vector<float> Split(const string& s, const string& delimiter)
+{
+    vector<float> fields;
+    for (const auto& p : SplitToPos(s, delimiter)) {
+        fields.emplace_back(strtof(s.data() + p.first, nullptr));
+    }
+    return fields;
+}

-void SplitInt(const string& s, const string& delimiter, IntList& fields);
+template <>
+inline vector<uint8_t> Split(const string& s, const string& delimiter)
+{
+    vector<uint8_t> fields;
+    for (const auto& p : SplitToPos(s, delimiter)) {
+        fields.emplace_back(strtol(s.data() + p.first, nullptr, 10));
+    }
+    return fields;
+}

-FloatList SplitFloat(const string& s, const string& delimiter);
+template <>
+inline vector<bool> Split(const string& s, const string& delimiter)
+{
+    vector<bool> fields;
+    for (const auto& p : SplitToPos(s, delimiter)) {
+        fields.emplace_back(
+            static_cast<bool>(strtol(s.data() + p.first, nullptr, 10)));
+    }
+    return fields;
+}

-void SplitInt(const string& s, const string& delimiter, FloatList& fields);
+} // namespace nts

 #endif // __STRING_UTIL_H__
--- a/source/tensor/XBLAS.cpp
+++ b/source/tensor/XBLAS.cpp
@@ -26,183 +26,9 @@
 *
 */

-#ifdef WIN32
-#include <wtypes.h>
-#endif
-
-#include <stdlib.h>
-#include <stdio.h>
-#include "XBLAS.h"
-#include "XGlobal.h"

 /* the nts (NiuTrans.Tensor) namespace */
 namespace nts{

-#ifdef WIN32
-HINSTANCE hBLASDll;
-#endif
-
-
-/* single-precision floating matrix-matrix multiplication */
-void (*XBLAS_SGEMM)(OPENBLAS_CONST enum CBLAS_ORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE, OPENBLAS_CONST enum CBLAS_TRANSPOSE,
-                    OPENBLAS_CONST BLASINT, OPENBLAS_CONST BLASINT, OPENBLAS_CONST BLASINT, OPENBLAS_CONST float,  
-                    OPENBLAS_CONST float *, OPENBLAS_CONST BLASINT,
-                    OPENBLAS_CONST float *, OPENBLAS_CONST BLASINT, OPENBLAS_CONST float, 
-                    float *, OPENBLAS_CONST BLASINT);
-
-/* double-precision floating matrix-matrix multiplication */
-void (*XBLAS_DGEMM)(OPENBLAS_CONST enum CBLAS_ORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE, OPENBLAS_CONST enum CBLAS_TRANSPOSE,
-                    OPENBLAS_CONST BLASINT, OPENBLAS_CONST BLASINT, OPENBLAS_CONST BLASINT, OPENBLAS_CONST double,  
-                    OPENBLAS_CONST double *, OPENBLAS_CONST BLASINT,
-                    OPENBLAS_CONST double *, OPENBLAS_CONST BLASINT, OPENBLAS_CONST double, 
-                    double *, OPENBLAS_CONST BLASINT);
-
-/* single-precision floating vector-vector multiplication (rank-1) */
-void (*XBLAS_SGER)(OPENBLAS_CONST enum CBLAS_ORDER, OPENBLAS_CONST BLASINT M, OPENBLAS_CONST BLASINT N, OPENBLAS_CONST float  alpha, 
-                   OPENBLAS_CONST float *Y, OPENBLAS_CONST BLASINT, OPENBLAS_CONST float *, OPENBLAS_CONST BLASINT, 
-                   float *, OPENBLAS_CONST BLASINT);
-
-/* double-precision floating vector-vector multiplication (rank-1) */
-void (*XBLAS_DGER)(OPENBLAS_CONST enum CBLAS_ORDER, OPENBLAS_CONST BLASINT M, OPENBLAS_CONST BLASINT N, OPENBLAS_CONST double  alpha, 
-                   OPENBLAS_CONST double *Y, OPENBLAS_CONST BLASINT, OPENBLAS_CONST double *, OPENBLAS_CONST BLASINT, 
-                   double *, OPENBLAS_CONST BLASINT);
-
-/* set the number of threads */
-void (*XBLAS_SET_THREAD_NUM)(int);
-
-/* get the number of threads */
-//int (*XBLAS_GET_THREAD_NUM)();
-
-
-/* get the number of physical processors (cores).*/
-int (*XBLAS_GET_CORE_NUM)();
-
-
-/* get the CPU corename */
-//char * (*XBLAS_GET_CORE_NAME)();
-
-/* get the parallelization type used by OpenBLAS */
-//int (*XBLAS_GET_PARALLEL_TYPE)(void);
-
-
-#if defined(USE_BLAS)
-
-/* load some stuff for BLAS */
-void LoadBLAS(const char * dllFileName)
-{
-#ifndef CUDA_BLAS
-#ifdef _WIN32
-
-#if defined(OPENBLAS)
-    /* non-ascii characters are not supported yet */
-    wchar_t * fn = new wchar_t[strlen(dllFileName) + 1];
-    memset(fn, 0, sizeof(wchar_t) * (strlen(dllFileName) + 1));
-    for(int i = 0; i < strlen(dllFileName); i++)
-        fn[i] = dllFileName[i];
-
-    hBLASDll = LoadLibrary((LPCWSTR)fn);
-
-    if(!hBLASDll){
-        XPRINT1(0, stderr, "[LoadBLAS] Error! Cannot load dll %s!\n", dllFileName);
-        exit(1);
-    }
-
-    /* matrix-matrix multiplicatoin */
-    (FARPROC&)XBLAS_SGEMM = GetProcAddress(hBLASDll, "cblas_sgemm");
-    (FARPROC&)XBLAS_DGEMM = GetProcAddress(hBLASDll, "cblas_dgemm");
-
-    /* vector-vector multiplication */
-    (FARPROC&)XBLAS_SGER = GetProcAddress(hBLASDll, "cblas_sger");
-    (FARPROC&)XBLAS_DGER = GetProcAddress(hBLASDll, "cblas_dger");
-
-    /* multi-threading */
-    (FARPROC&)XBLAS_SET_THREAD_NUM = GetProcAddress(hBLASDll, "openblas_set_num_threads");
-    //(FARPROC&)XBLAS_SET_THREAD_NUM = GetProcAddress(hBLASDll, "goto_set_num_threads");
-    //(FARPROC&)XBLAS_GET_THREAD_NUM = GetProcAddress(hBLASDll, "openblas_get_num_threads");
-    (FARPROC&)XBLAS_GET_CORE_NUM = GetProcAddress(hBLASDll, "openblas_get_num_procs");
-    //(FARPROC&)XBLAS_GET_CORE_NAME = GetProcAddress(hBLASDll, "openblas_get_corename");
-    //(FARPROC&)XBLAS_GET_PARALLEL_TYPE = GetProcAddress(hBLASDll, "openblas_get_parallel");
-
-    delete[] fn;
-#endif // defined(OPENBLAS)
-
-#if defined(MKL)
-    /* non-ascii characters are not supported yet */
-    wchar_t * fn = new wchar_t[strlen(dllFileName) + 1];
-    memset(fn, 0, sizeof(wchar_t) * (strlen(dllFileName) + 1));
-    for(int i = 0; i < strlen(dllFileName); i++)
-        fn[i] = dllFileName[i];
-
-    hBLASDll = LoadLibrary((LPCWSTR)fn);
-
-    if(!hBLASDll){
-        XPRINT1(0, stderr, "[LoadBLAS] Error! Cannot load dll %s!\n", dllFileName);
-        exit(1);
-    }
-
-    /* matrix-matrix multiplicatoin */
-    (FARPROC&)XBLAS_SGEMM = GetProcAddress(hBLASDll, "cblas_sgemm");
-    (FARPROC&)XBLAS_DGEMM = GetProcAddress(hBLASDll, "cblas_dgemm");
-
-    /* vector-vector multiplication */
-    (FARPROC&)XBLAS_SGER = GetProcAddress(hBLASDll, "cblas_sger");
-    (FARPROC&)XBLAS_DGER = GetProcAddress(hBLASDll, "cblas_dger");
-
-    /* multi-threading */
-    (FARPROC&)XBLAS_SET_THREAD_NUM = GetProcAddress(hBLASDll, "MKL_Set_Num_Threads");
-    (FARPROC&)XBLAS_GET_CORE_NUM   = GetProcAddress(hBLASDll, "MKL_Get_Max_Threads");
-#endif // defined(MKL)
-
-#else // _WIN32
-
-    XBLAS_SGEMM = &cblas_sgemm;
-    XBLAS_DGEMM = &cblas_dgemm;
-    XBLAS_SGER  = &cblas_sger;
-    XBLAS_DGER  = &cblas_dger;
-#if defined(OPENBLAS)
-    XBLAS_SET_THREAD_NUM    = &openblas_set_num_threads;
-    XBLAS_GET_CORE_NUM      = &openblas_get_num_procs;
-#endif // defined(OPENBLAS)
-#if defined(MKL)
-    XBLAS_SET_THREAD_NUM    = &mkl_set_num_threads;
-    XBLAS_GET_CORE_NUM      = &mkl_get_max_num_threads;
-#endif // defined(MKL)
-
-#endif // _WIN32
-
-    XBLAS_SET_THREAD_NUM(1);
-#endif // ndef(CUDA_BLAS)
-}
-
-/* unload the libs */
-void UnloadBLAS()
-{
-#ifdef _WIN32
-
-    if(!FreeLibrary(hBLASDll)){
-        XPRINT(0, stderr, "[UnloadBLAS] Error! Cannot free the BLAS dll!\n");
-        exit(1);
-    }
-
-#else
-
-#endif
-}
-
-#else  // undefined(USE_BLAS) || undefined(OPENBLAS)
-
-void LoadBLAS(const char * dllFileName)
-{
-    XPRINT(0, stderr, "[LoadBLAS] Error! No Blas lib is available. Please use OPENBLAS or MKL!\n");
-    exit(1);
-}
-
-void UnloadBLAS()
-{
-    XPRINT(0, stderr, "[UnloadBLAS] Error! No Blas lib is available. Please use OPENBLAS or MKL!\n");
-    exit(1);
-}
-
-#endif // defined(USE_BLAS) && defined(OPENBLAS)

 } /* end of the nts (NiuTrans.Tensor) namespace */
\ No newline at end of file
--- a/source/tensor/XBLAS.h
+++ b/source/tensor/XBLAS.h
@@ -34,7 +34,6 @@ namespace nts{

 /* some of the code below is from OpenBLAS (https://github.com/xianyi/OpenBLAS) */

-//#define OPENBLAS

 #define OPENBLAS_CONST const
 typedef int BLASINT;
@@ -46,7 +45,26 @@ typedef enum CBLAS_SIDE      {CblasLeft=141, CblasRight=142} CBLAS_SIDE;


 #if defined(USE_BLAS)
-
+#ifdef OPENBLAS
+#define XBLAS_SGEMM cblas_sgemm
+#define XBLAS_DGEMM cblas_dgemm
+#define XBLAS_SGER cblas_sger
+#define XBLAS_DGER cblas_dger
+#define XBLAS_SAXPY cblas_saxpy
+#define XBLAS_DAXPY cblas_daxpy
+#define XBLAS_SET_THREAD_NUM openblas_set_num_threads
+#define XBLAS_GET_CORE_NUM openblas_get_num_procs
+#endif
+#ifdef MKL
+#define XBLAS_SGEMM cblas_sgemm
+#define XBLAS_DGEMM cblas_dgemm
+#define XBLAS_SGER cblas_sger
+#define XBLAS_DGER cblas_dger
+#define XBLAS_SAXPY cblas_saxpy
+#define XBLAS_DAXPY cblas_daxpy
+#define XBLAS_SET_THREAD_NUM MKL_Set_Num_Threads
+#define XBLAS_GET_CORE_NUM MKL_Get_Max_Threads
+#endif
 /* 
 single/double-precision floating matrix-matrix multiplication (rank-3)
 - SGEMM (ORDER, TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
@@ -62,14 +80,14 @@ where A, B and C are matrices,
      LDB(=N) specifies the size of the first dimension of B as declared in the calling (sub) program,
      and LDC(=N) specifies the size of the first dimension of C as declared in the calling (sub) program.
 */
-extern "C" void (*XBLAS_SGEMM)(OPENBLAS_CONST enum CBLAS_ORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE, OPENBLAS_CONST enum CBLAS_TRANSPOSE,
+extern "C" void XBLAS_SGEMM(OPENBLAS_CONST enum CBLAS_ORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE, OPENBLAS_CONST enum CBLAS_TRANSPOSE,
                               OPENBLAS_CONST BLASINT, OPENBLAS_CONST BLASINT, OPENBLAS_CONST BLASINT, OPENBLAS_CONST float,  
                               OPENBLAS_CONST float *, OPENBLAS_CONST BLASINT,
                               OPENBLAS_CONST float *, OPENBLAS_CONST BLASINT, OPENBLAS_CONST float, 
                               float *, OPENBLAS_CONST BLASINT);

 /* double-precision floating matrix-matrix multiplication */
-extern "C" void (*XBLAS_DGEMM)(OPENBLAS_CONST enum CBLAS_ORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE, OPENBLAS_CONST enum CBLAS_TRANSPOSE,
+extern "C" void XBLAS_DGEMM(OPENBLAS_CONST enum CBLAS_ORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE, OPENBLAS_CONST enum CBLAS_TRANSPOSE,
                               OPENBLAS_CONST BLASINT, OPENBLAS_CONST BLASINT, OPENBLAS_CONST BLASINT, OPENBLAS_CONST double,  
                               OPENBLAS_CONST double *, OPENBLAS_CONST BLASINT,
                               OPENBLAS_CONST double *, OPENBLAS_CONST BLASINT, OPENBLAS_CONST double, 
@@ -88,24 +106,33 @@ where X and Y are vectors with m and n elements respectively,
          E.g., if we are using CblasRowMajor, the leading dimension is the number of columns of A.

 */
-extern "C" void (*XBLAS_SGER)(OPENBLAS_CONST enum CBLAS_ORDER, OPENBLAS_CONST BLASINT M, OPENBLAS_CONST BLASINT N, OPENBLAS_CONST float  alpha, 
+extern "C" void XBLAS_SGER(OPENBLAS_CONST enum CBLAS_ORDER, OPENBLAS_CONST BLASINT M, OPENBLAS_CONST BLASINT N, OPENBLAS_CONST float  alpha, 
                              OPENBLAS_CONST float *Y, OPENBLAS_CONST BLASINT, OPENBLAS_CONST float *, OPENBLAS_CONST BLASINT, 
                              float *, OPENBLAS_CONST BLASINT);

 /* double-precision floating vector-vector multiplication (rank-1) */
-extern "C" void (*XBLAS_DGER)(OPENBLAS_CONST enum CBLAS_ORDER, OPENBLAS_CONST BLASINT M, OPENBLAS_CONST BLASINT N, OPENBLAS_CONST double  alpha, 
+extern "C" void XBLAS_DGER(OPENBLAS_CONST enum CBLAS_ORDER, OPENBLAS_CONST BLASINT M, OPENBLAS_CONST BLASINT N, OPENBLAS_CONST double  alpha, 
                              OPENBLAS_CONST double *Y, OPENBLAS_CONST BLASINT, OPENBLAS_CONST double *, OPENBLAS_CONST BLASINT, 
                              double *, OPENBLAS_CONST BLASINT);

+/*
+some description
+
+*/
+extern "C" void XBLAS_SAXPY(OPENBLAS_CONST BLASINT n, OPENBLAS_CONST float a, OPENBLAS_CONST float *x, OPENBLAS_CONST BLASINT incx, OPENBLAS_CONST float *y, OPENBLAS_CONST BLASINT  incy);
+
+/* double-precision floating sumMe function */
+extern "C" void XBLAS_DAXPY(OPENBLAS_CONST BLASINT n, OPENBLAS_CONST double a, OPENBLAS_CONST double *x, OPENBLAS_CONST BLASINT incx, OPENBLAS_CONST double *y, OPENBLAS_CONST BLASINT  incy);
+
 /* set the number of threads */
-extern "C" void (*XBLAS_SET_THREAD_NUM)(int);
+extern "C" void XBLAS_SET_THREAD_NUM(int);

 /* get the number of threads */
 //extern "C" int (*XBLAS_GET_THREAD_NUM)();


 /* get the number of physical processors (cores).*/
-extern "C" int (*XBLAS_GET_CORE_NUM)();
+extern "C" int XBLAS_GET_CORE_NUM();

 /* get the CPU corename */
 //extern "C" char * (*XBLAS_GET_CORE_NAME)();
@@ -113,58 +140,6 @@ extern "C" int (*XBLAS_GET_CORE_NUM)();
 /* get the parallelization type used by OpenBLAS */
 //extern "C" int (*XBLAS_GET_PARALLEL_TYPE)(void);

-/* linux systems */
-#ifndef _WIN32
-
-/* cblas functions that are imported from the lib. See cblas.h in OpenBlas for more information */
-extern "C" void cblas_sgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, 
-                        OPENBLAS_CONST BLASINT M, OPENBLAS_CONST BLASINT N, OPENBLAS_CONST BLASINT K, OPENBLAS_CONST float alpha, 
-                        OPENBLAS_CONST float *A, OPENBLAS_CONST BLASINT lda, 
-                        OPENBLAS_CONST float *B, OPENBLAS_CONST BLASINT ldb, 
-                        OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST BLASINT ldc);
-extern "C" void cblas_dgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, 
-                        OPENBLAS_CONST BLASINT M, OPENBLAS_CONST BLASINT N, OPENBLAS_CONST BLASINT K, OPENBLAS_CONST double alpha, 
-                        OPENBLAS_CONST double *A, OPENBLAS_CONST BLASINT lda, 
-                        OPENBLAS_CONST double *B, OPENBLAS_CONST BLASINT ldb, 
-                        OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST BLASINT ldc);
-extern "C" void cblas_sger (OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST BLASINT M, OPENBLAS_CONST BLASINT N, OPENBLAS_CONST float  alpha, 
-                        OPENBLAS_CONST float  *X, OPENBLAS_CONST BLASINT incX, OPENBLAS_CONST float  *Y, OPENBLAS_CONST BLASINT incY, 
-                        float  *A, OPENBLAS_CONST BLASINT lda);
-extern "C" void cblas_dger (OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST BLASINT M, OPENBLAS_CONST BLASINT N, OPENBLAS_CONST double alpha, 
-                        OPENBLAS_CONST double *X, OPENBLAS_CONST BLASINT incX, OPENBLAS_CONST double *Y, OPENBLAS_CONST BLASINT incY, 
-                        double *A, OPENBLAS_CONST BLASINT lda);
-
-#if defined(OPENBLAS)
-/* better control of multi-threading */
-extern "C" void  openblas_set_num_threads(int num_threads);
-extern "C" void  goto_set_num_threads(int num_threads);
-//extern "C" int   openblas_get_num_threads(void);
-extern "C" int   openblas_get_num_procs(void);
-//extern "C" char* openblas_get_config(void);
-//extern "C" char* openblas_get_corename(void);
-//extern "C" int   openblas_get_parallel(void);
-#endif
-
-#endif
-
-#if defined(MKL)
-
-
-/* better control of multi-threading */
-//_Mkl_Api(void,MKL_Set_Num_Threads,(int nth))
-//_Mkl_Api(int,MKL_Get_Max_Threads,(void))
-extern "C" void  MKL_Set_Num_Threads(int num_threads);
-extern "C" int  MKL_Get_Max_Threads();
-
-
-#define mkl_set_num_threads MKL_Set_Num_Threads
-#define mkl_get_max_num_threads MKL_Get_Max_Threads
-
-//extern "C" void  mkl_set_num_threads(int num_threads);
-//extern "C" void  omp_set_num_threads(int num_threads);
-//extern "C" int  mkl_get_max_num_threads();
-
-#endif

 #if defined(CUDA_BLAS)

@@ -186,24 +161,8 @@ extern void BLASMatrixMULD(int deviceID, double * a, double * b, double * c, int

 #endif

-#endif
-
-#ifdef _WIN32
-
-#include "windows.h"
-
-extern HINSTANCE hBLASDll;
-
-#else

 #endif
-
-/* load some stuff for BLAS */
-extern void LoadBLAS(const char * dllFileName);
-
-/* unload the libs */
-extern void UnloadBLAS();
-
 } /* end of the nts (NiuTrans.Tensor) namespace */
-
 #endif
+
--- a/source/tensor/XCall.cpp
+++ b/source/tensor/XCall.cpp
--- a/source/tensor/XCall.h
+++ b/source/tensor/XCall.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: LI Yinqiao (email: li.yin.qiao.2012@hotmail.com) 2019-10-21
+*/
+
+#ifndef __XCALL_H__
+#define __XCALL_H__
+
+#include "XTensor.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/*
+* we define the "new and delete" functions below
+*/
+
+/* global flag for enabling gradient flows or not */
+static bool X_ENABLE_GRAD = false;
+
+/* initialize a XTensor V2 */
+void InitTensorV2(XTensor * tensor,
+                  const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
+                  const float myDenseRatio = 1.0F, const int myDevID = -1, XMem * myMem = NULL);
+                
+/* initialize a dense XTensor */
+void InitTensor(XTensor * tensor,
+                const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
+                const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
+
+/* initialize a scalar V2 */
+void InitTensor0DV2(XTensor * tensor, const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);
+
+/* initialize a scalar */
+void InitTensor0D(XTensor * tensor, const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
+
+/* initialize a dense vector V2 */
+void InitTensor1DV2(XTensor * tensor, const int num, 
+                    const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);
+
+/* initialize a dense vector */
+void InitTensor1D(XTensor * tensor, const int num, 
+                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
+
+/* initialize a dense matrix V2 */
+void InitTensor2DV2(XTensor * tensor, const int rowNum, const int colNum,
+                    const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);
+
+/* initialize a dense matrix */
+void InitTensor2D(XTensor * tensor, const int rowNum, const int colNum,
+                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
+
+/* initialize a dense 3d tensor V2 */
+void InitTensor3DV2(XTensor * tensor, const int d0, const int d1, const int d2,
+                    const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);
+
+/* initialize a dense 3d tensor */
+void InitTensor3D(XTensor * tensor, const int d0, const int d1, const int d2,
+                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
+    
+/* initialize a dense 4d tensor V2 */
+void InitTensor4DV2(XTensor * tensor, const int d0, const int d1, const int d2, const int d3,
+                    const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);
+
+/* initialize a dense 4d tensor */
+void InitTensor4D(XTensor * tensor, const int d0, const int d1, const int d2, const int d3,
+                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
+
+/* initialize a dense 5d tensor V2 */
+void InitTensor5DV2(XTensor * tensor, const int d0, const int d1, const int d2, const int d3, const int d4,
+                    const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);
+
+/* initialize a dense 5d tensor */
+void InitTensor5D(XTensor * tensor, const int d0, const int d1, const int d2, const int d3, const int d4,
+                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
+
+/* initialize a tensor with a reference tensor V2 */
+void InitTensorV2(XTensor * tensor, const XTensor * reference);
+
+/* initialize a tensor with a reference tensor */
+void InitTensor(XTensor * tensor, const XTensor * reference);
+    
+/* initialize a tensor on the CPU with a reference tensor */
+void InitTensorOnCPU(XTensor * tensor, const XTensor * reference);
+    
+/* generate a XTensor with no initialization */
+XTensor * NewTensor();
+
+/* generate a XTensor V2 */
+XTensor * NewTensorV2(const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
+                      const float myDenseRatio = 1.0F, const int myDevID = -1, XMem * myMem = NULL);
+
+/* generate a dense XTensor */
+XTensor * NewTensor(const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
+                    const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
+
+/* generate a XTensor which allocates data on the buffer V2 */
+XTensor * NewTensorBufV2(const int myOrder, const int * myDimSize,
+                         const TENSOR_DATA_TYPE myDataType = X_FLOAT, const float myDenseRatio = 1.0F,
+                         const int myDevID = -1, XMem * myMem = NULL);
+
+/* generate a dense XTensor which allocates data on the buffer */
+XTensor * NewTensorBuf(const int myOrder, const int * myDimSize,
+                       const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
+
+/* generate a XTensor which allocates data on the buffer V2 */
+XTensor * NewTensorBufV2(const XTensor * reference, int devID, XMem * myMem);
+
+/* generate a XTensor which allocates data on the buffer */
+XTensor * NewTensorBuf(const XTensor * reference, int devID, const bool isEnableGrad = X_ENABLE_GRAD);
+
+/* generate a scalar V2 */
+XTensor * NewTensor0DV2(const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);
+
+/* generate a scalar */
+XTensor * NewTensor0D(const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
+
+/* generate a dense vector V2 */
+XTensor * NewTensor1DV2(const int num, const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, 
+                        XMem * myMem = NULL);
+
+/* generate a dense vector */
+XTensor * NewTensor1D(const int num, const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
+
+/* generate a dense matrix V2 */
+XTensor * NewTensor2DV2(const int rowNum, const int colNum, 
+                        const TENSOR_DATA_TYPE myDataType = X_FLOAT,
+                        const int myDevID = -1, XMem * myMem = NULL);
+
+/* generate a dense matrix */
+XTensor * NewTensor2D(const int rowNum, const int colNum, 
+                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
+                      const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
+
+/* generate a dense 3d tensor V2 */
+XTensor * NewTensor3DV2(const int d0, const int d1, const int d2, 
+                        const TENSOR_DATA_TYPE myDataType = X_FLOAT,
+                        const int myDevID = -1, XMem * myMem = NULL);
+
+/* generate a dense 3d tensor */
+XTensor * NewTensor3D(const int d0, const int d1, const int d2, 
+                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
+                      const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
+
+/* generate a dense 4d tensor V2 */
+XTensor * NewTensor4DV2(const int d0, const int d1, const int d2, const int d3,
+                        const TENSOR_DATA_TYPE myDataType = X_FLOAT,
+                        const int myDevID = -1, XMem * myMem = NULL);
+
+/* generate a dense 4d tensor */
+XTensor * NewTensor4D(const int d0, const int d1, const int d2, const int d3,
+                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
+                      const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
+
+/* generate a dense 5d tensor V2 */
+XTensor * NewTensor5DV2(const int d0, const int d1, const int d2, const int d3, const int d4,
+                        const TENSOR_DATA_TYPE myDataType = X_FLOAT,
+                        const int myDevID = -1, XMem * myMem = NULL);
+
+/* generate a dense 5d tensor */
+XTensor * NewTensor5D(const int d0, const int d1, const int d2, const int d3, const int d4,
+                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
+                      const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
+
+/* generate a dense vector by range */
+XTensor * NewTensorRange(int lower, int upper, int step, const TENSOR_DATA_TYPE myDataType = X_INT, const int myDevID = -1, const bool isEnableGrad = X_ENABLE_GRAD);
+
+/* generate a copy of XTensor (with a reference to a given tensor) */
+XTensor * NewTensor(const XTensor * a, bool isFilledData = true);
+
+/* free the data space of a given tensor */
+void DelTensor(XTensor * tensor);
+
+/* free the data space of a given tensor (on the buffer) */
+void DelTensorBuf(XTensor * tensor);
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __XCALL_H__
\ No newline at end of file
--- a/source/tensor/XGlobal.cpp
+++ b/source/tensor/XGlobal.cpp
@@ -50,14 +50,6 @@ int CONST_MINUSONE = -1;
 bool CONST_TRUE = true;

 int verboseLevel = 0;
-bool useBLAS = false;
-
-
-#ifdef USE_CUDA
-    bool useCUDA = true;
-#else
-    bool useCUDA = false;
-#endif

 FILE * tmpLog = NULL;
 double myTime = 0;

--- a/source/tensor/XGlobal.h
+++ b/source/tensor/XGlobal.h
@@ -135,8 +135,6 @@ extern bool CONST_TRUE;
 #define NIUTRANSNNDEBUG

 extern int verboseLevel;
-extern bool useBLAS;
-extern bool useCUDA;

 #define FFLUSH(FILEH) \
 { \

--- a/source/tensor/XList.cpp
+++ b/source/tensor/XList.cpp
@@ -249,26 +249,6 @@ inline int TensorListBase<T>::FindFirst(const T& item)
    return -1;
 }

-template <>
-inline int TensorListBase<Example>::FindFirst(const Example& item)
-{
-    for (int i = 0; i < count; i++) {
-        if (item.id == items[i].id)
-            return i;
-    }
-    return -1;
-}
-
-template <>
-inline int TensorListBase<Result>::FindFirst(const Result& item)
-{
-    for (int i = 0; i < count; i++) {
-        if (item.id == items[i].id)
-            return i;
-    }
-    return -1;
-}
-
 /* clear the data array */
 template <typename T>
 void TensorListBase<T>::Clear()
@@ -383,8 +363,6 @@ template struct TensorListBase<long>;
 template struct TensorListBase<float>;
 template struct TensorListBase<short>;
 template struct TensorListBase<XTensor*>;
-template struct TensorListBase<Result>;
-template struct TensorListBase<Example>;
 template struct TensorListBase<void*>;

 } /* end of the nts (NiuTrans.Tensor) namespace */
\ No newline at end of file
--- a/source/tensor/XList.h
+++ b/source/tensor/XList.h
@@ -133,18 +133,6 @@ typedef TensorListBase<long> LongList;
 typedef TensorListBase<float> FloatList;
 typedef TensorListBase<short> ShortList;

-struct Example {
-    int id;
-    IntList data;
-};
-
-struct Result {
-    int id;
-    IntList data;
-};
-
-typedef TensorListBase<Result> ResultList;
-typedef TensorListBase<Example> ExampleList;
 typedef TensorListBase<XTensor*> TensorList;

 } /* end of the nts (NiuTrans.Tensor) namespace */

--- a/source/tensor/XMem.cpp
+++ b/source/tensor/XMem.cpp
@@ -31,8 +31,8 @@
 /* the nts (NiuTrans.Tensor) namespace */
 namespace nts{
    
-int testxmemid = 0;
-void * recordp = NULL;
+//int testxmemid = 0;
+//void * recordp = NULL;

 /*
 for managing the memories
@@ -1482,7 +1482,7 @@ void XMem::ShowMemUsage(FILE * file)
    }

    fprintf(file, "mem:%.1fMB used:%.1fMB usage:%.3f\n", 
-           (DTYPE)used/MILLION, (DTYPE)total/MILLION, (DTYPE)used/total);
+           (DTYPE)total/MILLION, (DTYPE)used/MILLION, (DTYPE)used/total);
 }

 #ifdef USE_CUDA
@@ -1562,9 +1562,9 @@ void XMemManager::GetBufferSize(MTYPE freeMem, MTYPE * myBufSize)
            if (freeMem >= MILLION * 512){
                *myBufSize = MILLION * 128;
                if (freeMem >= MILLION * 1024) {
-                    *myBufSize = MILLION * 256;
+                    *myBufSize = MILLION * 128;
                    if (freeMem >= MILLION * 2048)
-                        *myBufSize = MILLION * 512;
+                        *myBufSize = MILLION * 128;
                }
            }
        }

--- a/source/tensor/XMem.h
+++ b/source/tensor/XMem.h
@@ -480,12 +480,10 @@ public:
 /* managing the memories */
 extern XMemManager GMems;

+//extern XMem * GMem;

-
-extern XMem * GMem;
-
-extern int testxmemid;
-extern void * recordp;
+//extern int testxmemid;
+//extern void * recordp;

 } /* end of the nts (NiuTrans.Tensor) namespace */


--- a/source/tensor/XName.cpp
+++ b/source/tensor/XName.cpp
@@ -105,6 +105,8 @@ const char * GetOPName(int type)
            return "R_REDUCEMEAN";
        else if (type == REDUCE_REDUCESUM)
            return "R_REDUCESUM";
+        else if (type == REDUCE_REDUCESUMALL)
+            return "R_REDUCESUMALL";
        else if (type == REDUCE_REDUCESUMSQUARED)
            return "R_REDUCESUMSQUARED";
        else if (type == REDUCE_REDUCEVARIANCE)
@@ -135,6 +137,8 @@ const char * GetOPName(int type)
            return "S_SPLIT";
        else if (type == SHAPE_SPLIT_LIST)
            return "S_SPLIT_LIST";
+        else if (type == SHAPE_STACK)
+            return "S_SHAPE_STACK";
        else if (type == SHAPE_SQUEEZE)
            return "S_SQUEEZE";
        else if (type == SHAPE_TRANSPOSE)

--- a/source/tensor/XName.h
+++ b/source/tensor/XName.h
@@ -51,7 +51,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define MATH_MASK               MATH_DIVDIM + 1
 #define MATH_MATRIXMUL          MATH_MASK + 1
 #define MATH_MATRIXMULBATCHED   MATH_MATRIXMUL + 1
-#define MATH_MULTIPLY           MATH_MATRIXMULBATCHED + 1
+#define MATH_MAX                MATH_MATRIXMULBATCHED + 1
+#define MATH_MIN                MATH_MAX + 1
+#define MATH_MULTIPLY           MATH_MIN + 1
 #define MATH_MULTIPLYDIM        MATH_MULTIPLY + 1
 #define MATH_MULTIPLYBROADCAST  MATH_MULTIPLYDIM + 1
 #define MATH_NEGATE             MATH_MULTIPLYBROADCAST + 1
@@ -74,7 +76,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define REDUCE_REDUCEMAX        REDUCE + 1
 #define REDUCE_REDUCEMEAN       REDUCE_REDUCEMAX + 1
 #define REDUCE_REDUCESUM        REDUCE_REDUCEMEAN + 1
-#define REDUCE_REDUCESUMSQUARED REDUCE_REDUCESUM + 1
+#define REDUCE_REDUCESUMALL     REDUCE_REDUCESUM + 1
+#define REDUCE_REDUCESUMSQUARED REDUCE_REDUCESUMALL + 1
 #define REDUCE_REDUCEVARIANCE   REDUCE_REDUCESUMSQUARED + 1

 /* data and shape related operations */
@@ -97,7 +100,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define SHAPE_RESHAPE           SHAPE_PERMUTE + 1
 #define SHAPE_SPLIT             SHAPE_RESHAPE + 1
 #define SHAPE_SPLIT_LIST        SHAPE_SPLIT + 1
-#define SHAPE_SQUEEZE           SHAPE_SPLIT_LIST + 1
+#define SHAPE_STACK             SHAPE_SPLIT_LIST + 1
+#define SHAPE_SQUEEZE           SHAPE_STACK + 1
 #define SHAPE_TRANSPOSE         SHAPE_SQUEEZE + 1
 #define SHAPE_UNSQUEEZE         SHAPE_TRANSPOSE + 1


--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
--- a/source/tensor/core/CHeader.h
+++ b/source/tensor/core/CHeader.h
@@ -83,13 +83,17 @@
 #include "shape/Permute.h"
 #include "shape/Split.h"
 #include "shape/Squeeze.h"
+#include "shape/Stack.h"
 #include "shape/Transpose.h"
 #include "shape/Unsqueeze.h"
+#include "shape/IsSameShaped.h"

 #include "sort/Sort.h"
 #include "sort/TopK.h"

 #include "utilities/XMatrixSegment.h"
 #include "utilities/FlushToMem.h"
+#include "utilities/CheckData.h"
+#include "utilities/SetAscendingOrder.h"

 #endif // __CHEADER_H__
--- a/source/tensor/core/arithmetic/Div.cpp
+++ b/source/tensor/core/arithmetic/Div.cpp
@@ -22,6 +22,7 @@
 #include "../../XTensor.h"
 #include "../../XName.h"
 #include "../../XUtility.h"
+#include "../shape/IsSameShaped.h"
 #include "Div.h"
 #include "Div.cuh"
 #include "DivDim.h"
@@ -48,9 +49,6 @@ void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int le
                  "Unmatched tensors!");

    CheckDev(a->devID, b->devID);
-
-    int leadingDimRDI = a->order - leadingDim - 1;
-
 #ifdef USE_CUDA
    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
        _CudaDiv(a, b, c, alpha, leadingDim);
@@ -63,17 +61,17 @@ void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int le
    int blockSizeB = 1;
    int blockSizeC = 1;
    int blockNum = 1;
-    int dimensionSizeA = a->dimSizeRDI[leadingDimRDI];
-    int dimensionSizeB = b->dimSizeRDI[leadingDimRDI];
-    int dimensionSizeC = c->dimSizeRDI[leadingDimRDI];
+    int dimensionSizeA = a->dimSize[leadingDim];
+    int dimensionSizeB = b->dimSize[leadingDim];
+    int dimensionSizeC = c->dimSize[leadingDim];

    for (int i = 0; i < a->order; i++) {
-        if (i != leadingDimRDI) {
-            CheckNTErrors((a->dimSizeRDI[i] == b->dimSizeRDI[i] && a->dimSizeRDI[i] == c->dimSizeRDI[i]),
+        if (i != leadingDim) {
+            CheckNTErrors((a->dimSize[i] == b->dimSize[i] && a->dimSize[i] == c->dimSize[i]),
                          "Unmatched tensors!");
        }
-        if (i < leadingDimRDI)
-            stride *= a->dimSizeRDI[i];
+        if (i > leadingDim)
+            stride *= a->dimSize[i];
    }

    blockSizeA = stride * dimensionSizeA;
@@ -168,7 +166,7 @@ int GetDivDimIndex(const XTensor &a, const XTensor &b)
 {
    if(a.order < b.order)
        return -1;
-    if(XTensor::IsSameShaped(&a, &b))
+    if(IsSameShaped(a, b))
        return -1;

    int hitCount = 0;
@@ -253,8 +251,8 @@ where i is the index of the item
 */
 void Div(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int leadingDim)
 {
-    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
-        InitTensor(&c, &a);
+    if (!c.isInit || !IsSameShaped(a, c)) {
+        InitTensorV2(&c, &a);
    }

    int n = GetDivDimIndex(a, b);

--- a/source/tensor/core/arithmetic/Div.cu
+++ b/source/tensor/core/arithmetic/Div.cu
@@ -122,7 +122,6 @@ where i is the item index
 */
 void _CudaDiv(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
 {
-    int leadingDimRDI = a->order - leadingDim - 1;
    CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
                  "Unmatched tensors in multiplication!");
    CheckNTErrors((a->order == b->order && a->order == c->order), "Unmatched tensors!");
@@ -130,18 +129,18 @@ void _CudaDiv(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, in
    int stride = 1;
    int blockSizeA = 1;
    int blockNum = 1;
-    int dimensionSizeA = a->dimSizeRDI[leadingDimRDI];
-    int dimensionSizeB = b->dimSizeRDI[leadingDimRDI];
-    int dimensionSizeC = c->dimSizeRDI[leadingDimRDI];
+    int dimensionSizeA = a->dimSize[leadingDim];
+    int dimensionSizeB = b->dimSize[leadingDim];
+    int dimensionSizeC = c->dimSize[leadingDim];

    for (int i = 0; i < a->order; i++) {
-        if (i != leadingDimRDI) {
-            CheckNTErrors((a->dimSizeRDI[i] == b->dimSizeRDI[i] &&
-                           a->dimSizeRDI[i] == c->dimSizeRDI[i]),
+        if (i != leadingDim) {
+            CheckNTErrors((a->dimSize[i] == b->dimSize[i] &&
+                           a->dimSize[i] == c->dimSize[i]),
                          "Unmatched tensors!");
        }
-        if (i < leadingDimRDI)
-            stride *= a->dimSizeRDI[i];
+        if (i > leadingDim)
+            stride *= a->dimSize[i];
    }

    blockSizeA = stride * dimensionSizeA;

--- a/source/tensor/core/arithmetic/DivDim.cpp
+++ b/source/tensor/core/arithmetic/DivDim.cpp
@@ -26,6 +26,7 @@
 #include "../../XName.h"
 #include "../../XUtility.h"
 #include "../movement/CopyValues.h"
+#include "../shape/IsSameShaped.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

@@ -56,7 +57,7 @@ void _DivDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alp

    CheckDev(a->devID, b->devID);

-    if(XTensor::IsSameShaped(a, b)){
+    if(_IsSameShaped(a, b)){
        _Div(a, b, c, alpha);
        return;
    }
@@ -188,8 +189,8 @@ i.e., a is divided with b by broadcasting
 */
 void DivDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE alpha)
 {
-    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
-        InitTensor(&c, &a);
+    if (!c.isInit || !IsSameShaped(a, c)) {
+        InitTensorV2(&c, &a);
    }

    /* call _Div function */

--- a/source/tensor/core/arithmetic/Mask.cpp
+++ b/source/tensor/core/arithmetic/Mask.cpp
@@ -24,6 +24,7 @@
 #include "../../XTensor.h"
 #include "../../XName.h"
 #include "../../XUtility.h"
+#include "../shape/IsSameShaped.h"
 #include "Mask.h"
 #include "Mask.cuh"

@@ -171,8 +172,8 @@ where i is the index of the element
 */
 void Mask(const XTensor &a, const XTensor &mask, XTensor &c, DTYPE alpha)
 {
-    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
-        InitTensor(&c, &a);
+    if (!c.isInit || !IsSameShaped(a, c)) {
+        InitTensorV2(&c, &a);
    }

    /* call _Mask function */

--- a/source/tensor/core/arithmetic/MatrixMul.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul.cpp
--- a/source/tensor/core/arithmetic/MatrixMulBatched.cpp
+++ b/source/tensor/core/arithmetic/MatrixMulBatched.cpp
@@ -22,6 +22,7 @@
 #include "../../XTensor.h"
 #include "../../XDevice.h"
 #include "../../XName.h"
+#include "../shape/IsSameShaped.h"
 #include "MatrixMulBatched.h"
 #include "XTensorBLAS.h"
 #include "MatrixMul2D.h"
@@ -94,27 +95,27 @@ void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
                  "Input tensor and output tensor must have same order!");
    CheckNTErrors(a->devID >= 0 && b->devID >= 0 && c->devID >= 0, "The tensors must be on GPUs");

-    int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
-    int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
-    int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1];
-    int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0];
-    int cn = c->dimSizeRDI[1];
-    int cm = c->dimSizeRDI[0];
+    int an = transposedA == X_TRANS ? a->dimSize[a->order - 1] : a->dimSize[a->order - 2];
+    int am = transposedA == X_TRANS ? a->dimSize[a->order - 2] : a->dimSize[a->order - 1];
+    int bn = transposedB == X_TRANS ? b->dimSize[b->order - 1] : b->dimSize[b->order - 2];
+    int bm = transposedB == X_TRANS ? b->dimSize[b->order - 2] : b->dimSize[b->order - 1];
+    int cn = c->dimSize[c->order - 2];
+    int cm = c->dimSize[c->order - 1];

    CheckNTErrors((am == bn && an == cn && bm == cm), "Unmatched tensors in multiplication!");

-    int aBlockSize = a->dimSizeRDI[0] * a->dimSizeRDI[1];
-    int bBlockSize = b->dimSizeRDI[0] * b->dimSizeRDI[1];
-    int cBlockSize = c->dimSizeRDI[0] * c->dimSizeRDI[1];
+    int aBlockSize = a->dimSize[a->order - 1] * a->dimSize[a->order - 2];
+    int bBlockSize = b->dimSize[b->order - 1] * b->dimSize[b->order - 2];
+    int cBlockSize = c->dimSize[c->order - 1] * c->dimSize[c->order - 2];
    int aRealBlockSize = aBlockSize * a->unitSize;
    int bRealBlockSize = bBlockSize * b->unitSize;
    int cRealBlockSize = cBlockSize * c->unitSize;
    int blockNum = 1;

-    for (int i = 2; i < a->order; i++) {
-        CheckNTErrors((a->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
-        CheckNTErrors((b->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
-        blockNum *= a->dimSizeRDI[i];
+    for (int i = 0; i < a->order - 2; i++) {
+        CheckNTErrors((a->dimSize[i] == c->dimSize[i]), "Incorrect tensor sizes!");
+        CheckNTErrors((b->dimSize[i] == c->dimSize[i]), "Incorrect tensor sizes!");
+        blockNum *= a->dimSize[i];
    }

    int devIDBackup = 0;
@@ -125,9 +126,9 @@ void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
                                     a->data, transposedA, a->dataType, aBlockSize,
                                     b->data, transposedB, b->dataType, bBlockSize,
                                     c->data, c->dataType, cBlockSize, blockNum,
-                                     a->dimSizeRDI[1], a->dimSizeRDI[0],
-                                     b->dimSizeRDI[1], b->dimSizeRDI[0],
-                                     c->dimSizeRDI[1], c->dimSizeRDI[0], alpha, beta);
+                                     a->dimSize[a->order - 2], a->dimSize[a->order - 1],
+                                     b->dimSize[b->order - 2], b->dimSize[b->order - 1],
+                                     c->dimSize[c->order - 2], c->dimSize[c->order - 1], alpha, beta);

    BacktoCudaDev(a->devID, devIDBackup);
 #endif
@@ -163,46 +164,43 @@ void _MatrixMulBatchedCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
                 "Input tensor and output tensor must have same order!");


-    int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
-    int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
-    int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1];
-    int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0];
-    int cn = c->dimSizeRDI[1];
-    int cm = c->dimSizeRDI[0];
+    int an = transposedA == X_TRANS ? a->dimSize[a->order - 1] : a->dimSize[a->order - 2];
+    int am = transposedA == X_TRANS ? a->dimSize[a->order - 2] : a->dimSize[a->order - 1];
+    int bn = transposedB == X_TRANS ? b->dimSize[b->order - 1] : b->dimSize[b->order - 2];
+    int bm = transposedB == X_TRANS ? b->dimSize[b->order - 2] : b->dimSize[b->order - 1];
+    int cn = c->dimSize[c->order - 2];
+    int cm = c->dimSize[c->order - 1];

    CheckNTErrors(am == bn && an == cn && bm == cm, "Unmatched tensors in multiplication!");

-    int aBlockSize = a->dimSizeRDI[0] * a->dimSizeRDI[1];
-    int bBlockSize = b->dimSizeRDI[0] * b->dimSizeRDI[1];
-    int cBlockSize = c->dimSizeRDI[0] * c->dimSizeRDI[1];
+    int aBlockSize = a->dimSize[a->order - 1] * a->dimSize[a->order - 2];
+    int bBlockSize = b->dimSize[b->order - 1] * b->dimSize[b->order - 2];
+    int cBlockSize = c->dimSize[c->order - 1] * c->dimSize[c->order - 2];
    int aRealBlockSize = aBlockSize * a->unitSize;
    int bRealBlockSize = bBlockSize * b->unitSize;
    int cRealBlockSize = cBlockSize * c->unitSize;
    int blockNum = 1;

-    for (int i = 2; i < a->order; i++) {
-        CheckNTErrors((a->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
-        CheckNTErrors((b->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
-        blockNum *= a->dimSizeRDI[i];
+    for (int i = 0; i < a->order - 2; i++) {
+        CheckNTErrors((a->dimSize[i] == c->dimSize[i]), "Incorrect tensor sizes!");
+        CheckNTErrors((b->dimSize[i] == c->dimSize[i]), "Incorrect tensor sizes!");
+        blockNum *= a->dimSize[i];
    }

-    int aDimSize[2] = {-a->dimSizeRDI[1], a->dimSizeRDI[0]};
-    int bDimSize[2] = {-b->dimSizeRDI[1], b->dimSizeRDI[0]};
-    int cDimSize[2] = {-c->dimSizeRDI[1], c->dimSizeRDI[0]};
+    int aDimSize[2] = {-a->dimSize[a->order - 2], a->dimSize[a->order - 1]};
+    int bDimSize[2] = {-b->dimSize[b->order - 2], b->dimSize[b->order - 1]};
+    int cDimSize[2] = {-c->dimSize[c->order - 2], c->dimSize[c->order - 1]};

-    XTensor * ai = NewTensor2D(aDimSize[0], aDimSize[1], a->dataType, a->devID, a->mem);
-    XTensor * bi = NewTensor2D(bDimSize[0], bDimSize[1], b->dataType, b->devID, b->mem);
-    XTensor * ci = NewTensor2D(cDimSize[0], cDimSize[1], c->dataType, c->devID, c->mem);
+    XTensor * ai = NewTensor2DV2(aDimSize[0], aDimSize[1], a->dataType, a->devID, a->mem);
+    XTensor * bi = NewTensor2DV2(bDimSize[0], bDimSize[1], b->dataType, b->devID, b->mem);
+    XTensor * ci = NewTensor2DV2(cDimSize[0], cDimSize[1], c->dataType, c->devID, c->mem);

    for (int i = 0; i < blockNum; i++) {
        ai->data = (char*)a->data + i * aRealBlockSize;
        bi->data = (char*)b->data + i * bRealBlockSize;
        ci->data = (char*)c->data + i * cRealBlockSize;
 #ifdef USE_BLAS
-        if (useBLAS)
        _MatrixMULCPU(ai, transposedA, bi, transposedB, ci, alpha, beta);
-        else
-            _MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
 #else
        _MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
 #endif
@@ -245,9 +243,9 @@ void _MatrixMulBatchedCPU(const TensorList * a, MATRIX_TRANS_TYPE transposedA,
        XTensor * ai = (XTensor*)a->GetItem(i);
        XTensor * bi = (XTensor*)b->GetItem(i);
        XTensor * ci = (XTensor*)c->GetItem(i);
-        if (!XTensor::IsSameShaped(aim, ai) ||
-            !XTensor::IsSameShaped(bim, bi) ||
-            !XTensor::IsSameShaped(cim, ci))
+        if (!_IsSameShaped(aim, ai) ||
+            !_IsSameShaped(bim, bi) ||
+            !_IsSameShaped(cim, ci))
        {
            isUniform = false;
            break;
@@ -262,10 +260,7 @@ void _MatrixMulBatchedCPU(const TensorList * a, MATRIX_TRANS_TYPE transposedA,
        CheckNTErrors((bi->order == 2), "2d tensor (i.e., matrix) is required!");
        CheckNTErrors((ci->order == 2), "2d tensor (i.e., matrix) is required!");
 #ifdef USE_BLAS
-        if (useBLAS)
            _MatrixMULCPU(ai, transposedA, bi, transposedB, ci, alpha, beta);
-        else
-            _MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
 #else
        _MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
 #endif
@@ -297,10 +292,10 @@ XTensor MatrixMulBatched(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const 
    CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
    CheckNTErrors(a.order == b.order, "Input tensor and output tensor must have same order!");

-    int an = transposedA == X_TRANS ? a.dimSizeRDI[0] : a.dimSizeRDI[1];
-    int am = transposedA == X_TRANS ? a.dimSizeRDI[1] : a.dimSizeRDI[0];
-    int bn = transposedB == X_TRANS ? b.dimSizeRDI[0] : b.dimSizeRDI[1];
-    int bm = transposedB == X_TRANS ? b.dimSizeRDI[1] : b.dimSizeRDI[0];
+    int an = transposedA == X_TRANS ? a.dimSize[a.order - 1] : a.dimSize[a.order - 2];
+    int am = transposedA == X_TRANS ? a.dimSize[a.order - 2] : a.dimSize[a.order - 1];
+    int bn = transposedB == X_TRANS ? b.dimSize[b.order - 1] : b.dimSize[b.order - 2];
+    int bm = transposedB == X_TRANS ? b.dimSize[b.order - 2] : b.dimSize[b.order - 1];

    CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");

@@ -355,10 +350,10 @@ XTensor MatrixMulBatched(const XTensor &a, const XTensor &b,
    CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
    CheckNTErrors(a.order == b.order, "Input tensor and output tensor must have same order!");

-    int an = a.dimSizeRDI[1];
-    int am = a.dimSizeRDI[0];
-    int bn = b.dimSizeRDI[1];
-    int bm = b.dimSizeRDI[0];
+    int an = a.dimSize[a.order - 2];
+    int am = a.dimSize[a.order - 1];
+    int bn = b.dimSize[b.order - 2];
+    int bm = b.dimSize[b.order - 1];

    CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");


--- a/source/tensor/core/arithmetic/MulAndShift.cpp
+++ b/source/tensor/core/arithmetic/MulAndShift.cpp
@@ -37,7 +37,7 @@ int GetSumIndex(const XTensor &a, const XTensor &b)
 {
    if (a.order < b.order)
        return -1;
-    if (XTensor::IsSameShaped(&a, &b))
+    if (IsSameShaped(a, b))
        return -1;

    int hitCount = 0;
@@ -71,26 +71,27 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
    CheckNTErrors(x.dataType == w.dataType, "Input tensors should have the same data type!");
    CheckNTErrors(x.order >= 2 && w.order >= 2, "Input tensors must have a order >= 2!");

-    int xn = x.dimSizeRDI[1];
-    int xm = x.dimSizeRDI[0];
-    int wn = w.dimSizeRDI[1];
-    int wm = w.dimSizeRDI[0];
+    int xn = x.dimSize[x.order - 2];
+    int xm = x.dimSize[x.order - 1];
+    int wn = w.dimSize[w.order - 2];
+    int wm = w.dimSize[w.order - 1];

    CheckNTErrors(xm == wn, "Unmatched tensors in multiplication!");

    int order = x.order + w.order - 2;
    int sub = 0;
    int * dimSize = new int[order];
-    for (int i = 2; i < x.order; i++)
-        dimSize[sub++] = x.dimSizeRDI[x.order + 1 - i];
-    for (int i = 2; i < w.order; i++)
-        dimSize[sub++] = w.dimSizeRDI[w.order + 1 - i];
+    for (int i = 0; i < x.order - 2; i++)
+        dimSize[sub++] = x.dimSize[i];
+    for (int i = 0; i < w.order - 2; i++)
+        dimSize[sub++] = w.dimSize[i];
+
    dimSize[sub++] = xn;
    dimSize[sub++] = wm;

    float dr = (!x.isSparse || !w.isSparse) ? 1.0F : MAX(x.denseRatio, w.denseRatio);

-    XTensor * tmp = NewTensorBuf(order, dimSize, x.dataType, dr, x.devID, x.mem);
+    XTensor * tmp = NewTensorBufV2(order, dimSize, x.dataType, dr, x.devID, x.mem);

    /* call _MatrixMul function */
    _MatrixMul(&x, X_NOTRANS, &w, X_NOTRANS, tmp, alpha, 0, parallelRunner);
@@ -148,24 +149,24 @@ XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedA,
    CheckNTErrors(x.dataType == w.dataType, "Input tensors should have the same data type!");
    CheckNTErrors(x.order >= 2 && w.order >= 2, "Input tensors must have a order >= 2!");

-    int xn = transposedA == X_TRANS ? x.dimSizeRDI[0] : x.dimSizeRDI[1];
-    int xm = transposedA == X_TRANS ? x.dimSizeRDI[1] : x.dimSizeRDI[0];
-    int wn = transposedB == X_TRANS ? w.dimSizeRDI[0] : w.dimSizeRDI[1];
-    int wm = transposedB == X_TRANS ? w.dimSizeRDI[1] : w.dimSizeRDI[0];
+    int xn = transposedA == X_TRANS ? x.dimSize[x.order - 1] : x.dimSize[x.order - 2];
+    int xm = transposedA == X_TRANS ? x.dimSize[x.order - 2] : x.dimSize[x.order - 1];
+    int wn = transposedB == X_TRANS ? w.dimSize[w.order - 1] : w.dimSize[w.order - 2];
+    int wm = transposedB == X_TRANS ? w.dimSize[w.order - 2] : w.dimSize[w.order - 1];

    int order = x.order + w.order - 2;
    int sub = 0;
    int * dimSize = new int[order];
-    for (int i = 2; i < x.order; i++)
-        dimSize[sub++] = x.dimSizeRDI[x.order + 1 - i];
-    for (int i = 2; i < w.order; i++)
-        dimSize[sub++] = w.dimSizeRDI[w.order + 1 - i];
+    for (int i = 0; i < x.order - 2; i++)
+        dimSize[sub++] = x.dimSize[i];
+    for (int i = 0; i < w.order - 2; i++)
+        dimSize[sub++] = w.dimSize[i];
    dimSize[sub++] = xn;
    dimSize[sub++] = wm;

    float dr = (!x.isSparse || !w.isSparse) ? 1.0F : MAX(x.denseRatio, w.denseRatio);

-    XTensor * tmp = NewTensorBuf(order, dimSize, x.dataType, dr, x.devID, x.mem);
+    XTensor * tmp = NewTensorBufV2(order, dimSize, x.dataType, dr, x.devID, x.mem);

    /* call _MatrixMul function */
    _MatrixMul(&x, transposedA, &w, transposedB, tmp, alpha, 0, parallelRunner);
@@ -205,7 +206,6 @@ XTensor MulAndShift(const XTensor& x, MATRIX_TRANS_TYPE transposedA,
    DelTensorBuf(tmp);

    return c;
-
 }

 }
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Multiply.cpp
+++ b/source/tensor/core/arithmetic/Multiply.cpp
@@ -22,6 +22,7 @@
 #include "../../XTensor.h"
 #include "../../XName.h"
 #include "../../XUtility.h"
+#include "../shape/IsSameShaped.h"
 #include "Multiply.h"
 #include "Multiply.cuh"
 #include "MultiplyDim.h"
@@ -48,9 +49,6 @@ void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, i
                  "Unmatched tensors!");

    CheckDev(a->devID, b->devID);
-
-    int leadingDimRDI = a->order - leadingDim - 1;
-
 #ifdef USE_CUDA
    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
        _CudaMultiply(a, b, c, alpha, leadingDim);
@@ -63,18 +61,18 @@ void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, i
    int blockSizeB = 1;
    int blockSizeC = 1;
    int blockNum = 1;
-    int dimensionSizeA = a->dimSizeRDI[leadingDimRDI];
-    int dimensionSizeB = b->dimSizeRDI[leadingDimRDI];
-    int dimensionSizeC = c->dimSizeRDI[leadingDimRDI];
+    int dimensionSizeA = a->dimSize[leadingDim];
+    int dimensionSizeB = b->dimSize[leadingDim];
+    int dimensionSizeC = c->dimSize[leadingDim];

    for (int i = 0; i < a->order; i++) {
-        if (i != leadingDimRDI) {
-            CheckNTErrors((a->dimSizeRDI[i] == b->dimSizeRDI[i] &&
-                           a->dimSizeRDI[i] == c->dimSizeRDI[i]),
+        if (i != leadingDim) {
+            CheckNTErrors((a->dimSize[i] == b->dimSize[i] &&
+                           a->dimSize[i] == c->dimSize[i]),
                          "Unmatched tensors!");
        }
-        if (i < leadingDimRDI)
-            stride *= a->dimSizeRDI[i];
+        if (i > leadingDim)
+            stride *= a->dimSize[i];
    }

    blockSizeA = stride * dimensionSizeA;
@@ -169,7 +167,7 @@ int GetMultiplyDimIndex(const XTensor &a, const XTensor &b)
 {
    if(a.order < b.order)
        return -1;
-    if(XTensor::IsSameShaped(&a, &b))
+    if(IsSameShaped(a, b))
        return -1;

    int hitCount = 0;
@@ -254,8 +252,8 @@ where i is the index of the item
 */
 void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int leadingDim)
 {
-    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
-        InitTensor(&c, &a);
+    if (!c.isInit || !IsSameShaped(a, c)) {
+        InitTensorV2(&c, &a);
    }

    int n = GetMultiplyDimIndex(a, b);

--- a/source/tensor/core/arithmetic/Multiply.cu
+++ b/source/tensor/core/arithmetic/Multiply.cu
@@ -122,26 +122,25 @@ where i is the item index
 */
 void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
 {
-    int leadingDimRDI = a->order - leadingDim - 1;
-    CheckNTErrors(a->unitNum <= c->unitNum && b->unitNum <= c->unitNum,
+    CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
                  "Unmatched tensors in multiplication!");
-    CheckNTErrors(a->order == b->order && a->order == c->order, "Unmatched tensors!");
+    CheckNTErrors((a->order == b->order && a->order == c->order), "Unmatched tensors!");

    int stride = 1;
    int blockSizeA = 1;
    int blockNum = 1;
-    int dimensionSizeA = a->dimSizeRDI[leadingDimRDI];
-    int dimensionSizeB = b->dimSizeRDI[leadingDimRDI];
-    int dimensionSizeC = c->dimSizeRDI[leadingDimRDI];
+    int dimensionSizeA = a->dimSize[leadingDim];
+    int dimensionSizeB = b->dimSize[leadingDim];
+    int dimensionSizeC = c->dimSize[leadingDim];

    for (int i = 0; i < a->order; i++) {
-        if (i != leadingDimRDI) {
-            CheckNTErrors((a->dimSizeRDI[i] == b->dimSizeRDI[i] &&
-                           a->dimSizeRDI[i] == c->dimSizeRDI[i]),
+        if (i != leadingDim) {
+            CheckNTErrors((a->dimSize[i] == b->dimSize[i] &&
+                           a->dimSize[i] == c->dimSize[i]),
                          "Unmatched tensors!");
        }
-        if (i < leadingDimRDI)
-            stride *= a->dimSizeRDI[i];
+        if (i > leadingDim)
+            stride *= a->dimSize[i];
    }

    blockSizeA = stride * dimensionSizeA;

--- a/source/tensor/core/arithmetic/MultiplyDim.cpp
+++ b/source/tensor/core/arithmetic/MultiplyDim.cpp
@@ -24,6 +24,7 @@
 #include "MultiplyDim.h"
 #include "MultiplyDim.cuh"
 #include "../shape/Unsqueeze.h"
+#include "../shape/IsSameShaped.h"
 #include "../../XName.h"
 #include "../../XUtility.h"
 #include "../movement/CopyValues.h"
@@ -57,7 +58,7 @@ void _MultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYP

    CheckDev(a->devID, b->devID);

-    if(XTensor::IsSameShaped(a, b)){
+    if(_IsSameShaped(a, b)){
        _Multiply(a, b, c, alpha);
        return;
    }
@@ -203,8 +204,8 @@ i.e., a is multiplied with b by broadcasting
 */
 void MultiplyDim(const XTensor &a, const XTensor &b, XTensor &c, int n)
 {
-    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
-        InitTensor(&c, &a);
+    if (!c.isInit || !IsSameShaped(a, c)) {
+        InitTensorV2(&c, &a);
    }

    /* call _Multiply function */
@@ -232,7 +233,7 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
 {
    CheckNTErrors(a->order == b->order, "Wrong tensor orders!");
    CheckNTErrors(a->order == c->order, "Wrong tensor orders!");
-    CheckNTErrors(a->order > 0, "TODO!");
+    CheckNTErrors(a->order >= 0, "TODO!");
    
    int order = a->order;
    int count = 0;
@@ -280,8 +281,8 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
            dimsS[0] = -dimsS[0];
            dimsT[0] = -dimsT[0];
            
-            XTensor * s = NewTensor(order - (j - i), dimsS, a->dataType, a->denseRatio, a->devID, a->mem);
-            XTensor * t = NewTensor(order - (j - i) + 1, dimsT, b->dataType, b->denseRatio, b->devID, b->mem);
+            XTensor * s = NewTensorV2(order - (j - i), dimsS, a->dataType, a->denseRatio, a->devID, a->mem);
+            XTensor * t = NewTensorV2(order - (j - i) + 1, dimsT, b->dataType, b->denseRatio, b->devID, b->mem);
            
            if(count == 0)
                source = b->data;
@@ -371,8 +372,8 @@ where some of dimensions of b can be of size 1
 */
 void MultiplyBroadcast(const XTensor &a, const XTensor &b, XTensor &c)
 {
-    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
-        InitTensor(&c, &a);
+    if (!c.isInit || !IsSameShaped(a, c)) {
+        InitTensorV2(&c, &a);
    }

    /* call _SumBroadcast function */

--- a/source/tensor/core/arithmetic/Sub.cpp
+++ b/source/tensor/core/arithmetic/Sub.cpp
@@ -22,6 +22,7 @@
 #include "../../XTensor.h"
 #include "../../XName.h"
 #include "../../XUtility.h"
+#include "../shape/IsSameShaped.h"
 #include "Sub.h"
 #include "Sub.cuh"
 #include "SubDim.h"
@@ -149,7 +150,7 @@ int GetSubDimIndex(const XTensor &a, const XTensor &b)
 {
    if(a.order < b.order)
        return -1;
-    if(XTensor::IsSameShaped(&a, &b))
+    if(IsSameShaped(a, b))
        return -1;

    int hitCount = 0;
@@ -223,8 +224,8 @@ tensor subtraction c = a - b * \beta
 */
 void Sub(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta)
 {
-    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
-        InitTensor(&c, &a);
+    if (!c.isInit || !IsSameShaped(a, c)) {
+        InitTensorV2(&c, &a);
    }

    int n = GetSubDimIndex(a, b);

--- a/source/tensor/core/arithmetic/SubDim.cpp
+++ b/source/tensor/core/arithmetic/SubDim.cpp
@@ -26,6 +26,7 @@
 #include "../../XName.h"
 #include "../../XUtility.h"
 #include "../movement/CopyValues.h"
+#include "../shape/IsSameShaped.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

@@ -61,7 +62,7 @@ void _SubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE bet
        return;
    }

-    if (XTensor::IsSameShaped(a, b)) {
+    if (_IsSameShaped(a, b)) {
        _Sub(a, b, c, beta);
        return;
    }
@@ -188,8 +189,8 @@ i.e., a is subtracted with b by broadcasting
 */
 void SubDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta)
 {
-    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
-        InitTensor(&c, &a);
+    if (!c.isInit || !IsSameShaped(a, c)) {
+        InitTensorV2(&c, &a);
    }

    /* call _Sub function */

--- a/source/tensor/core/arithmetic/SubDim.cu
+++ b/source/tensor/core/arithmetic/SubDim.cu
@@ -39,7 +39,7 @@ where a is a tensor and b is a row vector
 */
 template <class T, bool betaFired>
 __global__
-    void KernelSubWithRow(T * a, T * b, T * c, int rowNum, int colNum, T beta)
+void KernelSubWithRow(T * a, T * b, T * c, int rowNum, int colNum, T beta)
 {
    __shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
    int col = blockDim.x * blockIdx.x + threadIdx.x;
@@ -75,7 +75,7 @@ where a is a tensor and b is a colum vector
 */
 template <class T, bool betaFired>
 __global__
-    void KernelSubWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, T beta)
+void KernelSubWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, T beta)
 {
    __shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];


--- a/source/tensor/core/arithmetic/Sum.cpp
+++ b/source/tensor/core/arithmetic/Sum.cpp
@@ -22,7 +22,9 @@
 #include "../../XTensor.h"
 #include "../../XName.h"
 #include "../../XUtility.h"
+#include "../../XBLAS.h"
 #include "../movement/CopyValues.h"
+#include "../shape/IsSameShaped.h"
 #include "Sum.h"
 #include "Sum.cuh"
 #include "SumDim.h"
@@ -45,6 +47,8 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
    CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
                  "Unmatched tensors in addition!");

+    CheckDev(a->devID, b->devID);
+
    if(beta == 0){
        _CopyValues(a, c);
        return;
@@ -157,6 +161,19 @@ void _SumMe(XTensor * a, const XTensor * b, DTYPE beta)
 }

 /*
+tensor summation a = a + b * \beta (do it on site)
+keep the result in the tensor a and return nothing
+
+>> a - a tensor
+>> b - another tensor
+>> beta - the scaling factor
+*/
+void SumMe(XTensor& a, const XTensor& b, DTYPE beta)
+{
+    _Sum(&a, &b, &a, beta);
+}
+
+/* 
 return a dimension if the sum is performed as SumDim (in more details in SumDim.h)
 >> a - a tensor
 >> b - another tensor for sum
@@ -165,6 +182,8 @@ int GetSumDimIndex(const XTensor &a, const XTensor &b)
 {
    if(a.order < b.order)
        return -1;
+    if(IsSameShaped(a, b))
+        return -1;

    int hitCount = 0;
    int hitDim = -1;
@@ -184,7 +203,7 @@ int GetSumDimIndex(const XTensor &a, const XTensor &b)
 }
    
 /*
-tensor summation c = a + b * \beta (return a XTensor structure)
+tensor summation c = a + b * \beta (return an XTensor structure)
 make a new tensor c to keep the result and return it

 >> a - a tensor
@@ -227,4 +246,45 @@ XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta)
    return c;
 }

+/*
+tensor summation c = a + b * \beta
+
+>> a - a tensor
+>> b - another tensor
+>> beta - the scaling factor
+*/
+void Sum(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta)
+{
+    if (!c.isInit || !IsSameShaped(a, c)) {
+        InitTensorV2(&c, &a);
+    }
+
+    int n = GetSumDimIndex(a, b);
+
+    if (n == -1) {
+        /* call _Sum function */
+        _Sum(&a, &b, &c, beta);
+		
+		/* tensor connections */
+        if (a.enableGrad && b.enableGrad) {    
+            XLink::MakeLink(&a, &b, &c, MATH_SUM);
+            XLink::AddParamToHead(&c, beta);
+        }
+    }
+    else if (n >= 0 && n < a.order) {
+        /* call _SumDim function */
+        _SumDim(&a, &b, &c, n, beta);
+    
+        /* tensor connections */
+        if (a.enableGrad && b.enableGrad) {
+            XLink::MakeLink(&a, &b, &c, MATH_SUMDIM);
+            XLink::AddParamToHeadInt(&c, n);
+            XLink::AddParamToHead(&c, beta);
+        }
+    }
+    else {
+        ShowNTErrors("Something is wrong!");
+    }
+}
+
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/Sum.cu
+++ b/source/tensor/core/arithmetic/Sum.cu
@@ -45,15 +45,6 @@ void KernelADD(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta)
        c[i] = a[i] + b[i] * beta;
 }

-__global__
-void KernelADD(int * a, int * b, int * c, int size, int beta)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-    if (i < size)
-        c[i] = a[i] + b[i] * beta;
-}
-
 /*
 tensor summation c = a + b * \beta (cuda version)
 >> a - a tensor
@@ -109,17 +100,6 @@ void _CudaSum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
                KernelADD << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, a->unitNum, beta);
            }
        }
-        else if (a->dataType == X_INT &&
-                 b->dataType == X_INT &&
-                 c->dataType == X_INT)
-        {
-            int gridSize[3], blockSize[3];
-
-            GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
-            dim3 blocks(gridSize[0]);
-            dim3 threads(blockSize[0]);
-            KernelADD << <blocks, threads >> >((int*)a->data, (int*)b->data, (int*)c->data, a->unitNum, (int)beta);
-        }
        else {
            // TODO!!
            ShowNTErrors("TODO!");

--- a/source/tensor/core/arithmetic/Sum.h
+++ b/source/tensor/core/arithmetic/Sum.h
@@ -34,6 +34,7 @@ tensor summation a = a + b * \beta
 keep the result in the input tensor a and return nothing
 */
 void _SumMe(XTensor * a, const XTensor * b, DTYPE beta = (DTYPE)1.0);
+void SumMe(XTensor & a, const XTensor & b, DTYPE beta = (DTYPE)1.0);
    
 /*
 tensor summation c = a + b * \beta
@@ -41,6 +42,9 @@ make a new tensor c to keep the result and return it
 */
 XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta = (DTYPE)1.0);

+/* tensor summation c = a + b * \beta */
+void Sum(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta = (DTYPE)1.0);
+
 } // namespace nts(NiuTrans.Tensor)

 #endif // __SUM_H__
--- a/source/tensor/core/arithmetic/SumDim.cpp
+++ b/source/tensor/core/arithmetic/SumDim.cpp
@@ -26,6 +26,7 @@
 #include "SumDim.h"
 #include "SumDim.cuh"
 #include "../shape/Unsqueeze.h"
+#include "../shape/IsSameShaped.h"
 #include "../../XName.h"
 #include "../../XUtility.h"
 #include "../movement/CopyValues.h"
@@ -64,25 +65,11 @@ void _SumDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE bet
        return;
    }

-    if(XTensor::IsSameShaped(a, b)){
+    if(_IsSameShaped(a, b)){
        _Sum(a, b, c, beta);
        return;
    }

-    /*int dims[MAX_TENSOR_DIM_NUM];
-    for(int i = 0; i < a->order; i++)
-        dims[i] = 1;
-    dims[n] = a->GetDim(n);
-
-    XTensor * b2 = NewTensor(a->order, dims, b->dataType, b->denseRatio, b->devID, b->mem);
-    _CopyValues(b, b2);
-
-    _SumBroadcast(a, b2, c, beta);
-
-    DelTensor(b2);
-
-    return;*/
-
    if(a->devID >= 0 || b->devID >= 0 || c->devID >= 0){
 #ifdef USE_CUDA
        _CudaSumDim(a, b, c, n, beta);
@@ -205,8 +192,8 @@ i.e., a is summed with b by broadcasting
 */
 void SumDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta)
 {
-    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
-        InitTensor(&c, &a);
+    if (!c.isInit || !IsSameShaped(a, c)) {
+        InitTensorV2(&c, &a);
    }

    /* call _SumDim function */
@@ -233,7 +220,7 @@ void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta
 {
    CheckNTErrors(a->order == b->order, "Wrong tensor orders!");
    CheckNTErrors(a->order == c->order, "Wrong tensor orders!");
-    CheckNTErrors(a->order > 0, "TODO!");
+    CheckNTErrors(a->order >= 0, "TODO!");
    
    int order = a->order;
    int count = 0;
@@ -281,8 +268,8 @@ void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta
            dimsS[0] = -dimsS[0];
            dimsT[0] = -dimsT[0];
            
-            XTensor * s = NewTensor(order - (j - i), dimsS, a->dataType, a->denseRatio, a->devID, a->mem);
-            XTensor * t = NewTensor(order - (j - i) + 1, dimsT, b->dataType, b->denseRatio, b->devID, b->mem);
+            XTensor * s = NewTensorV2(order - (j - i), dimsS, a->dataType, a->denseRatio, a->devID, a->mem);
+            XTensor * t = NewTensorV2(order - (j - i) + 1, dimsT, b->dataType, b->denseRatio, b->devID, b->mem);
            
            if(count == 0)
                source = b->data;
@@ -374,8 +361,8 @@ c = a + b * \beta
 */
 void SumBroadcast(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta)
 {
-    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
-        InitTensor(&c, &a);
+    if (!c.isInit || !IsSameShaped(a, c)) {
+        InitTensorV2(&c, &a);
    }

    /* call _SumBroadcast function */

--- a/source/tensor/core/arithmetic/SumDim.cu
+++ b/source/tensor/core/arithmetic/SumDim.cu
@@ -87,17 +87,17 @@ void KernelAddWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize
    int col = colIndex % colNum;
    int block = colIndex / colNum;

-    if (row >= rowNum || block >= blockNum)
+    if(row >= rowNum || block >= blockNum)
        return;

-    if (threadIdx.x == 0)
+    if(threadIdx.x == 0)
        bv[threadIdx.y] = b[row];

    __syncthreads();

    int offset = block * blockSize + row * colNum + col;
    
-    if (betaFired)
+    if(betaFired)
        c[offset] = a[offset] + bv[threadIdx.y] * beta;
    else
        c[offset] = a[offset] + bv[threadIdx.y];

--- a/source/tensor/core/arithmetic/XTensorBLAS.cu
+++ b/source/tensor/core/arithmetic/XTensorBLAS.cu
@@ -22,6 +22,7 @@
 #include "../../XUtility.h"
 #include "../../XDevice.h"
 #include "../../XTensor.h"
+#include "../shape/IsSameShaped.h"
 #include "XTensorBLAS.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -224,9 +225,9 @@ void _CudaBLASMatrixMULList(cublasHandle_t * handle,
        XTensor * ai = (XTensor*)a->GetItem(i);
        XTensor * bi = (XTensor*)b->GetItem(i);
        XTensor * ci = (XTensor*)c->GetItem(i);
-        if (!XTensor::IsSameShaped(aim, ai) ||
-            !XTensor::IsSameShaped(bim, bi) ||
-            !XTensor::IsSameShaped(cim, ci))
+        if (!_IsSameShaped(aim, ai) ||
+            !_IsSameShaped(bim, bi) ||
+            !_IsSameShaped(cim, ci))
        {
            isUniform = false;
            break;

--- a/source/tensor/core/getandset/ConvertDataType.cpp
+++ b/source/tensor/core/getandset/ConvertDataType.cpp
 /* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */

 /*
-* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
-*/
+ * $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
+ */

 #include "../../XTensor.h"
 #include "../../XName.h"
@@ -131,7 +131,7 @@ void ConvertDataType(const XTensor & input, XTensor & output, TENSOR_DATA_TYPE d
 {
    if (!output.isInit || input.dataType != output.dataType) {
        float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
-        InitTensor(&output, input.order, input.dimSize, dataType, dr, input.devID, input.mem);
+        InitTensorV2(&output, input.order, input.dimSize, dataType, dr, input.devID, input.mem);
    }

    _ConvertDataType(&input, &output);

--- a/source/tensor/core/getandset/OnehotAndIndex.cpp
+++ b/source/tensor/core/getandset/OnehotAndIndex.cpp
@@ -21,6 +21,7 @@

 #include "OnehotAndIndex.h"
 #include "OnehotAndIndex.cuh"
+#include "SetData.h"

 namespace nts{ // namespace nts(NiuTrans.Tensor)

@@ -31,7 +32,7 @@ convert onehot tensor to index tensor
 >> index - index tensor, which value is an integer num
 >> size - the last dimension size of the onehot tensor
 */
-void _OnehotToIndex(XTensor * onehot, XTensor * index, int size)
+void _OnehotToIndex(const XTensor * onehot, XTensor * index, int size)
 {
    CheckNTErrors(onehot->GetDim(-1) == size, "Illegal tensor dimension!");
    CheckNTErrors(onehot->order == index->order + 1, "Illegal tensor order!");
@@ -78,13 +79,13 @@ make a new tensor to keep the result and return it
 >> size - the last dimension size of the onehot tensor
 << return - the index tensor
 */
-XTensor OnehotToIndex(XTensor & onehot, int size)
+XTensor OnehotToIndex(const XTensor & onehot, int size)
 {
    CheckNTErrors(onehot.GetDim(-1) == size, "Illegal tensor dimension!");
    CheckNTErrors(onehot.dataType == X_INT, "The onehot tensor must be in X_INT!")

    XTensor index;
-    InitTensor(&index, onehot.order - 1, onehot.dimSize, X_INT, 1.0F, onehot.devID, onehot.mem);
+    InitTensorV2(&index, onehot.order - 1, onehot.dimSize, X_INT, 1.0F, onehot.devID, onehot.mem);
    index.SetTMPFlag();

    _OnehotToIndex(&onehot, &index, size);
@@ -99,7 +100,8 @@ convert index tensor to onehot tensor
 >> onehot - onehot tensor, which value is 0 or 1
 >> size - the last dimension size of the onehot tensor
 */
-void _IndexToOnehot(XTensor * index, XTensor * onehot, int size, float labelSmoothingP)
+void _IndexToOnehot(const XTensor * index, XTensor * onehot, 
+                    int size, float labelSmoothingP)
 {
    CheckNTErrors(onehot->GetDim(-1) == size, "Illegal tensor dimension!");
    CheckNTErrors(onehot->order == index->order + 1, "Illegal tensor order!");
@@ -109,11 +111,14 @@ void _IndexToOnehot(XTensor * index, XTensor * onehot, int size, float labelSmoo
    for (int i = 0; i < index->order; i++)
        CheckNTErrors(index->GetDim(i) == onehot->GetDim(i), "Illegal tensor order!");

-    onehot->SetZeroAll();
+    //onehot->SetZeroAll();

-#ifdef USE_CUDA
    float confidence = 1 - labelSmoothingP;
    float lowconfidence = labelSmoothingP / size;
+
+    _SetDataFixedFloat(onehot, lowconfidence);
+
+#ifdef USE_CUDA
    if(onehot->devID >= 0 && index->devID >= 0) {
        _CudaIndexToOnehot(index, onehot, size, confidence, lowconfidence);
        return;
@@ -129,8 +134,49 @@ void _IndexToOnehot(XTensor * index, XTensor * onehot, int size, float labelSmoo
    for (int i = 0; i < blockNum; i++) {
        int id = indexData[i];
        DTYPE * od = onehotData + i * stride;
-        od[id] = 1;
+        od[id] = confidence;
+    }
+
+}
+
+/*
+convert index tensor to onehot tensor
+
+>> index - index tensor, which value is an integer num
+>> onehot - onehot tensor, which value is 0 or 1
+>> size - the last dimension size of the onehot tensor
+*/
+void _IndexToOnehot(int * index, int n, XTensor * onehot, int size, float labelSmoothingP)
+{
+    /*CheckNTErrors(onehot->GetDim(-1) == size, "Illegal tensor dimension!");
+    CheckNTErrors(onehot->dataType == X_INT, "The onehot tensor must be in X_INT!")
+
+
+        onehot->SetZeroAll();
+
+#ifdef USE_CUDA
+    if (onehot->devID >= 0) {
+        
+        delete[] cudaIndex;
+        return;
    }
+#endif
+
+    int blockNum = n;
+    int stride = size;
+
+    int * indexData = (int *)index;
+    int * onehotData = (int *)onehot->data;
+
+    for (int i = 0; i < blockNum; i++) {
+        int id = indexData[i];
+        int * od = onehotData + i * stride;
+        od[id] = 1;
+    }*/
+    XTensor* cudaIndex = NewTensor1DV2(n, X_INT, onehot->devID);
+    cudaIndex->SetData(index, n);
+    _IndexToOnehot(cudaIndex, onehot, size, labelSmoothingP);
+    delete[] cudaIndex;

 }

@@ -143,7 +189,7 @@ make a new tensor to keep the result and return it
 >> confidence - labelsmoothing
 << return - the onehot tensor
 */
-XTensor IndexToOnehot(XTensor & index, int size, float labelSmoothingP)
+XTensor IndexToOnehot(const XTensor & index, int size, float labelSmoothingP)
 {
    CheckNTErrors(index.dataType == X_INT, "The onehot tensor must be in X_INT!")

@@ -154,7 +200,7 @@ XTensor IndexToOnehot(XTensor & index, int size, float labelSmoothingP)
    int * dim = new int[order + 1];
    memcpy(dim, index.dimSize, order * sizeof(int));
    dim[order] = size;
-    InitTensor(&onehot, index.order + 1, dim, X_FLOAT, 1.0F, index.devID, index.mem);
+    InitTensorV2(&onehot, index.order + 1, dim, X_FLOAT, 1.0F, index.devID, index.mem);

    _IndexToOnehot(&index, &onehot, size, labelSmoothingP);


--- a/source/tensor/core/getandset/OnehotAndIndex.cu
+++ b/source/tensor/core/getandset/OnehotAndIndex.cu
@@ -61,7 +61,7 @@ convert onehot tensor to index tensor (cuda version)
 >> index - index tensor, which value is an integer num
 >> size - the last dimension size of the onehot tensor
 */
-void _CudaOnehotToIndex(XTensor * onehot, XTensor * index, int size)
+void _CudaOnehotToIndex(const XTensor * onehot, XTensor * index, int size)
 {
    int devID = onehot->devID;

@@ -111,13 +111,10 @@ void KernelIndexToOnehot(DTYPE * onehotData, int * indexData, int blockNum, int 

    int id = indexData[i];

-    //od[id] = 2.0;
-    //onehotData[i * stride + id] = 0.1;
    if (offset == id)
        od[offset] = confidence;
-    else{
-        od[offset] = lowconfidence;
-    }
+    //else
+    //    od[offset] = lowconfidence;
 }

 /* 
@@ -127,7 +124,8 @@ convert index tensor to onehot tensor (cuda version)
 >> onehot - onehot tensor, which value is 0 or 1
 >> size - the last dimension size of the onehot tensor
 */
-void _CudaIndexToOnehot(XTensor * index, XTensor * onehot, int size, float confidence, float lowconfidence)
+void _CudaIndexToOnehot(const XTensor * index, XTensor * onehot, 
+                        int size, float confidence, float lowconfidence)
 {
    int devID = onehot->devID;


--- a/source/tensor/core/getandset/OnehotAndIndex.cuh
+++ b/source/tensor/core/getandset/OnehotAndIndex.cuh
@@ -27,10 +27,11 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)

 /* convert onehot tensor to index tensor (cuda version) */
-void _CudaOnehotToIndex(XTensor * onehot, XTensor * index, int size);
+void _CudaOnehotToIndex(const XTensor * onehot, XTensor * index, int size);

 /* convert index tensor to onehot tensor (cuda version) */
-void _CudaIndexToOnehot(XTensor * index, XTensor * onehot, int size, float confidence, float lowconfidence);
+void _CudaIndexToOnehot(const XTensor * index, XTensor * onehot, 
+                        int size, float confidence, float lowconfidence);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/getandset/OnehotAndIndex.h
+++ b/source/tensor/core/getandset/OnehotAndIndex.h
@@ -27,18 +27,21 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)

 /* convert onehot tensor to index tensor */
-void _OnehotToIndex(XTensor * onehot, XTensor * index, int size);
+void _OnehotToIndex(const XTensor * onehot, XTensor * index, int size);

 /* convert onehot tensor to index tensor (return an XTensor structure)
 make a new tensor to keep the result and return it */
-XTensor OnehotToIndex(XTensor & onehot, int num);
+XTensor OnehotToIndex(const XTensor & onehot, int num);

 /* convert index tensor to onehot tensor */
-void _IndexToOnehot(XTensor * index, XTensor * onehot, int size, float labelSmoothingP);
+void _IndexToOnehot(const XTensor * index, XTensor * onehot, int size, float labelSmoothingP);
+
+/* convert index tensor to onehot tensor */
+void _IndexToOnehot(int * index, int n, XTensor * onehot, int size, float labelSmoothingP);

 /* convert index tensor to onehot tensor (return an XTensor structure)
 make a new tensor to keep the result and return it */
-XTensor IndexToOnehot(XTensor & index, int num, float labelSmoothingP);
+XTensor IndexToOnehot(const XTensor & index, int num, float labelSmoothingP);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/getandset/Select.cpp
+++ b/source/tensor/core/getandset/Select.cpp
@@ -26,6 +26,114 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)

 /*
+generate a tensor with selected data in index along the given dimension
+
+c = select(a)
+
+>> a - input tensor
+>> c - result tensor
+>> index - the selected index
+>> dim - the dimension along with which we do the job
+*/
+
+void _Select(const XTensor * a, XTensor * c, int* index, int dim)
+{
+    CheckNTErrors(a != NULL && c != NULL, "empty tensors!");
+    CheckNTErrors(a->order == c->order, "The input and output tensors must in the same order!");
+    CheckNTErrors(dim >= 0 && dim < a->order, "The input dimension is out of bounds!");
+    CheckNTErrors(a->dataType == c->dataType, "The tensor must be of the same data type!");
+    int stride = 1;
+    for (int i = dim + 1; i < a->order; i++)
+        stride *= a->dimSize[i];
+    int copyTimes = 1;
+    for (int i = 0; i < dim; i++)
+    {
+        copyTimes *= a->dimSize[i];
+    }
+    int cot = c->dimSize[dim];
+    int blockSize = stride * a->unitSize;
+    int stepSizeS = stride * a->dimSize[dim] * a->unitSize;
+    int stepSizeT = stride * c->dimSize[dim] * a->unitSize;
+    char * s = (char*)a->data;
+    char * t = (char*)c->data;
+    for (int i = 0; i < copyTimes; i++) {
+        for (int j = 0; j < cot; ++j) {
+            XMemCopy(t + j * blockSize, c->devID, s + index[j] * blockSize, a->devID, blockSize);
+        }
+        s += stepSizeS;
+        t += stepSizeT;
+    }
+}
+
+/*
+generate a tensor with selected data in index along the given dimension
+
+c = select(a)
+
+>> a - input tensor
+>> c - result tensor
+>> index - the selected index
+>> dim - the dimension along with which we do the job
+*/
+void _Select(const XTensor * a, XTensor * c, XTensor* index, int dim)
+{
+    if (index->devID >= 0)
+    {
+        int* indexCPU = new int[index->unitNum];
+        XMemCopy(indexCPU, -1, index->data,index->devID, index->unitNum * sizeof(int));
+
+        _Select(a, c, indexCPU, dim);
+        delete[] indexCPU;
+    }
+    else
+    {
+        _Select(a, c, (int *)index->data, dim);
+    }
+}
+
+/*
+c = select(a)
+
+>> a - input tensor
+>> index - the selected index
+>> dim - the dimension along with which we do the job 
+<< return - the result of the generated tensor with selected data
+*/
+XTensor Select(const XTensor &a, XTensor &index, int dim)
+{
+    int order = a.order;
+    int * dimSize = new int[order];
+
+    CheckNTErrors(dim >= 0 && dim < a.order, "The input dimension is out of bounds!");
+
+    for (int i = 0; i < a.order; i++) {
+        if (i == dim) {
+            dimSize[i] = index.dimSize[0];
+        }
+        else
+            dimSize[i] = a.dimSize[i];
+    }
+
+    float dr = (!a.isSparse) ? 1.0F : a.denseRatio;
+    XTensor c(order, dimSize, a.dataType, dr, a.devID, a.mem);
+    c.SetTMPFlag();
+
+    /* call _SelectRange function */
+    _Select(&a, &c, &index, dim);
+
+    /* tensor connection */
+    if (a.enableGrad) {
+        XLink::MakeLink(&a, &index, &c, GETANDSET_SELECT);
+        XLink::AddParamToHeadInt(&c, dim);
+    }
+
+    /* destroy variables */
+    delete[] dimSize;
+
+    return c;
+}
+
+/* 
 generate a tensor with selected data in range[low,high] along the given dimension 

 c = select(a) 
@@ -58,13 +166,12 @@ void _SelectRange(const XTensor * a, XTensor * c, int dim, int low, int high)
    }

    int stride = 1;
-    int dimRDI = a->order - dim - 1;
-    for(int i = 0; i < dimRDI; i++)
-        stride *= a->dimSizeRDI[i];
+    for(int i = dim + 1; i < a->order; i++)
+        stride *= a->dimSize[i];

    int copyTimes = 1;
-    for (int i = dimRDI + 1; i < a->order; i++) 
-        copyTimes *= a->dimSizeRDI[i];
+    for (int i = 0; i < dim; i++) 
+        copyTimes *= a->dimSize[i];

    int blockSize = stride * (high - low) * a->unitSize;
    int stepSizeS = stride * a->dimSize[dim] * a->unitSize;

--- a/source/tensor/core/getandset/Select.h
+++ b/source/tensor/core/getandset/Select.h
@@ -27,13 +27,16 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)

 /* generate a tensor with selected data c = select(a) */
-void _Select(const XTensor * a, XTensor * c, XTensor * indexCPU);
+void _Select(const XTensor * a, XTensor * c, int* index, int dim);
+
+/* generate a tensor with selected data c = select(a) */
+void _Select(const XTensor * a, XTensor * c, XTensor* index, int dim);

 /* 
 generate a tensor with selected data c = select(a) (returna a XTensor structure)
 make a new tensor to keep the result and return it
 */
-XTensor Select(const XTensor &a, XTensor &indexCPU);
+XTensor Select(const XTensor &a, XTensor &index, int dim);

 /* 
 generate a tensor with selected data in range[low,high] along the given dimension 

--- a/source/tensor/core/getandset/SetData.cpp
+++ b/source/tensor/core/getandset/SetData.cpp
@@ -470,7 +470,7 @@ void _SetDataLowTri(XTensor * tensor, DTYPE p, int shift)
 void _SetDataRand(XTensor * tensor, int rNum, int cNum)
 {
    if (tensor == NULL || tensor->isInit == false || tensor->order !=2 ) {
-        InitTensor2D(tensor, rNum, cNum);
+        InitTensor2DV2(tensor, rNum, cNum);
    }

    _SetDataRand(tensor, 0.0F, 1.0F);
@@ -519,7 +519,7 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
 #ifdef USE_CUDA
        _CudaSetDataRand(tensor, lower, upper);
 #endif
-        //XTensor * t2 = NewTensor(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, -1);
+        //XTensor * t2 = NewTensorV2(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, -1);
        //_SetDataRand(t2, low, high);
        //_CopyValues(t2, tensor);
        //delete t2;

--- a/source/tensor/core/math/Binary.cpp
+++ b/source/tensor/core/math/Binary.cpp
@@ -21,6 +21,7 @@

 #include <math.h>
 #include "../../XName.h"
+#include "../shape/IsSameShaped.h"
 #include "Binary.h"
 #include "Binary.cuh"

@@ -77,7 +78,7 @@ void _funcName(const XTensor * a, XTensor * b, T num)                           
        _cudaFuncName(a, b, num);                                                    \
        return;                                                                      \
    }                                                                                \
-    CheckNTErrors((XTensor::IsSameShaped(a, b)),                                     \
+    CheckNTErrors((_IsSameShaped(a, b)),                                             \
                  "Input tensors should have the same data type!");                  \
    if (a->dataType == X_INT) {                                                      \
        int * d = (int*)a->data;                                                     \
@@ -112,7 +113,7 @@ void _funcName(const XTensor * a, XTensor * b, T num)                           
    if (a->devID >= 0) {                                                             \
        ShowNTErrors("No GPU devices support!")                                      \
    }                                                                                \
-    CheckNTErrors((XTensor::IsSameShaped(a, b)),                                     \
+    CheckNTErrors((_IsSameShaped(a, b)),                                             \
                  "Input tensors should have the same data type!");                  \
    if (a->dataType == X_INT) {                                                      \
        int * d = (int*)a->data;                                                     \
@@ -169,8 +170,8 @@ XTensor funcName(const XTensor &a, T num)                                       
    _funcName(&a, &b, num);                                                          \
    if(a.enableGrad){                                                                \
        XLink::MakeLink(&a, NULL, &b, operationId);                                  \
-    }                                                                                \
        XLink::AddParamToHead(&b, num);                                              \
+    }                                                                                \
    return b;                                                                        \
 }                                                                                    \
 template XTensor funcName<int>(const XTensor&, int);                                 \
@@ -181,8 +182,8 @@ template XTensor funcName<double>(const XTensor&, double);
 template<class T>                                                                    \
 void funcName(const XTensor &a, XTensor &b, T num)                                   \
 {                                                                                    \
-    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {                               \
-        InitTensor(&b, &a);                                                          \
+    if (!b.isInit || !IsSameShaped(a, b)) {                                          \
+        InitTensorV2(&b, &a);                                                        \
    }                                                                                \
    _funcName(&a, &b, num);                                                          \
    if (a.enableGrad) {                                                              \

--- a/source/tensor/core/math/Binary.cu
+++ b/source/tensor/core/math/Binary.cu
@@ -23,6 +23,7 @@
 #include "../../XDevice.h"
 #include "../../XUtility.h"
 #include "../../XName.h"
+#include "../shape/IsSameShaped.h"
 #include "Binary.h"
 #include "Binary.cuh"

@@ -89,7 +90,7 @@ void Kernel##funcName(T1 * a, T1 * b, int size, T2 num)                         
 template<class T>                                                                   \
 void _Cuda##funcName(const XTensor * a, XTensor * b, T num)                         \
 {                                                                                   \
-    CheckNTErrors((XTensor::IsSameShaped(a, b)),                                    \
+    CheckNTErrors((_IsSameShaped(a, b)),                                    \
                  "Input tensors should have the same type!");                      \
    CheckNTErrors((a->isSparse == false), "TODO!");                                 \
                                                                                    \

--- a/source/tensor/core/math/Clip.cpp
+++ b/source/tensor/core/math/Clip.cpp
@@ -21,6 +21,7 @@

 #include "../../XTensor.h"
 #include "../../XName.h"
+#include "../shape/IsSameShaped.h"
 #include "Clip.h"
 #include "Clip.cuh"

@@ -43,7 +44,7 @@ void _Clip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper)
    }
 #endif

-    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
+    CheckNTErrors((_IsSameShaped(a, b)), "Input tensors should have the same type!");

    if (a->dataType == DEFAULT_DTYPE) {
        DTYPE* d = (DTYPE*)a->data;
@@ -84,7 +85,19 @@ void _ClipMe(XTensor * a, DTYPE lower, DTYPE upper)
 }

 /*
-set every entry to its clip value (return a XTensor structure)
+set every entry to its clip value (do it on site)
+keep the result in the input tensor a and return nothing
+>> a - the tensor we are processing
+>> lower - the lower border
+>> upper - the upper border
+*/
+void ClipMe(XTensor& a, DTYPE lower, DTYPE upper)
+{
+    _Clip(&a, &a, lower, upper);
+}
+
+/*
+set every entry to its clip value (return an XTensor structure)
 make a new tensor to keep the result and return it
 >> a - input tensor we are processing
 >> lower - the lower border
@@ -109,6 +122,23 @@ XTensor Clip(const XTensor & a, DTYPE lower, DTYPE upper)
 	return b;
 }

+void Clip(const XTensor & a, XTensor & b, DTYPE lower, DTYPE upper)
+{
+    if (!b.isInit || !IsSameShaped(a, b)) {
+        InitTensorV2(&b, &a);
+    }
+
+    /* call _Clip function */
+    _Clip(&a, &b, lower, upper);
+
+    /* tensor connections */
+    if (a.enableGrad) {
+        XLink::MakeLink(&a, NULL, &b, MATH_CLIP);
+        XLink::AddParamToHead(&b, lower);
+        XLink::AddParamToHead(&b, upper);
+    }
+}
+
 /*
 backward computation


--- a/source/tensor/core/math/Clip.cu
+++ b/source/tensor/core/math/Clip.cu
@@ -17,11 +17,11 @@

 /*
 * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-03
-* $Update by: Lin Ye (linye2015@outlook.com) 2019-07-06 float16/int added
 */

 #include "../../XDevice.h"
 #include "../../XTensor.h"
+#include "../shape/IsSameShaped.h"
 #include "Clip.h"
 #include "Clip.cuh"

@@ -36,9 +36,8 @@ set each entry to its clip value (CUDA Kernel)
 >> upper - the upper border
 >> size - size of the data array
 */
-template <class T>
 __global__
-void KernelClip(T * a, T * b, T lower, T upper, int size)
+void KernelClip(DTYPE * a, DTYPE * b, DTYPE lower, DTYPE upper, int size)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;

@@ -53,6 +52,21 @@ void KernelClip(T * a, T * b, T lower, T upper, int size)
 }

 /*
+set each entry to its clip value with float16 data type value (CUDA Kernel)
+This is for float16 computation
+>> a - pointer to input data array
+>> b - pointer to output data array
+>> lower - the lower border
+>> upper - the upper border
+>> size - size of the data array
+*/
+__global__
+void KernelClip(__half * a, __half * b, DTYPE lower, DTYPE upper, int size)
+{
+    return;
+}
+
+/*
 set each entry to its clip value
 >> a - input tensor we are processing
 >> b - output tensor we are processing
@@ -61,7 +75,7 @@ set each entry to its clip value
 */
 void _CudaClip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper)
 {
-    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
+    CheckNTErrors((_IsSameShaped(a, b)), "Input tensors should have the same type!");
    CheckNTErrors((a->isSparse == false), "TODO!");

    int gridSize[3];
@@ -78,11 +92,8 @@ void _CudaClip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper)
    if (a->dataType == DEFAULT_DTYPE) {
        KernelClip << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, lower, upper, a->unitNum);
    }
-    else if (a->dataType == X_INT) {
-        int lower1 = (int)lower;
-        int upper1 = (int)upper;
-
-        KernelClip << <blocks, threads >> >((int *)a->data, (int *)b->data, lower1, upper1, a->unitNum);
+    else if (a->dataType == X_FLOAT16) {
+        KernelClip << <blocks, threads >> >((__half*)a->data, (__half*)b->data, lower, upper, a->unitNum);
    }
    else {
        ShowNTErrors("TODO!");

--- a/source/tensor/core/math/Clip.cuh
+++ b/source/tensor/core/math/Clip.cuh
--- a/source/tensor/core/math/Compare.cpp
+++ b/source/tensor/core/math/Compare.cpp
--- a/source/tensor/core/math/Compare.cu
+++ b/source/tensor/core/math/Compare.cu
--- a/source/tensor/core/math/Compare.cuh
+++ b/source/tensor/core/math/Compare.cuh
--- a/source/tensor/core/math/Compare.h
+++ b/source/tensor/core/math/Compare.h
--- a/source/tensor/core/math/Normalize.cpp
+++ b/source/tensor/core/math/Normalize.cpp
--- a/source/tensor/core/math/Normalize.cu
+++ b/source/tensor/core/math/Normalize.cu
--- a/source/tensor/core/math/ScaleAndShift.cpp
+++ b/source/tensor/core/math/ScaleAndShift.cpp
--- a/source/tensor/core/math/ScaleAndShift.cu
+++ b/source/tensor/core/math/ScaleAndShift.cu
--- a/source/tensor/core/math/Unary.cpp
+++ b/source/tensor/core/math/Unary.cpp
--- a/source/tensor/core/math/Unary.cu
+++ b/source/tensor/core/math/Unary.cu
--- a/source/tensor/core/movement/CopyInGrid.cpp
+++ b/source/tensor/core/movement/CopyInGrid.cpp
--- a/source/tensor/core/movement/CopyIndexed.cpp
+++ b/source/tensor/core/movement/CopyIndexed.cpp
--- a/source/tensor/core/movement/Gather.cpp
+++ b/source/tensor/core/movement/Gather.cpp
--- a/source/tensor/core/movement/Gather.cu
+++ b/source/tensor/core/movement/Gather.cu
--- a/source/tensor/core/movement/Gather.cuh
+++ b/source/tensor/core/movement/Gather.cuh
--- a/source/tensor/core/movement/Gather.h
+++ b/source/tensor/core/movement/Gather.h
--- a/source/tensor/core/movement/Spread.cpp
+++ b/source/tensor/core/movement/Spread.cpp
--- a/source/tensor/core/movement/Spread.cu
+++ b/source/tensor/core/movement/Spread.cu
--- a/source/tensor/core/reduce/ReduceMax.cpp
+++ b/source/tensor/core/reduce/ReduceMax.cpp
--- a/source/tensor/core/reduce/ReduceMax.cu
+++ b/source/tensor/core/reduce/ReduceMax.cu
--- a/source/tensor/core/reduce/ReduceMax.cuh
+++ b/source/tensor/core/reduce/ReduceMax.cuh
--- a/source/tensor/core/reduce/ReduceMax.h
+++ b/source/tensor/core/reduce/ReduceMax.h
--- a/source/tensor/core/reduce/ReduceMean.cpp
+++ b/source/tensor/core/reduce/ReduceMean.cpp
--- a/source/tensor/core/reduce/ReduceSum.cpp
+++ b/source/tensor/core/reduce/ReduceSum.cpp
--- a/source/tensor/core/reduce/ReduceSum.cu
+++ b/source/tensor/core/reduce/ReduceSum.cu
--- a/source/tensor/core/reduce/ReduceSumAll.cpp
+++ b/source/tensor/core/reduce/ReduceSumAll.cpp
--- a/source/tensor/core/reduce/ReduceSumAll.h
+++ b/source/tensor/core/reduce/ReduceSumAll.h
--- a/source/tensor/core/reduce/ReduceSumSquared.cpp
+++ b/source/tensor/core/reduce/ReduceSumSquared.cpp
--- a/source/tensor/core/reduce/ReduceVariance.cpp
+++ b/source/tensor/core/reduce/ReduceVariance.cpp
--- a/source/tensor/core/reduce/VectorBuffer.cpp
+++ b/source/tensor/core/reduce/VectorBuffer.cpp
--- a/source/tensor/core/reduce/VectorBuffer.h
+++ b/source/tensor/core/reduce/VectorBuffer.h
--- a/source/tensor/core/shape/Concatenate.cpp
+++ b/source/tensor/core/shape/Concatenate.cpp
--- a/source/tensor/core/shape/ConcatenateSolely.cpp
+++ b/source/tensor/core/shape/ConcatenateSolely.cpp
--- a/source/tensor/core/shape/IsSameShaped.cpp
+++ b/source/tensor/core/shape/IsSameShaped.cpp
--- a/source/tensor/core/shape/IsSameShaped.h
+++ b/source/tensor/core/shape/IsSameShaped.h
--- a/source/tensor/core/shape/Merge.cpp
+++ b/source/tensor/core/shape/Merge.cpp
--- a/source/tensor/core/shape/Merge.h
+++ b/source/tensor/core/shape/Merge.h
--- a/source/tensor/core/shape/Reshape.cpp
+++ b/source/tensor/core/shape/Reshape.cpp
--- a/source/tensor/core/shape/Split.cpp
+++ b/source/tensor/core/shape/Split.cpp
--- a/source/tensor/core/shape/Squeeze.cpp
+++ b/source/tensor/core/shape/Squeeze.cpp
--- a/source/tensor/core/shape/Stack.cpp
+++ b/source/tensor/core/shape/Stack.cpp
--- a/source/tensor/core/shape/Stack.h
+++ b/source/tensor/core/shape/Stack.h
--- a/source/tensor/core/shape/Unsqueeze.cpp
+++ b/source/tensor/core/shape/Unsqueeze.cpp
--- a/source/tensor/core/shape/Unsqueeze.cu
+++ b/source/tensor/core/shape/Unsqueeze.cu
--- a/source/tensor/core/sort/Sort.cpp
+++ b/source/tensor/core/sort/Sort.cpp
--- a/source/tensor/core/sort/Sort.cu
+++ b/source/tensor/core/sort/Sort.cu
--- a/source/tensor/core/sort/TopK.cpp
+++ b/source/tensor/core/sort/TopK.cpp
--- a/source/tensor/core/sort/TopK.cu
+++ b/source/tensor/core/sort/TopK.cu
--- a/source/tensor/core/utilities/CheckData.cpp
+++ b/source/tensor/core/utilities/CheckData.cpp
--- a/source/tensor/core/utilities/CheckData.h
+++ b/source/tensor/core/utilities/CheckData.h
--- a/source/tensor/core/utilities/SetAscendingOrder.cpp
+++ b/source/tensor/core/utilities/SetAscendingOrder.cpp
--- a/source/tensor/core/utilities/SetAscendingOrder.cu
+++ b/source/tensor/core/utilities/SetAscendingOrder.cu
--- a/source/tensor/core/utilities/SetAscendingOrder.h
+++ b/source/tensor/core/utilities/SetAscendingOrder.h
--- a/source/tensor/function/Dropout.cpp
+++ b/source/tensor/function/Dropout.cpp
--- a/source/tensor/function/DropoutWithIndex.cpp
+++ b/source/tensor/function/DropoutWithIndex.cpp
--- a/source/tensor/function/HardTanH.cpp
+++ b/source/tensor/function/HardTanH.cpp
--- a/source/tensor/function/Identity.cpp
+++ b/source/tensor/function/Identity.cpp
--- a/source/tensor/function/LogSoftmax.cpp
+++ b/source/tensor/function/LogSoftmax.cpp
--- a/source/tensor/function/LogSoftmax.cu
+++ b/source/tensor/function/LogSoftmax.cu
--- a/source/tensor/function/Loss.cpp
+++ b/source/tensor/function/Loss.cpp
--- a/source/tensor/function/Loss.cu
+++ b/source/tensor/function/Loss.cu
--- a/source/tensor/function/Rectify.cpp
+++ b/source/tensor/function/Rectify.cpp
--- a/source/tensor/function/Sigmoid.cpp
+++ b/source/tensor/function/Sigmoid.cpp
--- a/source/tensor/function/Softmax.cpp
+++ b/source/tensor/function/Softmax.cpp
--- a/source/tensor/function/Softmax.cu
+++ b/source/tensor/function/Softmax.cu
--- a/source/tensor/loss/CrossEntropy.cpp
+++ b/source/tensor/loss/CrossEntropy.cpp
--- a/source/tensor/loss/CrossEntropy.cu
+++ b/source/tensor/loss/CrossEntropy.cu
--- a/source/tensor/test/TAbsolute.cpp
+++ b/source/tensor/test/TAbsolute.cpp
--- a/source/tensor/test/TClip.cpp
+++ b/source/tensor/test/TClip.cpp
--- a/source/tensor/test/TCompare.cpp
+++ b/source/tensor/test/TCompare.cpp
--- a/source/tensor/test/TConcatenate.cpp
+++ b/source/tensor/test/TConcatenate.cpp
--- a/source/tensor/test/TConcatenateSolely.cpp
+++ b/source/tensor/test/TConcatenateSolely.cpp
--- a/source/tensor/test/TConvertDataType.cpp
+++ b/source/tensor/test/TConvertDataType.cpp
--- a/source/tensor/test/TCopyIndexed.cpp
+++ b/source/tensor/test/TCopyIndexed.cpp
--- a/source/tensor/test/TCopyValues.cpp
+++ b/source/tensor/test/TCopyValues.cpp
--- a/source/tensor/test/TCos.cpp
+++ b/source/tensor/test/TCos.cpp
--- a/source/tensor/test/TCrossEntropy.cpp
+++ b/source/tensor/test/TCrossEntropy.cpp
--- a/source/tensor/test/TDiv.cpp
+++ b/source/tensor/test/TDiv.cpp
--- a/source/tensor/test/TDivDim.cpp
+++ b/source/tensor/test/TDivDim.cpp
--- a/source/tensor/test/TDropout.cpp
+++ b/source/tensor/test/TDropout.cpp
--- a/source/tensor/test/TExp.cpp
+++ b/source/tensor/test/TExp.cpp
--- a/source/tensor/test/TGather.cpp
+++ b/source/tensor/test/TGather.cpp
--- a/source/tensor/test/THardTanH.cpp
+++ b/source/tensor/test/THardTanH.cpp
--- a/source/tensor/test/TIdentity.cpp
+++ b/source/tensor/test/TIdentity.cpp
--- a/source/tensor/test/TLog.cpp
+++ b/source/tensor/test/TLog.cpp
--- a/source/tensor/test/TLogSoftmax.cpp
+++ b/source/tensor/test/TLogSoftmax.cpp
--- a/source/tensor/test/TLoss.cpp
+++ b/source/tensor/test/TLoss.cpp
--- a/source/tensor/test/TMatrixMul.cpp
+++ b/source/tensor/test/TMatrixMul.cpp
--- a/source/tensor/test/TMatrixMul2D.cpp
+++ b/source/tensor/test/TMatrixMul2D.cpp
--- a/source/tensor/test/TMatrixMul2DParallel.cpp
+++ b/source/tensor/test/TMatrixMul2DParallel.cpp
--- a/source/tensor/test/TMatrixMulBatched.cpp
+++ b/source/tensor/test/TMatrixMulBatched.cpp
--- a/source/tensor/test/TMerge.cpp
+++ b/source/tensor/test/TMerge.cpp
--- a/source/tensor/test/TMultiply.cpp
+++ b/source/tensor/test/TMultiply.cpp
--- a/source/tensor/test/TMultiplyDim.cpp
+++ b/source/tensor/test/TMultiplyDim.cpp
--- a/source/tensor/test/TNegate.cpp
+++ b/source/tensor/test/TNegate.cpp
--- a/source/tensor/test/TNormalize.cpp
+++ b/source/tensor/test/TNormalize.cpp
--- a/source/tensor/test/TPower.cpp
+++ b/source/tensor/test/TPower.cpp
--- a/source/tensor/test/TRectify.cpp
+++ b/source/tensor/test/TRectify.cpp
--- a/source/tensor/test/TReduceMax.cpp
+++ b/source/tensor/test/TReduceMax.cpp
--- a/source/tensor/test/TReduceMean.cpp
+++ b/source/tensor/test/TReduceMean.cpp
--- a/source/tensor/test/TReduceSum.cpp
+++ b/source/tensor/test/TReduceSum.cpp
--- a/source/tensor/test/TReduceSumAll.cpp
+++ b/source/tensor/test/TReduceSumAll.cpp
--- a/source/tensor/test/TReduceSumSquared.cpp
+++ b/source/tensor/test/TReduceSumSquared.cpp
--- a/source/tensor/test/TReduceVariance.cpp
+++ b/source/tensor/test/TReduceVariance.cpp
--- a/source/tensor/test/TRound.cpp
+++ b/source/tensor/test/TRound.cpp
--- a/source/tensor/test/TScaleAndShift.cpp
+++ b/source/tensor/test/TScaleAndShift.cpp
--- a/source/tensor/test/TSelect.cpp
+++ b/source/tensor/test/TSelect.cpp
--- a/source/tensor/test/TSetAscendingOrder.cpp
+++ b/source/tensor/test/TSetAscendingOrder.cpp
--- a/source/tensor/test/TSetData.cpp
+++ b/source/tensor/test/TSetData.cpp
--- a/source/tensor/test/TSigmoid.cpp
+++ b/source/tensor/test/TSigmoid.cpp
--- a/source/tensor/test/TSign.cpp
+++ b/source/tensor/test/TSign.cpp
--- a/source/tensor/test/TSin.cpp
+++ b/source/tensor/test/TSin.cpp
--- a/source/tensor/test/TSoftmax.cpp
+++ b/source/tensor/test/TSoftmax.cpp
--- a/source/tensor/test/TSort.cpp
+++ b/source/tensor/test/TSort.cpp
--- a/source/tensor/test/TSplit.cpp
+++ b/source/tensor/test/TSplit.cpp
--- a/source/tensor/test/TSpread.cpp
+++ b/source/tensor/test/TSpread.cpp
--- a/source/tensor/test/TSub.cpp
+++ b/source/tensor/test/TSub.cpp
--- a/source/tensor/test/TSubDim.cpp
+++ b/source/tensor/test/TSubDim.cpp
--- a/source/tensor/test/TSum.cpp
+++ b/source/tensor/test/TSum.cpp
--- a/source/tensor/test/TSumDim.cpp
+++ b/source/tensor/test/TSumDim.cpp
--- a/source/tensor/test/TTan.cpp
+++ b/source/tensor/test/TTan.cpp
--- a/source/tensor/test/TTopK.cpp
+++ b/source/tensor/test/TTopK.cpp
--- a/source/tensor/test/TTranspose.cpp
+++ b/source/tensor/test/TTranspose.cpp
--- a/source/tensor/test/TUnsqueeze.cpp
+++ b/source/tensor/test/TUnsqueeze.cpp
--- a/source/tensor/test/TXMem.cpp
+++ b/source/tensor/test/TXMem.cpp
--- a/source/tensor/test/Test.cpp
+++ b/source/tensor/test/Test.cpp